# UMAP chess dataset EDA

---

## Staging

---

import section:

In [76]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

import umap

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer

general parameters:

In [77]:
## metadata
notebook_ver = 0.2

## dataset
data_pth = './data/games.csv'

## Define nominal data variables to be thrown at one-hot-encoder
to_hot_encode = [
    'opening_eco',
    'increment_code',
    'victory_status'
]

## Define string variables for vectorization
to_vectorize = [
    'white_id',
    'black_id'
]

## Define numerical cols to use
num_cols = [
    'winner_bin',
    'rated_bin',
    'game_len_mins',
#     'turns', # dense var
#     'white_rating', # dense var
#     'black_rating', # dense var
    'opening_ply'
]

Data load:

In [78]:
games = pd.read_csv(data_pth)

## Preprocessing

---

In [79]:
## Encode binary vars to integers
games['rated_bin'] = np.where(games['rated'] == True, 1, 0)

games.drop(columns='rated',
          inplace=True)

In [80]:
games['winner_bin'] = np.where(games['winner'] == 'white', 0, 1)

games.drop(columns='winner',
          inplace=True)

In [81]:
## Opening, closing games dates transformation
games['created_at_dt'] = pd.to_datetime(games['created_at']/1000, unit='s', origin='unix')
games['last_move_at_dt'] = pd.to_datetime(games['last_move_at']/1000, unit='s', origin='unix')

games['game_len_dt'] = games['last_move_at_dt'] - games['created_at_dt']
games['game_len'] = games['last_move_at'] - games['created_at']

games['game_len_mins'] = games['game_len_dt'].dt.components['minutes']

In [82]:
## Instantiate One Hot Encoder
ohe = OneHotEncoder(handle_unknown='error')

In [83]:
## Instantiate Count Vectorizer
cve = CountVectorizer()

In [84]:
## Set column transformer
column_trans = make_column_transformer((ohe,to_hot_encode),
                                       remainder='passthrough')

In [85]:
## Fit column transformer
column_trans.fit(games[to_hot_encode])

ColumnTransformer(remainder='passthrough',
                  transformers=[('onehotencoder', OneHotEncoder(),
                                 ['opening_eco', 'increment_code',
                                  'victory_status'])])

In [86]:
## Produce one-hot-encoded array 
hot_encoded = column_trans.transform(games[to_hot_encode]).toarray()

hot_encoded_df = pd.DataFrame(hot_encoded, 
                              columns = column_trans.get_feature_names())



In [87]:
## Form final dataframe
hot_df = games[num_cols].merge(hot_encoded_df,
                     left_index=True,
                     right_index=True)

## Final DF

---

In [88]:
hot_df

Unnamed: 0,winner_bin,rated_bin,game_len_mins,opening_ply,onehotencoder__x0_A00,onehotencoder__x0_A01,onehotencoder__x0_A02,onehotencoder__x0_A03,onehotencoder__x0_A04,onehotencoder__x0_A05,...,onehotencoder__x1_9+9,onehotencoder__x1_90+0,onehotencoder__x1_90+120,onehotencoder__x1_90+30,onehotencoder__x1_90+5,onehotencoder__x1_90+8,onehotencoder__x2_draw,onehotencoder__x2_mate,onehotencoder__x2_outoftime,onehotencoder__x2_resign
0,0,0,0,5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,1,0,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,1,0,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0,1,0,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0,1,0,5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20053,0,1,5,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
20054,1,1,12,2,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
20055,0,1,2,3,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
20056,0,1,15,4,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [89]:
## create a algebraic notation to iccf notation board mapping
iccf_board = [str(x)+str(y) for x in range(1,9) for y in range(1,9)]
algebraic_board = [l + str(i) for l in list('abcdefgh') for i in range(1,9)]
board_mapping = {algebraic : iccf for algebraic, iccf in zip(algebraic_board, iccf_board)}

In [90]:
games.columns

Index(['id', 'created_at', 'last_move_at', 'turns', 'victory_status',
       'increment_code', 'white_id', 'white_rating', 'black_id',
       'black_rating', 'moves', 'opening_eco', 'opening_name', 'opening_ply',
       'rated_bin', 'winner_bin', 'created_at_dt', 'last_move_at_dt',
       'game_len_dt', 'game_len', 'game_len_mins'],
      dtype='object')

In [91]:
def to_uni(mov):
    move_unicode = []
    for char in mov:
        move_unicode.append(ord(char))
    return move_unicode

In [92]:
def encode_moves(moves_seq):
    moves = []
    for move in moves_seq.split(' '):
        ## Convert str encoding to unicode vals
        move_uni = to_uni(move)
        ## Multiply all unicode vals together
        move_prod = np.prod(np.array(move_uni))  
        ## Take log2 for every move and multiply by 10_000_000
        moves.append(int(np.log2(move_prod)*10_000_000))
    return(moves)

In [93]:
games['moves_encoded'] = games['moves'].apply(lambda x: encode_moves(x))

In [94]:
games['moves_len'] = games['moves_encoded'].apply(len)

In [95]:
games_moves = games['moves_encoded']

In [106]:
games.columns

Index(['id', 'created_at', 'last_move_at', 'turns', 'victory_status',
       'increment_code', 'white_id', 'white_rating', 'black_id',
       'black_rating', 'moves', 'opening_eco', 'opening_name', 'opening_ply',
       'rated_bin', 'winner_bin', 'created_at_dt', 'last_move_at_dt',
       'game_len_dt', 'game_len', 'game_len_mins', 'moves_encoded',
       'moves_len'],
      dtype='object')

In [96]:
games_moves_exploded = games_moves.explode()

In [None]:
# games_moves_exploded.pivot(index='order', 
#                            columns='index', 
#                            values='moves_encoded')

In [97]:
games_moves_exploded

0        123442959
0        123717766
0        123297963
0        123842441
0        259080238
           ...    
20057    241209151
20057    186540345
20057    241087450
20057    186800298
20057    239346994
Name: moves_encoded, Length: 1212827, dtype: object

In [104]:
exploded_moves = []

for game in games_moves_exploded.index.values:
    trans = pd.DataFrame(games_moves_exploded.loc[game]).T
    exploded_moves.append(trans)
    break

In [105]:
exploded_moves[0]

Unnamed: 0,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,0.10,0.11,0.12
moves_encoded,123442959,123717766,123297963,123842441,259080238,124130989,259638457,259924149,186302529,237858084,185871841,183722274,184172591


In [None]:
games_s['moves_class'].

In [None]:
games[['moves_win','winner_bin']]

## UMAP model

---

Parameters:

In [None]:
distance_metric = 'euclidean'
# metrics_available = euclidean, manhattan, chebyshev, minkowski, canberra, braycurtis, mahalanobis,
# wminkowski, seuclidean, cosine, correlation, haversine, hamming, jaccard, dice, russelrao, kulsinski,
# ll_dirichlet, hellinger, rogerstanimoto, sokalmichener, sokalsneath, yule

n_neighbors = 15
min_dist = 0.2

Instatiate & train the model:

In [None]:
## Instantiate UMAP model
reducer = umap.UMAP(metric=distance_metric,
                   n_neighbors=n_neighbors,
                   min_dist=min_dist)

In [None]:
## Train the model
embedding = reducer.fit_transform(hot_df)

---

## Plot

---

Parameters:

In [None]:
## switches
cbar_switch = True

## display
alpha = 1
marker_shape = 'o'
marker_size = 2
dpi = 300

## data
cmap_variable = games['game_len_mins']

## colors
facecolor = 'black'
labelcolor = 'white'
cmap = 'viridis'

Plot script:

In [None]:
## Render vis
fig = plt.figure(figsize=(14,12))
fig.patch.set_facecolor(facecolor)

plot = plt.scatter(embedding[:,0], 
                embedding[:,1], 
                marker=marker_shape,
                s=marker_size,
                edgecolor='none',
                c=cmap_variable,
                cmap='viridis', 
                alpha=alpha)

plt.axis("off")
plt.title(f'20k chess games; distance={distance_metric}; nn={n_neighbors}, min=dist={min_dist}', 
          color='w')

if cbar_switch:
    cbar = plt.colorbar(fraction=0.05, 
                    shrink=0.5,
                    ticks=[games['game_len_mins'].min(), 
                        games['game_len_mins'].quantile(q=0.50),
                        games['game_len_mins'].quantile(q=0.75),
                        games['game_len_mins'].max()], alpha=alpha
                           )

    cbar.set_label(cmap_variable.name, color=labelcolor)
    cbar.ax.yaxis.set_tick_params(color=labelcolor)
    cbar.solids.set_edgecolor("face")

plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color=labelcolor)

_ = plot

# plt.savefig(f"screens/{distance_metric}_nn{n_neighbors}_mind{min_dist}_v{notebook_ver}.png", 
#             dpi=dpi, 
#             facecolor=facecolor)