# UMAP chess dataset EDA

---

## Staging

---

import section:

In [None]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt

import umap

from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import make_column_transformer

general parameters:

In [None]:
## metadata
notebook_ver = 0.3

## dataset
data_pth = './data/games.csv'

# ## Define nominal data variables to be thrown at one-hot-encoder
# to_hot_encode = [
#     'opening_eco',
#     'increment_code',
#     'victory_status'
# ]

# ## Define string variables for vectorization
# to_vectorize = [
#     'white_id',
#     'black_id'
# ]

# ## Define numerical cols to use
# num_cols = [
#     'winner_bin',
#     'rated_bin',
#     'game_len_mins',
# #     'turns', # dense var
# #     'white_rating', # dense var
# #     'black_rating', # dense var
#     'opening_ply'
# ]

Data load:

In [None]:
GAMES = pd.read_csv(data_pth)

## Preprocessing

---

In [None]:
## Encode binary vars to integers
GAMES['rated_bin'] = np.where(GAMES['rated'] == True, 1, 0)

GAMES.drop(columns='rated',
          inplace=True)

In [None]:
GAMES['winner_bin'] = np.where(GAMES['winner'] == 'white', 0, 1)

GAMES.drop(columns='winner',
          inplace=True)

In [None]:
## Opening, closing games dates transformation
GAMES['created_at_dt'] = pd.to_datetime(GAMES['created_at']/1000, unit='s', origin='unix')
GAMES['last_move_at_dt'] = pd.to_datetime(GAMES['last_move_at']/1000, unit='s', origin='unix')

GAMES['game_len_dt'] = GAMES['last_move_at_dt'] - GAMES['created_at_dt']
GAMES['game_len'] = GAMES['last_move_at'] - GAMES['created_at']

GAMES['game_len_mins'] = GAMES['game_len_dt'].dt.components['minutes']

## Part 1 - Metadata table

---

In [None]:
# ## Instantiate One Hot Encoder
# ohe = OneHotEncoder(handle_unknown='error')

In [None]:
# ## Instantiate Count Vectorizer
# cve = CountVectorizer()

In [None]:
# ## Set column transformer
# column_trans = make_column_transformer((ohe,to_hot_encode),
#                                        remainder='passthrough')

In [None]:
# ## Fit column transformer
# column_trans.fit(GAMES[to_hot_encode])

In [None]:
# ## Produce one-hot-encoded array 
# hot_array = column_trans.transform(games[to_hot_encode]).toarray()

# HOT_ENCODED = pd.DataFrame(hot_array, 
#                               columns = column_trans.get_feature_names())

In [None]:
# ## Form final dataframe
# META = GAMES[num_cols].merge(HOT_ENCODED,
#                      left_index=True,
#                      right_index=True)

### Metadata table

In [None]:
# META

## Part 2 - Vectorized moves table

---

In [None]:
## create a algebraic notation to iccf notation board mapping
# iccf_board = [str(x)+str(y) for x in range(1,9) for y in range(1,9)]
# algebraic_board = [l + str(i) for l in list('abcdefgh') for i in range(1,9)]
# board_mapping = {algebraic : iccf for algebraic, iccf in zip(algebraic_board, iccf_board)}

In [None]:
GAMES.columns

In [None]:
def to_uni(mov):
    '''
    Turn str move encoding into a sequence of Unicode values
    '''
    move_unicode = []
    for char in mov:
        move_unicode.append(ord(char))
    return move_unicode

In [None]:
def encode_moves(moves_seq):
    '''
    Encode move into an integer = 1*e^10
    '''
    moves = []
    for move in moves_seq.split(' '):
        ## Convert str encoding to unicode vals
        move_uni = to_uni(move)
        ## Multiply all unicode vals together
        move_prod = np.prod(np.array(move_uni))  
        ## Take log2 for every move and multiply by 10_000_000
        moves.append(int(np.log2(move_prod)*10_000_000))
    return(moves)

In [None]:
## Encode moves into separate variable
GAMES['moves_encoded'] = GAMES['moves'].apply(lambda x: encode_moves(x))

In [None]:
## Check max() len of moves sequence in dataset
GAMES['moves_len'] = GAMES['moves_encoded'].apply(len)
moves_max = GAMES['moves_len'].max()

In [None]:
MOVES = GAMES[['moves_encoded','id']]

In [None]:
MOVES_EXPLODED = (MOVES
                .explode(column='moves_encoded'))

In [None]:
MOVES_EXPLODED

In [None]:
MOVES_EXPLODED.iloc[1946]

In [None]:
from tqdm import tqdm_notebook, tnrange

transposed_moves = []

for game in tnrange(0, len(MOVES_EXPLODED.index.values)):
    
    
    GRABBED_MOVES = MOVES_EXPLODED.loc[game]
    
    cols = range(1,len(GRABBED_MOVES)+1)
    game_id = GRABBED_MOVES['id']
    
    
    if type(game_id) != str:
        game_id = game_id.values[0]
    
    
    GRABBED_MOVES = GRABBED_MOVES.drop(columns='id')
    
    TRANS_MOVES = pd.DataFrame(GRABBED_MOVES).T
    
    TRANS_MOVES.index = [game_id]
    TRANS_MOVES.columns = cols

    transposed_moves.append(TRANS_MOVES)

In [None]:
MOVES_TRANS = concat(transposed_moves)

In [None]:
MOVES_TRANS.to_csv('./moves_trans.csv')

# @@@ Cell > Run All Above @@@

In [None]:
games_s['moves_class'].

In [None]:
games[['moves_win','winner_bin']]

## UMAP model

---

Parameters:

In [None]:
distance_metric = 'euclidean'
# metrics_available = euclidean, manhattan, chebyshev, minkowski, canberra, braycurtis, mahalanobis,
# wminkowski, seuclidean, cosine, correlation, haversine, hamming, jaccard, dice, russelrao, kulsinski,
# ll_dirichlet, hellinger, rogerstanimoto, sokalmichener, sokalsneath, yule

n_neighbors = 15
min_dist = 0.2

Instatiate & train the model:

In [None]:
## Instantiate UMAP model
reducer = umap.UMAP(metric=distance_metric,
                   n_neighbors=n_neighbors,
                   min_dist=min_dist)

In [None]:
## Train the model
embedding = reducer.fit_transform(hot_df)

---

## Plot

---

Parameters:

In [None]:
## switches
cbar_switch = True

## display
alpha = 1
marker_shape = 'o'
marker_size = 2
dpi = 300

## data
cmap_variable = games['game_len_mins']

## colors
facecolor = 'black'
labelcolor = 'white'
cmap = 'viridis'

Plot script:

In [None]:
## Render vis
fig = plt.figure(figsize=(14,12))
fig.patch.set_facecolor(facecolor)

plot = plt.scatter(embedding[:,0], 
                embedding[:,1], 
                marker=marker_shape,
                s=marker_size,
                edgecolor='none',
                c=cmap_variable,
                cmap='viridis', 
                alpha=alpha)

plt.axis("off")
plt.title(f'20k chess games; distance={distance_metric}; nn={n_neighbors}, min=dist={min_dist}', 
          color='w')

if cbar_switch:
    cbar = plt.colorbar(fraction=0.05, 
                    shrink=0.5,
                    ticks=[games['game_len_mins'].min(), 
                        games['game_len_mins'].quantile(q=0.50),
                        games['game_len_mins'].quantile(q=0.75),
                        games['game_len_mins'].max()], alpha=alpha
                           )

    cbar.set_label(cmap_variable.name, color=labelcolor)
    cbar.ax.yaxis.set_tick_params(color=labelcolor)
    cbar.solids.set_edgecolor("face")

plt.setp(plt.getp(cbar.ax.axes, 'yticklabels'), color=labelcolor)

_ = plot

# plt.savefig(f"screens/{distance_metric}_nn{n_neighbors}_mind{min_dist}_v{notebook_ver}.png", 
#             dpi=dpi, 
#             facecolor=facecolor)