In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans

SEED = 1660

ModuleNotFoundError: No module named 'sklearn'

**Read the data**

In [None]:
df_train = pd.read_parquet("03_datasets/final_train.parquet").reset_index(drop=True)
df_test = pd.read_parquet("03_datasets/final_test.parquet").reset_index(drop=True)

In [None]:
df_train = df_train.drop(columns=["ECO"]).sample(10_000)
df_test = df_test.drop(columns=["ECO"]).sample(10_000)

In [None]:
# X_train = df_train.drop(columns=["GameId", "WhiteElo", "BlackElo", "Elo"])
# Y_train = df_train["Elo"]

# X_test = df_test.drop(columns=["GameId", "WhiteElo", "BlackElo", "Elo"])
# Y_test = df_test["Elo"]

features = list(set([
    'Elo',
    'MaxMoveNumber',
    'MeanStartLoss',
    'Opening',
    'StdWinodds', 'MeanAbsWinodds', 'MedianAbsWinodds', 'MaxAbsWinodds',
    'MeanStartAbsWinodds', 'MaxStartAbsWinodds', 'MedianAdvLoss',
    'MeanAdvLoss', 'MaxAdvLoss', 'StdAdvLoss',
    'MeanStartLoss', 
    'MeanQueenLossStart', 'MeanRookLoss',
    'MeanKnightLossStart', 'MeanBishopLoss', 'MeanBishopLossStart',
    'MeanKingLoss1', 'MeanKingLoss2', 'MeanPawnLossStart', 'MeanPawnLoss',
    'MeanFlagPawnLoss1', 'MeanFlagPawnLoss2', 'MeanCenterPawnLoss1',
    'MeanCenterPawnLoss2', 'MinSoonMateMoveNumber',
    'MedianSoonMateMoveNumber', 'MeanSoonMateMoveNumber', 'MeanQueenLoss',
    'MeanKnightLoss', 
    'LineTreeMean'
]))

X_train = df_train[features]
X_test = df_test[features]

In [None]:
X_train = X_train.rank()

**PCA**

In [None]:
mean, std = X_train.mean(), X_train.std()
X_train_scaled = (X_train - mean) / std
X_test_scaled = (X_test - mean) / std

In [None]:
pca = PCA(n_components=2, random_state=SEED)
pca.fit(X_train_scaled)
df_train_clust = pd.DataFrame(pca.transform(X_train_scaled))
df_test_clust = pd.DataFrame(pca.transform(X_test_scaled))
# print(pca.explained_variance_ratio_ * 100)
# print(np.cumsum(pca.explained_variance_ratio_) * 100)

In [23]:
000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000

0

**Plot**

In [None]:
df_train_clust["NMoves"] = df_train["MaxMoveNumber"].values
df_train_clust["Elo"] = df_train["Elo"].values

In [None]:
N_CLUSTERS = 20
cluster_model = MiniBatchKMeans(n_clusters=N_CLUSTERS, batch_size=20_000, random_state=SEED, n_init=2)
clustering_components = [0, 1]
cluster_model.fit(df_train_clust[clustering_components])
df_train_clust["cluster"] = cluster_model.predict(df_train_clust[clustering_components])
df_test_clust["cluster"] = cluster_model.predict(df_test_clust[clustering_components])

In [None]:
coloring = (
    df_train_clust
    .groupby("cluster")
    .agg({"NMoves": "mean"})
    .squeeze().to_dict()
)

df_train_clust["color"] = df_train_clust["cluster"].map(coloring)
df_test_clust["color"] = df_test_clust["cluster"].map(coloring)

fig = px.scatter(
    df_train_clust.sample(10_000, random_state=0),
    x=0,
    y=1,
    color="color",
    labels={"color": "........"},
    color_continuous_scale=["darkred", "red", "orange", "yellow", "lime", "green"],
    template="plotly_dark"
)

fig.data[0].marker.size=3

fig.update_xaxes(title="")
fig.update_yaxes(title="")

fig.update_layout(
    height=1080, 
    width=1920//2,
    font_size=20,
    font_family="Consolas"
)
fig.show()
# fig.write_image("presentation/images/pca_smooth.png", scale=2)

In [None]:
fig = px.scatter(
    df_train_clust.sample(150_000, random_state=0),
    x=0,
    y=1,
    color="cluster",
    labels={"cluster": "........"},
    color_continuous_scale=(
        px.colors.sequential.Purples[::-1]+
        px.colors.sequential.Blues
    ),
    template="plotly_dark"
)

fig.data[0].marker.size=3

fig.update_xaxes(title="")
fig.update_yaxes(title="")

fig.update_layout(
    height=1080, 
    width=1920//2, 
    font_size=20,
    font_family="Consolas"
)
fig.show()
# fig.write_image("presentation/images/pca_clusters.png", scale=2)

In [None]:
fig = px.scatter(
    df_train_clust.sample(150_000, random_state=0),
    x=0,
    y=1,
    color="Y",
    labels={"Y": "........"},
    color_continuous_scale=["black", "darkred", "red", "orange", "yellow", "lime", "green"],
    template="plotly_dark"
)

fig.data[0].marker.size=3

fig.update_xaxes(title="")
fig.update_yaxes(title="")

fig.update_layout(
    height=1080, 
    width=1920//2+1, 
    font_size=20,
    font_family="Consolas",
)
fig.show()
# fig.write_image("presentation/images/pca_original.png", scale=2)

In [None]:
# 