In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

import json

In [None]:
df_train = pd.read_parquet("datasets/binned_train.parquet")
df_test = pd.read_parquet("datasets/binned_test.parquet")

In [None]:
df_train

In [None]:
df_train[["Opening", "ECO"]].head(10)

**Mean encoding**

In [None]:
# df_train["OpeningType"] = df_train["Opening"].str.split(":", expand=True)[0]
# df_train["ECOType"] = df_train["ECO"].str[:2]

# df_test["OpeningType"] = df_test["Opening"].str.split(":", expand=True)[0]
# df_test["ECOType"] = df_test["ECO"].str[:2]

In [None]:
# opening_features = ["ECO", "OpeningType", "ECOType", "Opening"]
opening_features = ["Opening"]
for feature in opening_features:
    
    # Common openings
    common_openings = (
        df_train[feature]
        .value_counts()
        .where(lambda x: x >= 100).dropna()
        .index
    )
    
    # Calculate the means
    means_dict = (
        df_train
        .groupby(feature)
        .agg({"Elo": "mean"})
        .loc[common_openings]
        .squeeze().to_dict()
    )
    
    # Mean for rare openings
    mean_if_rare = df_train["Elo"].where(
        ~df_train["Opening"].isin(common_openings)
    ).mean()
    
    # Apply
    df_train[feature] = (
        df_train[feature]
        .map(means_dict)
        .fillna(mean_if_rare)
    )
    
    df_test[feature] = (
        df_test[feature]
        .map(means_dict)
        .fillna(mean_if_rare)
    )

In [None]:
# s = pd.Series(means_dict).sort_values().reset_index(drop=True)

# fig = px.line(
#     s.values, 
#     template="plotly_white", 
# )
# fig.data[0].marker.line.width=0
# fig.update_xaxes(title="")
# fig.update_yaxes(title="")
# fig.update_layout(
#     height=1080//2, 
#     width=1920//2, 
#     font_size=20,
#     font_family="Consolas",
#     showlegend=False,
#     yaxis_range=[900, 2200]
# )

# fig.data[0].line.color="#008C45"
# fig.data[0].line.width=2

# fig.show()
# fig.write_image("presentation/images/opening_mean.png", scale=2)

**Opening line**

In [None]:
# df_train["FirstMoves"] = df_train["FirstMoves"].str.split(" ")
# df_test["FirstMoves"] = df_test["FirstMoves"].str.split(" ")

# elos = df_train["Elo"].values
# moves = df_train["FirstMoves"].values

In [None]:
line_count = {}
line_rating = {}

for elo, move in zip(elos, moves):
    
    for i in range(1, min(20, len(move))):
        
        key = " ".join(move[:i])
        line_count[key] = line_count.get(key, 0) + 1
        line_rating[key] = line_rating.get(key, 0) + elo

In [None]:
result = {
    k: v / line_count[k]
    for k, v in line_rating.items()
    if line_count[k] > 100
}

In [None]:
series = pd.Series(result).sort_values()

In [None]:

# fig = px.line(
#     series.values, 
#     template="plotly_white", 
# )
# fig.data[0].marker.line.width=0
# fig.update_xaxes(title="")
# fig.update_yaxes(title="")
# fig.update_layout(
#     height=1080//2, 
#     width=1920//2, 
#     font_size=20,
#     font_family="Consolas",
#     showlegend=False,
#     yaxis_range=[900, 2200]
# )

# fig.data[0].line.color="#008C45"
# fig.data[0].line.width=2

# fig.show()
# fig.write_image("presentation/images/line_mean.png", scale=2)

In [None]:
tree = {}
for key, value in series.sort_index().to_dict().items():
    current_node = tree
    for move in key.split(" "):
        if not (move in current_node):
            current_node[move] = {}
        current_node = current_node[move]
    current_node["mean"] = value

In [None]:
tree["e4"]["e5"]["Bc4"]["mean"]

In [None]:
def get_line_mean(moves_list):
    try:
        current_node = tree
        for move in moves_list:
            if len(current_node) == 1:
                return current_node["mean"]
            if move in current_node:
                current_node = current_node[move]
            else:
                return current_node["mean"]
    except:
        return 1500

In [None]:
df_train["LineTreeMean"] = df_train["FirstMoves"].map(get_line_mean).fillna(1500)
df_test["LineTreeMean"] = df_test["FirstMoves"].map(get_line_mean).fillna(1500)

In [None]:
# .drop(columns=["FirstMoves"])
df_train.to_parquet("datasets/final_train.parquet")
df_test.to_parquet("datasets/final_test.parquet")