In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

pd.options.display.max_columns=1000

**Merge files**

In [None]:
train_n_batches = 15
test_n_batches = 4


df_train = pd.concat((
    pd.read_parquet(f"features/batch_{i}.parquet")
    for i in range(1, train_n_batches + 1)
))

df_test = pd.concat((
    pd.read_parquet(f"features/batch_{i}.parquet")
    for i in range(train_n_batches + 1, train_n_batches + test_n_batches + 1)
))

In [None]:
# df_train = df_train[ df_train["MaxMoveNumber"] >= 5 ]
# df_test = df_test[ df_test["MaxMoveNumber"] >= 5 ]

In [None]:
len(df_train), len(df_test)

In [None]:
df_train.head()

**Check features**

In [None]:
df_train.columns

In [None]:
# px.line(
#     df_train.groupby(
#     df_train['Elo'].clip(600, 2500) // 100
#     ).agg({'MaxMoveNumber': 'mean'}).squeeze()
# )

In [None]:
df_train

In [None]:
feature = (df_train["MeanCheckEarly"] // 0.01).clip(0, 40)

In [None]:
fig = px.line(
    feature.value_counts().sort_index()
)

fig.data[0].mode = "lines+markers"
fig.update_layout(
    template="plotly_white",
    showlegend=False
)

fig.show()

In [None]:
fig = px.line(
    df_train.groupby(feature).agg({"Elo": "mean"})
)

fig.data[0].mode = "lines+markers"
fig.update_layout(
    template="plotly_white",
    showlegend=False
)

fig.show()

**Saving files**

In [None]:
def bin_features(df):
    
    df_new = pd.DataFrame()

    for feature in df.columns:
        if feature in ["GameId", "White", "Black", "WhiteElo", "BlackElo", "Elo", "Opening", "ECO", "FirstMoves"]:
            df_new[feature] = df[feature]
        else:
            df_new[feature] = df[feature].clip(
                df[feature].quantile(0.005), 
                df[feature].quantile(0.995)
            )

    # # Ошибки
    # df["NBlunders"] = (df["NBlunders"]).clip(0, 20)
    # df["NOkayMoves"] = (df["NOkayMoves"] // 5).clip(0, 30)
    # df["MeanBlunders"] = (df["MeanBlunders"] // 0.02).clip(0, 15)
    # df["MeanMistakes"] = (df["MeanMistakes"] // 0.05).clip(0, 10)
    # df["MeanBadMoves"] = (df["MeanBadMoves"] // 0.02).clip(0, 24)
    # df["MeanOkayMoves"] = (df["MeanOkayMoves"] // 0.02).clip(20, 50)
    
    # # Средний ход ошибок
    # df["MoveNumberBlunder"] = (df["MoveNumberBlunder"] // 2).clip(0, 25)
    # df["MoveNumberMistake"] = (df["MoveNumberMistake"] // 3).clip(0, 15)
    # df["MoveNumberBadMove"] = (df["MoveNumberBadMove"] // 3).clip(0, 15)
    
    # # Eval
    # df["MeanAbsEval"] = (df["MeanAbsEval"] // 20).clip(0, 40)
    # df["EvalStd"] = (df["EvalStd"] // 50).clip(0, 18)
    # df["NEqualGame300"] = (df["NEqualGame300"] // 3).clip(0, 30)
    # df["MeanLostGame600"] = (df["MeanLostGame600"] // 0.05).clip(0, 18)
    
    # df["AbsEvalMedian"] = (df["AbsEvalMedian"] // 10)
    # df["CentipawnLossMedian"] = (df["CentipawnLossMedian"] // 10).clip(0, 20)
    
    # # Потери сантипешек
    # df["MeanCentipawnLoss"] = (df["MeanCentipawnLoss"] // 10).clip(0, 22)
    # df["StartCentipawnLoss15"] = (df["StartCentipawnLoss15"] // 10).clip(0, 25)
    # df["KnightCentipawnLoss"] = (df["KnightCentipawnLoss"] // 20).clip(0, 16)
    # df["PawnCentipawnLoss"] = (df["PawnCentipawnLoss"] // 10).clip(1, 20)
    
    # # Прочее
    # df["MeanHasMate"] = (df["MeanHasMate"] // 0.05).clip(0, 8)
    # df["MeanChecks"] = (df["MeanChecks"] // 0.02).clip(0, 12)
    # df["NMoves"] = (df["NMoves"] // 5).clip(0, 20)
    
    # # WinOdds
    # df["WinOddsStd"] = (df["WinOddsStd"] // 0.01).clip(2, 60)
    # df["WinOddsMean"] = (df["WinOddsMean"].abs() // 0.01).clip(0, 70)
    # df["MaxAdvLost"] = (df["MaxAdvLost"] // 0.05).clip(2, 30)
    # df["MeanAdvLost"] = (df["MeanAdvLost"] // 0.01).clip(2, 30)
    # df["StartAdvLost10"] = (df["StartAdvLost10"] // 0.01).clip(0, 30)
    
    # df["WinOddsMedian"] = (df["WinOddsMedian"].abs() // 0.01)
    # df["MedianAdvLost"] = (df["MedianAdvLost"] // 0.005).clip(0, 20)
    
    # # df["TimeSpentMean"] = (df["TimeSpentMean"] // 1).clip(-2, 25)
    
    
    return df_new

In [None]:
bin_features(df_train).to_parquet("datasets/binned_train.parquet")
bin_features(df_test).to_parquet("datasets/binned_test.parquet")