In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error
from sklearn.model_selection import cross_validate, KFold

import statsmodels.api as sm

SEED = 1660

## Загружаем данные

In [None]:
df_train = pd.read_parquet("datasets/final_train.parquet")
df_test = pd.read_parquet("datasets/final_test.parquet")

In [None]:
df_train = df_train.drop(columns=["ECO"])
df_test = df_test.drop(columns=["ECO"])

In [None]:
X_train = df_train.drop(columns=["GameId", "Elo", "White", "Black", "WhiteElo", "BlackElo"])
Y_train = df_train["Elo"]

X_test = df_test.drop(columns=["GameId", "Elo", "White", "Black", "WhiteElo", "BlackElo"])
Y_test = df_test["Elo"]

In [None]:
X_train.shape, X_test.shape

In [None]:
mean = X_train.mean()
std = X_train.std()

X_train = (X_train - mean) / std
X_test = (X_test - mean) / std

In [None]:
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)

In [None]:
px.histogram(X_train["MeanQueenMoveEarly"])

In [None]:
model = sm.OLS(Y_train, X_train)

In [None]:
result = model.fit()

In [None]:
print(result.summary())

## Вспомогательные функции

In [None]:
def plot_bars(x, title):
    fig = px.bar(x)
    
    fig.data[0].marker.color="white"
    fig.data[1].marker.color="green"
    fig.data[0].marker.line.width=0
    fig.data[1].marker.line.width=0
    
    fig.update_layout(
        barmode="group", 
        bargroupgap=0.0,
        template="plotly_dark",
        xaxis_title="K",
        yaxis_title="Value",
        title_text=title
    )
    
    fig.show()

In [None]:
def get_crossval_report(model, X_train, Y_train):
    cv_scores = cross_validate(
        model, 
        X_train, Y_train, 
        cv=KFold(n_splits=3, random_state=SEED, shuffle=True),
        scoring=["r2", "neg_mean_absolute_error"],
        return_train_score=True
    )
    
    r2_scores = pd.DataFrame({
        "Train": cv_scores["train_r2"],
        "Test": cv_scores["test_r2"]
    })
    
    plot_bars(r2_scores*100, title="R^2")
    
    mae_scores = pd.DataFrame({
        "Train": cv_scores["train_neg_mean_absolute_error"],
        "Test": cv_scores["test_neg_mean_absolute_error"]
    })
    
    plot_bars(-1*mae_scores, title="MAE")

## Линейная модель

In [None]:
linear_model = LinearRegression()

In [None]:
get_crossval_report(linear_model, X_train, Y_train)

## Градиентный бустинг

In [None]:
catboost_model = RandomForestRegressor(max_depth=2)

In [None]:
get_crossval_report(catboost_model, X_train, Y_train)

**Смесь моделей**

In [None]:
features_split = [
    "LineTreeMean", "MeanStartLoss", "Opening",
    "MeanPawnLossStart", "MeanKnightLossStart", "MeanBishopLossStart", "MeanQueenLossStart"
]

model_1 = CatBoostRegressor(
    iterations=250,
    max_depth=2,
    random_seed=SEED,
    verbose=0
)

model_1.fit(
    X_train[features_split],
    Y_train
)

pred_1 = model_1.predict(
    X_test[features_split],
)

model_2 = CatBoostRegressor(
    iterations=250,
    max_depth=2,
    random_seed=SEED,
    verbose=0
)

model_2.fit(
    X_train.drop(columns=features_split),
    Y_train
)

pred_2 = model_2.predict(
    X_test.drop(columns=features_split)
)

In [None]:
r2_score(Y_test, pred_1)

In [None]:
r2_score(Y_test, pred_2)

In [None]:
r2_score(Y_test, (pred_1+pred_2) / 2)

In [None]:
model_2.get_feature_importance(prettified=True)

## Финальная модель

In [None]:
final_model = LinearRegression()

final_model.fit(X_train, Y_train)

In [None]:
# final_model.get_feature_importance(prettified=True).head(20)
final_model.coef_

In [None]:
r2_score(
    Y_test,
    final_model.predict(X_test)
)

**Error Group By Player**

In [None]:
df_train["Prediction"] = final_model.predict(X_train)

In [None]:
players = pd.concat([
    df_train[["White", "WhiteElo", "Prediction"]].rename(columns={"White": "Player", "WhiteElo": "Elo"}),
    df_train[["Black", "BlackElo", "Prediction"]].rename(columns={"Black": "Player", "BlackElo": "Elo"})
])

In [None]:
common_players = players["Player"].value_counts().where(lambda x: x >= 5).dropna().index
players_filtered = players[ players["Player"].isin(common_players) ]

In [None]:
aaa = players_filtered.groupby("Player", as_index=False).agg({"Elo": "mean", "Prediction": "mean"})

In [None]:
r2_score(
    aaa["Elo"],
    aaa["Prediction"]
)