In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, StratifiedKFold

from catboost import CatBoostClassifier, cv, Pool

from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier

SEED = 1660

## Загрузим данные

In [None]:
df_train = pd.read_parquet("03_datasets/final_train.parquet")
df_test = pd.read_parquet("03_datasets/final_test.parquet")

In [None]:
X_train = df_train.drop(columns=["GameId", "Elo"])
Y_train = df_train["Elo"]

X_test = df_test.drop(columns=["GameId", "Elo"])
Y_test = df_test["Elo"]

In [None]:
fig = px.histogram(Y_train)

fig.update_layout(
    template="plotly_dark",
    showlegend=False,
    title_text="Распределение ELO-рейтинга"
)

fig.show()

## Разбиваем на классы

In [None]:
# Y_train = Y_train.clip(600, 2600)
# Y_test = Y_test.clip(600, 2600)

In [None]:
bins = np.quantile(Y_train, [0.0, 0.20, 0.40, 0.60, 0.80, 1.00])
bins

In [None]:
Y_train_cat = pd.cut(Y_train, bins, include_lowest=True, labels=False)
Y_test_cat = pd.cut(Y_test, bins, include_lowest=True, labels=False)

Y_train_cat = Y_train_cat.astype(int)
Y_test_cat = Y_test_cat.astype(int)

In [None]:
fig = px.histogram(
    Y_train, 
    color=Y_train_cat, 
    color_discrete_map={0: "darkred", 1: "red", 2: "orange", 3: "yellow", 4: "lime", 5: "green"},
)

fig.update_layout(
    template="plotly_dark",
    showlegend=False,
    title_text="Распределение ELO-рейтинга"
)

fig.show()

In [None]:
Y_train_cat.value_counts(normalize=True).sort_index().multiply(100)

## Модели

In [None]:
def plot_bars(x, title):
    fig = px.bar(x)
    
    fig.data[0].marker.color="black"
    fig.data[1].marker.color="green"
    fig.data[0].marker.line.width=0
    fig.data[1].marker.line.width=0
    
    fig.update_layout(
        barmode="group", 
        bargroupgap=0.0,
        template="plotly_white",
        xaxis_title="Итерация",
        yaxis_title="Метрика",
        title_text=title
    )
    
    fig.show()

In [None]:
def get_crossval_report(model, X_train, Y_train):
    cv_scores = cross_validate(
        model, 
        X_train, Y_train, 
        cv=StratifiedKFold(n_splits=5, random_state=SEED, shuffle=True),
        scoring=["accuracy"],
        return_train_score=True
    )
    
    r2_scores = pd.DataFrame({
        "Train": cv_scores["train_accuracy"],
        "Test": cv_scores["test_accuracy"]
    })
    
    plot_bars(r2_scores*100, title="Accuracy")

**Встроенная поддержка множества классов**

In [None]:
catboost_model = CatBoostClassifier(
    iterations=100,
    max_depth=2,
    random_seed=SEED,
    verbose=0
)

In [None]:
get_crossval_report(catboost_model, X_train, Y_train_cat)

**OneVsRest**

In [None]:
# linear_ovr = OneVsRestClassifier(LogisticRegression(max_iter=1000, random_state=SEED))
# get_crossval_report(linear_ovr, X_train, Y_train_cat)

# Очень догло +
# ConvergenceWarning

In [None]:
catboost_ovr = OneVsRestClassifier(catboost_model)

In [None]:
get_crossval_report(catboost_ovr, X_train, Y_train_cat)

In [None]:
catboost_ovr.fit(X_train, Y_train_cat)

In [None]:
catboost_ovr.estimators_[2].get_feature_importance(prettified=True)

**OneVsOne**

In [None]:
# linear_ovr = OneVsOneClassifier(LogisticRegression(max_iter=1000, random_state=SEED))
# get_crossval_report(linear_ovr, X_train, Y_train_cat)

# Очень догло
# ConvergenceWarning

In [None]:
catboost_ovo = OneVsOneClassifier(catboost_model)

In [None]:
get_crossval_report(catboost_ovo, X_train, Y_train_cat)

## Финальная модель

In [None]:
model_final = OneVsRestClassifier(catboost_model).fit(X_train, Y_train_cat)

**Метрики**

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
cat_prediction = model_final.predict(X_test)

In [None]:
accuracy_score(Y_test_cat, cat_prediction) * 100

In [None]:
fig = px.imshow(
    confusion_matrix(Y_test_cat, cat_prediction),
    color_continuous_scale=["white", "cyan", "blue", "darkblue"],
    template="plotly_dark"
)


fig.show()

**Важность признаков**

In [None]:
fi = pd.concat((
    model_final.estimators_[i].get_feature_importance(prettified=True).assign(cat=i)
    for i in range(5)
))

fi = fi.pivot(index="cat", columns="Feature Id", values="Importances")

In [None]:
fi

In [None]:
important_features = fi.mean().nlargest(10).index
fig = px.line(
    fi.loc[:, important_features],
    template="plotly_dark"
)
fig.update_traces(mode="lines+markers")
fig