In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from src.preprocess_data import (
    create_df,
    process_train_df,
)

### Overview

Fit a quick Random Forest model to work out reasonable performance. 

We are not going to take account of the ordered nature of the outcome here. 

### Set-up data

In [2]:
df = create_df("../data/games.csv")
df["p1_outcome"] = (1 + df["p1_score"] * 2).astype(int)

We will use the titles of the players as features also.

Per Wiki: _The WGM title is ranked lower than that of International Master (IM) but higher than that of FIDE Master (FM)._

In [3]:
title_dict = {"GM": 4, "IM": 3, "FM": 2, "WGM": 2.5}

In [4]:
df["p1_title_numeric"] = df["p1_title"].apply(lambda x: title_dict[x])
df["p2_title_numeric"] = df["p2_title"].apply(lambda x: title_dict[x])

df["elo_diff"] = df["p1_elo"].values - df["p2_elo"].values

### Set-up training

In [5]:
train = df.loc[df["date"] < "2019-01-01"]
test = df.loc[df["date"] >= "2019-01-01"]

In [6]:
cols = ["elo_diff", "p1_white", "p1_title_numeric", "p2_title_numeric"]

In [7]:
train[cols].head()

Unnamed: 0,elo_diff,p1_white,p1_title_numeric,p2_title_numeric
435,-572.0,False,2.0,4.0
436,415.0,False,4.0,3.0
437,401.0,True,4.0,4.0
439,-353.0,False,3.0,4.0
440,-342.0,True,3.0,4.0


In [8]:
rf_params = {
    "max_depth": [2, 4, 6],
    "n_estimators": [25, 50, 75, 100],
    "criterion": ["gini", "entropy"],
    "min_samples_leaf": [2, 6, 12],
}

In [9]:
grid_search = GridSearchCV(
    RandomForestClassifier(), param_grid=rf_params, cv=10, n_jobs=-1
)
grid_search.fit(train[cols].values, train["p1_outcome"].values)

grid_search.best_params_

{'criterion': 'entropy',
 'max_depth': 4,
 'min_samples_leaf': 6,
 'n_estimators': 100}

In [10]:
clf = grid_search.best_estimator_

clf.feature_importances_

array([0.75272061, 0.14967283, 0.06006922, 0.03753733])

### Predict

In [11]:
ypred_test = clf.predict_proba(test[cols].values)
ypred_test_cls = np.argmax(ypred_test, axis=1) + 1

ypred_train = clf.predict_proba(train[cols].values)
ypred_train_cls = np.argmax(ypred_train, axis=1) + 1

Let's look at how often the model got the correct outcome or missed:

In [12]:
test_diff_0_accuracy = np.mean(np.abs(test["p1_outcome"].values - ypred_test_cls) == 0)
test_diff_1_accuracy = np.mean(np.abs(test["p1_outcome"].values - ypred_test_cls) == 1)
test_diff_2_accuracy = np.mean(np.abs(test["p1_outcome"].values - ypred_test_cls) == 2)

train_diff_0_accuracy = np.mean(
    np.abs(train["p1_outcome"].values - ypred_train_cls) == 0
)
train_diff_1_accuracy = np.mean(
    np.abs(train["p1_outcome"].values - ypred_train_cls) == 1
)
train_diff_2_accuracy = np.mean(
    np.abs(train["p1_outcome"].values - ypred_train_cls) == 2
)

In [13]:
print(
    f"Train - exact class: {train_diff_0_accuracy:.4f}, miss by 1: {train_diff_1_accuracy:.4f}"
)
print(
    f"Test - exact class: {test_diff_0_accuracy:.4f}, miss by 1: {test_diff_1_accuracy:.4f}"
)

Train - exact class: 0.5578, miss by 1: 0.4363
Test - exact class: 0.5497, miss by 1: 0.4411


Performance is actually similar to the Bayesian model (and with none of the fun).