# Modelling and Results

## Imports

In [None]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import chess
import io
import sys
# import chess.pgn
from io import StringIO

import plotly.express as px
import plotly.graph_objects as go

import joblib
from tempfile import mkdtemp
import re

from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from tempfile import mkdtemp
from sklearn.pipeline import Pipeline
# from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import classification_report, plot_confusion_matrix, confusion_matrix,\
                            make_scorer, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [None]:
import detecting_cheaters_in_chess_helpers as hp

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
# plt.style.use('seaborn')

----

## Introduction

For this project, the classification models selected were logistic regression, K-nearest neighbours (for the first two rounds of modelling), decision tree, random forest and XGBoost.

Accuracy was selected as the overall evaluation metric, though future work should investigate prioritising the $F_{1}$ score, a harmonic mean of the precision and recall scores because a balance must be found between minimising the number of cheaters who are not detected and the number of non-cheaters who are flagged as cheaters.

-----

## First Pass

### Loading Data

In [None]:
big_df = joblib.load('./data/preprocessed/2022_2018_CvH.pkl')

In [None]:
X, y = hp.X_y_split_simple(big_df)

y = hp.y_convert_to_ints(y)

X_ = X.drop(columns=['emt', 'moves'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.2, random_state=22,
                                                                     stratify=y)

In [None]:
y_train_, y_test_ = hp.flat_y(y_train, y_test)

In [None]:
ohe = OneHotEncoder(sparse=False, dtype=np.int8(), handle_unknown='ignore')

In [None]:
X_train_ = pd.merge(
    left=X_train.drop(columns=['ECO']).reset_index(drop=True),
    right=pd.DataFrame(ohe.fit_transform(X_train[['ECO']]), columns=ohe.get_feature_names_out()),
    how='left',
    left_index=True,
    right_index=True)

In [None]:
X_test_ = pd.merge(
    left=X_test.drop(columns=['ECO']).reset_index(drop=True),
    right=pd.DataFrame(ohe.transform(X_test[['ECO']]), columns=ohe.get_feature_names_out()),
    how='left',
    left_index=True,
    right_index=True)

In [None]:
stan_scal = StandardScaler()

In [None]:
X_train_scaled=X_train_.copy()

X_train_scaled[['BlackElo', 'BlackRD',
                'PlyCount', 'WhiteElo',
                'WhiteRD', 'TimeControl_Base', 'TimeControl_Inc']] = stan_scal.fit_transform(X_train_[[
    'BlackElo', 'BlackRD', 'PlyCount', 'WhiteElo', 'WhiteRD', 'TimeControl_Base', 'TimeControl_Inc']])

X_test_scaled=X_test_.copy()

X_test_scaled[['BlackElo', 'BlackRD',
                'PlyCount', 'WhiteElo',
                'WhiteRD', 'TimeControl_Base', 'TimeControl_Inc']] = stan_scal.transform(X_test_[[
    'BlackElo', 'BlackRD', 'PlyCount', 'WhiteElo', 'WhiteRD', 'TimeControl_Base', 'TimeControl_Inc']])



----

### Logistic Regression

In [None]:
# logreg = hp.train_default_logreg_model(X_train_scaled, y_train_)

In [None]:
logreg = joblib.load('data/pickled_models/first/first_logreg.pkl')

In [None]:
logreg_report, logreg_results, logreg_coefficients = hp.class_model_eval_logreg(
    logreg, X_train_scaled, X_test_scaled, y_train_, y_test_)

In [None]:
log_reg_coefficients = pd.DataFrame(
    data=abs(logreg.coef_), columns=logreg.feature_names_in_)

log_reg_coefficients.T.rename(columns={0: 'coefficient_weights'}).sort_values(
    by='coefficient_weights', ascending=False).head(15)

-----

### K-Nearest Neighbour

In [None]:
knn = hp.train_default_knn_model(X_train_scaled, y_train_)

In [None]:
# knn = joblib.load('./data/pickled_models/first_knn.pkl')

In [None]:
# knn_report, knn_model_results = \
#     hp.class_model_eval_logreg(
#         knn, X_train_scaled, X_test_scaled, y_train_, y_test_,
#         has_coeffs=False)

In [None]:
knn_results = joblib.load('./data/pickled_models/first/first_knn_report-results.pkl')

In [None]:
print(knn_results[0])

In [None]:
conf_matr = plot_confusion_matrix(knn, X_test_scaled, y_test_)
plt.show()

----

### Decision Tree

In [None]:
dt = hp.train_default_dt_model(X_train_, y_train_)

In [None]:
# dt = joblib.load('data/pickled_models/first/first_dt.pkl')

In [None]:
dt_report, dt_model_results = \
    hp.class_model_eval_logreg(
        dt, X_train_, X_test_, y_train_, y_test_, has_coeffs=False)

-----

### Random Forest

In [None]:
random_forest = hp.train_default_rf_model(X_train_, y_train_)

In [None]:
# random_forest = joblib.load('data/pickled_models/first/first_randfor.pkl')

In [None]:
random_forest_report, random_forest_model_results = \
    hp.class_model_eval_logreg(
        random_forest, X_train_, X_test_, y_train_, y_test_, has_coeffs=False)

In [None]:
# decision_tree_train_scores = []
# for sub_tree in random_forest.estimators_:
#     decision_tree_train_scores.append(sub_tree.score(X_train_, y_train))
    
# print("Performance on fitted data:")
# print(f"Average Decision Tree: {np.mean(decision_tree_train_scores)}")
# print(f"Random Forest: {random_forest.score(X_train_, y_train)}")

In [None]:
# decision_tree_test_scores = []
# for sub_tree in random_forest.estimators_:
#     decision_tree_test_scores.append(sub_tree.score(X_test_, y_test))

# print("Performance on Test data:")
# print(f"Average Decision Tree: {np.mean(decision_tree_test_scores)}")
# print(f"Random Forest: {random_forest.score(X_test_, y_test)}")

In [None]:
hp.rf_trees_and_forest_strength(random_forest, X_train_, X_test_, y_train_, y_test_);

In [None]:
hp.tree_feature_importance(random_forest, X_train_)

----

### XGBoost

In [None]:
XGB = hp.train_default_xgboost_model(X_train_, y_train_)

In [None]:
xgb_report, xgb_model_results = hp.class_model_eval_logreg(XGB, X_train_, X_test_, y_train_, y_test_,
                                                          has_coeffs=False)

-----

## Second Pass

All of the first pass models were suspiciously accurate and the elo of each player was consistently one of the most important features. Games where humans had played against each other was joined with the dataframe to see if this had an impact on the model accuracy.

### Loading Data

In [None]:
X_CvH_HvH = joblib.load('./data/preprocessed/X_CvH_HvH.pkl')
y_CvH_HvH = joblib.load('./data/preprocessed/y_CvH_HvH.pkl')

In [None]:
X_CvH_HvH_train, X_CvH_HvH_test, y_CvH_HvH_train, y_CvH_HvH_test = train_test_split(
    X_CvH_HvH, y_CvH_HvH, test_size=0.2, random_state=22, stratify=y_CvH_HvH)

In [None]:
y_CvH_HvH_train_, y_CvH_HvH_test_ = hp.flat_y(y_CvH_HvH_train, y_CvH_HvH_test)

In [None]:
X_CvH_HvH_train_, X_CvH_HvH_test_, ohe_CvH_HvH = hp.OHE_ECO(X_CvH_HvH_train, X_CvH_HvH_test)

In [None]:
X_CvH_HvH_train_scaled=X_CvH_HvH_train_.copy()
X_CvH_HvH_test_scaled=X_CvH_HvH_test_.copy()

In [None]:
X_CvH_HvH_train_scaled[['BlackRD', 'WhiteRD']] = np.array([
    [np.int8(0) if y=='na' else y for y in x] for x in X_CvH_HvH_train_scaled[['BlackRD', 'WhiteRD']].values
    ]) # 402 ms ± 2.73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [None]:
X_CvH_HvH_test_scaled[['BlackRD', 'WhiteRD']] = np.array([
    [np.int8(0) if y=='na' else y for y in x] for x in X_CvH_HvH_test_scaled[['BlackRD', 'WhiteRD']].values
    ]) 

In [None]:
stan_scal_CvH_HvH = StandardScaler()

In [None]:
X_CvH_HvH_train_scaled[['BlackElo', 'BlackRD',
                'PlyCount', 'WhiteElo',
                'WhiteRD', 'TimeControl_Base', 'TimeControl_Inc']] = stan_scal_CvH_HvH.fit_transform(
    X_CvH_HvH_train_scaled[['BlackElo', 'BlackRD', 'PlyCount', 'WhiteElo',
                            'WhiteRD', 'TimeControl_Base', 'TimeControl_Inc']])

X_CvH_HvH_test_scaled[['BlackElo', 'BlackRD',
                'PlyCount', 'WhiteElo',
                'WhiteRD', 'TimeControl_Base', 'TimeControl_Inc']] = stan_scal_CvH_HvH.transform(
    X_CvH_HvH_test_scaled[['BlackElo', 'BlackRD', 'PlyCount', 'WhiteElo',
                           'WhiteRD', 'TimeControl_Base', 'TimeControl_Inc']])



----

### Logistic Regression

In [None]:
# takes a while to run - not recommended
# # logreg_CvH_HvH = hp.train_default_logreg_model(X_CvH_HvH_train_scaled, y_CvH_HvH_train_)

In [None]:
logreg_CvH_HvH = joblib.load('data/pickled_models/first/first_logreg_CvH_HvH.pkl')

In [None]:
logreg_CvH_HvH_report, logreg_CvH_HvH_results, logreg_CvH_HvH_coefficients = hp.class_model_eval_logreg(
    logreg_CvH_HvH, X_CvH_HvH_train_scaled,
    X_CvH_HvH_test_scaled, y_CvH_HvH_train_,
    y_CvH_HvH_test_)

The accuracy is still suspiciously high. The distribution of elos was investigated and it became clear that the computers had a much higher elo on average than their human counterpart. For the next round of modelling, all elos were removed to see if this would have an effect on the accuracy of the models.

----

## Third Pass

### CvH

#### Data Prep

In [None]:
big_df = joblib.load('./data/preprocessed/2022_2018_CvH.pkl')

In [None]:
X, y = hp.X_y_split_simple(big_df)

y = hp.y_convert_to_ints(y)

In [None]:
X_ = hp.drop_emt_moves(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.2, random_state=22,
                                                                     stratify=y)

In [None]:
y_train_, y_test_ = hp.flat_y(y_train, y_test)

In [None]:
X_train_, X_test_, ohe_ = hp.OHE_ECO(X_train, X_test)

In [None]:
X_train_scaled, X_test_scaled, stan_scaler = hp.stan_scale(X_train_, X_test_, [
    'BlackElo', 'BlackRD', 'PlyCount', 'WhiteElo', 'WhiteRD', 'TimeControl_Base', 'TimeControl_Inc'
])

In [None]:
X_train_scaled=X_train_scaled.drop(columns=['BlackElo', 'WhiteElo'])
X_test_scaled=X_test_scaled.drop(columns=['BlackElo', 'WhiteElo'])

In [None]:
X_train_=X_train_.drop(columns=['BlackElo', 'WhiteElo'])
X_test_=X_test_.drop(columns=['BlackElo', 'WhiteElo'])

----

#### Logistic Regression

In [None]:
# noelo_logreg_CvH = hp.train_default_logreg_model(X_train_scaled, y_train_)

In [None]:
noelo_logreg_CvH = joblib.load('./data/pickled_models/noelo_CvH_logreg.pkl')

In [None]:
noelo_logreg_report, noelo_logreg_results, noelo_logreg_coefficients = hp.class_model_eval_logreg(
    noelo_logreg_CvH, X_train_scaled, X_test_scaled, y_train_, y_test_)

In [None]:
top_coeffs = hp.binary_logreg_ranked_coefs(noelo_logreg_CvH)

The accuracy has gone down, as expected. 

-----

#### K-Nearest Neighbour

In [None]:
# noelo_knn_CvH = hp.train_default_knn_model(X_train_scaled, y_train_)

In [None]:
noelo_knn_CvH = joblib.load('./data/pickled_models/noelo_CvH/noelo_CvH_knn.pkl')

In [None]:
# noelo_knn_report, noelo_knn_results = hp.class_model_eval_logreg(noelo_knn_CvH,
#                                                                  X_train_scaled, X_test_scaled, y_train_, y_test_,
#                                                                  has_coeffs=False)

In [None]:
noelo_knn_report, noelo_knn_results = joblib.load('./data/pickled_models/noelo_CvH/noelo_CvH_knn_report-results.pkl')

![title](./data/plots/noelo_CvH_knn_conf_mat.png)

In [None]:
print(noelo_knn_report)

display(noelo_knn_results)

-----

#### Decision Tree

In [None]:
# noelo_dt_CvH = hp.train_default_dt_model(X_train_scaled, y_train_,
#                                      path='./data/pickled_models/', name='noelo_CvH_scaled')

In [None]:
# noelo_dt_CvH = hp.train_default_dt_model(X_train_, y_train_,
#                                      path='./data/pickled_models/', name='noelo_CvH_unscaled')

In [None]:
noelo_dt_CvH = joblib.load('./data/pickled_models/noelo_CvH/noelo_CvH_scaled_dt.pkl')

In [None]:
noelo_dt_CvH_unscaled = joblib.load('./data/pickled_models/noelo_CvH/noelo_CvH_unscaled_dt.pkl')

In [None]:
noelo_dt_report, noelo_dt_model_results = \
    hp.class_model_eval_logreg(
        noelo_dt_CvH, X_train_scaled, X_test_scaled, y_train_, y_test_, has_coeffs=False)

In [None]:
noelo_dt_unscaled_report, noelo_dt_unscaled_model_results = \
    hp.class_model_eval_logreg(
        noelo_dt_CvH_unscaled, X_train_, X_test_, y_train_, y_test_, has_coeffs=False)

In [None]:
feature_df_dt = hp.tree_feature_importance(noelo_dt_CvH_unscaled, X_train_)
feature_df_dt

----

#### Random Forest

In [None]:
# noelo_rf_CvH_unscaled = train_default_rf_model(X_train_, y_train_,
#                                      path='./data/pickled_models/', name='noelo_CvH_unscaled')

In [None]:
# noelo_rf_CvH = hp.train_default_rf_model(X_train_scaled, y_train_,
#                                      path='./data/pickled_models/', name='noelo_CvH_scaled')

In [None]:
# noelo_rf_CvH_unscaled = hp.train_default_rf_model(X_train_, y_train_,
#                                      path='./data/pickled_models/', name='noelo_CvH_unscaled')

In [None]:
# noelo_rf_CvH_scaled = joblib.load('./data/pickled_models/noelo_CvH_scaled_rf.pkl')

In [None]:
noelo_rf_CvH_unscaled = joblib.load('./data/pickled_models/noelo_CvH/noelo_CvH_unscaled_rf.pkl')

In [None]:
noelo_rf_report, noelo_rf_model_results = \
    hp.class_model_eval_logreg(
        noelo_rf_CvH_unscaled, X_train_, X_test_, y_train_, y_test_, has_coeffs=False)

In [None]:
hp.rf_trees_and_forest_strength(noelo_rf_CvH_unscaled, X_train_, X_test_, y_train_, y_test_);

In [None]:
test_feature_df_rf = hp.tree_feature_importance(noelo_rf_CvH_unscaled, X_train_)
test_feature_df_rf

----

#### XGBoost

In [None]:
# noelo_xgb_CvH_unscaled = hp.train_default_xgboost_model(X_train_, y_train_)

In [None]:
noelo_xgb_CvH_unscaled = joblib.load('./data/pickled_models/noelo_CvH/noelo_CvH_unscaled_xgb.pkl')

In [None]:
# xgb_report, xgb_model_results = hp.class_model_eval_logreg(XGB, X_train_, X_test_, y_train_, y_test_,
#                                                           has_coeffs=False)

In [None]:
noelo_xgb_report, noelo_xgb_model_results = \
    hp.class_model_eval_logreg(
        noelo_xgb_CvH_unscaled, X_train_, X_test_, y_train_, y_test_, has_coeffs=False)

In [None]:
test_feature_df_xgb = hp.tree_feature_importance(noelo_xgb_CvH_unscaled, X_train_)
test_feature_df_xgb

----

### CvH and HvH

#### Data Prep

In [None]:
X_CvH_HvH = joblib.load('./data/preprocessed/X_CvH_HvH.pkl')
y_CvH_HvH = joblib.load('./data/preprocessed/y_CvH_HvH.pkl')

In [None]:
X_CvH_HvH_train, X_CvH_HvH_test, y_CvH_HvH_train, y_CvH_HvH_test = train_test_split(
    X_CvH_HvH, y_CvH_HvH, test_size=0.2, random_state=22, stratify=y_CvH_HvH)

In [None]:
y_CvH_HvH_train_, y_CvH_HvH_test_ = hp.flat_y(y_CvH_HvH_train, y_CvH_HvH_test)

In [None]:
X_CvH_HvH_train_, X_CvH_HvH_test_, ohe_CvH_HvH = hp.OHE_ECO(X_CvH_HvH_train, X_CvH_HvH_test)

In [None]:
X_CvH_HvH_train_ = hp.drop_uneeded_cols(X_CvH_HvH_train_, ['BlackElo', 'WhiteElo'])
X_CvH_HvH_test_ = hp.drop_uneeded_cols(X_CvH_HvH_test_, ['BlackElo', 'WhiteElo'])

In [None]:
any([any(X_CvH_HvH_train_[col].isna()) for col in X_CvH_HvH_train_.columns]) # Should be False

In [None]:
X_CvH_HvH_train_[['BlackRD', 'WhiteRD']] = np.array([
    [np.int8(0) if y=='na' else np.float16(y) for y in x] for x in X_CvH_HvH_train_[['BlackRD', 'WhiteRD']].values
    ]) # 402 ms ± 2.73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

X_CvH_HvH_test_[['BlackRD', 'WhiteRD']] = np.array([
    [np.int8(0) if y=='na' else np.float16(y) for y in x] for x in X_CvH_HvH_test_[['BlackRD', 'WhiteRD']].values
    ]) 

In [None]:
display(any([type(val)!=float for val in X_CvH_HvH_train_.BlackRD])) # False
display(any([type(val)!=float for val in X_CvH_HvH_train_.WhiteRD])) # False

In [None]:
X_CvH_HvH_train_scaled, X_CvH_HvH_test_scaled, stan_scal_  = hp.stan_scale(X_CvH_HvH_train_,
                                                                           X_CvH_HvH_test_,
                                                                          ['BlackRD', 'PlyCount', 'WhiteRD',
                                                                          'TimeControl_Base', 'TimeControl_Inc'])

-----

#### Logistic Regression

In [None]:
# logreg_CvH_HvH_noelo = hp.train_default_logreg_model(X_CvH_HvH_train_scaled,
#                                                     y_CvH_HvH_train_)

In [None]:
logreg_CvH_HvH_noelo = joblib.load('./data/pickled_models/noelo_CvH_HvH/noelo_CvH_HvH_logreg.pkl')

In [None]:
noelo_logreg_CvH_HvH_report, noelo_logreg_CvH_HvH_results, noelo_logreg_CvH_HvH_coefficients =\
    hp.class_model_eval_logreg(
        logreg_CvH_HvH_noelo, X_CvH_HvH_train_scaled,
        X_CvH_HvH_test_scaled, y_CvH_HvH_train_,
        y_CvH_HvH_test_,
        num_class=3)

In [None]:
# noelo_logreg_CvH_HvH_results

In [None]:
noelo_logreg_CvH_HvH_coefficients

In [None]:
for col in noelo_logreg_CvH_HvH_coefficients:
    display(noelo_logreg_CvH_HvH_coefficients[[col]].sort_values(
    by=col, ascending=False).head(15))

-----

#### K-Nearest Neighbours

In [None]:
# noelo_CvH_HvH_knn = hp.train_default_knn_model(X_CvH_HvH_train_scaled,
#                                                y_CvH_HvH_train_,
#                                                path='./data/pickled_models/noelo_CvH_HvH/',
#                                                name='noelo_CvH_HvH')

In [None]:
noelo_CvH_HvH_knn = joblib.load('./data/pickled_models/noelo_CvH_HvH/noelo_CvH_HvH_knn.pkl')

In [None]:
# noelo_CvH_HvH_knn_report, noelo_CvH_HvH_knn_results, conf_matr_knn = hp.class_model_eval_logreg(
#     noelo_CvH_HvH_knn,
#     X_CvH_HvH_train_scaled, X_CvH_HvH_test_scaled,
#     y_CvH_HvH_train_, y_CvH_HvH_test_,
#     has_coeffs=False,
#     is_knn_=True,
#     num_class=3)

In [None]:
# joblib.dump([noelo_CvH_HvH_knn_report, noelo_CvH_HvH_knn_results],
#             './data/pickled_models/noelo_CvH_HvH/noelo_CvH_HvH_knn_report-results.pkl',
#             compress=9)

In [None]:
# conf_matr_knn.figure_.savefig('./data/plots/noelo_CvH_HvH_knn_conf_mat.png',dpi=300)

In [None]:
noelo_CvH_HvH_knn_report, noelo_CvH_HvH_knn_results = joblib.load('./data/pickled_models/noelo_CvH_HvH/noelo_CvH_HvH_knn_report-results.pkl')

<img src='./data/plots/noelo_CvH_HvH_knn_conf_mat.png' alt='alt text' />

In [None]:
print(noelo_CvH_HvH_knn_report)

display(noelo_CvH_HvH_knn_results)

-----

#### Decision Tree

In [None]:
# noelo_CvH_HvH_dt = hp.train_default_dt_model(X_CvH_HvH_train_,
#                                              y_CvH_HvH_train_)

In [None]:
noelo_CvH_HvH_dt = joblib.load('./data/pickled_models/noelo_CvH_HvH/noelo_CvH_HvH_dt.pkl')

In [None]:
noelo_CvH_HvH_dt_report, noelo_CvH_HvH_dt_results = hp.class_model_eval_logreg(
    noelo_CvH_HvH_dt,
    X_CvH_HvH_train_, X_CvH_HvH_test_,
    y_CvH_HvH_train_, y_CvH_HvH_test_,
    has_coeffs=False,
    num_class=3)

In [None]:
hp.tree_feature_importance(noelo_CvH_HvH_dt, X_CvH_HvH_train_)

----

#### Random Forest

In [None]:
# noelo_CvH_HvH_rf = hp.train_default_rf_model(X_CvH_HvH_train_,
#                                              y_CvH_HvH_train_,
#                                              path='./data/pickled_models/noelo_CvH_HvH/',
#                                              name='noelo_CvH_HvH')

In [None]:
noelo_CvH_HvH_rf = joblib.load('./data/pickled_models/noelo_CvH_HvH/noelo_CvH_HvH_rf.pkl')

In [None]:
noelo_CvH_HvH_rf_report, noelo_CvH_HvH_rf_results = hp.class_model_eval_logreg(
    noelo_CvH_HvH_rf,
    X_CvH_HvH_train_, X_CvH_HvH_test_,
    y_CvH_HvH_train_, y_CvH_HvH_test_,
    has_coeffs=False,
    num_class=3)

In [None]:
hp.tree_feature_importance(noelo_CvH_HvH_rf, X_CvH_HvH_train_)

In [None]:
len(noelo_CvH_HvH_rf.estimators_) # 100

In [None]:
hp.rf_trees_and_forest_strength(noelo_CvH_HvH_rf,
                                X_CvH_HvH_train_, X_CvH_HvH_test_,
                                y_CvH_HvH_train_, y_CvH_HvH_test_)

-----

#### XGBoost

In [None]:
# noelo_CvH_HvH_xgb = hp.train_default_xgboost_model(X_CvH_HvH_train_,
#                                                    y_CvH_HvH_train_)

In [None]:
noelo_CvH_HvH_xgb = joblib.load('./data/pickled_models/noelo_CvH_HvH/noelo_CvH_HvH_xgb.pkl')

In [None]:
noelo_CvH_HvH_xgb_report, noelo_CvH_HvH_xgb_results = hp.class_model_eval_logreg(
    noelo_CvH_HvH_xgb,
    X_CvH_HvH_train_, X_CvH_HvH_test_,
    y_CvH_HvH_train_, y_CvH_HvH_test_,
    has_coeffs=False,
    num_class=3)

In [None]:
hp.tree_feature_importance(noelo_CvH_HvH_xgb, X_CvH_HvH_train_)

-------

## Modelling Using Engineered Features

### Loading

In [None]:
wanted_cols = [
    'WhiteElo', 'WhiteRD', 'BlackElo', 'BlackRD', 'white_played_perc_top_move', 'black_played_perc_top_move',
    'av_white_emt', 'av_black_emt', 'PlyCount', 'TimeControl_Base', 'TimeControl_Inc', 'ECO', 'WhiteIsComp', 
    'BlackIsComp', 'NoComp']

In [None]:
df_ffpm = joblib.load('data/cleaned/df_for_fourth_pass_modelling.pkl')

In [None]:
df_ffpm.head()

---

### No Cross-validation

#### X-y split, train-test split, flatten y, OHE, scale

In [None]:
X, y = hp.X_y_split_simple(df_ffpm)
y = hp.y_convert_to_ints(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=22, stratify=y)

y_train, y_test = hp.flat_y(y_train, y_test)

In [None]:
X_train.head()

In [None]:
X_train, X_test, ohe_ = hp.OHE_ECO(X_train, X_test)

In [None]:
# X_train.shape # (22031, 440)
# X_test.shape # (5508, 440)

In [None]:
# wanted_cols[:-4]

# # ['WhiteElo',
# #  'WhiteRD',
# #  'BlackElo',
# #  'BlackRD',
# #  'white_played_perc_top_move',
# #  'black_played_perc_top_move',
# #  'av_white_emt',
# #  'av_black_emt',
# #  'PlyCount',
# #  'TimeControl_Base',
# #  'TimeControl_Inc']

In [None]:
X_train_scaled, X_test_scaled, stan_scal_ = hp.stan_scale(X_train, X_test, list_of_cols=wanted_cols[:-4])

In [None]:
X_train_scaled.head()

-----

#### Logistic Regression

In [None]:
# fp_logreg = hp.train_default_logreg_model(X_train_scaled, y_train,
#                                           path='E:/data/capstone/pickled_models/fourth_pass/',
#                                           name='fp')

In [None]:
fp_logreg = joblib.load('data/pickled_models/fourth_pass/fp_logreg.pkl')

In [None]:
fp_logreg_report, fp_logreg_results, fp_logreg_coefficients =\
    hp.class_model_eval_logreg(
        fp_logreg, X_train_scaled, X_test_scaled, y_train, y_test,
        num_class=3)

In [None]:
# fp_logreg_coefficients

In [None]:
for col in fp_logreg_coefficients:
    display(fp_logreg_coefficients[[col]].sort_values(
    by=col, ascending=False).head(15))

-----

#### Decision Tree

In [None]:
fp_dt = hp.train_default_dt_model(X_train, y_train)

In [None]:
fp_dt_report, fp_dt_results =\
    hp.class_model_eval_logreg(
        fp_dt, X_train, X_test, y_train, y_test,
        num_class=3, has_coeffs=False)

In [None]:
hp.tree_feature_importance(fp_dt, X_train)

-----

###### NoElo

In [None]:
X_train_noelo = X_train.drop(columns=['WhiteElo', 'BlackElo'])
X_test_noelo = X_test.drop(columns=['WhiteElo', 'BlackElo'])

In [None]:
fp_dt_noelo = hp.train_default_dt_model(X_train_noelo, y_train)

In [None]:
fp_dt_noelo_report, fp_dt_noelo_results =\
    hp.class_model_eval_logreg(
        fp_dt_noelo, X_train_noelo, X_test_noelo, y_train, y_test,
        num_class=3, has_coeffs=False)

In [None]:
hp.tree_feature_importance(fp_dt_noelo, X_train_noelo)

-----

#### Random Forest

In [None]:
fp_rf = hp.train_default_rf_model(X_train, y_train)

In [None]:
fp_rf_report, fp_rf_results =\
    hp.class_model_eval_logreg(
        fp_rf, X_train, X_test, y_train, y_test,
        num_class=3, has_coeffs=False)

In [None]:
hp.rf_trees_and_forest_strength(fp_rf, X_train, X_test, y_train, y_test);

In [None]:
hp.tree_feature_importance(fp_rf, X_train)

In [None]:
len(fp_rf.estimators_) # 100

In [None]:
# decision_tree_train_scores = []
# for sub_tree in fp_rf.estimators_:
#     decision_tree_train_scores.append(fp_rf.score(X_train_, y_train))
    
# print("Performance on fitted data:")
# print(f"Average Decision Tree: {np.mean(decision_tree_train_scores)}")
# print(f"Random Forest: {fp_rf.score(X_train_, y_train)}")

In [None]:
# decision_tree_test_scores = []
# for sub_tree in random_forest.estimators_:
#     decision_tree_test_scores.append(sub_tree.score(X_test_, y_test))

# print("Performance on Test data:")
# print(f"Average Decision Tree: {np.mean(decision_tree_test_scores)}")
# print(f"Random Forest: {random_forest.score(X_test_, y_test)}")

-----

###### NoElo

In [None]:
X_train_noelo = X_train.drop(columns=['WhiteElo', 'BlackElo'])
X_test_noelo = X_test.drop(columns=['WhiteElo', 'BlackElo'])

In [None]:
fp_rf_noelo = hp.train_default_rf_model(X_train_noelo, y_train)

In [None]:
fp_rf_noelo_report, fp_rf_noelo_results =\
    hp.class_model_eval_logreg(
        fp_rf_noelo, X_train_noelo, X_test_noelo, y_train, y_test,
        num_class=3, has_coeffs=False)

In [None]:
hp.rf_trees_and_forest_strength(fp_rf_noelo, X_train_noelo, X_test_noelo, y_train, y_test);

In [None]:
hp.tree_feature_importance(fp_rf_noelo, X_train_noelo)

-----

-----

#### XGBoost

In [None]:
fp_xgb = hp.train_default_xgboost_model(X_train, y_train)

In [None]:
fp_xgb_report, fp_xgb_results =\
    hp.class_model_eval_logreg(
        fp_xgb, X_train, X_test, y_train, y_test,
        num_class=3, has_coeffs=False)

In [None]:
hp.tree_feature_importance(fp_xgb, X_train)

----

### With GridSearch

In [None]:
df_ffpm = joblib.load('data/cleaned/df_for_fourth_pass_modelling.pkl')

In [None]:
df_ffpm.head()

-----

#### X-y split, train-validation-test split, flatten y

In [None]:
X, y = hp.X_y_split_simple(df_ffpm)
y = hp.y_convert_to_ints(y)

In [None]:
X_remainder, X_test, y_remainder, y_test = train_test_split(X, y, test_size=0.2,
                                                            random_state=22, stratify=y)

In [None]:
y_remainder, y_test = hp.flat_y(y_remainder, y_test)

In [None]:
# Print info on how the data has been split
print(f'The remainder set has {len(X_remainder)} data points.')
print(f'The test set has {len(X_test)} data points.')

# Print info on how the data has been split
print(f'The remainder set has {len(y_remainder)} data points.')
print(f'The test set has {len(y_test)} data points.')

----

#### Preparing pipeline

In [None]:
# column transformer with scaling
ct_scale = ColumnTransformer([
    ('normalize', StandardScaler(),
     ['WhiteElo', 'WhiteRD', 'BlackElo', 'BlackRD', 'white_played_perc_top_move',
                                     'black_played_perc_top_move', 'av_white_emt', 'av_black_emt', 'PlyCount',
                                     'TimeControl_Base', 'TimeControl_Inc']),
    ('OHE', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['ECO'])], n_jobs=-1
    )

In [None]:
# column transformer without scaling
ct_unscale = ColumnTransformer([
    ('OHE', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['ECO'])],
    n_jobs=-1,
    remainder='passthrough') # passthrough means that the other columns will be carried through

In [None]:
# # testing
# ct_unscale.fit(X_remainder)
# ct_unscale.get_feature_names()[-12:]

# # ['OHE__x0_E99',
# #  'WhiteElo',
# #  'WhiteRD',
# #  'BlackElo',
# #  'BlackRD',
# #  'white_played_perc_top_move',
# #  'black_played_perc_top_move',
# #  'av_white_emt',
# #  'av_black_emt',
# #  'PlyCount',
# #  'TimeControl_Base',
# #  'TimeControl_Inc']

In [None]:
# # testing
# ct_scale.fit(X_remainder)
# ct_scale.get_feature_names_out()

# # ['normalize__WhiteElo', 'normalize__WhiteRD', 'normalize__BlackElo',
# #        'normalize__BlackRD', 'normalize__white_played_perc_top_move',
# #        'normalize__black_played_perc_top_move', 'normalize__av_white_emt',
# #        'normalize__av_black_emt', 'normalize__PlyCount',
# #        'normalize__TimeControl_Base', 'normalize__TimeControl_Inc',
# #        'OHE__ECO_A00', 'OHE__ECO_A01', 'OHE__ECO_A02', 'OHE__ECO_A03',

In [None]:
cachedir_scale = mkdtemp()
cachedir_unscale = mkdtemp()

In [None]:
my_pipeline_scale = Pipeline(steps=[
    ('column_transform', ct_scale),
    ('model', LogisticRegression())],
    memory=cachedir_scale, verbose=True)

In [None]:
my_pipeline_unscale = Pipeline([
    ('column_transform', ct_unscale),
    ('model', DecisionTreeClassifier())],
    memory=cachedir_unscale, verbose=True)

-----

#### Logistic Regression

In [None]:
# c_values = [.00001, .0001, .001, .1, 1, 10, 100, 1000, 10000]

# logreg_param_grid = [

#     {'column_transform': [ct_scale],
#      'model__n_jobs': [-1], 'model__max_iter': [1000],
#      'model__solver': ['lbfgs'], 'model__C': c_values, 'model__penalty': ['l2', 'none']},
#     {'column_transform': [ct_scale],
#      'model__random_state': [22], 'model__n_jobs': [-1], 'model__max_iter': [1000],
#      'model__solver': ['liblinear'], 'model__C': c_values, 'model__penalty': ['l1', 'l2']}    
# ]

In [None]:
# # for testing that it  works

# c_values = [.1, 1, 10]

# logreg_param_grid = [
    
#     {'column_transform': [ct_scale],
#      'model': [LogisticRegression()], 'model__random_state':  [22],
#      'model__n_jobs': [-1], 'model__max_iter': [1000],
#      'model__solver': ['lbfgs'], 'model__C': c_values, 'model__penalty': ['l2', 'none']}
# ]

In [None]:
# logreg_gs = GridSearchCV(
#     estimator=my_pipeline_scale,
#     param_grid=logreg_param_grid,
#     cv=5, n_jobs=-1, verbose=10)

In [None]:
# fitted_logreg_gs = logreg_gs.fit(X_remainder, y_remainder)

In [None]:
# fitted_logreg_gs.best_params_

In [None]:
fitted_logreg_gs = joblib.load('data/pickled_models/gridsearch/best_logreg.pkl')

In [None]:
# Print the accuracies
print(f"The best logistic regression's accuracy on the remainder set: {fitted_logreg_gs.score(X_remainder, y_remainder)}")
print(f"The best logistic regression's accuracy on the test set: {fitted_logreg_gs.score(X_test, y_test)}")

-----

#### Decision Tree

In [None]:
# # for testing that it  works

# dt_param_grid = [
    
#     {'column_transform': [ct_unscale],
#      'model': [DecisionTreeClassifier()],
#      'model__criterion': ['gini', 'entropy'], 'model__splitter': ['best', 'random'],
#      'model__max_depth': [None, 50, 100, 150], 'model__min_samples_split': [2, 4, 6],
#      'model__min_samples_leaf': [1, 2, 4, 5], 'model__max_features': ['sqrt', 'auto', 'log2', None]
#     }
# ]

In [None]:
# dt_gs = GridSearchCV(
#     my_pipeline_unscale,
#     param_grid=dt_param_grid,
#     cv=5, n_jobs=-1, verbose=10)

In [None]:
# fitted_dt_gs = dt_gs.fit(X_remainder, y_remainder)

In [None]:
# fitted_dt_gs.best_estimator_

In [None]:
# fitted_dt_gs.best_params_

In [None]:
fitted_dt_gs = joblib.load('data/pickled_models/gridsearch/best_dt.pkl')

In [None]:
# Print the accuracies
print(f"The best decistion tree's accuracy on the remainder set: {fitted_dt_gs.score(X_remainder, y_remainder)}")
print(f"The best decistion tree's accuracy on the test set: {fitted_dt_gs.score(X_test, y_test)}")

-----

#### Random Forest

In [None]:
# # for testing that it  works

# rf_param_grid = [
    
#     {'column_transform': [ct_unscale],
#      'model': [RandomForestClassifier()],
#      'model__n_estimators': [25, 50, 100], 'model__criterion': ['gini', 'entropy'],
#      'model__max_depth': [None, 50, 100, 150], 'model__min_samples_split': [2, 4, 6],
#      'model__min_samples_leaf': [1], 'model__max_features': ['sqrt']
#     }
# ]

In [None]:
# rf_gs = GridSearchCV(
#     my_pipeline_unscale,
#     param_grid=rf_param_grid,
#     cv=5, n_jobs=-1, verbose=10)

In [None]:
# fitted_rf_gs = rf_gs.fit(X_remainder, y_remainder)

In [None]:
# fitted_rf_gs.best_estimator_

In [None]:
# fitted_rf_gs.best_params_

In [None]:
fitted_rf_gs = joblib.load('data/pickled_models/gridsearch/best_rf.pkl')

In [None]:
# Print the accuracies
print(f"The best decistion tree's accuracy on the remainder set: {fitted_rf_gs.score(X_remainder, y_remainder)}")
print(f"The best decistion tree's accuracy on the test set: {fitted_rf_gs.score(X_test, y_test)}")

-----

#### XGBoost

In [None]:
# # for testing that it  works

# xgb_param_grid = [
    
#     {'column_transform': [ct_unscale],
#      'model': [XGBClassifier()],
#      'model__n_estimators': [100], 'model__learning_rate': [0.3, 0.8, 1.5], 'model__gamma': [0],
#      'model__max_depth': [6], 'model__max_delta_step': [0, 1],
#      'model__sampling_method': ['uniform', 'gradient_based'],
#      'model__lambda': [1], 'model__verbosity': [1]
#     }
# ]

In [None]:
# xgb_gs = GridSearchCV(
#     my_pipeline_unscale,
#     param_grid=xgb_param_grid,
#     cv=5, n_jobs=-1, verbose=10)

In [None]:
# fitted_xgb_gs = xgb_gs.fit(X_remainder, y_remainder)

In [None]:
# fitted_xgb_gs.best_estimator_

In [None]:
# fitted_xgb_gs.best_params_

In [None]:
fitted_xgb_gs = joblib.load('data/pickled_models/gridsearch/best_xgbost.pkl')

In [None]:
# Print the accuracies
print(f"The best decistion tree's accuracy on the remainder set: {fitted_xgb_gs.score(X_remainder, y_remainder)}")
print(f"The best decistion tree's accuracy on the test set: {fitted_xgb_gs.score(X_test, y_test)}")

-----

## Results

The first two rounds of modelling ran into problems due to underlying issues with the collected data - primarily that the distribution of elos between humans and computers was so different. By adding games where humans played against other humans and removing each player's elo the accuracy decreased for all models tested.


The engineered features of the percentage of moves each side played that was the engine's top recommendation and the average time spent per move improved the accuracy of all models tested, even after only taking games within a certain elo range to address the difference in average elo between humans and engines. This is not surprising as games, where an engine is playing against a human, are likely to have a high percentage of moves matching the engine's top recommendation. What is interesting is that for the logistic regression model using the engineered features, the 15 highest value coefficient for each sub-classification model did not have the percentage of moves that were the engine's top recommendation. In predicting if white or black was cheating, the elo and average time per move were still within the top 7 and 6 respectively, so it is likely that not enough was done to address the difference in average elo of computers and humans, and that because the computers will typically spend the same amount of time per move the average time spent is a strong indicator (though 'smart' cheaters will tamper with this metric by consciously moving slower at different points in the game).


For the decision tree, the player's elo, average time per move and percentage of moves that were the engine's top recommendation all were within the top 10 most important features in predicting if a player was cheating. Interestingly, an opening was not the most important feature - instead, time control took the top spot. This suggests in the dataset, the CvH games had a significantly different distribution of time controls to the HvH games. Indeed, even when the elos were removed, all models still performed well due to the engineered features, but the most important features were still the time controls.


Future work should investigate further the openings that were the strongest indicator of if white or black was cheating, as one can infer that computers tend to favour particular openings over others. This is indeed a known trait of modern engines, as their playstyle does differ from human players: engines have been noted to play in a way as to keep their options open as long as possible, whereas humans will seek an early conflict which forces the position into more set possibilities.


In [None]:
First_Models_CvH=pd.DataFrame(data=[[0.9090, 0.9458, 0.9343, 0.9703, 0.9587],
                                    [0.8701, 'na', 'na', 'na', 'na'],
                                    [0.7532, 0.8428, 0.8455, 0.8927, 0.9286],
                                    [0.6911, 0.8126, 0.8554, 0.8960, 0.9093],
                                    [0.8900, 'na', 0.9336, 0.9882, 'na'],
                                    ['na', 'na', 0.8631, 0.9541, 0.9549],
                                    [0.8900, 'na', 0.9747, 0.9863, 0.9881]
                                    
                                    
                                   ],
                              index=['CvH', 'CvH&HvH', 'NoElo CvH', 'NoElo CvH&HvH', 'Feat Eng CvH&HvH',
                                     'NoElo Feat Eng CvH&HvH', 'Grid Search Feat Eng CvH&HvH'],
                              columns=['LogReg', 'KNN', 'DT', 'RF', 'XGBoost'])
First_Models_CvH

----