In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import joblib
import copy
import pandas as pd
import numpy as np
import os
PRJ_ROOT_DIR = os.path.dirname(os.path.abspath(''))

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [2]:
# Where to save the figures
NOTE_ROOT_DIR = os.path.abspath('')
CHAPTER_ID = "04_modelling"
IMAGES_PATH = os.path.join(NOTE_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## 1 - Load the Data

In [3]:
# Load the integrated data
df_match = pd.read_csv(os.path.join(PRJ_ROOT_DIR, "data", "tabular", "integrate", "matches.csv"))
df_match.drop(['season'], axis=1, inplace=True)
df_match = df_match.replace(-100, 0)
df_match.head()

Unnamed: 0,home_result,defence/clean_sheets_0,defence/goals_conceded_0,defence/tackles_0,defence/tackle_success_%_0,defence/last_man_tackles_0,defence/blocked_shots_0,defence/interceptions_0,defence/clearances_0,defence/headed_clearance_0,...,goalkeeping/catches_26,goalkeeping/sweeper_clearances_26,goalkeeping/throw_outs_26,goalkeeping/goal_kicks_26,defence/recoveries_26,defence/duels_won_26,defence/duels_lost_26,teamplay/passes_26,teamplay/crosses_26,teamplay/accurate_long_balls_26
0,lose,5.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1097.0,88.0,0.0
1,win,6.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,win,13.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,win,11.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,248.0,13.0,0.0
4,win,13.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 1.1 - Recover the Test Set

In [4]:
# Split train-test
from sklearn.model_selection import train_test_split
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(df_match.iloc[:, 1:], df_match["home_result"], test_size=test_size, random_state=42)
X_train.head()

Unnamed: 0,defence/clean_sheets_0,defence/goals_conceded_0,defence/tackles_0,defence/tackle_success_%_0,defence/last_man_tackles_0,defence/blocked_shots_0,defence/interceptions_0,defence/clearances_0,defence/headed_clearance_0,defence/clearances_off_line_0,...,goalkeeping/catches_26,goalkeeping/sweeper_clearances_26,goalkeeping/throw_outs_26,goalkeeping/goal_kicks_26,defence/recoveries_26,defence/duels_won_26,defence/duels_lost_26,teamplay/passes_26,teamplay/crosses_26,teamplay/accurate_long_balls_26
872,8.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
626,10.0,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
617,9.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
409,11.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
589,9.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 1.2 - Preprocessing

In [7]:
# Setup the pipeline of preprocessing (from 04_modelling)
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

pipe_predictor_preprocess = Pipeline([
    ("scaler", StandardScaler())
])
pipe_label_preprocess = Pipeline([
    ("encoder", OrdinalEncoder())
])

In [8]:
# Fit & transform the training data
X_prep = pipe_predictor_preprocess.fit_transform(X_train)
y_prep = pipe_label_preprocess.fit_transform(y_train.values.reshape(-1, 1))
pipe_label_preprocess["encoder"].categories_

[array(['draw', 'lose', 'win'], dtype=object)]

In [9]:
# Transform the test data
X_prep_test = pipe_predictor_preprocess.transform(X_test)
y_prep_test = pipe_label_preprocess.transform(y_test.values.reshape(-1, 1))
pipe_label_preprocess["encoder"].categories_

[array(['draw', 'lose', 'win'], dtype=object)]

## 2 - Evaluate the Models

In [55]:
# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

# Import metrics
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score

# Import cross-validation tools
from sklearn.model_selection import cross_val_predict

# Setup to save/load the model
def save_model(model, id_):
    print("Saving model", id_)
    joblib.dump(model, os.path.join(NOTE_ROOT_DIR, "models", CHAPTER_ID, id_ + ".pkl"))
def load_model(id_):
    print("Loading model", id_)
    return joblib.load(os.path.join(NOTE_ROOT_DIR, "models", CHAPTER_ID, id_ + ".pkl"))

# Setup display
def report(y_true, y_pred):
    print("======")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("F1      :", f1_score(y_true, y_pred))
    print("======")

### 2.1 - Overfitting

In [17]:
# Setup to compare train & test performance
def train_vs_test(model, X_train=X_prep, y_train=y_prep, X_test=X_prep_test, y_test=y_prep_test, score_fn=accuracy_score):
    train_score = score_fn(y_train, model.predict(X_train))
    test_score = score_fn(y_test, model.predict(X_test))
    print("======")
    print("Train score:", train_score)
    print("Test score :", test_score)
    print("======")
    return train_score, test_score

#### Random Forest

In [53]:
# Compare model on train & test
forest_bayes_search = load_model("forest_bayes_search")
_, _ = train_vs_test(forest_bayes_search)

Loading model forest_bayes_search
Train score: 0.7546511627906977
Test score : 0.5601851851851852


In [23]:
# Features that have impact
feat_inds = np.where(forest_bayes_search.feature_importances_ > 0.0)[0]
len(feat_inds)

883

In [52]:
# Compare the reduced model on train & test
reduced_forest_bayes_search = load_model("reduced_forest_bayes_search")
_, _ = train_vs_test(reduced_forest_bayes_search, X_train=X_prep[:, feat_inds], X_test=X_prep_test[:, feat_inds])

Loading model reduced_forest_bayes_search
Train score: 0.9267441860465117
Test score : 0.5601851851851852


**CAUTION:** The models are overfitting.

#### Gradient Boosting

In [51]:
# Compare model on train & test
grad_bayes_search = load_model("grad_bayes_search")
_, _ = train_vs_test(grad_bayes_search)

Loading model grad_bayes_search
Train score: 0.6488372093023256
Test score : 0.5509259259259259


In [50]:
# Compare the reduced model on train & test
reduced_grad_bayes_search = load_model("reduced_grad_bayes_search")
_, _ = train_vs_test(reduced_grad_bayes_search, X_train=X_prep[:, feat_inds], X_test=X_prep_test[:, feat_inds])

Loading model reduced_grad_bayes_search
Train score: 0.8162790697674419
Test score : 0.49074074074074076


#### AdaBoost

In [49]:
# Compare model on train & test
ada_bayes_search = load_model("ada_bayes_search")
_, _ = train_vs_test(ada_bayes_search)

Loading model ada_bayes_search
Train score: 0.5476744186046512
Test score : 0.42592592592592593


In [48]:
# Compare the reduced model on train & test
reduced_ada_bayes_search = load_model("reduced_ada_bayes_search")
_, _ = train_vs_test(reduced_ada_bayes_search, X_train=X_prep[:, feat_inds], X_test=X_prep_test[:, feat_inds])

Loading model reduced_ada_bayes_search
Train score: 0.5081395348837209
Test score : 0.44907407407407407


#### SVC

In [47]:
# Compare model on train & test
svc_bayes_search = load_model("svc_bayes_search")
_, _ = train_vs_test(svc_bayes_search)

Loading model svc_bayes_search
Train score: 0.9209302325581395
Test score : 0.49537037037037035


In [46]:
# Compare the reduced model on train & test
reduced_svc_bayes_search = load_model("reduced_svc_bayes_search")
_, _ = train_vs_test(reduced_svc_bayes_search, X_train=X_prep[:, feat_inds], X_test=X_prep_test[:, feat_inds])

Loading model reduced_svc_bayes_search
Train score: 0.7872093023255814
Test score : 0.5462962962962963


### 2.2 - Thresholding

In [61]:
y_scores_grad = cross_val_predict(grad_bayes_search, X_prep, y_prep.squeeze(), cv=3, method='decision_function')
y_scores_grad

array([[-0.93233172, -0.803085  , -0.69596384],
       [-1.0398113 , -1.22365062, -0.87866568],
       [-1.62555336, -0.96394598, -1.11778243],
       ...,
       [-1.44034896, -1.05677425, -0.74509738],
       [-1.37463204, -1.56512657, -0.47541028],
       [-1.27821555, -1.08044925, -1.11938415]])