In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Disable warnings
import warnings
warnings.filterwarnings('ignore')

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import copy
import pandas as pd
import numpy as np
import os
PRJ_ROOT_DIR = os.path.dirname(os.path.abspath(''))

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
import seaborn as sns

In [2]:
# Where to save the figures
NOTE_ROOT_DIR = os.path.abspath('')
CHAPTER_ID = "04_modelling"
IMAGES_PATH = os.path.join(NOTE_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

## 1 - Load the Data

In [15]:
# Load the integrated data
df_match = pd.read_csv(os.path.join(PRJ_ROOT_DIR, "data", "tabular", "integrate", "matches.csv"))
df_match.drop(['season'], axis=1, inplace=True)
df_match.head()

Unnamed: 0,home_result,defence/clean_sheets_0,defence/goals_conceded_0,defence/tackles_0,defence/tackle_success_%_0,defence/last_man_tackles_0,defence/blocked_shots_0,defence/interceptions_0,defence/clearances_0,defence/headed_clearance_0,...,goalkeeping/catches_26,goalkeeping/sweeper_clearances_26,goalkeeping/throw_outs_26,goalkeeping/goal_kicks_26,defence/recoveries_26,defence/duels_won_26,defence/duels_lost_26,teamplay/passes_26,teamplay/crosses_26,teamplay/accurate_long_balls_26
0,lose,5.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1097.0,88.0,0.0
1,win,6.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0
2,win,13.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0
3,win,11.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,248.0,13.0,0.0
4,win,13.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0


### 1.1. Manage Empty Positions' Statistics

In [16]:
# Replace -100 with 0 (statistics' value of empty positions)
df_match = df_match.replace(-100, 0)
df_match.head()

Unnamed: 0,home_result,defence/clean_sheets_0,defence/goals_conceded_0,defence/tackles_0,defence/tackle_success_%_0,defence/last_man_tackles_0,defence/blocked_shots_0,defence/interceptions_0,defence/clearances_0,defence/headed_clearance_0,...,goalkeeping/catches_26,goalkeeping/sweeper_clearances_26,goalkeeping/throw_outs_26,goalkeeping/goal_kicks_26,defence/recoveries_26,defence/duels_won_26,defence/duels_lost_26,teamplay/passes_26,teamplay/crosses_26,teamplay/accurate_long_balls_26
0,lose,5.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1097.0,88.0,0.0
1,win,6.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,win,13.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,win,11.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,248.0,13.0,0.0
4,win,13.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 1.2 Create a Test Set

In [19]:
# Split train-test
from sklearn.model_selection import train_test_split
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(df_match.iloc[:, 1:], df_match["home_result"], test_size=test_size, random_state=42)
X_train.head()

Unnamed: 0,defence/clean_sheets_0,defence/goals_conceded_0,defence/tackles_0,defence/tackle_success_%_0,defence/last_man_tackles_0,defence/blocked_shots_0,defence/interceptions_0,defence/clearances_0,defence/headed_clearance_0,defence/clearances_off_line_0,...,goalkeeping/catches_26,goalkeeping/sweeper_clearances_26,goalkeeping/throw_outs_26,goalkeeping/goal_kicks_26,defence/recoveries_26,defence/duels_won_26,defence/duels_lost_26,teamplay/passes_26,teamplay/crosses_26,teamplay/accurate_long_balls_26
872,8.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
626,10.0,51.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
617,9.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
409,11.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
589,9.0,48.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 1.3 - Preprocessing

In [30]:
# Setup the pipeline of preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

pipe_predictor_preprocess = Pipeline([
    ("scaler", StandardScaler())
])
pipe_label_preprocess = Pipeline([
    ("encoder", OrdinalEncoder())
])

In [31]:
# Transform the training data
X_prep = pipe_predictor_preprocess.fit_transform(X_train)
y_prep = pipe_label_preprocess.fit_transform(y_train.values.reshape(-1, 1))
pipe_label_preprocess["encoder"].categories_

[array(['draw', 'lose', 'win'], dtype=object)]

## 2 - Modelling

In [85]:
# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

# Import metrics
from sklearn.metrics import accuracy_score, f1_score

# Import validation
from sklearn.model_selection import cross_val_score

# Setup display
def report(y_true, y_pred):
    print("======")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("F1      :", f1_score(y_true, y_pred, average="micro"))
    print("======")

# Setup cross-validation
def cross_validate(model, X_prep, y_prep, scoring="accuracy", cv=10):
    return cross_val_score(model, X_prep, y_prep, scoring=scoring, cv=cv, verbose=1, n_jobs=3).mean()

### 2.1 - Train the Models

#### Logistic Regression

In [60]:
# Train logistic regression
log_reg = LogisticRegression(multi_class="multinomial")
log_reg.fit(X_prep, y_prep)

In [61]:
# Evaluate training
report(y_prep, log_reg.predict(X_prep))

Accuracy: 1.0
F1      : 1.0


In [68]:
# k-fold cross validation on the model
cross_validate(log_reg, X_prep, y_prep)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    2.0s finished


0.4383720930232558

#### Linear SVC

In [82]:
# Train linear svm 
lin_svc = LinearSVC(loss="hinge", C=0.001, random_state=42)
lin_svc.fit(X_prep, y_prep)

In [83]:
# Evaluate training
report(y_prep, lin_svc.predict(X_prep))

Accuracy: 0.7604651162790698
F1      : 0.7604651162790698


In [84]:
# k-fold cross validation on the model
cross_validate(lin_svc, X_prep, y_prep)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.4s finished


0.4965116279069767

#### DecisionTree

In [120]:
# Train decision tree
tree_clf = DecisionTreeClassifier(max_depth=10, random_state=42)
tree_clf.fit(X_prep, y_prep)

In [121]:
# Evaluate training
report(y_prep, tree_clf.predict(X_prep))

Accuracy: 0.9244186046511628
F1      : 0.9244186046511627


In [122]:
# Evaluate training
cross_validate(tree_clf, X_prep, y_prep)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  10 out of  10 | elapsed:    0.8s finished


0.43720930232558136

In [131]:
tree_clf.feature_importances_[np.where(tree_clf.feature_importances_ > 0.0)[0]]

109

In [136]:
lin_svc = LinearSVC(loss="hinge", C=0.001, random_state=42)
lin_svc.fit(X_prep[:, np.where(tree_clf.feature_importances_ > 0.0)[0]], y_prep)
report(y_prep, lin_svc.predict(X_prep))

ValueError: X has 1350 features, but LinearSVC is expecting 109 features as input.