In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import os.path as op
import os
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterSampler
from scipy.stats.distributions import norm, uniform

In [2]:
RANDOM_STATE = 42
np.random.RandomState(RANDOM_STATE)
INPUT = "../data"
TARGET = "delta"

In [3]:
df = pd.read_csv(op.join(INPUT, "dataset.csv")) # load data without header
df.sort_values(by="date", ascending=True)

Unnamed: 0,date,home,away,goals_home,goals_away,season,delta,year,home__val,away__val,...,away__scored,away__suffered,away__delta_minus_1,away__delta_minus_2,away__delta_minus_3,away__delta_minus_4,away__avg_scored_3,away__avg_suffered_3,away__avg_scored_5,away__avg_suffered_5
396,2011-08-12,Gil Vicente,Benfica,2,2,2011,0,2011,6850,174300,...,2,2,0,0,0,0,0.000000,0.000000,0.0,0.0
397,2011-08-13,Rio Ave FC,Braga,0,0,2011,0,2011,16300,57800,...,0,0,0,0,0,0,0.000000,0.000000,0.0,0.0
398,2011-08-13,Sporting CP,Olhanense,1,1,2011,0,2011,128550,0,...,1,1,0,0,0,0,0.000000,0.000000,0.0,0.0
399,2011-08-14,Marítimo,Beira-Mar,0,0,2011,0,2011,24300,0,...,0,0,0,0,0,0,0.000000,0.000000,0.0,0.0
400,2011-08-14,Feirense,Nacional,0,0,2011,0,2011,0,0,...,0,0,0,0,0,0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,2019-11-10,Marítimo,Portimonense,1,1,2019,0,2019,15650,19150,...,1,1,0,0,-4,-4,0.666667,2.000000,0.6,2.2
378,2019-11-10,Marítimo,Portimonense,1,1,2019,0,2019,15650,19150,...,1,1,0,0,0,-4,1.000000,1.000000,0.6,2.2
377,2019-11-10,Marítimo,Portimonense,1,1,2019,0,2019,15650,19150,...,1,1,0,0,-4,-4,0.666667,2.000000,0.6,2.2
385,2019-11-10,Sporting CP,Belenenses SAD,2,0,2019,2,2019,215830,11200,...,0,2,1,1,-5,-5,0.666667,1.666667,1.0,2.4


In [30]:
COL_NAMES = [
    x for x in df.columns if
    x in ["home__val", "away__val"] or
    "__avg_" in x or
    "delta_minus" in x 
]

In [31]:
df.head()


Unnamed: 0,date,home,away,goals_home,goals_away,season,delta,year,home__val,away__val,...,away__scored,away__suffered,away__delta_minus_1,away__delta_minus_2,away__delta_minus_3,away__delta_minus_4,away__avg_scored_3,away__avg_suffered_3,away__avg_scored_5,away__avg_suffered_5
0,2019-08-09,Portimonense,Belenenses SAD,0,0,2019,0,2019,19150,11200,...,0,0,0,0,0,0,0.0,1.666667,0.0,3.2
1,2019-08-09,Portimonense,Belenenses SAD,0,0,2019,0,2019,19150,11200,...,0,0,0,0,0,0,0.0,4.333333,0.0,3.8
2,2019-08-09,Portimonense,Belenenses SAD,0,0,2019,0,2019,19150,11200,...,0,0,0,0,0,0,0.0,1.666667,0.0,3.2
3,2019-08-09,Portimonense,Belenenses SAD,0,0,2019,0,2019,19150,11200,...,0,0,0,0,0,0,0.0,4.333333,0.0,3.8
4,2019-08-10,Santa Clara,Famalicão,0,2,2019,-1,2019,15350,14150,...,2,0,2,0,0,0,0.0,0.0,0.0,0.0


In [32]:
# we're just trying to understand if
# a specific team wins, loses or draws, no need to have the goal difference
df.loc[df[TARGET] > 0, TARGET] = 1
df.loc[df[TARGET] < 0, TARGET] = -1

 # A first approach

In [33]:
X = df[COL_NAMES]
y = df[TARGET]

In [34]:
lc = SGDClassifier(
    loss="log",
    random_state=RANDOM_STATE
)
lc.fit(X=X, y=y)
print(f"Our accuracy is {int(lc.score(X=X, y=y)*100)}%!")


Our accuracy is 47%!


 It seems we have a model in our hands that we can expect to guess the result of over half of the matches.
 Do we really?
 Is this a fair way of evaluating our model?
 You may read hint nr. 1.

 # A fair accessment of the performance; aka Cross validation

In [47]:
splitter = TimeSeriesSplit(n_splits=2)
tr_ind, te_ind = (*splitter.split(X),)[0]
X_tr, y_tr = X.loc[tr_ind], y.loc[tr_ind]
X_te, y_te = X.loc[te_ind], y.loc[te_ind]

In [48]:
scaler = StandardScaler()
X_tr = scaler.fit_transform(X_tr)
X_te = scaler.transform(X_te)

In [49]:
lc = SGDClassifier(
    loss="log",
    random_state=RANDOM_STATE 
)
lc.fit(X=X_tr, y=y_tr)
print(f"Our accuracy is {int(lc.score(X=X_te, y=y_te)*100)}%!\n A little worse than what we previously thought!")

Our accuracy is 46%!
 A little worse than what we previously thought!


 # Let's try other models

In [50]:
dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
dt.fit(X=X_tr, y=y_tr)
print(f"Our accuracy is {int(dt.score(X=X_te, y=y_te)*100)}%!")

Our accuracy is 45%!


In [51]:
df_imp = pd.DataFrame(
dt.feature_importances_,
columns=["importance"]
)
df_imp.index = COL_NAMES
df_imp.sort_values(by="importance", ascending=False, inplace=True)

In [52]:
df[COL_NAMES].astype(float).describe()

Unnamed: 0,home__val,away__val,home__delta_minus_1,home__delta_minus_2,home__delta_minus_3,home__delta_minus_4,home__avg_scored_3,home__avg_suffered_3,home__avg_scored_5,home__avg_suffered_5,away__delta_minus_1,away__delta_minus_2,away__delta_minus_3,away__delta_minus_4,away__avg_scored_3,away__avg_suffered_3,away__avg_scored_5,away__avg_suffered_5
count,3042.0,3042.0,3042.0,3042.0,3042.0,3042.0,3042.0,3042.0,3042.0,3042.0,3042.0,3042.0,3042.0,3042.0,3042.0,3042.0,3042.0,3042.0
mean,46780.108481,46900.460224,-0.17094,0.097304,-0.106838,0.127876,1.086675,1.280517,1.027153,1.251085,0.172255,-0.096318,0.110125,-0.125247,1.171598,1.236139,1.04977,1.247074
std,77869.625111,78471.921279,1.759375,1.73074,1.701333,1.679446,0.855126,0.80204,0.80535,0.662705,1.746866,1.733738,1.699576,1.669038,0.897051,0.776833,0.808737,0.670607
min,0.0,0.0,-10.0,-7.0,-10.0,-6.0,0.0,0.0,0.0,0.0,-7.0,-10.0,-6.0,-10.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,-1.0,-1.0,-1.0,-1.0,0.333333,0.666667,0.4,0.8,-1.0,-1.0,-1.0,-1.0,0.666667,0.666667,0.4,0.8
50%,15350.0,14950.0,0.0,0.0,0.0,0.0,1.0,1.333333,1.0,1.2,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.2
75%,26380.0,28650.0,1.0,1.0,1.0,1.0,1.666667,1.666667,1.6,1.8,1.0,1.0,1.0,1.0,1.666667,1.666667,1.6,1.6
max,314700.0,314700.0,7.0,10.0,6.0,10.0,5.666667,5.333333,4.6,4.2,10.0,7.0,10.0,6.0,6.333333,4.666667,5.2,4.2


In [53]:
df_imp.head(10)

Unnamed: 0,importance
away__val,0.154873
home__val,0.132238
away__delta_minus_2,0.084275
home__avg_suffered_5,0.076401
away__delta_minus_1,0.060679
home__avg_scored_5,0.057309
away__avg_suffered_5,0.055474
home__delta_minus_2,0.050408
home__delta_minus_4,0.047311
home__delta_minus_1,0.041105


In [54]:
X.columns

Index(['home__val', 'away__val', 'home__delta_minus_1', 'home__delta_minus_2',
       'home__delta_minus_3', 'home__delta_minus_4', 'home__avg_scored_3',
       'home__avg_suffered_3', 'home__avg_scored_5', 'home__avg_suffered_5',
       'away__delta_minus_1', 'away__delta_minus_2', 'away__delta_minus_3',
       'away__delta_minus_4', 'away__avg_scored_3', 'away__avg_suffered_3',
       'away__avg_scored_5', 'away__avg_suffered_5'],
      dtype='object')

In [55]:
y.name

'delta'

In [56]:
X.columns

Index(['home__val', 'away__val', 'home__delta_minus_1', 'home__delta_minus_2',
       'home__delta_minus_3', 'home__delta_minus_4', 'home__avg_scored_3',
       'home__avg_suffered_3', 'home__avg_scored_5', 'home__avg_suffered_5',
       'away__delta_minus_1', 'away__delta_minus_2', 'away__delta_minus_3',
       'away__delta_minus_4', 'away__avg_scored_3', 'away__avg_suffered_3',
       'away__avg_scored_5', 'away__avg_suffered_5'],
      dtype='object')

In [57]:
help(KNeighborsClassifier)

Help on class KNeighborsClassifier in module sklearn.neighbors.classification:

class KNeighborsClassifier(sklearn.neighbors.base.NeighborsBase, sklearn.neighbors.base.KNeighborsMixin, sklearn.neighbors.base.SupervisedIntegerMixin, sklearn.base.ClassifierMixin)
 |  KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=30, p=2, metric='minkowski', metric_params=None, n_jobs=None, **kwargs)
 |  
 |  Classifier implementing the k-nearest neighbors vote.
 |  
 |  Read more in the :ref:`User Guide <classification>`.
 |  
 |  Parameters
 |  ----------
 |  n_neighbors : int, optional (default = 5)
 |      Number of neighbors to use by default for :meth:`kneighbors` queries.
 |  
 |  weights : str or callable, optional (default = 'uniform')
 |      weight function used in prediction.  Possible values:
 |  
 |      - 'uniform' : uniform weights.  All points in each neighborhood
 |        are weighted equally.
 |      - 'distance' : weight points by the inverse of th

In [58]:
knn = KNeighborsClassifier()
knn.fit(X=X_tr, y=y_tr)
print(f"Our accuracy is {int(knn.score(X=X_te, y=y_te)*100)}%!")

Our accuracy is 42%!


 The DT accuracy looks pretty good.
 Is there reason for concern?
 Any queue on why this happened only with this model?
 Why this difference in performance between the models?
 Read hint nr. 2 for an explanation

 We've came up with a simple cross validation scheme, but is it good enough for this situation?
 Should we take any extra care when we are dealing with data gathered at different moments in time?
 Read hint nr. 3 for an explanation