In [0]:
import os
import sys
import urllib
import os.path as op
import urllib

import pandas as pd
import seaborn as sns
import numpy as np

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterSampler
from scipy.stats.distributions import norm, uniform

In [0]:
INPUT = "./data/"

In [0]:
FILE_IDS = [
            "dataset.csv",
            "games.csv",
            "upcoming_games.csv",
            "values.csv"
]

In [0]:
opener = urllib.request.URLopener()

In [0]:
BASE_URL = "https://github.com/mppldev/ds_workshop/raw/master/data/{}"

In [0]:
if not op.exists(INPUT):
  os.makedirs(INPUT)

In [0]:
for f in FILE_IDS:
  file_path = op.join("./data",f)
  if not op.exists(file_path):
    res = urllib.request.urlopen(BASE_URL.format(f))
    content = res.read()
    with open(file_path, "w") as file:
      file.write(content.decode("utf-8"))

In [117]:
RANDOM_STATE = 42
np.random.RandomState(RANDOM_STATE)

RandomState(MT19937) at 0x7F9CF7CB1678

In [0]:
TARGET = "delta"

In [119]:
df = pd.read_csv(op.join(INPUT, "dataset.csv")) # load data without header
df.sort_values(by="date", ascending=True)

Unnamed: 0,date,home,away,goals_home,goals_away,season,delta,year,home__val,away__val,home__scored,home__suffered,home__delta_minus_1,home__delta_minus_2,home__delta_minus_3,home__delta_minus_4,home__avg_scored_3,home__avg_suffered_3,home__avg_scored_5,home__avg_suffered_5,away__scored,away__suffered,away__delta_minus_1,away__delta_minus_2,away__delta_minus_3,away__delta_minus_4,away__avg_scored_3,away__avg_suffered_3,away__avg_scored_5,away__avg_suffered_5
396,2011-08-12,Gil Vicente,Benfica,2,2,2011,0,2011,6850,174300,2,2,0,0,0,0,0.000000,0.000000,0.0,0.0,2,2,0,0,0,0,0.000000,0.000000,0.0,0.0
397,2011-08-13,Rio Ave FC,Braga,0,0,2011,0,2011,16300,57800,0,0,0,0,0,0,0.000000,0.000000,0.0,0.0,0,0,0,0,0,0,0.000000,0.000000,0.0,0.0
398,2011-08-13,Sporting CP,Olhanense,1,1,2011,0,2011,128550,0,1,1,0,0,0,0,0.000000,0.000000,0.0,0.0,1,1,0,0,0,0,0.000000,0.000000,0.0,0.0
399,2011-08-14,Marítimo,Beira-Mar,0,0,2011,0,2011,24300,0,0,0,0,0,0,0,0.000000,0.000000,0.0,0.0,0,0,0,0,0,0,0.000000,0.000000,0.0,0.0
400,2011-08-14,Feirense,Nacional,0,0,2011,0,2011,0,0,0,0,0,0,0,0,0.000000,0.000000,0.0,0.0,0,0,0,0,0,0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,2019-11-10,Marítimo,Portimonense,1,1,2019,0,2019,15650,19150,1,1,-2,-2,0,0,0.333333,1.666667,0.4,1.2,1,1,0,0,-4,-4,0.666667,2.000000,0.6,2.2
378,2019-11-10,Marítimo,Portimonense,1,1,2019,0,2019,15650,19150,1,1,-2,-2,0,0,0.333333,1.666667,0.4,1.2,1,1,0,0,0,-4,1.000000,1.000000,0.6,2.2
377,2019-11-10,Marítimo,Portimonense,1,1,2019,0,2019,15650,19150,1,1,0,-2,-2,0,0.333333,1.666667,0.6,1.4,1,1,0,0,-4,-4,0.666667,2.000000,0.6,2.2
385,2019-11-10,Sporting CP,Belenenses SAD,2,0,2019,2,2019,215830,11200,2,0,2,-1,-1,1,0.666667,0.666667,1.2,0.8,0,2,1,1,-5,-5,0.666667,1.666667,1.0,2.4


In [0]:
COL_NAMES = [
    x for x in df.columns if
    x in ["home__val", "away__val"] or
    "__avg_" in x or
    "delta_minus" in x or
    x in [
        "away__suffered",
        "away__scored",
        "home__scored",
        "home__suffered"
    ]
]

In [121]:
df.head()

Unnamed: 0,date,home,away,goals_home,goals_away,season,delta,year,home__val,away__val,home__scored,home__suffered,home__delta_minus_1,home__delta_minus_2,home__delta_minus_3,home__delta_minus_4,home__avg_scored_3,home__avg_suffered_3,home__avg_scored_5,home__avg_suffered_5,away__scored,away__suffered,away__delta_minus_1,away__delta_minus_2,away__delta_minus_3,away__delta_minus_4,away__avg_scored_3,away__avg_suffered_3,away__avg_scored_5,away__avg_suffered_5
0,2019-08-09,Portimonense,Belenenses SAD,0,0,2019,0,2019,19150,11200,0,0,0,0,0,0,0.0,1.333333,0.0,1.8,0,0,0,0,0,0,0.0,1.666667,0.0,3.2
1,2019-08-09,Portimonense,Belenenses SAD,0,0,2019,0,2019,19150,11200,0,0,0,0,0,0,0.0,1.333333,0.0,1.8,0,0,0,0,0,0,0.0,4.333333,0.0,3.8
2,2019-08-09,Portimonense,Belenenses SAD,0,0,2019,0,2019,19150,11200,0,0,0,0,0,0,0.0,3.0,0.0,2.0,0,0,0,0,0,0,0.0,1.666667,0.0,3.2
3,2019-08-09,Portimonense,Belenenses SAD,0,0,2019,0,2019,19150,11200,0,0,0,0,0,0,0.0,3.0,0.0,2.0,0,0,0,0,0,0,0.0,4.333333,0.0,3.8
4,2019-08-10,Santa Clara,Famalicão,0,2,2019,-2,2019,15350,14150,0,2,-2,0,0,0,0.0,3.333333,0.0,2.2,2,0,2,0,0,0,0.0,0.0,0.0,0.0


In [0]:
df.loc[df[TARGET] > 0, TARGET] = 1
df.loc[df[TARGET] < 0, TARGET] = -1

# A First Approach

In [0]:
X = df[COL_NAMES]
y = df[TARGET]

In [124]:
lc = SGDClassifier(
    loss="log",
    random_state=RANDOM_STATE
)
lc.fit(X=X, y=y)
print(f"Our accuracy is {int(lc.score(X=X, y=y)*100)}%!")

Our accuracy is 54%!


It seems we have a model in our hands that we can expect to guess the result of over half of the matches.

Do we really?

Is this a fair way of evaluating our model?
You may read **hint nr. 1.**

# A fair accessment of the performance - Cross Validation

In [0]:
X_tr, X_te, y_tr, y_te = train_test_split(
    X,
    y,
    test_size=0.5,
    random_state=RANDOM_STATE
)

In [126]:
lc = SGDClassifier(
    loss="log",
    random_state=RANDOM_STATE 
)
lc.fit(X=X_tr, y=y_tr)
print(f"Our accuracy is {int(lc.score(X=X_te, y=y_te)*100)}%!\n A little worse than what we previously thought!")

Our accuracy is 52%!
 A little worse than what we previously thought!


# Let's try other models

In [127]:
dt = DecisionTreeClassifier(random_state=RANDOM_STATE)
dt.fit(X=X_tr, y=y_tr)
print(f"Our accuracy is {int(dt.score(X=X_te, y=y_te)*100)}%!")

Our accuracy is 99%!


In [91]:
knn = KNeighborsClassifier()
knn.fit(X=X_tr, y=y_tr)
print(f"Our accuracy is {int(knn.score(X=X_te, y=y_te)*100)}%!")

Our accuracy is 63%!


The DT accuracy looks pretty good.

Is there reason for concern?

Any idea on why this happened only with this model?

Why this difference in performance between the models?

Read **hint nr. 2** for an explanation

We've came up with a simple cross validation scheme, but is it good enough for this situation?

Should we take any extra care when we are dealing with data gathered at different moments in time?

Read **hint nr. 3** for an explanation