In [40]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
football = pd.read_csv('england-clean.csv')
football.head()

Unnamed: 0,month,day,season,home,visitor,hgoal,vgoal
0,9,9,1986,23,48,2,0
1,9,9,1986,44,14,0,1
2,9,9,1986,52,25,0,0
3,9,9,1986,27,7,3,1
4,9,9,1986,35,2,2,0


In [3]:
def get_match_result(xg_home, xg_visitor):
    diff = abs(xg_home - xg_visitor)
    if diff < 0.2:
        return 0 #Draw
    else:
        if xg_home > xg_visitor:
            return 1 #Local wins
    return 2 #Local loses

In [4]:
match_results = [get_match_result(hgoal, vgoal) for hgoal, vgoal in zip(football.hgoal, football.vgoal)]

In [7]:
football['match_result'] = match_results
football.head()

Unnamed: 0,month,day,season,home,visitor,hgoal,vgoal,match_result
0,9,9,1986,23,48,2,0,1
1,9,9,1986,44,14,0,1,2
2,9,9,1986,52,25,0,0,0
3,9,9,1986,27,7,3,1,1
4,9,9,1986,35,2,2,0,1


In [8]:
X = football.loc[:,'month':'visitor'].to_numpy()

In [9]:
X.shape

(13338, 5)

In [43]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [44]:
y_hgoal = football.hgoal.to_numpy()
y_vgoal = football.vgoal.to_numpy()

In [45]:
tcsv = TimeSeriesSplit(n_splits=10)
for train_index, test_index in tcsv.split(X):
    #print(f'Train idx: {train_index}, Test idx: {test_index}')
    X_train, X_test = X[train_index], X[test_index]
    y_hgoal_train, y_hgoal_test = y_hgoal[train_index], y_hgoal[test_index]
    y_vgoal_train, y_vgoal_test = y_vgoal[train_index], y_vgoal[test_index]
    reg_home = LinearRegression().fit(X_train, y_hgoal_train)
    reg_visitor = LinearRegression().fit(X_train, y_vgoal_train)
    y_real = football.iloc[test_index, -1]
    y_home_pred = reg_home.predict(X_test)
    y_visitor_pred = reg_visitor.predict(X_test)
    y_pred = [get_match_result(hgoal, vgoal) for hgoal, vgoal in zip(y_home_pred, y_visitor_pred)]
    print(classification_report(y_real, y_pred, labels=[0,1,2]))

              precision    recall  f1-score   support

           0       0.27      0.55      0.36       334
           1       0.50      0.11      0.18       572
           2       0.25      0.32      0.28       306

    accuracy                           0.29      1212
   macro avg       0.34      0.33      0.27      1212
weighted avg       0.37      0.29      0.26      1212

              precision    recall  f1-score   support

           0       0.35      0.07      0.12       353
           1       0.45      0.95      0.61       538
           2       0.00      0.00      0.00       321

    accuracy                           0.44      1212
   macro avg       0.27      0.34      0.24      1212
weighted avg       0.30      0.44      0.31      1212

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       342
           1       0.46      1.00      0.63       556
           2       0.00      0.00      0.00       314

    accuracy        

In [46]:
for i in range(len(y_home_pred)):
    print(y_home_pred[i], '---', y_visitor_pred[i])

986648440988 --- 1.1300056531542544
1.51078650402612 --- 1.1377320819740748
1.4941529509942255 --- 1.1486007250299315
1.5436692980685658 --- 1.1314529559308284
1.5614233709939853 --- 1.149800419123618
1.5515537767653282 --- 1.1441078840551513
1.5206481006218033 --- 1.1520011528065055
1.535752828839909 --- 1.134549483362362
1.5212003629355921 --- 1.1595607416947926
1.544069605355856 --- 1.1291151618344337
1.5383221972174865 --- 1.1486920986389362
1.5441895698504602 --- 1.144998375374972
1.5072965565793575 --- 1.140702153849752
1.558862000249381 --- 1.1348937680830797
1.5102662322442255 --- 1.1437179125299315
1.535304535754777 --- 1.1395183670425415
1.5265474637866714 --- 1.1520872239866848
1.5319505480685658 --- 1.1519607684308284
1.50053259777612 --- 1.1572633319740748
1.5397230599036977 --- 1.139179384750649
1.565801906978038 --- 1.1435159906515462
1.522056960940988 --- 1.1436775281542544
1.560286855834513 --- 1.1289483219029002
1.543749274398302 --- 1.1521329107911873
1.5289328866058

In [47]:
df = pd.DataFrame({'h':y_home_pred, 'v':y_visitor_pred})
df.head()df = pd.DataFrame({'h':y_home_pred, 'v':y_visitor_pred})
df.head()
df.describe()
df.describe()

Unnamed: 0,h,v
count,1212.0,1212.0
mean,1.545769,1.172013
std,0.020511,0.030298
min,1.48923,1.109118
25%,1.530969,1.144039
50%,1.545538,1.175595
75%,1.560695,1.198052
max,1.603416,1.234336


In [48]:
df = pd.DataFrame({'h':football.hgoal, 'v':football.vgoal})
df.describe()

Unnamed: 0,h,v
count,13338.0,13338.0
mean,1.526691,1.116059
std,1.290898,1.113097
min,0.0,0.0
25%,1.0,0.0
50%,1.0,1.0
75%,2.0,2.0
max,9.0,9.0
