In [35]:
import pandas as pd
import numpy as np
import os
import math, random, csv

In [36]:
def convert_score_to_numerical(fulltime_scores):
    if fulltime_scores[0] > fulltime_scores[1]:
        return 1
    elif fulltime_scores[0] == fulltime_scores[1]:
        return 0
    else:
        return -1

In [37]:
data = pd.read_csv(os.getcwd() + '/Data/2015-16Season.csv')
data['target'] = data[['Hometeam Score', 'Awayteam Score']].apply(convert_score_to_numerical, axis=1)
df = pd.DataFrame(data)
df

Unnamed: 0,Match_ID,Hometeam,Awayteam,Hometeam Score,Awayteam Score,Home_Possession,Away_Possession,Home_Shots_on_target,Away_Shots_on_target,Home_Total_Shots,...,Away_Corners,Home_Offsides,Away_Offsides,Home_Yellow_Cards,Away_Yellow_Cards,Home_Red_Cards,Away_Red_Cards,Home_Fouls_Conceded,Away_Fouls_Conceded,target
0,12494,Manchester United,AFC Bournemouth,3,1,66.3,33.7,5,0,12,...,1,4,2,1,0,0,0,9,8,1
1,12485,Arsenal,Aston Villa,4,0,59.6,40.4,7,2,16,...,4,0,0,0,1,0,0,12,6,1
2,12486,Chelsea,Leicester City,1,1,54.0,46.0,4,5,17,...,6,8,0,0,0,0,0,5,12,0
3,12487,Everton,Norwich City,3,0,45.5,54.5,8,2,15,...,11,1,1,0,0,0,0,7,11,1
4,12488,Newcastle United,Tottenham Hotspur,5,1,38.5,61.5,10,5,19,...,8,1,1,1,2,1,0,7,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,12115,AFC Bournemouth,Aston Villa,0,1,58.4,41.6,2,3,11,...,3,0,0,3,4,0,0,13,13,-1
376,12117,Everton,Watford,2,2,66.6,33.4,5,5,10,...,2,4,1,1,2,0,0,7,13,0
377,12118,Leicester City,Sunderland,4,2,44.0,56.0,8,5,20,...,3,0,1,2,4,0,0,13,17,1
378,12120,Norwich City,Crystal Palace,1,3,62.9,37.1,6,7,17,...,4,4,2,1,0,0,0,14,20,-1


In [38]:
dropped_df = df.drop(columns=['Match_ID', 'Hometeam','Awayteam','Hometeam Score', 'Awayteam Score', 'target'])
dropped_df

Unnamed: 0,Home_Possession,Away_Possession,Home_Shots_on_target,Away_Shots_on_target,Home_Total_Shots,Away_Total_Shots,Home_Touches,Away_Touches,Home_Passes,Away_Pasees,...,Home_Corners,Away_Corners,Home_Offsides,Away_Offsides,Home_Yellow_Cards,Away_Yellow_Cards,Home_Red_Cards,Away_Red_Cards,Home_Fouls_Conceded,Away_Fouls_Conceded
0,66.3,33.7,5,0,12,7,677,342,677,342,...,7,1,4,2,1,0,0,0,9,8
1,59.6,40.4,7,2,16,5,609,414,609,414,...,5,4,0,0,0,1,0,0,12,6
2,54.0,46.0,4,5,17,18,525,444,525,444,...,7,6,8,0,0,0,0,0,5,12
3,45.5,54.5,8,2,15,11,382,445,382,445,...,4,11,1,1,0,0,0,0,7,11
4,38.5,61.5,10,5,19,18,351,551,351,551,...,3,8,1,1,1,2,1,0,7,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,58.4,41.6,2,3,11,7,548,391,548,391,...,6,3,0,0,3,4,0,0,13,13
376,66.6,33.4,5,5,10,11,558,286,558,286,...,8,2,4,1,1,2,0,0,7,13
377,44.0,56.0,8,5,20,11,342,442,342,442,...,6,3,0,1,2,4,0,0,13,17
378,62.9,37.1,6,7,17,11,470,272,470,272,...,1,4,4,2,1,0,0,0,14,20


In [39]:
def gd_vectors(scores):
    """
    Calculate the goal difference of the each game from the perspective of the
    home team, for use in form calcluations.
    """
    gd_dict = {}
    for game in scores:
        # goal difference from the perspective of the home team
        id, home_team, away_team, home_goals, away_goals = game
        score = home_goals - away_goals
        gd_dict[home_team] = gd_dict.get(home_team, []) + [(id, score)]
        gd_dict[away_team] = gd_dict.get(away_team, []) + [(id,-1 * score)]
    return gd_dict

In [40]:
scores = df[['Match_ID', 'Hometeam', 'Awayteam', 'Hometeam Score', 'Awayteam Score']].values
gd = gd_vectors(scores)

In [41]:
def exponential_momentum(matchID, team, gd_vectors, alpha, boolean = True):
    """
    Calculate an exponentially-decaying weight of a team's recent performance
    which places more emphasis on recent result.
    """
    if alpha > .69:
        raise ValueError
    avg_vec, i = [], 1
    while sum(avg_vec) < 1:
        avg_vec.append( math.e ** (-1 * (alpha * i)) )
        i += 1
    avg_vec = sorted(avg_vec)
    previous_results = get_window(matchID, team, gd_vectors, len(avg_vec), boolean)
    if not previous_results:
        return 0
    return np.dot( np.array(avg_vec), np.array(previous_results) )

def get_window(matchID, team, gd_vectors, window = 5, boolean = False):
    """
    Pull out the window length previous results for the input team. If boolean,
    then simply the win/loss values are added, otherwise goal difference is used.
    """
    team_results = gd_vectors[team]
    idx = -1
    for i, result in enumerate(team_results):
        if result[0] == matchID:
            idx = i
            break
    if idx < window - 1:
        return None
    return [ team_results[i][1] for i in range(idx - window, idx) ]

def linear_momentum(matchID, team, gd_vectors, window = 5, boolean = False):
    """
    Calcluated a linear momentum measure for a team given a history of goal
    difference. Simply, the linear sum of their previous results.
    """
    previous_results = get_window(matchID, team, gd_vectors, window, boolean)
    if not previous_results:
        return 0
    return sum(previous_results)

In [42]:
away_form_linear = []
home_form_linear = []
away_form_exp = []
home_form_exp = []
for game in scores:
    id, home_team, away_team, _, _ = game
    away_form_exp.append( exponential_momentum(id, away_team, gd, alpha = .65) )
    home_form_exp.append( exponential_momentum(id, home_team, gd, alpha = .65) )
    away_form_linear.append( linear_momentum(id, away_team, gd) )
    home_form_linear.append( linear_momentum(id, home_team, gd) )


df_form = df.copy()
df_form['away_form_exp'] = pd.Series(away_form_exp)
df_form['home_form_exp'] = pd.Series(home_form_exp)
list(df_form)

['Match_ID',
 'Hometeam',
 'Awayteam',
 'Hometeam Score',
 'Awayteam Score',
 'Home_Possession',
 'Away_Possession',
 'Home_Shots_on_target',
 'Away_Shots_on_target',
 'Home_Total_Shots',
 'Away_Total_Shots',
 'Home_Touches',
 'Away_Touches',
 'Home_Passes',
 'Away_Pasees',
 'Home_Takles',
 'Away_Takles',
 'Home_Clearance',
 'Away_Clearance',
 'Home_Corners',
 'Away_Corners',
 'Home_Offsides',
 'Away_Offsides',
 'Home_Yellow_Cards',
 'Away_Yellow_Cards',
 'Home_Red_Cards',
 'Away_Red_Cards',
 'Home_Fouls_Conceded',
 'Away_Fouls_Conceded',
 'target',
 'away_form_exp',
 'home_form_exp']

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [47]:
clfs = [LogisticRegression(), RandomForestClassifier(), GradientBoostingClassifier(),
        KNeighborsClassifier()]
df_form

Unnamed: 0,Home_Possession,Away_Possession,Home_Shots_on_target,Away_Shots_on_target,Home_Total_Shots,Away_Total_Shots,Home_Touches,Away_Touches,Home_Passes,Away_Pasees,...,Home_Offsides,Away_Offsides,Home_Yellow_Cards,Away_Yellow_Cards,Home_Red_Cards,Away_Red_Cards,Home_Fouls_Conceded,Away_Fouls_Conceded,away_form_exp,home_form_exp
0,66.3,33.7,5,0,12,7,677,342,677,342,...,4,2,1,0,0,0,9,8,0.000000,0.000000
1,59.6,40.4,7,2,16,5,609,414,609,414,...,0,0,0,1,0,0,12,6,0.000000,0.000000
2,54.0,46.0,4,5,17,18,525,444,525,444,...,8,0,0,0,0,0,5,12,0.000000,0.000000
3,45.5,54.5,8,2,15,11,382,445,382,445,...,1,1,0,0,0,0,7,11,0.000000,0.000000
4,38.5,61.5,10,5,19,18,351,551,351,551,...,1,1,1,2,1,0,7,6,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,58.4,41.6,2,3,11,7,548,391,548,391,...,0,0,3,4,0,0,13,13,-0.868851,-0.398061
376,66.6,33.4,5,5,10,11,558,286,558,286,...,4,1,1,2,0,0,7,13,-0.210275,1.169621
377,44.0,56.0,8,5,20,11,342,442,342,442,...,0,1,2,4,0,0,13,17,-1.118365,0.596319
378,62.9,37.1,6,7,17,11,470,272,470,272,...,4,2,1,0,0,0,14,20,-0.181513,0.765816


In [49]:
# df_form.drop(columns = ['target', 'Match_ID', 'Hometeam', 'Awayteam', 
#                                  'Hometeam Score', 'Awayteam Score'], inplace = True)
X = dropped_df.values
X_form = df_form.values
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

X_train_form, X_test_form, y_train_form, y_test_form = train_test_split(X_form, y, test_size = .2, random_state = 42)
X_train

df_form

Unnamed: 0,Home_Possession,Away_Possession,Home_Shots_on_target,Away_Shots_on_target,Home_Total_Shots,Away_Total_Shots,Home_Touches,Away_Touches,Home_Passes,Away_Pasees,...,Home_Offsides,Away_Offsides,Home_Yellow_Cards,Away_Yellow_Cards,Home_Red_Cards,Away_Red_Cards,Home_Fouls_Conceded,Away_Fouls_Conceded,away_form_exp,home_form_exp
0,66.3,33.7,5,0,12,7,677,342,677,342,...,4,2,1,0,0,0,9,8,0.000000,0.000000
1,59.6,40.4,7,2,16,5,609,414,609,414,...,0,0,0,1,0,0,12,6,0.000000,0.000000
2,54.0,46.0,4,5,17,18,525,444,525,444,...,8,0,0,0,0,0,5,12,0.000000,0.000000
3,45.5,54.5,8,2,15,11,382,445,382,445,...,1,1,0,0,0,0,7,11,0.000000,0.000000
4,38.5,61.5,10,5,19,18,351,551,351,551,...,1,1,1,2,1,0,7,6,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,58.4,41.6,2,3,11,7,548,391,548,391,...,0,0,3,4,0,0,13,13,-0.868851,-0.398061
376,66.6,33.4,5,5,10,11,558,286,558,286,...,4,1,1,2,0,0,7,13,-0.210275,1.169621
377,44.0,56.0,8,5,20,11,342,442,342,442,...,0,1,2,4,0,0,13,17,-1.118365,0.596319
378,62.9,37.1,6,7,17,11,470,272,470,272,...,4,2,1,0,0,0,14,20,-0.181513,0.765816


In [50]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)
X_train_form_std = sc.fit_transform(X_train_form)
X_test_form_std = sc.transform(X_test_form)

X_train_std

array([[ 0.80301396, -0.80301396,  1.23783115, ..., -0.29371501,
         1.17331744, -1.3284985 ],
       [-1.72038148,  1.72038148,  0.11230572, ..., -0.29371501,
         1.76191233, -1.03327661],
       [-0.1172832 ,  0.1172832 ,  0.48748086, ..., -0.29371501,
         1.76191233, -0.44283283],
       ...,
       [ 0.94155332, -0.94155332,  0.48748086, ..., -0.29371501,
        -1.18106211,  1.03327661],
       [-0.26571823,  0.26571823,  0.11230572, ..., -0.29371501,
        -1.769657  , -0.73805472],
       [ 1.070197  , -1.070197  , -0.26286943, ..., -0.29371501,
        -0.59246722, -0.14761094]])

In [51]:
for clf in clfs:
    clf.fit(X_train, y_train)
    print(type(clf))
    print("score = ", clf.score(X_test, y_test), "\n")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


<class 'sklearn.linear_model._logistic.LogisticRegression'>
score =  0.6052631578947368 

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
score =  0.5789473684210527 

<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>
score =  0.5263157894736842 

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
score =  0.3157894736842105 



In [52]:
#With scaled variables
for clf in clfs:
    clf.fit(X_train_std, y_train)
    print(type(clf))
    print("score = ", clf.score(X_test_std, y_test), "\n")

<class 'sklearn.linear_model._logistic.LogisticRegression'>
score =  0.5789473684210527 

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
score =  0.5657894736842105 

<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>
score =  0.5394736842105263 

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
score =  0.5 



In [53]:
for clf in clfs:
    clf.fit(X_train_form, y_train_form)
    print(type(clf))
    print("score = ", clf.score(X_test_form, y_test_form), "\n")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


<class 'sklearn.linear_model._logistic.LogisticRegression'>
score =  0.5921052631578947 

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
score =  0.5394736842105263 

<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>
score =  0.5394736842105263 

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
score =  0.3157894736842105 



In [54]:
#With scaled variables
for clf in clfs:
    clf.fit(X_train_form_std, y_train_form)
    print(type(clf))
    print("score = ", clf.score(X_test_form_std, y_test_form), "\n")

<class 'sklearn.linear_model._logistic.LogisticRegression'>
score =  0.5526315789473685 

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
score =  0.5789473684210527 

<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>
score =  0.5526315789473685 

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
score =  0.4868421052631579 



In [55]:
for i, feature in enumerate(list(df_form)):
    print(feature, ": ", clfs[0].coef_[:,i])

Home_Possession :  [ 0.6605366   0.18323566 -0.84377226]
Away_Possession :  [-0.6605366  -0.18323566  0.84377226]
Home_Shots_on_target :  [-0.66059672 -0.13118317  0.7917799 ]
Away_Shots_on_target :  [ 0.75386366 -0.22787062 -0.52599304]
Home_Total_Shots :  [-0.0447065  0.1404836 -0.0957771]
Away_Total_Shots :  [-0.04403824  0.31441535 -0.27037711]
Home_Touches :  [-0.31597381 -0.08977266  0.40574647]
Away_Touches :  [ 0.54411081 -0.04092956 -0.50318124]
Home_Passes :  [-0.31597381 -0.08977266  0.40574647]
Away_Pasees :  [ 0.54411081 -0.04092956 -0.50318124]
Home_Takles :  [-0.14824477  0.14628812  0.00195664]
Away_Takles :  [ 0.37956562 -0.2922608  -0.08730482]
Home_Clearance :  [-0.34408521  0.08206291  0.2620223 ]
Away_Clearance :  [ 0.64038476 -0.10954265 -0.53084211]
Home_Corners :  [ 0.1957412   0.05663908 -0.25238028]
Away_Corners :  [ 0.09114902  0.0099698  -0.10111882]
Home_Offsides :  [ 0.00104089 -0.0536704   0.05262951]
Away_Offsides :  [ 0.03705912 -0.08239441  0.04533528]

In [58]:
features_to_drop = ['Home_Total_Shots', 'Away_Total_Shots', 'Home_Touches', 
                    'Away_Touches', 'Home_Possession', 'Away_Possession',
                    'Home_Takles', 'Away_Takles', 'Home_Corners',
                    'Away_Corners','Home_Red_Cards', 'Away_Red_Cards',
                    'Home_Yellow_Cards', 'Away_Yellow_Cards']
df_sub = df_form.drop(columns = features_to_drop)
print(list(df_sub))

['Home_Shots_on_target', 'Away_Shots_on_target', 'Home_Passes', 'Away_Pasees', 'Home_Clearance', 'Away_Clearance', 'Home_Offsides', 'Away_Offsides', 'Home_Fouls_Conceded', 'Away_Fouls_Conceded', 'away_form_exp', 'home_form_exp']


In [59]:
X_sub = df_sub.values
X_train, X_test, y_train, y_test = train_test_split(X_sub, y, test_size = .2, random_state = 42)

for clf in clfs:
    clf.fit(X_train, y_train)
    print(type(clf))
    print("score = ", clf.score(X_test, y_test), "\n")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


<class 'sklearn.linear_model._logistic.LogisticRegression'>
score =  0.5657894736842105 

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
score =  0.5921052631578947 

<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>
score =  0.631578947368421 

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
score =  0.35526315789473684 



In [60]:
for i, feature in enumerate(list(df_sub)):
    print(feature, ": ", clfs[0].coef_[:,i])

Home_Shots_on_target :  [-0.23949301 -0.06662919  0.3061222 ]
Away_Shots_on_target :  [ 0.22500482  0.01298334 -0.23798816]
Home_Passes :  [-0.00052363  0.0003017   0.00022193]
Away_Pasees :  [ 0.00080086 -0.00146317  0.00066231]
Home_Clearance :  [-0.04037736  0.0145321   0.02584527]
Away_Clearance :  [ 0.04931735  0.00407341 -0.05339076]
Home_Offsides :  [-0.02104403 -0.00159495  0.02263898]
Away_Offsides :  [ 0.00697467 -0.037107    0.03013234]
Home_Fouls_Conceded :  [-0.06379742  0.04841378  0.01538364]
Away_Fouls_Conceded :  [ 0.01416711 -0.00737148 -0.00679563]
away_form_exp :  [ 0.02239116  0.01971346 -0.04210462]
home_form_exp :  [-0.02675422 -0.01953561  0.04628983]


In [61]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(df_sub.values)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = .2, random_state = 42)

for clf in clfs:
    clf.fit(X_train, y_train)
    print(type(clf))
    print("score = ", clf.score(X_test, y_test), "\n")

<class 'sklearn.linear_model._logistic.LogisticRegression'>
score =  0.6052631578947368 

<class 'sklearn.ensemble._forest.RandomForestClassifier'>
score =  0.631578947368421 

<class 'sklearn.ensemble._gb.GradientBoostingClassifier'>
score =  0.6447368421052632 

<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
score =  0.4473684210526316 



In [62]:
forest = RandomForestClassifier(n_estimators=500,random_state=42)
forest.fit(X_train_form, y_train)

RandomForestClassifier(n_estimators=500, random_state=42)

In [63]:
features = df_form.columns
importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]
indices
for f in range(X_train_form.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,features[indices[f]], importances[indices[f]]))

 1) Away_Clearance                 0.074864
 2) Home_Shots_on_target           0.067475
 3) Away_Shots_on_target           0.067385
 4) home_form_exp                  0.057397
 5) Home_Clearance                 0.053534
 6) Away_Total_Shots               0.050393
 7) Home_Total_Shots               0.049102
 8) Away_Pasees                    0.044302
 9) Away_Touches                   0.044166
10) away_form_exp                  0.042208
11) Away_Takles                    0.040015
12) Home_Passes                    0.039141
13) Home_Possession                0.037564
14) Home_Takles                    0.037535
15) Home_Corners                   0.037242
16) Home_Touches                   0.036711
17) Away_Possession                0.035232
18) Away_Fouls_Conceded            0.032896
19) Home_Fouls_Conceded            0.032731
20) Away_Corners                   0.027745
21) Away_Offsides                  0.023463
22) Away_Yellow_Cards              0.020159
23) Home_Offsides               

In [65]:
from SBS import *

ModuleNotFoundError: No module named 'SBS'