In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
from matplotlib import pyplot
import scipy as sp
import sklearn
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [29]:
filename = 'events.csv'
events = pd.read_csv('football-events/events.csv')
events.info()
games = pd.read_csv('football-events/ginf.csv')
shots = events[(events.event_type==1)]
shots.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 941009 entries, 0 to 941008
Data columns (total 22 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id_odsp        941009 non-null  object 
 1   id_event       941009 non-null  object 
 2   sort_order     941009 non-null  int64  
 3   time           941009 non-null  int64  
 4   text           941009 non-null  object 
 5   event_type     941009 non-null  int64  
 6   event_type2    214293 non-null  float64
 7   side           941009 non-null  int64  
 8   event_team     941009 non-null  object 
 9   opponent       941009 non-null  object 
 10  player         880009 non-null  object 
 11  player2        291310 non-null  object 
 12  player_in      51715 non-null   object 
 13  player_out     51738 non-null   object 
 14  shot_place     227459 non-null  float64
 15  shot_outcome   228498 non-null  float64
 16  is_goal        941009 non-null  int64  
 17  location       467067 non-nul

In [4]:
shots_prediction = shots.iloc[:,-6:]
dummies = pd.get_dummies(shots_prediction, columns=['location', 'bodypart','assist_method', 'situation'])
dummies.columns = ['is_goal', 'fast_break', 'loc_centre_box', 'loc_diff_angle_lr', 'diff_angle_left', 'diff_angle_right', 'left_side_box', 'left_side_6ybox', 'right_side_box', 'right_side_6ybox', 'close_range', 'penalty', 'outside_box', 'long_range', 'more_35y', 'more_40y', 'not_recorded', 'right_foot', 'left_foot', 'header', 'no_assist', 'assist_pass', 'assist_cross', 'assist_header', 'assist_through_ball', 'open_play', 'set_piece', 'corner', 'free_kick']
dummies.head()

Unnamed: 0,is_goal,fast_break,loc_centre_box,loc_diff_angle_lr,diff_angle_left,diff_angle_right,left_side_box,left_side_6ybox,right_side_box,right_side_6ybox,...,header,no_assist,assist_pass,assist_cross,assist_header,assist_through_ball,open_play,set_piece,corner,free_kick
0,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
13,1,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,1,0,0,0
14,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [8]:
X = dummies.iloc[:,1:]
y = dummies.iloc[:,0]
print(X.shape)
print(y.shape)

(229135, 28)
(229135,)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=1)

In [10]:
%%time
mlp = MLPClassifier(random_state=0, hidden_layer_sizes=(28,28,28,28), max_iter=2000, activation='relu')
mlp.fit(X_train, y_train)

CPU times: user 26.9 s, sys: 0 ns, total: 26.9 s
Wall time: 27.1 s


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(28, 28, 28, 28), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=2000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=0, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [11]:
mlp.score(X_train, y_train)
mlp.score(X_test, y_test)
accuracy = mlp.score(X_test, y_test)
print('The accuracy of classifying whether a shot is goal or not is {:.2f} %.'.format(accuracy*100))
y_pred = mlp.predict_proba(X_test)
predict = mlp.predict(X_test)
y_total = y_train.count()
y_positive = y_train.sum()
print('The training set contains {} examples of which {} are positives.'.format(y_total, y_positive))
auc_roc = roc_auc_score(y_test, y_pred[:,1])
print('Our MLP classifier obtains an AUC-ROC of {:.4f}.'.format(auc_roc))
auc_pr_baseline = y_positive / y_total
print('The baseline performance for AUC-PR is {:.4f}. This is what we would get by random guessing'.format(auc_pr_baseline))
auc_pr = average_precision_score(y_test, y_pred[:,1])
print('Our MLP classifier obtains an AUC-PR of {:.4f}.'.format(auc_pr))
cohen_kappa = cohen_kappa_score(y_test,predict)
print('Our classifier obtains a Cohen Kappa of {:.4f}.'.format(cohen_kappa))
MSE = sklearn.metrics.mean_squared_error(y_test, y_pred[:,1])
print('Our MLP classifier obtains a Mean Squared Error (MSE) of {:.4f}.'.format(MSE))

The accuracy of classifying whether a shot is goal or not is 91.10 %.
The training set contains 148937 examples of which 15937 are positives.
Our MLP classifier obtains an AUC-ROC of 0.8189.
The baseline performance for AUC-PR is 0.1070. This is what we would get by random guessing
Our MLP classifier obtains an AUC-PR of 0.4741.
Our classifier obtains a Cohen Kappa of 0.3533.
Our MLP classifier obtains a Mean Squared Error (MSE) of 0.0726.


In [12]:
print('Confusion Matrix:')
print(confusion_matrix(y_test,predict))
print('Report:')
print(classification_report(y_test,predict))

Confusion Matrix:
[[70771   923]
 [ 6214  2290]]
Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     71694
           1       0.71      0.27      0.39      8504

    accuracy                           0.91     80198
   macro avg       0.82      0.63      0.67     80198
weighted avg       0.90      0.91      0.89     80198



In [13]:
predictions = X_test.copy()
predictions['true_goals'] = y_test
predictions['expected_goals'] = y_pred[:,1]
predictions['difference'] = predictions['expected_goals'] - predictions['true_goals']
predictions = predictions.iloc[:,28:31]

In [42]:
m = (mlp.predict_proba(dummies.iloc[:,1:]))
m = pd.DataFrame(m,columns=["No Goal","Goal"])
m.info()
test = shots_prediction
#test = test.insert(-1,)
test = test.assign(xG = m['Goal'])
test.info()
test

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229135 entries, 0 to 229134
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   No Goal  229135 non-null  float64
 1   Goal     229135 non-null  float64
dtypes: float64(2)
memory usage: 3.5 MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 229135 entries, 0 to 941006
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id_odsp        229135 non-null  object 
 1   id_event       229135 non-null  object 
 2   sort_order     229135 non-null  int64  
 3   time           229135 non-null  int64  
 4   text           229135 non-null  object 
 5   event_type     229135 non-null  int64  
 6   event_type2    168560 non-null  float64
 7   side           229135 non-null  int64  
 8   event_team     229135 non-null  object 
 9   opponent       229135 non-null  object 
 10  player         229122 non-null  object

Unnamed: 0,id_odsp,id_event,sort_order,time,text,event_type,event_type2,side,event_team,opponent,...,player_out,shot_place,shot_outcome,is_goal,location,bodypart,assist_method,situation,fast_break,xG
0,UFot0hit/,UFot0hit1,1,2,Attempt missed. Mladen Petric (Hamburg) left f...,1,12.0,2,Hamburg SV,Borussia Dortmund,...,,6.0,2.0,0,9.0,2.0,1,1.0,0,0.047499
11,UFot0hit/,UFot0hit12,12,14,Attempt missed. Shinji Kagawa (Borussia Dortmu...,1,12.0,1,Borussia Dortmund,Hamburg SV,...,,13.0,2.0,0,15.0,1.0,1,1.0,0,0.022749
13,UFot0hit/,UFot0hit14,14,17,"Goal! Borussia Dortmund 1, Hamburg 0. Kevin G...",1,12.0,1,Borussia Dortmund,Hamburg SV,...,,4.0,1.0,1,9.0,2.0,1,1.0,0,0.248381
14,UFot0hit/,UFot0hit15,15,19,Attempt blocked. Mats Hummels (Borussia Dortmu...,1,,1,Borussia Dortmund,Hamburg SV,...,,2.0,3.0,0,15.0,1.0,0,1.0,0,0.188227
17,UFot0hit/,UFot0hit18,18,20,Attempt blocked. Tomas Rincon (Hamburg) right ...,1,,2,Hamburg SV,Borussia Dortmund,...,,2.0,3.0,0,15.0,1.0,0,1.0,0,0.009994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
940983,z5L2OT5E/,z5L2OT5E102,102,81,Attempt blocked. Remo Freuler (Atalanta) right...,1,,1,Atalanta,Sampdoria,...,,2.0,3.0,0,15.0,1.0,0,1.0,0,
940991,z5L2OT5E/,z5L2OT5E110,110,84,Attempt missed. Alberto Grassi (Atalanta) righ...,1,12.0,1,Atalanta,Sampdoria,...,,10.0,2.0,0,15.0,1.0,1,1.0,0,
940992,z5L2OT5E/,z5L2OT5E111,111,86,Attempt saved. Alejandro Gomez (Atalanta) righ...,1,12.0,1,Atalanta,Sampdoria,...,,5.0,1.0,0,9.0,1.0,1,1.0,0,
940993,z5L2OT5E/,z5L2OT5E112,112,87,Attempt saved. Fabio Quagliarella (Sampdoria) ...,1,12.0,2,Sampdoria,Atalanta,...,,5.0,1.0,0,15.0,1.0,1,1.0,0,
