In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from matplotlib import pyplot
import scipy as sp
import sklearn
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
events = pd.read_csv('football-events/events.csv')
games = pd.read_csv('football-events/ginf.csv')
shots = events[(events.event_type==1)]

# Data Validation

In [4]:
#Checks whether names are not invalid within the same game
assert events.groupby(['id_odsp', 'side'])['event_team'].nunique().unique()[0] == 1

# Data preparation and MLP instantiation

In [5]:
shots_prediction = shots.iloc[:,-6:]
dummies = pd.get_dummies(shots_prediction, columns=['location', 'bodypart','assist_method', 'situation'])
dummies.columns = ['is_goal', 'fast_break', 'loc_centre_box', 'loc_diff_angle_lr', 'diff_angle_left', 'diff_angle_right', 'left_side_box', 'left_side_6ybox', 'right_side_box', 'right_side_6ybox', 'close_range', 'penalty', 'outside_box', 'long_range', 'more_35y', 'more_40y', 'not_recorded', 'right_foot', 'left_foot', 'header', 'no_assist', 'assist_pass', 'assist_cross', 'assist_header', 'assist_through_ball', 'open_play', 'set_piece', 'corner', 'free_kick']

In [6]:
X = dummies.iloc[:,1:]
y = dummies.iloc[:,0]

In [7]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=1)
mlp = MLPClassifier(random_state=0, hidden_layer_sizes=(28, 28, 28, 28), max_iter=2000, activation='relu')
mlp.fit(X_train, y_train)

Wall time: 31.2 s


MLPClassifier(hidden_layer_sizes=(28, 28, 28, 28), max_iter=2000,
              random_state=0)

# Confustion Matrix and metrics analysis

In [8]:
predict = mlp.predict(X_test)
print('Confusion Matrix:')
print(confusion_matrix(y_test,predict))
print('Report:')
print(classification_report(y_test,predict))

Confusion Matrix:
[[70771   923]
 [ 6214  2290]]
Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     71694
           1       0.71      0.27      0.39      8504

    accuracy                           0.91     80198
   macro avg       0.82      0.63      0.67     80198
weighted avg       0.90      0.91      0.89     80198



In [9]:
accuracy = mlp.score(X_test, y_test)
print('The accuracy of classifying whether a shot is goal or not is {:.2f} %.'.format(accuracy*100))

The accuracy of classifying whether a shot is goal or not is 91.10 %.


# Join xG with original dataset

In [10]:
dummies['xG'] = mlp.predict_proba(X)[:,1]
dummies = dummies[['xG']].copy()
dataWithXG = shots.join(dummies[['xG']])
matchXG = dataWithXG.groupby(['id_odsp', 'side'])['xG'].sum().reset_index()

In [11]:
matchXG

Unnamed: 0,id_odsp,side,xG
0,004f4ING/,1,1.413347
1,004f4ING/,2,0.453106
2,00LMl81F/,1,2.949324
3,00LMl81F/,2,1.608309
4,00OX4xFp/,1,1.166929
...,...,...,...
18139,zyKwAQxf/,2,1.218277
18140,zyrHmI8P/,1,2.183775
18141,zyrHmI8P/,2,1.008775
18142,zyzdxP10/,1,1.587103
