In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
from matplotlib import pyplot
import scipy as sp
import sklearn
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import cohen_kappa_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [20]:
events = pd.read_csv('football-events/events.csv')
games = pd.read_csv('football-events/ginf.csv')
shots = events[(events.event_type==1)]

# Data Validation

In [5]:
#Checks whether names are not invalid within the same game
assert events.groupby(['id_odsp', 'side'])['event_team'].nunique().unique()[0] == 1

# Data preparation and MLP instantiation

In [6]:
shots_prediction = shots.iloc[:,-6:]
dummies = pd.get_dummies(shots_prediction, columns=['location', 'bodypart','assist_method', 'situation'])
dummies.columns = ['is_goal', 'fast_break', 'loc_centre_box', 'loc_diff_angle_lr', 'diff_angle_left', 'diff_angle_right', 'left_side_box', 'left_side_6ybox', 'right_side_box', 'right_side_6ybox', 'close_range', 'penalty', 'outside_box', 'long_range', 'more_35y', 'more_40y', 'not_recorded', 'right_foot', 'left_foot', 'header', 'no_assist', 'assist_pass', 'assist_cross', 'assist_header', 'assist_through_ball', 'open_play', 'set_piece', 'corner', 'free_kick']

In [7]:
X = dummies.iloc[:,1:]
y = dummies.iloc[:,0]

In [8]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=1)
mlp = MLPClassifier(random_state=0, hidden_layer_sizes=(28, 28, 28, 28), max_iter=2000, activation='relu')
mlp.fit(X_train, y_train)

CPU times: user 26.8 s, sys: 13.2 ms, total: 26.9 s
Wall time: 27 s


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(28, 28, 28, 28), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=2000,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=0, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

# Confustion Matrix and metrics analysis

In [9]:
predict = mlp.predict(X_test)
print('Confusion Matrix:')
print(confusion_matrix(y_test,predict))
print('Report:')
print(classification_report(y_test,predict))

Confusion Matrix:
[[70771   923]
 [ 6214  2290]]
Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     71694
           1       0.71      0.27      0.39      8504

    accuracy                           0.91     80198
   macro avg       0.82      0.63      0.67     80198
weighted avg       0.90      0.91      0.89     80198



In [10]:
accuracy = mlp.score(X_test, y_test)
print('The accuracy of classifying whether a shot is goal or not is {:.2f} %.'.format(accuracy*100))

The accuracy of classifying whether a shot is goal or not is 91.10 %.


# Join xG with original dataset

In [11]:
dummies['xG'] = mlp.predict_proba(X)[:,1]
dummies = dummies[['xG']].copy()
dataWithXG = shots.join(dummies[['xG']])
matchXG = dataWithXG.groupby(['id_odsp', 'side'])['xG'].sum().reset_index()

In [12]:
matchXG

Unnamed: 0,id_odsp,side,xG
0,004f4ING/,1,1.413347
1,004f4ING/,2,0.453106
2,00LMl81F/,1,2.949324
3,00LMl81F/,2,1.608309
4,00OX4xFp/,1,1.166929
...,...,...,...
18139,zyKwAQxf/,2,1.218277
18140,zyrHmI8P/,1,2.183775
18141,zyrHmI8P/,2,1.008775
18142,zyzdxP10/,1,1.587103


#### TODO: Insert nºAttempts nºCorners nºFouls nºFirst Yellow Card nº Second Yellow Cards, nº Red Cards, nº Free kicks nºoffside

## Inserting the other dependend variables

In [88]:
attempts = events[(events.event_type==1)].copy()
attemptsSorted = shotsSorted.groupby(['id_odsp', 'side']).count().reset_index()
shotsClean = shotsSorted.drop(["sort_order","time","text","event_type","event_type2","event_team","opponent","player","player2","player_in","player_out","shot_place","shot_outcome","is_goal","location","bodypart","assist_method","situation","fast_break"], axis=1).rename(columns={"id_event":"shot_count"})


corners = events[(events.event_type==2)].copy()
cornersSorted = corners.groupby(['id_odsp', 'side']).count().reset_index()
cornersClean = cornersSorted.drop(["sort_order","time","text","event_type","event_type2","event_team","opponent","player","player2","player_in","player_out","shot_place","shot_outcome","is_goal","location","bodypart","assist_method","situation","fast_break"], axis=1).rename(columns={"id_event":"corner_count"})

fouls = events[(events.event_type==3)].copy()
foulsSorted = fouls.groupby(['id_odsp', 'side']).count().reset_index()
foulsClean = foulsSorted.drop(["sort_order","time","text","event_type","event_type2","event_team","opponent","player","player2","player_in","player_out","shot_place","shot_outcome","is_goal","location","bodypart","assist_method","situation","fast_break"], axis=1).rename(columns={"id_event":"foul_count"})

yellow_cards = events[(events.event_type==4)].copy()
ycSorted = yellow_cards.groupby(['id_odsp', 'side']).count().reset_index()
ycClean = ycSorted.drop(["sort_order","time","text","event_type","event_type2","event_team","opponent","player","player2","player_in","player_out","shot_place","shot_outcome","is_goal","location","bodypart","assist_method","situation","fast_break"], axis=1).rename(columns={"id_event":"yellow_card_count"})


second_yellow_cards= events[(events.event_type==5)].copy()
sycSorted = second_yellow_cards.groupby(['id_odsp', 'side']).count().reset_index()
sycClean = sycSorted.drop(["sort_order","time","text","event_type","event_type2","event_team","opponent","player","player2","player_in","player_out","shot_place","shot_outcome","is_goal","location","bodypart","assist_method","situation","fast_break"], axis=1).rename(columns={"id_event":"second_yellow_card_count"})


red_cards = events[(events.event_type==6)].copy()
redCardsSorted = red_cards.groupby(['id_odsp', 'side']).count().reset_index()
redCardsClean = redCardsSorted.drop(["sort_order","time","text","event_type","event_type2","event_team","opponent","player","player2","player_in","player_out","shot_place","shot_outcome","is_goal","location","bodypart","assist_method","situation","fast_break"], axis=1).rename(columns={"id_event":"red_card_count"})


free_kicks = events[(events.event_type==8)].copy()
fkSorted = free_kicks.groupby(['id_odsp', 'side']).count().reset_index()
fkClean = fkSorted.drop(["sort_order","time","text","event_type","event_type2","event_team","opponent","player","player2","player_in","player_out","shot_place","shot_outcome","is_goal","location","bodypart","assist_method","situation","fast_break"], axis=1).rename(columns={"id_event":"free_kick_count"})


offsides = events[(events.event_type==9)].copy()
offsidesSorted = offsides.groupby(['id_odsp', 'side']).count().reset_index()
offsidesClean = offsidesSorted.drop(["sort_order","time","text","event_type","event_type2","event_team","opponent","player","player2","player_in","player_out","shot_place","shot_outcome","is_goal","location","bodypart","assist_method","situation","fast_break"], axis=1).rename(columns={"id_event":"offside_count"})



In [89]:
offsidesClean

Unnamed: 0,id_odsp,side,offside_count
0,004f4ING/,2,3
1,00LMl81F/,2,6
2,00OX4xFp/,1,3
3,00QH2XdM/,2,2
4,00QL4t1L/,1,3
...,...,...,...
15645,zyKwAQxf/,2,2
15646,zyrHmI8P/,1,1
15647,zyrHmI8P/,2,3
15648,zyzdxP10/,1,3
