In [37]:
import pandas as pd
import numpy as np
import matplotlib as plt
from matplotlib import pyplot
import scipy as sp
import seaborn as sb
import sklearn
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,mean_squared_error, mean_absolute_error
from sklearn.metrics import cohen_kappa_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
events = pd.read_csv('football-events/events.csv')
games = pd.read_csv('football-events/ginf.csv')
shots = events[(events.event_type==1)]

# Data Validation

In [4]:
#Checks whether names are not invalid within the same game
assert events.groupby(['id_odsp', 'side'])['event_team'].nunique().unique()[0] == 1

# Data preparation and MLP instantiation

In [5]:
shots_prediction = shots.iloc[:,-6:]
dummies = pd.get_dummies(shots_prediction, columns=['location', 'bodypart','assist_method', 'situation'])
dummies.columns = ['is_goal', 'fast_break', 'loc_centre_box', 'loc_diff_angle_lr', 'diff_angle_left', 'diff_angle_right', 'left_side_box', 'left_side_6ybox', 'right_side_box', 'right_side_6ybox', 'close_range', 'penalty', 'outside_box', 'long_range', 'more_35y', 'more_40y', 'not_recorded', 'right_foot', 'left_foot', 'header', 'no_assist', 'assist_pass', 'assist_cross', 'assist_header', 'assist_through_ball', 'open_play', 'set_piece', 'corner', 'free_kick']

In [6]:
shots_prediction.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 229135 entries, 0 to 941006
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   is_goal        229135 non-null  int64  
 1   location       229135 non-null  float64
 2   bodypart       229135 non-null  float64
 3   assist_method  229135 non-null  int64  
 4   situation      229135 non-null  float64
 5   fast_break     229135 non-null  int64  
dtypes: float64(3), int64(3)
memory usage: 12.2 MB


In [7]:
X = dummies.iloc[:,1:]
y = dummies.iloc[:,0]

In [8]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=1)
mlp = MLPClassifier(random_state=0, hidden_layer_sizes=(28, 28, 28, 28), max_iter=2000, activation='relu')
mlp.fit(X_train, y_train)

Wall time: 32.5 s


MLPClassifier(hidden_layer_sizes=(28, 28, 28, 28), max_iter=2000,
              random_state=0)

# Confustion Matrix and metrics analysis

In [9]:
predict = mlp.predict(X_test)
print('Confusion Matrix:')
print(confusion_matrix(y_test,predict))
print('Report:')
print(classification_report(y_test,predict))

Confusion Matrix:
[[70771   923]
 [ 6214  2290]]
Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     71694
           1       0.71      0.27      0.39      8504

    accuracy                           0.91     80198
   macro avg       0.82      0.63      0.67     80198
weighted avg       0.90      0.91      0.89     80198



In [10]:
accuracy = mlp.score(X_test, y_test)
print('The accuracy of classifying whether a shot is goal or not is {:.2f} %.'.format(accuracy*100))

The accuracy of classifying whether a shot is goal or not is 91.10 %.


# Join xG with original dataset

In [11]:
dummies['xG'] = mlp.predict_proba(X)[:,1]
dummies = dummies[['xG']].copy()
dataWithXG = shots.join(dummies[['xG']])
matchXG = dataWithXG.groupby(['id_odsp', 'side'])['xG'].sum().reset_index()

## Inserting the other dependend variables

In [12]:
# Number of shots
attempts = events[(events.event_type==1)]
shotsSorted = attempts.groupby(['id_odsp', 'side']).count().reset_index()
shotsClean = shotsSorted[['id_odsp', 'side', 'id_event']].rename(columns={"id_event":"shot_count"})

# Number of corners
corners = events[(events.event_type==2)]
cornersSorted = corners.groupby(['id_odsp', 'side']).count().reset_index()
cornersClean = cornersSorted[['id_odsp', 'side', 'id_event']].rename(columns={"id_event":"corner_count"})

# Number of fouls
fouls = events[(events.event_type==3)]
foulsSorted = fouls.groupby(['id_odsp', 'side']).count().reset_index()
foulsClean = foulsSorted[['id_odsp', 'side', 'id_event']].rename(columns={"id_event":"foul_count"})

# Number of yellow cards
yellow_cards = events[(events.event_type==4)]
ycSorted = yellow_cards.groupby(['id_odsp', 'side']).count().reset_index()
ycClean = ycSorted[['id_odsp', 'side', 'id_event']].rename(columns={"id_event":"yellow_card_count"})

# Number of second yellow cards
second_yellow_cards= events[(events.event_type==5)]
sycSorted = second_yellow_cards.groupby(['id_odsp', 'side']).count().reset_index()
sycClean = sycSorted[['id_odsp', 'side', 'id_event']].rename(columns={"id_event":"second_yellow_card_count"})

# Number of red cards
red_cards = events[(events.event_type==6)]
redCardsSorted = red_cards.groupby(['id_odsp', 'side']).count().reset_index()
redCardsClean = redCardsSorted[['id_odsp', 'side', 'id_event']].rename(columns={"id_event":"red_card_count"})

# Number of freekicks
free_kicks = events[(events.event_type==8)]
fkSorted = free_kicks.groupby(['id_odsp', 'side']).count().reset_index()
fkClean = fkSorted[['id_odsp', 'side', 'id_event']].rename(columns={"id_event":"free_kick_count"})

# Number of offsides
offsides = events[(events.event_type==9)].copy()
offsidesSorted = offsides.groupby(['id_odsp', 'side']).count().reset_index()
offsidesClean = offsidesSorted[['id_odsp', 'side', 'id_event']].rename(columns={"id_event":"offside_count"})



In [13]:
#Joins results
joinColumns = ['id_odsp', 'side']

fullDataset = matchXG

fullDataset = fullDataset.merge(shotsClean, on=joinColumns, how='left') 
fullDataset = fullDataset.merge(cornersClean, on=joinColumns, how='left') 
fullDataset = fullDataset.merge(ycClean, on=joinColumns, how='left') 
fullDataset = fullDataset.merge(sycClean, on=joinColumns, how='left') 
fullDataset = fullDataset.merge(redCardsClean, on=joinColumns, how='left') 
fullDataset = fullDataset.merge(fkClean, on=joinColumns, how='left') 
fullDataset = fullDataset.merge(offsidesClean, on=joinColumns, how='left') 
fullDataset= fullDataset.merge(events[['id_odsp', 'side', 'event_team']].drop_duplicates(), on=joinColumns, how='left')
fullDataset = fullDataset.fillna(0)

for column in fullDataset.columns:
    if column.endswith('_count'):
        fullDataset[column] = fullDataset[column].astype(int)





In [14]:
homeGoals = games[['id_odsp', 'fthg']].rename(columns={'fthg':'goals'})
homeGoals.insert(2, 'side', 1)
awayGoals = games[['id_odsp', 'ftag']].rename(columns={'ftag':'goals'})
awayGoals.insert(2, 'side', 2)
totalGoals = homeGoals.append(awayGoals)
fullDataset = fullDataset.merge(totalGoals, on=joinColumns, how='left')

In [15]:
fullDataset['other_side'] = fullDataset['side'].apply(lambda x: 2 if x == 1 else 1)

In [16]:
teamNames = fullDataset[['id_odsp', 'event_team', 'side']].rename(columns={'event_team':'other_event_team', 'side':'other_side'})
fullDataset = fullDataset.merge(teamNames, on=['id_odsp', 'other_side'], how='left').drop(columns=['other_side'])

# Analysing Data

# Regression

## Data preparing

In [17]:
fullDataset['event_team'] = fullDataset['event_team'].apply(hash)
fullDataset['other_event_team'] = fullDataset['other_event_team'].apply(hash)
colunas = list(fullDataset.columns)
colunas.remove('goals')
colunas.remove('id_odsp')
fullX = fullDataset[colunas]
fullY = fullDataset[['goals']].values.ravel()

In [18]:
from itertools import combinations
columnList = []
for i in range(len(fullX.columns)):
    columnList.extend( map(lambda x: list(x), sorted(list(combinations(fullX.columns, i+1))))    ) 

## Finder Function

In [19]:
def finder(model):
    results = []
    for i,entry in enumerate(columnList):
        print(f'\rTrying for {i} out of {len(columnList)-1}', end='')
        fullXTrain, fullXTest, fullYTrain, fullYTest = train_test_split(fullX[entry], fullY, test_size=0.35, random_state=1)
        model.fit(fullXTrain, fullYTrain)
        results.append((model.score(fullXTest, fullYTest), entry))
    results.sort(reverse=True)
    results

In [43]:
def adjusted_r2(r2, mod):
    return (1 - (1 - r2) * ((mod.shape[0] - 1) / 
          (mod.shape[0] - mod.shape[1] - 1)))

## Models

### Random Forest

In [54]:
%%time
fullXTrain, fullXTest, fullYTrain, fullYTest = train_test_split(fullX.drop(columns=['event_team', 'other_event_team']), fullY, test_size=0.35, random_state=1)
randForest = RandomForestRegressor(max_depth=5)
randForest.fit(fullXTrain, fullYTrain)


predict = randForest.predict(fullXTest)
r2 = randForest.score(fullXTest, fullYTest)
absErr = mean_absolute_error(fullYTest,predict)
sqErr = mean_squared_error(fullYTest,predict)
adjR2 = adjusted_r2(r2, fullXTest)

print(f'{r2=} {adjR2=} {absErr=} {sqErr=}')

r2=0.375093756646799 adjR2=0.3742068056626989 absErr=0.7649132991508495 sqErr=0.9584023175271174
Wall time: 1.04 s


### Multi-layered Percepton

In [47]:
%%time
fullXTrain, fullXTest, fullYTrain, fullYTest = train_test_split(fullX.drop(columns=['other_event_team','event_team']), fullY, test_size=0.35, random_state=1)
mlpReg = MLPRegressor()
mlpReg.fit(fullXTrain, fullYTrain)


predict = mlpReg.predict(fullXTest)
r2 = mlpReg.score(fullXTest, fullYTest)
absErr = mean_absolute_error(fullYTest,predict)
sqErr = mean_squared_error(fullYTest,predict)
adjR2 = adjusted_r2(r2, fullXTest)

print(f'{r2=} {adjR2=} {absErr=} {sqErr=}')

r2=0.3842637012867849 adjR2=0.38338976552138215 absErr=0.758131660258145 sqErr=0.9443386139107165
Wall time: 3.52 s


### Support Vector Machines - Regression

In [48]:
%%time
from sklearn import svm
fullXTrain, fullXTest, fullYTrain, fullYTest = train_test_split(fullX.drop(columns=['other_event_team','event_team']), fullY, test_size=0.35, random_state=1)
svm = svm.LinearSVR(max_iter=40000)
svm.fit(fullXTrain, fullYTrain)


predict = svm.predict(fullXTest)
r2 = svm.score(fullXTest, fullYTest)
absErr = mean_absolute_error(fullYTest,predict)
sqErr = mean_squared_error(fullYTest,predict)
adjR2 = adjusted_r2(r2, fullXTest)

print(f'{r2=} {adjR2=} {absErr=} {sqErr=}')

r2=0.36784983925948056 adjR2=0.3669526067335912 absErr=0.7557235666113568 sqErr=0.9695121236553577
Wall time: 1.56 s


### Decision Trees

In [49]:
%%time
fullXTrain, fullXTest, fullYTrain, fullYTest = train_test_split(fullX.drop(columns=['event_team', 'other_event_team']), fullY, test_size=0.35, random_state=1)
decTree = tree.DecisionTreeRegressor(max_depth=5)
decTree.fit(fullXTrain, fullYTrain)


predict = decTree.predict(fullXTest)
r2 = decTree.score(fullXTest, fullYTest)
absErr = mean_absolute_error(fullYTest,predict)
sqErr = mean_squared_error(fullYTest,predict)
adjR2 = adjusted_r2(r2, fullXTest)

print(f'{r2=} {adjR2=} {absErr=} {sqErr=}')

r2=0.351361636870826 adjR2=0.35044100207061113 absErr=0.7758382314746908 sqErr=0.9947996472626568
Wall time: 25 ms


In [50]:
import graphviz

In [51]:
dot_data = tree.export_graphviz(decTree,out_file=None,feature_names=fullXTrain.columns,class_names=True)
graph = graphviz.Source(dot_data)
graph.render("dec
             Tree",view=True)

'image.pdf'