# Comparing diferent machine learning techniques to predict football results
# - Authors and affiliations
- Abstract
- 1. Introduction

In [None]:
## 2. Description of the problem/dataset 

In [None]:
## 3. Approach

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

events = pd.read_csv('football-events/events.csv')
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 941009 entries, 0 to 941008
Data columns (total 22 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   id_odsp        941009 non-null  object 
 1   id_event       941009 non-null  object 
 2   sort_order     941009 non-null  int64  
 3   time           941009 non-null  int64  
 4   text           941009 non-null  object 
 5   event_type     941009 non-null  int64  
 6   event_type2    214293 non-null  float64
 7   side           941009 non-null  int64  
 8   event_team     941009 non-null  object 
 9   opponent       941009 non-null  object 
 10  player         880009 non-null  object 
 11  player2        291310 non-null  object 
 12  player_in      51715 non-null   object 
 13  player_out     51738 non-null   object 
 14  shot_place     227459 non-null  float64
 15  shot_outcome   228498 non-null  float64
 16  is_goal        941009 non-null  int64  
 17  location       467067 non-nul

In [13]:
# I manually converted the ../input/dictionary.txt to python dicts
event_types = {1:'Attempt', 2:'Corner', 3:'Foul', 4:'Yellow card', 5:'Second yellow card', 6:'Red card', 7:'Substitution', 8:'Free kick won', 9:'Offside', 10:'Hand ball', 11:'Penalty conceded'}
event_types2 = {12:'Key Pass', 13:'Failed through ball', 14:'Sending off', 15:'Own goal'}
sides = {1:'Home', 2:'Away'}
shot_places = {1:'Bit too high', 2:'Blocked', 3:'Bottom left corner', 4:'Bottom right corner', 5:'Centre of the goal', 6:'High and wide', 7:'Hits the bar', 8:'Misses to the left', 9:'Misses to the right', 10:'Too high', 11:'Top centre of the goal', 12:'Top left corner', 13:'Top right corner'}
shot_outcomes = {1:'On target', 2:'Off target', 3:'Blocked', 4:'Hit the bar'}
locations = {1:'Attacking half', 2:'Defensive half', 3:'Centre of the box', 4:'Left wing', 5:'Right wing', 6:'Difficult angle and long range', 7:'Difficult angle on the left', 8:'Difficult angle on the right', 9:'Left side of the box', 10:'Left side of the six yard box', 11:'Right side of the box', 12:'Right side of the six yard box', 13:'Very close range', 14:'Penalty spot', 15:'Outside the box', 16:'Long range', 17:'More than 35 yards', 18:'More than 40 yards', 19:'Not recorded'}
bodyparts = {1:'right foot', 2:'left foot', 3:'head'}
assist_methods = {0:np.nan, 1:'Pass', 2:'Cross', 3:'Headed pass', 4:'Through ball'}
situations = {1:'Open play', 2:'Set piece', 3:'Corner', 4:'Free kick'}

In [14]:
# Mapping the dicts onto the events dataframe
events['event_type'] = events['event_type'].map(event_types)
events['event_type2'] = events['event_type2'].map(event_types2)
events['side'] = events['side'].map(sides)
events['shot_place'] = events['shot_place'].map(shot_places)
events['shot_outcome'] = events['shot_outcome'].map(shot_outcomes)
events['location'] = events['location'].map(locations)
events['bodypart'] = events['bodypart'].map(bodyparts)
events['assist_method'] = events['assist_method'].map(assist_methods)
events['situation'] = events['situation'].map(situations)

In [32]:
# Many of the objects are in fact categorical, use astype from panda to fix this
cats = ['id_odsp', 'event_type','side', 'player', 'player2', 'event_team', 'opponent', 'shot_place', 'shot_outcome', 'location', 'bodypart', 'assist_method', 'situation']
d = dict.fromkeys(cats,'category')
events = events.astype(d)
events['is_goal'] = events['is_goal'].astype('bool') # this is a bool, we can fix that too while we're at it
events.info() # much better

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 941009 entries, 0 to 941008
Data columns (total 22 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   id_odsp        941009 non-null  category
 1   id_event       941009 non-null  object  
 2   sort_order     941009 non-null  int64   
 3   time           941009 non-null  int64   
 4   text           941009 non-null  object  
 5   event_type     941009 non-null  category
 6   event_type2    214293 non-null  object  
 7   side           941009 non-null  category
 8   event_team     941009 non-null  category
 9   opponent       941009 non-null  category
 10  player         880009 non-null  category
 11  player2        291310 non-null  category
 12  player_in      51715 non-null   object  
 13  player_out     51738 non-null   object  
 14  shot_place     227459 non-null  category
 15  shot_outcome   228498 non-null  category
 16  is_goal        941009 non-null  bool    
 17  location  

In [40]:
shots = events[(events.event_type=='Attempt')]
shots = shots.drop(['player_in','player_out','sort_order','text','event_type2','id_odsp','id_event'], axis=1) #atributes used for substituion and not considered for the model(yet?)
#possivelmente dropar equipas
shots.head()

Unnamed: 0,time,event_type,side,event_team,opponent,player,player2,shot_place,shot_outcome,is_goal,location,bodypart,assist_method,situation,fast_break
0,2,Attempt,Away,Hamburg SV,Borussia Dortmund,mladen petric,gokhan tore,High and wide,Off target,False,Left side of the box,left foot,Pass,Open play,0
11,14,Attempt,Home,Borussia Dortmund,Hamburg SV,shinji kagawa,mario gotze,Top right corner,Off target,False,Outside the box,right foot,Pass,Open play,0
13,17,Attempt,Home,Borussia Dortmund,Hamburg SV,kevin grosskreutz,mario gotze,Bottom right corner,On target,True,Left side of the box,left foot,Pass,Open play,0
14,19,Attempt,Home,Borussia Dortmund,Hamburg SV,mats hummels,,Blocked,Blocked,False,Outside the box,right foot,,Open play,0
17,20,Attempt,Away,Hamburg SV,Borussia Dortmund,tomas rincon,,Blocked,Blocked,False,Outside the box,right foot,,Open play,0


In [48]:
y= shots.iloc[:,8]
X= shots.drop(['is_goal'],axis=1)
print(X.shape)
print(y.shape)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 229135 entries, 0 to 941006
Data columns (total 14 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   time           229135 non-null  int64   
 1   event_type     229135 non-null  category
 2   side           229135 non-null  category
 3   event_team     229135 non-null  category
 4   opponent       229135 non-null  category
 5   player         229122 non-null  category
 6   player2        167798 non-null  category
 7   shot_place     227452 non-null  category
 8   shot_outcome   228498 non-null  category
 9   location       229135 non-null  category
 10  bodypart       229135 non-null  category
 11  assist_method  167859 non-null  category
 12  situation      229135 non-null  category
 13  fast_break     229135 non-null  int64   
dtypes: category(12), int64(2)
memory usage: 9.2 MB
(229135, 14)
(229135,)


In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

In [46]:
from sklearn.neural_network import MLPClassifier
#using classifier for testing
mlp = MLPClassifier(random_state=0, hidden_layer_sizes=(28,28,28,28), max_iter=2000, activation='relu')
mlp.fit(X_train, y_train)

ValueError: could not convert string to float: 'Attempt'

In [3]:

game_data = pd.read_csv('football-events/ginf.csv')
game_data.info()
game_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10112 entries, 0 to 10111
Data columns (total 18 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id_odsp    10112 non-null  object 
 1   link_odsp  10112 non-null  object 
 2   adv_stats  10112 non-null  bool   
 3   date       10112 non-null  object 
 4   league     10112 non-null  object 
 5   season     10112 non-null  int64  
 6   country    10112 non-null  object 
 7   ht         10112 non-null  object 
 8   at         10112 non-null  object 
 9   fthg       10112 non-null  int64  
 10  ftag       10112 non-null  int64  
 11  odd_h      10112 non-null  float64
 12  odd_d      10112 non-null  float64
 13  odd_a      10112 non-null  float64
 14  odd_over   977 non-null    float64
 15  odd_under  977 non-null    float64
 16  odd_bts    977 non-null    float64
 17  odd_bts_n  977 non-null    float64
dtypes: bool(1), float64(7), int64(3), object(7)
memory usage: 1.3+ MB


Unnamed: 0,id_odsp,link_odsp,adv_stats,date,league,season,country,ht,at,fthg,ftag,odd_h,odd_d,odd_a,odd_over,odd_under,odd_bts,odd_bts_n
0,UFot0hit/,/soccer/germany/bundesliga-2011-2012/dortmund-...,True,2011-08-05,D1,2012,germany,Borussia Dortmund,Hamburg SV,3,1,1.56,4.41,7.42,,,,
1,Aw5DflLH/,/soccer/germany/bundesliga-2011-2012/augsburg-...,True,2011-08-06,D1,2012,germany,FC Augsburg,SC Freiburg,2,2,2.36,3.6,3.4,,,,
2,bkjpaC6n/,/soccer/germany/bundesliga-2011-2012/werder-br...,True,2011-08-06,D1,2012,germany,Werder Bremen,Kaiserslautern,2,0,1.83,4.2,4.8,,,,
3,CzPV312a/,/soccer/france/ligue-1-2011-2012/paris-sg-lori...,True,2011-08-06,F1,2012,france,Paris Saint-Germain,Lorient,0,1,1.55,4.5,9.4,,,,
4,GUOdmtII/,/soccer/france/ligue-1-2011-2012/caen-valencie...,True,2011-08-06,F1,2012,france,Caen,Valenciennes,1,0,2.5,3.4,3.45,,,,


In [13]:
game_data.describe()

Unnamed: 0,season,fthg,ftag,odd_h,odd_d,odd_a,odd_over,odd_under,odd_bts,odd_bts_n
count,10112.0,10112.0,10112.0,10112.0,10112.0,10112.0,977.0,977.0,977.0,977.0
mean,2014.290249,1.548062,1.154964,2.93012,4.278434,5.537545,2.046817,2.105629,1.942917,2.063941
std,1.610074,1.309591,1.142596,2.370135,1.863643,5.700485,0.367062,0.536646,0.211102,0.251276
min,2012.0,0.0,0.0,1.06,1.91,1.11,1.14,1.42,1.41,1.44
25%,2013.0,1.0,0.0,1.76,3.43,2.74,1.79,1.78,1.8,1.87
50%,2014.0,1.0,1.0,2.27,3.68,3.86,2.03,1.97,1.92,2.05
75%,2016.0,2.0,2.0,3.08,4.3,6.0,2.28,2.27,2.05,2.2
max,2017.0,10.0,9.0,46.0,35.0,81.0,3.4,7.5,3.25,3.46
