In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from pathlib import Path
import warnings
import psycopg2
from sqlalchemy import create_engine
warnings.filterwarnings('ignore')
import sys
import pickle
from pprint import pprint
from sklearn.preprocessing import LabelEncoder

sys.path.insert(0, os.path.realpath('..\..\src'))
from database.database_config import DB_NAME, DB_USER, DB_PASSWORD, DB_HOST

In [2]:
db = create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}")
conn = db.connect()

In [3]:
SQL = """
select
	match_id,
	mf.date_id,
	mf.venue_id,
	mf.referee_id,
	home_team_id as home_id,
	away_team_id as away_id,
	attendance,
    match_week,
	xg_home,
	xg_away,
	home_goal,
	away_goal,
	home_captain_name,
	away_captain_name,
	home_formation,
	away_formation,
	home_possession,
	away_possession,
	home_shots,
	home_shots_on_target,
	home_distance,
	home_freekicks,
	home_penalty_kicks,
	home_touches,
	home_touches_att_pen,
	home_touches_def_pen,
	home_touches_def_third,
	home_touches_mid_third,
	home_touches_att_third,
	home_touches_live,
	home_dribbles_success,
	home_dribbles_att,
	home_dribbles_players_number,
	home_dribbles_megs,
	home_carries,
	home_carries_tot_dist,
	home_carries_prog_dist,
	home_carries_prog,
	home_carries_one_third,
	home_carries_cpa,
	home_carries_miss,
	home_carries_dis,
	home_receiving_target,
	home_receiving_rec,
	home_receiving_prog,
	away_shots,
	away_shots_on_target,
	away_distance,
	away_freekicks,
	away_penalty_kicks,
	away_touches,
	away_touches_att_pen,
	away_touches_def_pen,
	away_touches_def_third,
	away_touches_mid_third,
	away_touches_att_third,
	away_touches_live,
	away_dribbles_success,
	away_dribbles_att,
	away_dribbles_players_number,
	away_dribbles_megs,
	away_carries,
	away_carries_tot_dist,
	away_carries_prog_dist,
	away_carries_prog,
	away_carries_one_third,
	away_carries_cpa,
	away_carries_miss,
	away_carries_dis,
	away_receiving_target,
	away_receiving_rec,
	away_receiving_prog,
	match_result,
	ht.team_name as home_team,
	aw.team_name as away_team,
	md."_date_",
	md."_time_" ,
	md."_day_",
	extract(MONTH from md."_date_") AS month,
	extract(DAY from md."_date_") AS day_of_month,
	extract(DOW from md."_date_") AS day_of_week,
	extract(YEAR from md."_date_") AS year,
	extract(HOUR from md."_time_") AS hour,
	extract(MINUTE from md."_time_") AS minute,
    extract(SECOND from md."_time_") AS second,
	mr.referee_name,
	mv.venue_name
from
	laliga_wh.match_facts mf
inner join laliga_wh.match_teams ht 
on
	ht.team_id = mf.home_team_id
inner join laliga_wh.match_teams aw 
on
	aw.team_id = mf.away_team_id
inner join laliga_wh.match_dates md 
on
	md.date_id = mf.date_id
inner join laliga_wh.match_referees mr 
on
	mr.referee_id = mf.referee_id
inner join laliga_wh.match_venues mv 
on
	mv.venue_id = mf.venue_id
"""


In [4]:
match_df = pd.read_sql(SQL, con=conn)

In [5]:
match_df

Unnamed: 0,match_id,date_id,venue_id,referee_id,home_id,away_id,attendance,match_week,xg_home,xg_away,...,_day_,month,day_of_month,day_of_week,year,hour,minute,second,referee_name,venue_name
0,1,1,13,22,17,1,9231.0,1,1.4,1.2,...,Fri,8.0,18.0,5.0,2017.0,20.0,15.0,0.0,José Munuera,Estadio Municipal de Butarque
1,2,2,25,18,26,16,35971.0,1,1.9,0.3,...,Fri,8.0,18.0,5.0,2017.0,22.0,15.0,0.0,Jesús Gil,Estadio de Mestalla
2,3,3,22,7,6,24,16961.0,1,1.6,2.4,...,Sat,8.0,19.0,6.0,2017.0,18.0,15.0,0.0,Antonio Matéu Lahoz,Estadio de Balaídos
3,4,4,3,24,12,3,11511.0,1,2.1,0.8,...,Sat,8.0,19.0,6.0,2017.0,20.0,15.0,0.0,Juan Martínez,Estadi Municipal de Montilivi
4,5,5,18,4,25,10,30487.0,1,2.1,1.1,...,Sat,8.0,19.0,6.0,2017.0,22.0,15.0,0.0,Alejandro Hernández,Estadio Ramón Sánchez Pizjuán
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895,1896,1732,16,4,13,10,17951.0,38,2.0,0.4,...,Sun,5.0,22.0,0.0,2022.0,20.0,0.0,0.0,Alejandro Hernández,Estadio Nuevo Los Cármenes
1896,1897,1732,8,9,21,19,18717.0,38,0.8,1.3,...,Sun,5.0,22.0,0.0,2022.0,20.0,0.0,0.0,César Soto,Estadio El Sadar
1897,1898,1733,1,21,4,28,54850.0,38,0.7,0.7,...,Sun,5.0,22.0,0.0,2022.0,22.0,0.0,0.0,José Luis Munuera,Camp Nou
1898,1899,1733,12,18,24,3,23586.0,38,2.3,0.8,...,Sun,5.0,22.0,0.0,2022.0,22.0,0.0,0.0,Jesús Gil,Estadio Municipal de Anoeta


In [6]:
match_df.dtypes

match_id          int64
date_id           int64
venue_id          int64
referee_id        int64
home_id           int64
                 ...   
hour            float64
minute          float64
second          float64
referee_name     object
venue_name       object
Length: 87, dtype: object

# Preprocess
We will drop the last few columns pulled in from the joins
We have to verify data types are correct. Match to what is in the data warehouse
Clean formation columns 

In [7]:
match_results = match_df.iloc[:,6:]

In [8]:
match_results.head()

Unnamed: 0,attendance,match_week,xg_home,xg_away,home_goal,away_goal,home_captain_name,away_captain_name,home_formation,away_formation,...,_day_,month,day_of_month,day_of_week,year,hour,minute,second,referee_name,venue_name
0,9231.0,1,1.4,1.2,1,0,Martín Mantovani,Manu García,4-2-3-1,4-4-2,...,Fri,8.0,18.0,5.0,2017.0,20.0,15.0,0.0,José Munuera,Estadio Municipal de Butarque
1,35971.0,1,1.9,0.3,1,0,Daniel Parejo,Jonathan Viera,4-4-2,4-5-1,...,Fri,8.0,18.0,5.0,2017.0,22.0,15.0,0.0,Jesús Gil,Estadio de Mestalla
2,16961.0,1,1.6,2.4,2,3,Hugo Mallo,Xabi Prieto,4-3-3,4-3-3,...,Sat,8.0,19.0,6.0,2017.0,18.0,15.0,0.0,Antonio Matéu Lahoz,Estadio de Balaídos
3,11511.0,1,2.1,0.8,2,2,Álex Granell,Gabi,3-4-3,4-4-2,...,Sat,8.0,19.0,6.0,2017.0,20.0,15.0,0.0,Juan Martínez,Estadi Municipal de Montilivi
4,30487.0,1,2.1,1.1,1,1,Jesús Navas,Gerard Moreno,4-1-4-1,4-4-2,...,Sat,8.0,19.0,6.0,2017.0,22.0,15.0,0.0,Alejandro Hernández,Estadio Ramón Sánchez Pizjuán


In [9]:
for i,v in match_results.dtypes.iteritems():
    print(i, v)

attendance float64
match_week int64
xg_home float64
xg_away float64
home_goal int64
away_goal int64
home_captain_name object
away_captain_name object
home_formation object
away_formation object
home_possession float64
away_possession float64
home_shots int64
home_shots_on_target int64
home_distance float64
home_freekicks int64
home_penalty_kicks int64
home_touches int64
home_touches_att_pen int64
home_touches_def_pen int64
home_touches_def_third int64
home_touches_mid_third int64
home_touches_att_third int64
home_touches_live int64
home_dribbles_success int64
home_dribbles_att int64
home_dribbles_players_number int64
home_dribbles_megs int64
home_carries int64
home_carries_tot_dist int64
home_carries_prog_dist int64
home_carries_prog float64
home_carries_one_third float64
home_carries_cpa float64
home_carries_miss int64
home_carries_dis int64
home_receiving_target int64
home_receiving_rec int64
home_receiving_prog int64
away_shots int64
away_shots_on_target int64
away_distance float64


In [10]:
# Fix data types for certain columns
match_results['attendance'] = match_results['attendance'].astype(pd.Int64Dtype())
match_results['_date_'] = pd.to_datetime(match_results['_date_'])
match_results['month'] = match_results['month'].astype('int')
match_results['day_of_month'] = match_results['day_of_month'].astype('int')
match_results['day_of_week'] = match_results['day_of_week'].astype('int')
match_results['year'] = match_results['year'].astype('int')
match_results['hour'] = match_results['hour'].astype('int')
match_results['minute'] = match_results['minute'].astype('int')
match_results['second'] = match_results['second'].astype('int')
match_results.drop('_day_', axis = 1, inplace = True) # Not needed as i numerically created a day of week column

In [11]:
match_results['home_formation'].value_counts()

4-4-2        636
4-2-3-1      363
4-3-3        273
4-1-4-1      144
3-5-2        125
3-4-3         89
4-4-2◆        62
4-2-2-2       56
4-4-1-1       41
4-5-1         22
3-2-2-2-1     14
4-3-2-1       12
3-4-1-2       11
3-1-4-2        8
3-3-2-2        7
3-2-3-2        5
4-3-1-2◆       5
3-2-2-1-2      4
3-5-1-1        4
4-3-3◆         3
4-2-2-1-1      3
5-3-2          2
5-4-1          2
4-1-3-2        1
4-1-3-2◆       1
4-3-1-2        1
3-1-4-1-1      1
4-2-3-1◆       1
4-1-2-3        1
3-2-2-3        1
3-2-1-2-2      1
3-4-3◆         1
Name: home_formation, dtype: int64

In [12]:
match_results['home_formation'] = match_results['home_formation'].replace({'4-4-2◆': '4-4-2', '4-3-1-2◆':'4-3-1-2', '4-3-3◆':'4-3-3', '4-1-3-2◆':'4-1-3-2',
                                         '4-2-3-1◆':'4-2-3-1', '3-4-3◆':'3-4-3'})
match_results['away_formation'] = match_results['away_formation'].replace({'4-4-2◆': '4-4-2', '4-3-1-2◆':'4-3-1-2', '4-3-3◆':'4-3-3', '4-1-3-2◆':'4-1-3-2',
                                         '4-2-3-1◆':'4-2-3-1', '3-4-3◆':'3-4-3', '4-3-2-1◆':'4-3-2-1'})

In [13]:
match_results['home_formation'].value_counts()

4-4-2        698
4-2-3-1      364
4-3-3        276
4-1-4-1      144
3-5-2        125
3-4-3         90
4-2-2-2       56
4-4-1-1       41
4-5-1         22
3-2-2-2-1     14
4-3-2-1       12
3-4-1-2       11
3-1-4-2        8
3-3-2-2        7
4-3-1-2        6
3-2-3-2        5
3-5-1-1        4
3-2-2-1-2      4
4-2-2-1-1      3
5-3-2          2
5-4-1          2
4-1-3-2        2
4-1-2-3        1
3-1-4-1-1      1
3-2-2-3        1
3-2-1-2-2      1
Name: home_formation, dtype: int64

In [14]:
match_results['away_formation'].value_counts()

4-4-2        642
4-2-3-1      380
4-3-3        255
4-1-4-1      151
3-5-2        145
3-4-3        107
4-2-2-2       59
4-4-1-1       36
4-3-2-1       21
4-5-1         18
3-2-2-2-1     15
3-1-4-2       13
3-4-1-2       11
3-3-2-2       11
3-2-3-2        8
3-5-1-1        7
5-3-2          6
4-3-1-2        4
5-4-1          3
3-1-2-2-2      2
4-1-3-2        2
3-2-2-1-2      2
5-1-2-2        2
Name: away_formation, dtype: int64

In [15]:
for i,v in match_results.dtypes.iteritems():
    print(i, v)

attendance Int64
match_week int64
xg_home float64
xg_away float64
home_goal int64
away_goal int64
home_captain_name object
away_captain_name object
home_formation object
away_formation object
home_possession float64
away_possession float64
home_shots int64
home_shots_on_target int64
home_distance float64
home_freekicks int64
home_penalty_kicks int64
home_touches int64
home_touches_att_pen int64
home_touches_def_pen int64
home_touches_def_third int64
home_touches_mid_third int64
home_touches_att_third int64
home_touches_live int64
home_dribbles_success int64
home_dribbles_att int64
home_dribbles_players_number int64
home_dribbles_megs int64
home_carries int64
home_carries_tot_dist int64
home_carries_prog_dist int64
home_carries_prog float64
home_carries_one_third float64
home_carries_cpa float64
home_carries_miss int64
home_carries_dis int64
home_receiving_target int64
home_receiving_rec int64
home_receiving_prog int64
away_shots int64
away_shots_on_target int64
away_distance float64
aw

In [16]:
match_results

Unnamed: 0,attendance,match_week,xg_home,xg_away,home_goal,away_goal,home_captain_name,away_captain_name,home_formation,away_formation,...,_time_,month,day_of_month,day_of_week,year,hour,minute,second,referee_name,venue_name
0,9231,1,1.4,1.2,1,0,Martín Mantovani,Manu García,4-2-3-1,4-4-2,...,20:15:00,8,18,5,2017,20,15,0,José Munuera,Estadio Municipal de Butarque
1,35971,1,1.9,0.3,1,0,Daniel Parejo,Jonathan Viera,4-4-2,4-5-1,...,22:15:00,8,18,5,2017,22,15,0,Jesús Gil,Estadio de Mestalla
2,16961,1,1.6,2.4,2,3,Hugo Mallo,Xabi Prieto,4-3-3,4-3-3,...,18:15:00,8,19,6,2017,18,15,0,Antonio Matéu Lahoz,Estadio de Balaídos
3,11511,1,2.1,0.8,2,2,Álex Granell,Gabi,3-4-3,4-4-2,...,20:15:00,8,19,6,2017,20,15,0,Juan Martínez,Estadi Municipal de Montilivi
4,30487,1,2.1,1.1,1,1,Jesús Navas,Gerard Moreno,4-1-4-1,4-4-2,...,22:15:00,8,19,6,2017,22,15,0,Alejandro Hernández,Estadio Ramón Sánchez Pizjuán
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895,17951,38,2.0,0.4,0,0,Víctor Díaz,David López,4-3-3,4-4-2,...,20:00:00,5,22,0,2022,20,0,0,Alejandro Hernández,Estadio Nuevo Los Cármenes
1896,18717,38,0.8,1.3,0,2,Oier Sanjurjo,Manolo Reina,4-3-3,3-5-2,...,20:00:00,5,22,0,2022,20,0,0,César Soto,Estadio El Sadar
1897,54850,38,0.7,0.7,0,2,Sergio Busquets,Mario Gaspar,4-3-3,4-4-2,...,22:00:00,5,22,0,2022,22,0,0,José Luis Munuera,Camp Nou
1898,23586,38,2.3,0.8,1,2,Asier Illarramendi,Koke,4-4-2,3-5-2,...,22:00:00,5,22,0,2022,22,0,0,Jesús Gil,Estadio Municipal de Anoeta


In [18]:
# One hot encode
match_results_encode = pd.get_dummies(match_results, columns = ['home_captain_name', 'away_captain_name', 'home_formation', 'away_formation',
                                                                'home_team', 'away_team', 'referee_name', 'venue_name'])

In [19]:
match_results_encode

Unnamed: 0,attendance,match_week,xg_home,xg_away,home_goal,away_goal,home_possession,away_possession,home_shots,home_shots_on_target,...,venue_name_Estadio Wanda Metropolitano,venue_name_Estadio de Balaídos,venue_name_Estadio de Gran Canaria,venue_name_Estadio de Mendizorroza,venue_name_Estadio de Mestalla,venue_name_Estadio de la Cerámica,venue_name_Estadio del Rayo Vallecano,venue_name_Iberostar Estadi,venue_name_RCDE Stadium,venue_name_San Mamés
0,9231,1,1.4,1.2,1,0,54.0,46.0,14,3,...,0,0,0,0,0,0,0,0,0,0
1,35971,1,1.9,0.3,1,0,48.0,52.0,22,6,...,0,0,0,0,1,0,0,0,0,0
2,16961,1,1.6,2.4,2,3,53.0,47.0,16,5,...,0,1,0,0,0,0,0,0,0,0
3,11511,1,2.1,0.8,2,2,53.0,47.0,14,6,...,0,0,0,0,0,0,0,0,0,0
4,30487,1,2.1,1.1,1,1,62.0,38.0,9,4,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1895,17951,38,2.0,0.4,0,0,40.0,60.0,14,5,...,0,0,0,0,0,0,0,0,0,0
1896,18717,38,0.8,1.3,0,2,62.0,38.0,11,1,...,0,0,0,0,0,0,0,0,0,0
1897,54850,38,0.7,0.7,0,2,70.0,30.0,15,3,...,0,0,0,0,0,0,0,0,0,0
1898,23586,38,2.3,0.8,1,2,61.0,39.0,13,3,...,0,0,0,0,0,0,0,0,0,0


# Feature Selection

In [17]:
# check for missing values