In [176]:
import pandas as pd
import numpy as np

import unidecode
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm
from sklearn.metrics import r2_score

In [177]:
fb_df = pd.read_csv('data/Premier-League-2015-2019.csv')


In [178]:
fb_df.sample(10)

Unnamed: 0,Date,HomeTeam,AwayTeam,outcome_by_HOME,AVERAGE_ODD_WIN,AVERAGE_ODD_DRAW,AVERAGE_ODD_OPPONENT_WIN,RANKINGHOME,RANKING AWAY,LAST_GAME_RHOMETEAM,LAST_GAME_RAWAYTEAM
1120,05/05/2018,West Brom,Tottenham,W,11.619,5.185,2.114,10,2,1,1
328,16/04/2016,Norwich,Sunderland,L,7.605,3.163,3.115,20,16,-1,-1
9,10/08/2015,West Brom,Man City,L,9.263,4.136,2.105,13,2,0,0
265,27/02/2016,West Ham,Sunderland,W,7.238,3.383,4.153,12,16,0,-1
150,12/12/2015,Bournemouth,Man United,W,8.561,3.329,2.415,20,4,-1,1
688,08/04/2017,West Brom,Southampton,L,7.829,3.231,2.589,14,6,1,1
612,04/02/2017,Everton,Bournemouth,W,7.063,3.624,4.857,11,16,1,1
159,14/12/2015,Leicester,Chelsea,W,8.202,3.388,2.438,14,1,-1,1
1050,10/03/2018,Chelsea,Crystal Palace,W,6.183,5.1,10.229,1,14,-1,-1
1430,09/03/2019,Crystal Palace,Brighton,L,5.98,6.632,4.214,11,15,1,1


In [179]:
fb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1520 entries, 0 to 1519
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Date                      1520 non-null   object 
 1   HomeTeam                  1520 non-null   object 
 2   AwayTeam                  1520 non-null   object 
 3   outcome_by_HOME           1520 non-null   object 
 4   AVERAGE_ODD_WIN           1520 non-null   float64
 5   AVERAGE_ODD_DRAW          1520 non-null   float64
 6   AVERAGE_ODD_OPPONENT_WIN  1520 non-null   float64
 7   RANKINGHOME               1520 non-null   int64  
 8   RANKING AWAY              1520 non-null   int64  
 9   LAST_GAME_RHOMETEAM       1520 non-null   int64  
 10  LAST_GAME_RAWAYTEAM       1520 non-null   int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 130.8+ KB


Check for duplicates

In [180]:
fb_df.duplicated().sum()

0

Checking for missing values

In [181]:
fb_df.isnull().sum()

Date                        0
HomeTeam                    0
AwayTeam                    0
outcome_by_HOME             0
AVERAGE_ODD_WIN             0
AVERAGE_ODD_DRAW            0
AVERAGE_ODD_OPPONENT_WIN    0
RANKINGHOME                 0
RANKING AWAY                0
LAST_GAME_RHOMETEAM         0
LAST_GAME_RAWAYTEAM         0
dtype: int64

 normalize and standardize column names

In [182]:
fb_df.rename(columns={
    'Date': 'date',
    'HomeTeam': 'home_team',
    'AwayTeam': 'away_team',
    'outcome_by_HOME': 'home_outcome',
    'AVERAGE_ODD_WIN': 'avg_odd_home_win',
    'AVERAGE_ODD_DRAW': 'avg_odd_draw',
    'AVERAGE_ODD_OPPONENT_WIN': 'avg_odd_away_win',
    'RANKINGHOME': 'home_ranking',
    'RANKING AWAY': 'away_ranking',
    'LAST_GAME_RHOMETEAM': 'last_home_result',
    'LAST_GAME_RAWAYTEAM': 'last_away_result'
}, inplace=True)

In [183]:
fb_df.head()

Unnamed: 0,date,home_team,away_team,home_outcome,avg_odd_home_win,avg_odd_draw,avg_odd_away_win,home_ranking,away_ranking,last_home_result,last_away_result
0,08/08/2015,Bournemouth,Aston Villa,L,7.024,3.303,3.748,20,17,0,0
1,08/08/2015,Chelsea,Swansea,D,6.697,4.338,8.967,1,8,0,0
2,08/08/2015,Everton,Watford,D,6.842,3.514,4.852,11,20,0,0
3,08/08/2015,Leicester,Sunderland,W,7.0,3.207,3.742,14,16,0,0
4,08/08/2015,Man United,Tottenham,W,6.477,3.651,5.318,4,5,0,0


Label encoding

label encode home and away team and make sure a team in Home will have the same ID in Away

In [184]:
all_teams = pd.concat([fb_df['home_team'], fb_df['away_team']]).unique()

team_encoder = LabelEncoder()
team_encoder.fit(all_teams)

fb_df['home_team'] = team_encoder.transform(fb_df['home_team'])
fb_df['away_team'] = team_encoder.transform(fb_df['away_team'])

In [185]:
fb_df.sample(10)

Unnamed: 0,date,home_team,away_team,home_outcome,avg_odd_home_win,avg_odd_draw,avg_odd_away_win,home_ranking,away_ranking,last_home_result,last_away_result
175,26/12/2015,17,8,L,8.384,3.447,2.363,15,11,-1,1
1200,29/09/2018,0,24,W,6.764,7.508,5.565,6,14,1,0
505,26/11/2016,22,7,W,7.801,3.226,2.864,12,15,-1,-1
1334,29/12/2018,23,27,L,6.707,7.92,7.592,3,20,1,0
855,28/10/2017,24,20,L,5.94,3.36,3.574,17,13,-1,-1
25,22/08/2015,26,2,L,6.835,3.231,3.086,12,20,1,0
640,04/03/2017,15,2,D,6.99,5.66,12.377,5,16,-1,0
350,30/04/2016,24,1,W,6.927,3.537,4.964,20,17,-1,-1
1054,10/03/2018,17,19,W,7.0,3.115,2.791,20,8,-1,0
1413,27/02/2019,0,2,W,6.257,7.634,6.168,6,12,1,0


Label encoding for home_outcome

In [186]:
#label encoding for home_outcome
df_label = fb_df
df_label['home_outcome'] = df_label['home_outcome'].map({'L': 0, 'D': 1, 'W': 2})

In [187]:
df_label.sample(10)

Unnamed: 0,date,home_team,away_team,home_outcome,avg_odd_home_win,avg_odd_draw,avg_odd_away_win,home_ranking,away_ranking,last_home_result,last_away_result
290,12/03/2016,20,19,0,7.521,3.112,2.778,9,7,-1,1
101,31/10/2015,7,15,1,8.606,3.42,2.281,10,4,-1,1
81,17/10/2015,7,26,0,7.148,3.354,3.64,10,12,1,1
1403,23/02/2019,2,27,1,6.383,6.601,2.603,12,20,-1,0
1298,09/12/2018,17,27,0,6.849,6.596,2.692,10,20,0,1
183,28/12/2015,15,6,1,8.013,3.162,2.871,4,1,-1,-1
752,21/05/2017,6,21,2,6.805,7.346,18.088,10,17,-1,1
92,24/10/2015,12,7,2,6.971,3.25,3.234,14,10,1,1
1117,05/05/2018,12,26,0,6.979,3.274,3.232,12,11,-1,-1
732,07/05/2017,13,19,1,6.556,3.854,5.574,8,6,1,1


One-Hot encode for home_outcome

might be useful if using differnt ml model?

In [None]:
# One-hot encoding for home_outcome
##df_onehot = fb_df
##df_onehot = pd.get_dummies(fb_df, columns=['home_outcome'])

In [190]:
df_label.sample(10)

Unnamed: 0,date,home_team,away_team,home_outcome,avg_odd_home_win,avg_odd_draw,avg_odd_away_win,home_ranking,away_ranking,last_home_result,last_away_result
1099,21/04/2018,25,13,1,10.715,4.669,2.111,10,4,1,1
828,01/10/2017,8,4,0,5.889,3.462,5.309,7,16,1,0
38,30/08/2015,19,18,2,6.477,3.282,3.666,7,20,0,1
936,16/12/2017,24,10,0,5.739,3.467,4.642,17,20,-1,-1
62,26/09/2015,15,21,2,6.398,5.995,13.069,4,16,-1,-1
1503,04/05/2019,17,13,0,11.185,8.838,2.359,10,4,0,1
930,16/12/2017,0,17,2,5.675,5.661,11.763,5,20,0,-1
168,20/12/2015,24,13,2,8.915,3.499,2.219,20,6,-1,0
392,20/08/2016,12,0,1,8.92,3.44,2.52,1,2,-1,-1
96,25/10/2015,2,23,0,7.875,3.337,2.369,20,5,-1,1
