In [1]:
import pandas as pd
import numpy as np

import unidecode
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm
from sklearn.metrics import r2_score

In [2]:
fb_df = pd.read_csv('data/Premier-League-2015-2019.csv')


In [3]:
fb_df.sample(10)

Unnamed: 0,Date,HomeTeam,AwayTeam,outcome_by_HOME,AVERAGE_ODD_WIN,AVERAGE_ODD_DRAW,AVERAGE_ODD_OPPONENT_WIN,RANKINGHOME,RANKING AWAY,LAST_GAME_RHOMETEAM,LAST_GAME_RAWAYTEAM
1118,05/05/2018,Stoke,Crystal Palace,L,7.277,3.261,2.77,13,14,0,1
868,05/11/2017,Man City,Arsenal,W,5.777,4.65,6.456,3,5,1,1
483,05/11/2016,Man City,Middlesbrough,D,7.507,6.138,13.306,4,20,-1,-1
369,11/05/2016,Sunderland,Everton,W,7.174,3.601,4.68,16,11,0,0
1294,08/12/2018,Chelsea,Man City,W,7.527,7.29,2.146,5,1,-1,1
746,16/05/2017,Arsenal,Sunderland,W,7.223,9.904,23.862,2,17,-1,1
632,25/02/2017,Everton,Sunderland,W,6.972,4.514,7.436,11,17,0,-1
948,23/12/2017,Swansea,Crystal Palace,D,6.823,3.179,2.519,15,14,-1,1
682,05/04/2017,Swansea,Tottenham,L,9.478,4.035,2.09,12,3,1,1
477,30/10/2016,Everton,West Ham,W,8.148,3.571,4.336,11,7,0,0


In [4]:
fb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1520 entries, 0 to 1519
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Date                      1520 non-null   object 
 1   HomeTeam                  1520 non-null   object 
 2   AwayTeam                  1520 non-null   object 
 3   outcome_by_HOME           1520 non-null   object 
 4   AVERAGE_ODD_WIN           1520 non-null   float64
 5   AVERAGE_ODD_DRAW          1520 non-null   float64
 6   AVERAGE_ODD_OPPONENT_WIN  1520 non-null   float64
 7   RANKINGHOME               1520 non-null   int64  
 8   RANKING AWAY              1520 non-null   int64  
 9   LAST_GAME_RHOMETEAM       1520 non-null   int64  
 10  LAST_GAME_RAWAYTEAM       1520 non-null   int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 130.8+ KB


Check for duplicates

In [5]:
fb_df.duplicated().sum()

0

Checking for missing values

In [6]:
fb_df.isnull().sum()

Date                        0
HomeTeam                    0
AwayTeam                    0
outcome_by_HOME             0
AVERAGE_ODD_WIN             0
AVERAGE_ODD_DRAW            0
AVERAGE_ODD_OPPONENT_WIN    0
RANKINGHOME                 0
RANKING AWAY                0
LAST_GAME_RHOMETEAM         0
LAST_GAME_RAWAYTEAM         0
dtype: int64

 normalize and standardize column names

In [7]:
fb_df.rename(columns={
    'Date': 'date',
    'HomeTeam': 'home_team',
    'AwayTeam': 'away_team',
    'outcome_by_HOME': 'home_outcome',
    'AVERAGE_ODD_WIN': 'avg_odd_home_win',
    'AVERAGE_ODD_DRAW': 'avg_odd_draw',
    'AVERAGE_ODD_OPPONENT_WIN': 'avg_odd_away_win',
    'RANKINGHOME': 'home_ranking',
    'RANKING AWAY': 'away_ranking',
    'LAST_GAME_RHOMETEAM': 'last_home_result',
    'LAST_GAME_RAWAYTEAM': 'last_away_result'
}, inplace=True)

In [8]:
fb_df.head()

Unnamed: 0,date,home_team,away_team,home_outcome,avg_odd_home_win,avg_odd_draw,avg_odd_away_win,home_ranking,away_ranking,last_home_result,last_away_result
0,08/08/2015,Bournemouth,Aston Villa,L,7.024,3.303,3.748,20,17,0,0
1,08/08/2015,Chelsea,Swansea,D,6.697,4.338,8.967,1,8,0,0
2,08/08/2015,Everton,Watford,D,6.842,3.514,4.852,11,20,0,0
3,08/08/2015,Leicester,Sunderland,W,7.0,3.207,3.742,14,16,0,0
4,08/08/2015,Man United,Tottenham,W,6.477,3.651,5.318,4,5,0,0


Label encoding

label encode home and away team and make sure a team in Home will have the same ID in Away

In [9]:
all_teams = pd.concat([fb_df['home_team'], fb_df['away_team']]).unique()

team_encoder = LabelEncoder()
team_encoder.fit(all_teams)

fb_df['home_team'] = team_encoder.transform(fb_df['home_team'])
fb_df['away_team'] = team_encoder.transform(fb_df['away_team'])

In [10]:
fb_df.sample(10)

Unnamed: 0,date,home_team,away_team,home_outcome,avg_odd_home_win,avg_odd_draw,avg_odd_away_win,home_ranking,away_ranking,last_home_result,last_away_result
779,21/08/2017,14,8,D,5.509,4.999,8.93,3,7,1,1
1032,24/02/2018,4,19,D,7.063,3.063,2.766,16,8,-1,-1
882,25/11/2017,13,6,D,6.009,3.362,3.338,4,1,1,1
1290,08/12/2018,0,10,W,7.082,8.368,9.421,6,16,0,-1
285,05/03/2016,24,12,L,7.817,3.244,2.559,20,14,1,1
1466,13/04/2019,3,2,L,6.099,6.545,3.133,15,12,-1,-1
57,20/09/2015,13,18,D,6.243,3.618,5.224,6,20,0,-1
1001,30/01/2018,22,0,W,10.017,4.731,2.123,15,5,1,1
1471,13/04/2019,23,10,W,7.156,9.204,12.236,3,16,1,-1
1165,25/08/2018,27,14,D,12.8,10.527,2.383,20,1,-1,1


Label encoding for home_outcome

In [11]:
#label encoding for home_outcome
df_label = fb_df
df_label['home_outcome'] = df_label['home_outcome'].map({'L': 0, 'D': 1, 'W': 2})

In [12]:
df_label.sample(10)

Unnamed: 0,date,home_team,away_team,home_outcome,avg_odd_home_win,avg_odd_draw,avg_odd_away_win,home_ranking,away_ranking,last_home_result,last_away_result
481,05/11/2016,4,7,2,8.655,3.292,2.473,20,15,-1,0
1452,31/03/2019,13,23,2,6.009,7.103,4.87,4,3,1,-1
386,13/08/2016,19,24,1,8.138,3.372,4.399,6,13,0,0
716,29/04/2017,20,26,1,6.974,3.258,3.438,9,7,1,0
821,30/09/2017,6,14,0,6.825,3.364,2.564,1,3,1,1
1383,02/02/2019,6,10,2,7.316,9.416,13.628,5,16,-1,-1
1156,19/08/2018,3,15,2,8.837,7.775,2.215,15,2,-1,1
1093,16/04/2018,26,20,1,6.681,3.267,3.82,11,13,0,-1
1250,10/11/2018,5,3,2,6.578,6.55,2.948,20,15,-1,-1
57,20/09/2015,13,18,1,6.243,3.618,5.224,6,20,0,-1


One-Hot encode for home_outcome

might be useful if using differnt ml model?

In [13]:
# One-hot encoding for home_outcome
##df_onehot = fb_df
##df_onehot = pd.get_dummies(fb_df, columns=['home_outcome'])

In [14]:
df_label.sample(10)

Unnamed: 0,date,home_team,away_team,home_outcome,avg_odd_home_win,avg_odd_draw,avg_odd_away_win,home_ranking,away_ranking,last_home_result,last_away_result
56,19/09/2015,22,8,1,6.662,3.21,3.172,8,11,1,0
864,04/11/2017,22,3,0,6.261,3.073,3.233,15,20,-1,0
81,17/10/2015,7,26,0,7.148,3.354,3.64,10,12,1,1
85,17/10/2015,23,13,1,7.385,3.281,2.9,5,6,1,1
999,22/01/2018,22,13,2,12.998,6.226,2.376,15,4,0,1
1222,20/10/2018,6,15,1,6.443,7.22,4.32,5,2,1,1
627,12/02/2017,4,6,1,11.695,5.169,2.192,20,10,1,0
1124,08/05/2018,22,19,0,7.463,3.208,2.57,15,8,-1,0
391,20/08/2016,4,13,2,11.795,4.371,2.112,20,8,0,0
1176,01/09/2018,26,27,0,6.767,7.203,2.733,13,20,-1,0


Checking for outliers

In [15]:
df_label.describe()

Unnamed: 0,home_team,away_team,home_outcome,avg_odd_home_win,avg_odd_draw,avg_odd_away_win,home_ranking,away_ranking,last_home_result,last_away_result
count,1520.0,1520.0,1520.0,1520.0,1520.0,1520.0,1520.0,1520.0,1520.0,1520.0
mean,13.6875,13.6875,1.155921,7.576486,4.878582,4.74102,10.65,10.65,-0.067763,0.067763
std,8.070483,8.070483,0.859462,1.715519,2.011393,3.807761,5.987578,5.987578,0.85253,0.857918
min,0.0,0.0,0.0,5.324,2.935,2.005,1.0,1.0,-1.0,-1.0
25%,7.0,7.0,0.0,6.507,3.29075,2.4915,5.75,5.75,-1.0,-1.0
50%,14.0,14.0,1.0,7.1175,3.9215,3.2855,10.5,10.5,0.0,0.0
75%,21.0,21.0,2.0,8.05325,6.61725,5.095,15.25,15.25,1.0,1.0
max,27.0,27.0,2.0,18.767,14.597,33.015,20.0,20.0,1.0,1.0


In [16]:
df_label.shape

(1520, 11)

removing outliers for "avg_odd_home_win", "avg_odd_draw" and "avg_odd_away_win"

In [17]:
def remove_outliers_iqr(df, column):
    initial_count = len(df)
    
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    df_filtered = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    
    removed_count = initial_count - len(df_filtered)
    print(f"{removed_count} rows removed from '{column}' due to outliers.")
    
    return df_filtered

In [18]:
for col in ['avg_odd_home_win', 'avg_odd_draw', 'avg_odd_away_win']:
    df_label = remove_outliers_iqr(df_label, col)

101 rows removed from 'avg_odd_home_win' due to outliers.
6 rows removed from 'avg_odd_draw' due to outliers.
167 rows removed from 'avg_odd_away_win' due to outliers.


In [19]:
df_label 


X = df_label [['avg_odd_home_win', 'avg_odd_draw', 'avg_odd_away_win']]

# Use describe on the selected columns
X.describe()

#print(X)

Unnamed: 0,avg_odd_home_win,avg_odd_draw,avg_odd_away_win
count,1246.0,1246.0,1246.0
mean,7.274838,4.441469,3.684693
std,1.043937,1.647506,1.604586
min,5.463,2.935,2.005
25%,6.492,3.25,2.54475
50%,7.056,3.535,3.197
75%,7.88525,6.38325,4.292
max,10.355,9.088,9.057


In [20]:
#df_label.to_csv('data/cleaned-premier-league-data.csv', index=False)
