In [364]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [365]:
fb_df = pd.read_csv('../data/Premier-League-2015-2019.csv')


In [366]:
fb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1520 entries, 0 to 1519
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Date                      1520 non-null   object 
 1   HomeTeam                  1520 non-null   object 
 2   AwayTeam                  1520 non-null   object 
 3   outcome_by_HOME           1520 non-null   object 
 4   AVERAGE_ODD_WIN           1520 non-null   float64
 5   AVERAGE_ODD_DRAW          1520 non-null   float64
 6   AVERAGE_ODD_OPPONENT_WIN  1520 non-null   float64
 7   RANKINGHOME               1520 non-null   int64  
 8   RANKING AWAY              1520 non-null   int64  
 9   LAST_GAME_RHOMETEAM       1520 non-null   int64  
 10  LAST_GAME_RAWAYTEAM       1520 non-null   int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 130.8+ KB


### Check for duplicates

In [367]:
fb_df.duplicated().sum()

0

### Checking for missing values

In [368]:
fb_df.isnull().sum()

Date                        0
HomeTeam                    0
AwayTeam                    0
outcome_by_HOME             0
AVERAGE_ODD_WIN             0
AVERAGE_ODD_DRAW            0
AVERAGE_ODD_OPPONENT_WIN    0
RANKINGHOME                 0
RANKING AWAY                0
LAST_GAME_RHOMETEAM         0
LAST_GAME_RAWAYTEAM         0
dtype: int64

### normalize and standardize column names

In [369]:
fb_df.rename(columns={
    'Date': 'date',
    'HomeTeam': 'home_team',
    'AwayTeam': 'away_team',
    'outcome_by_HOME': 'home_outcome',
    'AVERAGE_ODD_WIN': 'avg_odd_home_win',
    'AVERAGE_ODD_DRAW': 'avg_odd_draw',
    'AVERAGE_ODD_OPPONENT_WIN': 'avg_odd_away_win',
    'RANKINGHOME': 'home_ranking',
    'RANKING AWAY': 'away_ranking',
    'LAST_GAME_RHOMETEAM': 'last_home_result',
    'LAST_GAME_RAWAYTEAM': 'last_away_result'
}, inplace=True)

In [370]:
fb_df.head()

Unnamed: 0,date,home_team,away_team,home_outcome,avg_odd_home_win,avg_odd_draw,avg_odd_away_win,home_ranking,away_ranking,last_home_result,last_away_result
0,08/08/2015,Bournemouth,Aston Villa,L,7.024,3.303,3.748,20,17,0,0
1,08/08/2015,Chelsea,Swansea,D,6.697,4.338,8.967,1,8,0,0
2,08/08/2015,Everton,Watford,D,6.842,3.514,4.852,11,20,0,0
3,08/08/2015,Leicester,Sunderland,W,7.0,3.207,3.742,14,16,0,0
4,08/08/2015,Man United,Tottenham,W,6.477,3.651,5.318,4,5,0,0


## Label encoding

#### label encode home and away team and make sure a team in Home will have the same ID in Away

In [371]:
all_teams = pd.concat([fb_df['home_team'], fb_df['away_team']]).unique()

team_encoder = LabelEncoder()
team_encoder.fit(all_teams)

fb_df['home_team'] = team_encoder.transform(fb_df['home_team'])
fb_df['away_team'] = team_encoder.transform(fb_df['away_team'])

### Encoding for home_outcome

#### we will use both one-Hot and label encoding and see if they will affect our results

#### One-Hot encode for home_outcome

In [372]:
fb_df.sample(1)

Unnamed: 0,date,home_team,away_team,home_outcome,avg_odd_home_win,avg_odd_draw,avg_odd_away_win,home_ranking,away_ranking,last_home_result,last_away_result
1220,20/10/2018,2,19,D,6.449,7.087,3.347,12,17,1,-1


#### Label encoding for home_outcome

In [373]:
#label encoding for home_outcome
df_label = fb_df
df_label['home_outcome'] = df_label['home_outcome'].map({'L': 0, 'D': 1, 'W': 2})

In [374]:
# One-hot encoding for home_outcome
df_onehot = df_label
df_onehot = pd.get_dummies(fb_df, columns=['home_outcome'])


In [375]:
# One-hot encode last_home_result
df_onehot['last_home_result_L'] = (df_onehot['last_home_result'] == -1).astype(int)
df_onehot['last_home_result_D'] = (df_onehot['last_home_result'] == 0).astype(int)
df_onehot['last_home_result_W'] = (df_onehot['last_home_result'] == 1).astype(int)

# One-hot encode last_away_result
df_onehot['last_away_result_L'] = (df_onehot['last_away_result'] == -1).astype(int)
df_onehot['last_away_result_D'] = (df_onehot['last_away_result'] == 0).astype(int)
df_onehot['last_away_result_W'] = (df_onehot['last_away_result'] == 1).astype(int)
# Drop the original last_home_result and last_away_result columns
df_onehot.drop(columns=['last_home_result', 'last_away_result'], inplace=True)

In [376]:
df_onehot.sample(1)

Unnamed: 0,date,home_team,away_team,avg_odd_home_win,avg_odd_draw,avg_odd_away_win,home_ranking,away_ranking,home_outcome_0,home_outcome_1,home_outcome_2,last_home_result_L,last_home_result_D,last_home_result_W,last_away_result_L,last_away_result_D,last_away_result_W
666,01/04/2017,12,20,7.138,3.291,3.737,1,9,False,False,True,0,1,0,0,0,1


In [377]:
# have to run again to get the correct values for some weird reason
#print(df_label['away_ranking'].value_counts().sort_index())

In [378]:
# have to run again to get the correct values for some weird reason
#print(df_label['home_ranking'].value_counts().sort_index())


#### home and away rankings seem to be missing values for 18-19 and 20 seems to be bloated

since it seems like the rankings 18-19 might have ended up in 20. i will take all the 20s and randomly assign them 18,19 or 20

In [379]:
"""def redistribute_twenty(column):
    # Find where the column equals 20
    mask = df_label[column] == 20
    n = mask.sum()
    
    # Generate replacements: evenly split into 18, 19, 20
    replacements = np.array([18, 19, 20] * (n // 3 + 1))[:n]
    np.random.shuffle(replacements)

    # Apply replacements
    df_label.loc[mask, column] = replacements

# Apply to both columns
redistribute_twenty('home_ranking')
redistribute_twenty('away_ranking')
"""


"def redistribute_twenty(column):\n    # Find where the column equals 20\n    mask = df_label[column] == 20\n    n = mask.sum()\n    \n    # Generate replacements: evenly split into 18, 19, 20\n    replacements = np.array([18, 19, 20] * (n // 3 + 1))[:n]\n    np.random.shuffle(replacements)\n\n    # Apply replacements\n    df_label.loc[mask, column] = replacements\n\n# Apply to both columns\nredistribute_twenty('home_ranking')\nredistribute_twenty('away_ranking')\n"

In [380]:
#df_label['home_ranking']

In [381]:
#print(df_label['away_ranking'].value_counts())

In [382]:
df_label.shape

(1520, 11)

In [398]:
print(df_label['away_ranking'].apply(type).value_counts())

away_ranking
<class 'int'>    1520
Name: count, dtype: int64


In [399]:
print(df_label['away_ranking'].unique())

[17  8 20 16  5 10 12  7  6  2  4 11 15  9 13 14  3  1]


In [402]:
print(sorted(df_label['away_ranking'].unique()))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 20]


In [400]:
print(df_label['away_ranking'].astype(str).str.strip().value_counts().sort_index())

away_ranking
1      76
10     76
11     76
12     76
13     76
14     76
15     76
16     76
17     76
2      76
20    228
3      76
4      76
5      76
6      76
7      76
8      76
9      76
Name: count, dtype: int64


In [401]:
print(df_label['away_ranking'].describe())

count    1520.000000
mean       10.650000
std         5.987578
min         1.000000
25%         5.750000
50%        10.500000
75%        15.250000
max        20.000000
Name: away_ranking, dtype: float64


In [403]:
# have to run again to get the correct values for some weird reason
print(df_label['away_ranking'].value_counts().sort_index())

away_ranking
1      76
2      76
3      76
4      76
5      76
6      76
7      76
8      76
9      76
10     76
11     76
12     76
13     76
14     76
15     76
16     76
17     76
20    228
Name: count, dtype: int64


### Checking for outliers

In [384]:
df_label.describe()

Unnamed: 0,home_team,away_team,home_outcome,avg_odd_home_win,avg_odd_draw,avg_odd_away_win,home_ranking,away_ranking,last_home_result,last_away_result
count,1520.0,1520.0,1520.0,1520.0,1520.0,1520.0,1520.0,1520.0,1520.0,1520.0
mean,13.6875,13.6875,1.155921,7.576486,4.878582,4.74102,10.65,10.65,-0.067763,0.067763
std,8.070483,8.070483,0.859462,1.715519,2.011393,3.807761,5.987578,5.987578,0.85253,0.857918
min,0.0,0.0,0.0,5.324,2.935,2.005,1.0,1.0,-1.0,-1.0
25%,7.0,7.0,0.0,6.507,3.29075,2.4915,5.75,5.75,-1.0,-1.0
50%,14.0,14.0,1.0,7.1175,3.9215,3.2855,10.5,10.5,0.0,0.0
75%,21.0,21.0,2.0,8.05325,6.61725,5.095,15.25,15.25,1.0,1.0
max,27.0,27.0,2.0,18.767,14.597,33.015,20.0,20.0,1.0,1.0


In [385]:
df_label.shape

(1520, 11)

#### removing outliers for "avg_odd_home_win", "avg_odd_draw" and "avg_odd_away_win"

In [386]:
"""def remove_outliers_iqr(df, column):
    initial_count = len(df)
    
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    df_filtered = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    
    removed_count = initial_count - len(df_filtered)
    print(f"{removed_count} rows removed from '{column}' due to outliers.")
    
    return df_filtered
    """

'def remove_outliers_iqr(df, column):\n    initial_count = len(df)\n    \n    Q1 = df[column].quantile(0.25)\n    Q3 = df[column].quantile(0.75)\n    IQR = Q3 - Q1\n    lower_bound = Q1 - 1.5 * IQR\n    upper_bound = Q3 + 1.5 * IQR\n    \n    df_filtered = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]\n    \n    removed_count = initial_count - len(df_filtered)\n    print(f"{removed_count} rows removed from \'{column}\' due to outliers.")\n    \n    return df_filtered\n    '

In [387]:
""""
for col in ['avg_odd_home_win', 'avg_odd_draw', 'avg_odd_away_win']:
    df_label = remove_outliers_iqr(df_label, col)
    """

'"\nfor col in [\'avg_odd_home_win\', \'avg_odd_draw\', \'avg_odd_away_win\']:\n    df_label = remove_outliers_iqr(df_label, col)\n    '

In [388]:
""""
X = df_label [['avg_odd_home_win', 'avg_odd_draw', 'avg_odd_away_win']]
X.describe()
"""

'"\nX = df_label [[\'avg_odd_home_win\', \'avg_odd_draw\', \'avg_odd_away_win\']]\nX.describe()\n'

In [389]:
df_label.to_csv('../data/cleaned-premier-league-data.csv', index=False)