In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
fb_df = pd.read_csv('../data/Premier-League-2015-2019.csv')

In [3]:
fb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1520 entries, 0 to 1519
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Date                      1520 non-null   object 
 1   HomeTeam                  1520 non-null   object 
 2   AwayTeam                  1520 non-null   object 
 3   outcome_by_HOME           1520 non-null   object 
 4   AVERAGE_ODD_WIN           1520 non-null   float64
 5   AVERAGE_ODD_DRAW          1520 non-null   float64
 6   AVERAGE_ODD_OPPONENT_WIN  1520 non-null   float64
 7   RANKINGHOME               1520 non-null   int64  
 8   RANKING AWAY              1520 non-null   int64  
 9   LAST_GAME_RHOMETEAM       1520 non-null   int64  
 10  LAST_GAME_RAWAYTEAM       1520 non-null   int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 130.8+ KB


### Check for duplicates

In [4]:
fb_df.duplicated().sum()

0

### Checking for missing values

In [5]:
fb_df.isnull().sum()

Date                        0
HomeTeam                    0
AwayTeam                    0
outcome_by_HOME             0
AVERAGE_ODD_WIN             0
AVERAGE_ODD_DRAW            0
AVERAGE_ODD_OPPONENT_WIN    0
RANKINGHOME                 0
RANKING AWAY                0
LAST_GAME_RHOMETEAM         0
LAST_GAME_RAWAYTEAM         0
dtype: int64

### normalize and standardize column names

In [6]:
fb_df.rename(columns={
    'Date': 'date',
    'HomeTeam': 'home_team',
    'AwayTeam': 'away_team',
    'outcome_by_HOME': 'home_outcome',
    'AVERAGE_ODD_WIN': 'avg_odd_home_win',
    'AVERAGE_ODD_DRAW': 'avg_odd_draw',
    'AVERAGE_ODD_OPPONENT_WIN': 'avg_odd_away_win',
    'RANKINGHOME': 'home_ranking',
    'RANKING AWAY': 'away_ranking',
    'LAST_GAME_RHOMETEAM': 'last_home_result',
    'LAST_GAME_RAWAYTEAM': 'last_away_result'
}, inplace=True)

In [7]:
fb_df.head()

Unnamed: 0,date,home_team,away_team,home_outcome,avg_odd_home_win,avg_odd_draw,avg_odd_away_win,home_ranking,away_ranking,last_home_result,last_away_result
0,08/08/2015,Bournemouth,Aston Villa,L,7.024,3.303,3.748,20,17,0,0
1,08/08/2015,Chelsea,Swansea,D,6.697,4.338,8.967,1,8,0,0
2,08/08/2015,Everton,Watford,D,6.842,3.514,4.852,11,20,0,0
3,08/08/2015,Leicester,Sunderland,W,7.0,3.207,3.742,14,16,0,0
4,08/08/2015,Man United,Tottenham,W,6.477,3.651,5.318,4,5,0,0


In [8]:
print(fb_df['away_ranking'].value_counts().sort_index())

away_ranking
1      76
2      76
3      76
4      76
5      76
6      76
7      76
8      76
9      76
10     76
11     76
12     76
13     76
14     76
15     76
16     76
17     76
20    228
Name: count, dtype: int64


#### home and away rankings seem to be missing values for 18-19 and 20 seems to be bloated

since it seems like the rankings 18-19 might have ended up in 20. i will take all the 20s and randomly assign them 18,19 or 20

In [9]:
def redistribute_twenty(column):
    # Find where the column equals 20
    mask = fb_df[column] == 20
    n = mask.sum()
    
    # Generate replacements: evenly split into 18, 19, 20
    replacements = np.array([18, 19, 20] * (n // 3 + 1))[:n]
    np.random.shuffle(replacements)

    # Apply replacements
    fb_df.loc[mask, column] = replacements

# Apply to both columns
redistribute_twenty('home_ranking')
redistribute_twenty('away_ranking')

In [10]:
print(fb_df['away_ranking'].value_counts().sort_index())

away_ranking
1     76
2     76
3     76
4     76
5     76
6     76
7     76
8     76
9     76
10    76
11    76
12    76
13    76
14    76
15    76
16    76
17    76
18    76
19    76
20    76
Name: count, dtype: int64


### Checking for outliers

In [11]:
X = fb_df [['avg_odd_home_win', 'avg_odd_draw', 'avg_odd_away_win']]
X.describe()

Unnamed: 0,avg_odd_home_win,avg_odd_draw,avg_odd_away_win
count,1520.0,1520.0,1520.0
mean,7.576486,4.878582,4.74102
std,1.715519,2.011393,3.807761
min,5.324,2.935,2.005
25%,6.507,3.29075,2.4915
50%,7.1175,3.9215,3.2855
75%,8.05325,6.61725,5.095
max,18.767,14.597,33.015


In [12]:
fb_df.shape

(1520, 11)

#### removing outliers for "avg_odd_home_win", "avg_odd_draw" and "avg_odd_away_win"

In [13]:
def remove_outliers_iqr(df, column):
    initial_count = len(df)
    
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    df_filtered = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    
    removed_count = initial_count - len(df_filtered)
    print(f"{removed_count} rows removed from '{column}' due to outliers.")
    
    return df_filtered

In [14]:
for col in ['avg_odd_home_win', 'avg_odd_draw', 'avg_odd_away_win']:
    fb_df = remove_outliers_iqr(fb_df, col)

101 rows removed from 'avg_odd_home_win' due to outliers.
6 rows removed from 'avg_odd_draw' due to outliers.
167 rows removed from 'avg_odd_away_win' due to outliers.


In [15]:
X = fb_df [['avg_odd_home_win', 'avg_odd_draw', 'avg_odd_away_win']]
X.describe()

Unnamed: 0,avg_odd_home_win,avg_odd_draw,avg_odd_away_win
count,1246.0,1246.0,1246.0
mean,7.274838,4.441469,3.684693
std,1.043937,1.647506,1.604586
min,5.463,2.935,2.005
25%,6.492,3.25,2.54475
50%,7.056,3.535,3.197
75%,7.88525,6.38325,4.292
max,10.355,9.088,9.057


## Label encoding

#### label encode home and away team and make sure a team in Home will have the same ID in Away

In [16]:
""""
all_teams = pd.concat([fb_df['home_team'], fb_df['away_team']]).unique()

team_encoder = LabelEncoder()
team_encoder.fit(all_teams)

fb_df['home_team'] = team_encoder.transform(fb_df['home_team'])
fb_df['away_team'] = team_encoder.transform(fb_df['away_team'])
"""

'"\nall_teams = pd.concat([fb_df[\'home_team\'], fb_df[\'away_team\']]).unique()\n\nteam_encoder = LabelEncoder()\nteam_encoder.fit(all_teams)\n\nfb_df[\'home_team\'] = team_encoder.transform(fb_df[\'home_team\'])\nfb_df[\'away_team\'] = team_encoder.transform(fb_df[\'away_team\'])\n'

### Encoding for home_outcome

#### we will use both one-Hot and label encoding and see if they will affect our results

#### One-Hot encode for home_outcome

In [17]:
# One-hot encoding for home_outcome
df_onehot = fb_df
df_onehot = pd.get_dummies(fb_df, columns=['home_outcome'], dtype=int)


#### Label encoding for home_outcome

In [18]:
#label encoding for home_outcome
df_label = fb_df
df_label['home_outcome'] = df_label['home_outcome'].map({'L': 0, 'D': 1, 'W': 2})

In [19]:
df_onehot.head()

Unnamed: 0,date,home_team,away_team,avg_odd_home_win,avg_odd_draw,avg_odd_away_win,home_ranking,away_ranking,last_home_result,last_away_result,home_outcome_D,home_outcome_L,home_outcome_W
0,08/08/2015,Bournemouth,Aston Villa,7.024,3.303,3.748,20,17,0,0,0,1,0
1,08/08/2015,Chelsea,Swansea,6.697,4.338,8.967,1,8,0,0,1,0,0
2,08/08/2015,Everton,Watford,6.842,3.514,4.852,11,18,0,0,1,0,0
3,08/08/2015,Leicester,Sunderland,7.0,3.207,3.742,14,16,0,0,0,0,1
4,08/08/2015,Man United,Tottenham,6.477,3.651,5.318,4,5,0,0,0,0,1


In [20]:
df_label.head()

Unnamed: 0,date,home_team,away_team,home_outcome,avg_odd_home_win,avg_odd_draw,avg_odd_away_win,home_ranking,away_ranking,last_home_result,last_away_result
0,08/08/2015,Bournemouth,Aston Villa,0,7.024,3.303,3.748,20,17,0,0
1,08/08/2015,Chelsea,Swansea,1,6.697,4.338,8.967,1,8,0,0
2,08/08/2015,Everton,Watford,1,6.842,3.514,4.852,11,18,0,0
3,08/08/2015,Leicester,Sunderland,2,7.0,3.207,3.742,14,16,0,0
4,08/08/2015,Man United,Tottenham,2,6.477,3.651,5.318,4,5,0,0


### currently the data runs from 2014-2019 maybe we could get maybe training data if we split it up based on the season/year

convert our data object to datetime

In [21]:
df_onehot["date"] = pd.to_datetime(df_onehot["date"], format="%d/%m/%Y")
df_label["date"] = pd.to_datetime(df_label["date"], format="%d/%m/%Y")

make season column

In [22]:
def get_season(date):
    if date.month >= 8:  # August to December
        return f"{date.year}/{date.year+1}"
    else:                # January to July (off-season rare, but we group them here)
        return f"{date.year-1}/{date.year}"

In [23]:
df_onehot["season"] = df_onehot["date"].apply(get_season)
df_label["season"] = df_label["date"].apply(get_season)

# Filter to 5 desired seasons
target_seasons = ["2014/2015", "2015/2016", "2016/2017", "2017/2018", "2018/2019", "2019/2020"]
season_dfs = {
    season: df for season, df in df_onehot.groupby("season") if season in target_seasons
}

target_seasons = ["2014/2015", "2015/2016", "2016/2017", "2017/2018", "2018/2019", "2019/2020"]
season_dfs = {
    season: df for season, df in df_label.groupby("season") if season in target_seasons
}



fix ordering

In [24]:
df_onehot = df_onehot[
    [
        "date",
        "season",
        "home_team",
        "away_team",
        "home_outcome_D",
        "home_outcome_L",
        "home_outcome_W",
        "avg_odd_home_win",
        "avg_odd_draw",
        "avg_odd_away_win",
        "home_ranking",
        "away_ranking",
        "last_home_result",
        "last_away_result",
    ]
]

df_label = df_label[
    [
        "date",
        "season",
        "home_team",
        "away_team",
        "home_outcome",
        "avg_odd_home_win",
        "avg_odd_draw",
        "avg_odd_away_win",
        "home_ranking",
        "away_ranking",
        "last_home_result",
        "last_away_result",
    ]
]

another way we maybe could improve our dataset is a adding a new column  based on "last_home_result" and "last_away_result" to tally up the total win/draw/loss

In [25]:
def add_season_result_totals(df):
    df = df.sort_values(by=["season", "date"]).reset_index(drop=True)

    df["home_season_wins_so_far"] = 0
    df["home_season_draws_so_far"] = 0
    df["home_season_losses_so_far"] = 0
    df["away_season_wins_so_far"] = 0
    df["away_season_draws_so_far"] = 0
    df["away_season_losses_so_far"] = 0

    # Drop old versions just in case
    df = df.drop(columns=[
        "home_season_wins_so_far", "home_season_draws_so_far", "home_season_losses_so_far",
        "away_season_wins_so_far", "away_season_draws_so_far", "away_season_losses_so_far"
    ], errors="ignore")

    # Initialize new columns
    df["home_season_wins_so_far"] = 0
    df["home_season_draws_so_far"] = 0
    df["home_season_losses_so_far"] = 0
    df["away_season_wins_so_far"] = 0
    df["away_season_draws_so_far"] = 0
    df["away_season_losses_so_far"] = 0

    season_stats = {}

    for idx, row in df.iterrows():
        season = row["season"]
        home = row["home_team"]
        away = row["away_team"]
        last_home_result = row["last_home_result"]
        last_away_result = row["last_away_result"]

        if season not in season_stats:
            season_stats[season] = {}

        for team in [home, away]:
            if team not in season_stats[season]:
                season_stats[season][team] = {"W": 0, "D": 0, "L": 0}

        # Assign existing totals
        df.at[idx, "home_season_wins_so_far"] = season_stats[season][home]["W"]
        df.at[idx, "home_season_draws_so_far"] = season_stats[season][home]["D"]
        df.at[idx, "home_season_losses_so_far"] = season_stats[season][home]["L"]

        df.at[idx, "away_season_wins_so_far"] = season_stats[season][away]["W"]
        df.at[idx, "away_season_draws_so_far"] = season_stats[season][away]["D"]
        df.at[idx, "away_season_losses_so_far"] = season_stats[season][away]["L"]

        # Update based on previous match results
        if last_home_result == 1:
            season_stats[season][home]["W"] += 1
        elif last_home_result == 0:
            season_stats[season][home]["D"] += 1
        elif last_home_result == -1:
            season_stats[season][home]["L"] += 1

        if last_away_result == 1:
            season_stats[season][away]["W"] += 1
        elif last_away_result == 0:
            season_stats[season][away]["D"] += 1
        elif last_away_result == -1:
            season_stats[season][away]["L"] += 1

    return df


In [26]:
df_label = add_season_result_totals(df_label)

df_onehot = add_season_result_totals(df_onehot)

In [27]:
df_onehot.to_csv('../Data/cleaned-premier-onehot.csv', index=False)
df_label.to_csv('../Data/cleaned-premier-label.csv', index=False)