<a href="https://colab.research.google.com/github/munyanza/premier_league_prediction_model/blob/main/football.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [106]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [107]:
from google.colab import files
uploaded = files.upload()

Saving football.csv to football (5).csv


In [108]:
#load the data
df = pd.read_csv('football.csv')
df.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,BFECAHH,BFECAHA
0,E0,16/08/2024,20:00,Man United,Fulham,1,0,H,0,0,...,1.86,2.07,1.83,2.11,1.88,2.11,1.82,2.05,1.9,2.08
1,E0,17/08/2024,12:30,Ipswich,Liverpool,0,2,A,0,0,...,2.05,1.88,2.04,1.9,2.2,2.0,1.99,1.88,2.04,1.93
2,E0,17/08/2024,15:00,Arsenal,Wolves,2,0,H,1,0,...,2.02,1.91,2.0,1.9,2.05,1.93,1.99,1.87,2.02,1.96
3,E0,17/08/2024,15:00,Everton,Brighton,0,3,A,0,1,...,1.87,2.06,1.86,2.07,1.92,2.1,1.83,2.04,1.88,2.11
4,E0,17/08/2024,15:00,Newcastle,Southampton,1,0,H,1,0,...,1.87,2.06,1.88,2.06,1.89,2.1,1.82,2.05,1.89,2.1


In [109]:
df.columns


Index(['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR',
       'HTHG', 'HTAG',
       ...
       'B365CAHH', 'B365CAHA', 'PCAHH', 'PCAHA', 'MaxCAHH', 'MaxCAHA',
       'AvgCAHH', 'AvgCAHA', 'BFECAHH', 'BFECAHA'],
      dtype='object', length=120)

In [110]:
#Basic preprocessing
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
df.sort_values('Date',inplace=True)

In [111]:
#Create target variable (1 if home team wins, 0 otherwise)
df['HomeWin'] = (df['FTR'] == 'H').astype(int)

**Feature Engineering**

In [114]:
# Head-to-head history
def h2h(team1, team2, date):
    past = df[(df['Date'] < date) &
             (((df['HomeTeam'] == team1) & (df['AwayTeam'] == team2)) |
              ((df['HomeTeam'] == team2) & (df['AwayTeam'] == team1)))]
    if len(past) == 0: return 0.5
    team1_wins = len(past[(past['HomeTeam']==team1)&(past['FTR']=='H') |
                          (past['AwayTeam']==team1)&(past['FTR']=='A')])
    return team1_wins/len(past)

df['H2H_Advantage'] = df.apply(lambda x: h2h(x['HomeTeam'],x['AwayTeam'],x['Date']), axis=1)

# Recent goal difference
df['HomeGD'] = df['FTHG'] - df['FTAG']
df['HomeGD_5'] = df.groupby('HomeTeam')['HomeGD'].transform(lambda x: x.rolling(5).mean().shift())
df['AwayGD_5'] = df.groupby('AwayTeam')['HomeGD'].transform(lambda x: -x.rolling(5).mean().shift())

**Model Building**

In [116]:
# Select features and target
# Check available columns in df.columns and update the features list accordingly.
# For now, let's use the engineered features that are present.
features = ['H2H_Advantage', 'HomeGD_5', 'AwayGD_5']
X = df[features].dropna() # Drop rows with NaN values created by rolling mean
y = df.loc[X.index, 'HomeWin'] # Align target variable with features after dropping NaNs

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
predictions = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, predictions):.2f}")

Accuracy: 0.61


In [117]:
# Get current season data (adjust filter as needed)
current_season = df[df['Date'] > '2023-07-01']  # Example for 2023/24 season

# Predict all matches
current_season['PredictedHomeWinProb'] = model.predict_proba(current_season[features])[:,1]

# Calculate expected points for each team
home_teams = current_season.groupby('HomeTeam')['PredictedHomeWinProb'].agg(['count', 'sum'])
away_teams = current_season.groupby('AwayTeam')['PredictedHomeWinProb'].agg(['count', 'sum'])

# Combine home and away
team_stats = home_teams.join(away_teams, how='outer', lsuffix='_home', rsuffix='_away')
team_stats.fillna(0, inplace=True)

# Calculate total expected points (3 for win, 1 for draw)
team_stats['TotalExpectedPoints'] = (
    3 * team_stats['sum_home'] +
    3 * (team_stats['count_away'] - team_stats['sum_away']) +
    1 * (team_stats['count_home'] - team_stats['sum_home']) +
    1 * team_stats['sum_away']
)

# Get top 5 teams
top_5_teams = team_stats['TotalExpectedPoints'].sort_values(ascending=False).head(5)
print("Predicted Top 5 Teams:")
print(top_5_teams)

Predicted Top 5 Teams:
HomeTeam
Liverpool        90.496167
Arsenal          86.078190
Newcastle        84.234976
Chelsea          81.569881
Nott'm Forest    80.199571
Name: TotalExpectedPoints, dtype: float64
