In [39]:
# connect to my google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [69]:
# imports
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score

In [41]:
CSV_DIR = "/content/drive/My Drive/Summer 2025/ML Soccer/matches.csv"

In [42]:
matches = pd.read_csv(CSV_DIR)
matches.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt,Season,Team
0,2024-08-17,12:30 (11:30),Premier League,Matchweek 1,Sat,Away,W,2,0,Ipswich Town,...,Match Report,,18,5,14.8,0,0,0,2025,Liverpool
1,2024-08-25,16:30 (15:30),Premier League,Matchweek 2,Sun,Home,W,2,0,Brentford,...,Match Report,,19,8,13.6,1,0,0,2025,Liverpool
2,2024-09-01,16:00 (15:00),Premier League,Matchweek 3,Sun,Away,W,3,0,Manchester Utd,...,Match Report,,11,3,13.4,0,0,0,2025,Liverpool
3,2024-09-14,15:00 (14:00),Premier League,Matchweek 4,Sat,Home,L,0,1,Nott'ham Forest,...,Match Report,,14,5,14.9,0,0,0,2025,Liverpool
4,2024-09-21,15:00 (14:00),Premier League,Matchweek 5,Sat,Home,W,3,0,Bournemouth,...,Match Report,,19,12,16.6,0,0,0,2025,Liverpool


In [43]:
# Great news because 38 matches * 5 seasosn (2021-2025) * 20 teams = 3800
matches.shape

(3800, 28)

Create Predictors

In [48]:
matches['venue_code'] = matches['Venue'].astype('category').cat.codes
matches['opp_code'] = matches['Opponent'].astype('category').cat.codes
matches['hour'] = matches['Time'].str.replace(':.+', '', regex=True).astype('int')
matches['Date'] = pd.to_datetime(matches['Date'])
matches['day_code'] = matches['Date'].dt.dayofweek

In [49]:
matches['opp_code']

Unnamed: 0,opp_code
0,10
1,3
2,16
3,19
4,2
...,...
3795,22
3796,7
3797,8
3798,17


In [50]:
matches['hour']

Unnamed: 0,hour
0,12
1,16
2,16
3,15
4,15
...,...
3795,19
3796,15
3797,19
3798,18


In [51]:
matches['day_code']

Unnamed: 0,day_code
0,5
1,6
2,6
3,5
4,5
...,...
3795,6
3796,5
3797,6
3798,2


In [52]:
# look to try and account for draws in the future
matches['target'] = (matches['Result'] == 'W').astype('int')
matches['target']

Unnamed: 0,target
0,1
1,1
2,1
3,0
4,1
...,...
3795,0
3796,0
3797,1
3798,0


In [53]:
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state=1)

train = matches[matches['Date'] < '2024-06-01']
test = matches[matches['Date'] >= '2024-06-01']

predictors = ['venue_code', 'opp_code', 'hour', 'day_code']

In [54]:
rf.fit(train[predictors], train['target'])

In [56]:
pred = rf.predict(test[predictors])

In [62]:
acc = accuracy_score(test['target'], pred)
acc

0.5947368421052631

In [66]:
combined_df = pd.DataFrame(dict(actual=test['target'], prediction = pred))
pd.crosstab(index=combined_df['actual'], columns=combined_df['prediction'])

prediction,0,1
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,375,98
1,210,77


In [71]:
precision_score(test['target'], pred)

0.44

In [76]:
group_matches = matches.groupby('Team')
group = group_matches.get_group('Chelsea')
group

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,FK,PK,PKatt,Season,Team,venue_code,opp_code,hour,day_code,target
114,2024-08-18,16:30 (15:30),Premier League,Matchweek 1,Sun,Home,L,0,2,Manchester City,...,0,0,0,2025,Chelsea,1,15,16,6,0
115,2024-08-25,14:00 (13:00),Premier League,Matchweek 2,Sun,Away,W,6,2,Wolves,...,1,0,0,2025,Chelsea,0,26,14,6,1
116,2024-09-01,13:30 (12:30),Premier League,Matchweek 3,Sun,Home,D,1,1,Crystal Palace,...,1,0,0,2025,Chelsea,1,7,13,6,0
117,2024-09-14,20:00 (19:00),Premier League,Matchweek 4,Sat,Away,W,1,0,Bournemouth,...,0,0,0,2025,Chelsea,0,2,20,5,1
118,2024-09-21,12:30 (11:30),Premier League,Matchweek 5,Sat,Away,W,3,0,West Ham,...,0,0,0,2025,Chelsea,0,25,12,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3187,2021-05-01,17:30 (16:30),Premier League,Matchweek 34,Sat,Home,W,2,0,Fulham,...,1,0,0,2021,Chelsea,1,9,17,5,1
3188,2021-05-08,17:30 (16:30),Premier League,Matchweek 35,Sat,Away,W,2,1,Manchester City,...,1,0,0,2021,Chelsea,0,15,17,5,1
3189,2021-05-12,20:15 (19:15),Premier League,Matchweek 36,Wed,Home,L,0,1,Arsenal,...,0,0,0,2021,Chelsea,1,0,20,2,0
3190,2021-05-18,20:15 (19:15),Premier League,Matchweek 37,Tue,Home,W,2,1,Leicester City,...,0,1,1,2021,Chelsea,1,12,20,1,1


In [81]:
# want to consider form aswell now
def rolling_avgs(group, cols, new_cols):
    group = group.sort_values('Date')
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [84]:
from operator import ne
cols = ['GF', 'GA', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']
new_cols = [f'{c}_rolling' for c in cols]
new_cols

['GF_rolling',
 'GA_rolling',
 'Sh_rolling',
 'SoT_rolling',
 'Dist_rolling',
 'FK_rolling',
 'PK_rolling',
 'PKatt_rolling']

In [85]:
rolling_avgs(group, cols, new_cols)

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,day_code,target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
3157,2020-10-03,12:30 (11:30),Premier League,Matchweek 4,Sat,Home,W,4,0,Crystal Palace,...,5,1,2.000000,2.000000,11.666667,5.333333,21.133333,0.333333,0.333333,0.666667
3158,2020-10-17,15:00 (14:00),Premier League,Matchweek 5,Sat,Home,D,3,3,Southampton,...,5,0,2.333333,1.666667,13.666667,5.333333,19.766667,0.333333,0.666667,1.000000
3159,2020-10-24,17:30 (16:30),Premier League,Matchweek 6,Sat,Away,D,0,0,Manchester Utd,...,5,0,3.333333,2.000000,16.000000,6.333333,15.966667,0.333333,0.666667,0.666667
3160,2020-10-31,15:00,Premier League,Matchweek 7,Sat,Away,W,3,0,Burnley,...,5,1,2.333333,1.000000,10.666667,3.333333,16.266667,0.000000,0.666667,0.666667
3161,2020-11-07,17:30,Premier League,Matchweek 8,Sat,Home,W,4,1,Sheffield Utd,...,5,1,2.000000,1.000000,10.333333,5.000000,16.833333,0.333333,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,2025-04-26,12:30 (11:30),Premier League,Matchweek 34,Sat,Home,W,1,0,Everton,...,5,1,1.333333,1.000000,22.666667,7.000000,18.166667,0.333333,0.000000,0.000000
148,2025-05-04,16:30 (15:30),Premier League,Matchweek 35,Sun,Home,W,3,1,Liverpool,...,6,1,1.666667,1.000000,19.000000,7.666667,17.933333,0.333333,0.000000,0.000000
149,2025-05-11,12:00 (11:00),Premier League,Matchweek 36,Sun,Away,L,0,2,Newcastle Utd,...,6,0,2.000000,0.666667,13.000000,6.333333,16.700000,0.333333,0.333333,0.333333
150,2025-05-16,20:15 (19:15),Premier League,Matchweek 37,Fri,Home,W,1,0,Manchester Utd,...,4,1,1.333333,1.000000,12.000000,5.000000,16.633333,0.000000,0.333333,0.333333


In [86]:
matches_rolling = matches.groupby('Team').apply(lambda x: rolling_avgs(x, cols, new_cols))
matches_rolling

  matches_rolling = matches.groupby('Team').apply(lambda x: rolling_avgs(x, cols, new_cols))


Unnamed: 0_level_0,Unnamed: 1_level_0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,day_code,target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
Team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Arsenal,3309,2020-10-04,14:00 (13:00),Premier League,Matchweek 4,Sun,Home,W,2,1,Sheffield Utd,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
Arsenal,3310,2020-10-17,17:30 (16:30),Premier League,Matchweek 5,Sat,Away,L,0,1,Manchester City,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
Arsenal,3311,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0,1,Leicester City,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
Arsenal,3312,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1,0,Manchester Utd,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
Arsenal,3313,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0,3,Aston Villa,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wolverhampton Wanderers,603,2025-04-26,15:00 (14:00),Premier League,Matchweek 34,Sat,Home,W,3,0,Leicester City,...,5,1,2.333333,1.000000,13.000000,4.333333,16.433333,0.666667,0.000000,0.000000
Wolverhampton Wanderers,604,2025-05-02,20:00 (19:00),Premier League,Matchweek 35,Fri,Away,L,0,1,Manchester City,...,4,0,2.666667,0.666667,12.333333,4.000000,18.633333,1.000000,0.000000,0.000000
Wolverhampton Wanderers,605,2025-05-10,15:00 (14:00),Premier League,Matchweek 36,Sat,Home,L,0,2,Brighton,...,5,0,1.333333,0.333333,10.000000,2.333333,18.900000,0.666667,0.000000,0.000000
Wolverhampton Wanderers,606,2025-05-20,20:00 (19:00),Premier League,Matchweek 37,Tue,Away,L,2,4,Crystal Palace,...,1,0,1.000000,1.000000,12.000000,2.666667,17.200000,0.333333,0.000000,0.000000


In [87]:
matches_rolling = matches_rolling.droplevel('Team')
matches_rolling

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,day_code,target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
3309,2020-10-04,14:00 (13:00),Premier League,Matchweek 4,Sun,Home,W,2,1,Sheffield Utd,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
3310,2020-10-17,17:30 (16:30),Premier League,Matchweek 5,Sat,Away,L,0,1,Manchester City,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
3311,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0,1,Leicester City,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
3312,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1,0,Manchester Utd,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
3313,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0,3,Aston Villa,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603,2025-04-26,15:00 (14:00),Premier League,Matchweek 34,Sat,Home,W,3,0,Leicester City,...,5,1,2.333333,1.000000,13.000000,4.333333,16.433333,0.666667,0.000000,0.000000
604,2025-05-02,20:00 (19:00),Premier League,Matchweek 35,Fri,Away,L,0,1,Manchester City,...,4,0,2.666667,0.666667,12.333333,4.000000,18.633333,1.000000,0.000000,0.000000
605,2025-05-10,15:00 (14:00),Premier League,Matchweek 36,Sat,Home,L,0,2,Brighton,...,5,0,1.333333,0.333333,10.000000,2.333333,18.900000,0.666667,0.000000,0.000000
606,2025-05-20,20:00 (19:00),Premier League,Matchweek 37,Tue,Away,L,2,4,Crystal Palace,...,1,0,1.000000,1.000000,12.000000,2.666667,17.200000,0.333333,0.000000,0.000000


In [88]:
matches_rolling.index = range(matches_rolling.shape[0])
matches_rolling

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,...,day_code,target,GF_rolling,GA_rolling,Sh_rolling,SoT_rolling,Dist_rolling,FK_rolling,PK_rolling,PKatt_rolling
0,2020-10-04,14:00 (13:00),Premier League,Matchweek 4,Sun,Home,W,2,1,Sheffield Utd,...,6,1,2.000000,1.333333,8.000000,3.666667,14.633333,0.666667,0.000000,0.000000
1,2020-10-17,17:30 (16:30),Premier League,Matchweek 5,Sat,Away,L,0,1,Manchester City,...,5,0,1.666667,1.666667,5.666667,3.666667,15.366667,0.000000,0.000000,0.000000
2,2020-10-25,19:15,Premier League,Matchweek 6,Sun,Home,L,0,1,Leicester City,...,6,0,1.000000,1.666667,7.000000,3.666667,16.566667,0.666667,0.000000,0.000000
3,2020-11-01,16:30,Premier League,Matchweek 7,Sun,Away,W,1,0,Manchester Utd,...,6,1,0.666667,1.000000,9.666667,4.000000,16.566667,1.000000,0.000000,0.000000
4,2020-11-08,19:15,Premier League,Matchweek 8,Sun,Home,L,0,3,Aston Villa,...,6,0,0.333333,0.666667,9.666667,2.666667,19.333333,1.000000,0.333333,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3711,2025-04-26,15:00 (14:00),Premier League,Matchweek 34,Sat,Home,W,3,0,Leicester City,...,5,1,2.333333,1.000000,13.000000,4.333333,16.433333,0.666667,0.000000,0.000000
3712,2025-05-02,20:00 (19:00),Premier League,Matchweek 35,Fri,Away,L,0,1,Manchester City,...,4,0,2.666667,0.666667,12.333333,4.000000,18.633333,1.000000,0.000000,0.000000
3713,2025-05-10,15:00 (14:00),Premier League,Matchweek 36,Sat,Home,L,0,2,Brighton,...,5,0,1.333333,0.333333,10.000000,2.333333,18.900000,0.666667,0.000000,0.000000
3714,2025-05-20,20:00 (19:00),Premier League,Matchweek 37,Tue,Away,L,2,4,Crystal Palace,...,1,0,1.000000,1.000000,12.000000,2.666667,17.200000,0.333333,0.000000,0.000000


In [91]:
def make_predictions(data, predictors):
    train = data[data['Date'] < '2024-06-01']
    test = data[data['Date'] >= '2024-06-01']
    rf.fit(train[predictors], train['target'])
    pred = rf.predict(test[predictors])
    combined_df = pd.DataFrame(dict(actual=test['target'], prediction = pred), index=test.index)
    precision = precision_score(test['target'], pred)
    return combined_df, precision

In [92]:
combined_df, precision = make_predictions(matches_rolling, predictors + new_cols)
precision

0.470873786407767

In [94]:
combined_df = combined_df.merge(matches_rolling[['Date', 'Team', 'Opponent', 'Result']], left_index=True, right_index=True)
combined_df

Unnamed: 0,actual,prediction,Date,Team,Opponent,Result
149,1,1,2024-08-17,Arsenal,Wolves,W
150,1,0,2024-08-24,Arsenal,Aston Villa,W
151,0,1,2024-08-31,Arsenal,Brighton,D
152,1,1,2024-09-15,Arsenal,Tottenham,W
153,0,0,2024-09-22,Arsenal,Manchester City,D
...,...,...,...,...,...,...
3711,1,0,2025-04-26,Wolverhampton Wanderers,Leicester City,W
3712,0,0,2025-05-02,Wolverhampton Wanderers,Manchester City,L
3713,0,0,2025-05-10,Wolverhampton Wanderers,Brighton,L
3714,0,0,2025-05-20,Wolverhampton Wanderers,Crystal Palace,L


In [95]:
class MissingDict(dict):
    __missing__ = lambda self, key: key

map_values = {"Brighton and Hove Albion": "Brighton",
              "Manchester United": "Manchester Utd",
              "Newcastle United": "Newcastle Utd",
              "Tottenham Hotspur": "Tottenham",
              "West Ham United": "West Ham",
              "Wolverhampton Wanderers":
              "Wolves"}
mapping = MissingDict(**map_values)

In [96]:
mapping['West Ham United']

'West Ham'

In [97]:
combined_df['new team'] = combined_df['Team'].map(mapping)
combined_df

Unnamed: 0,actual,prediction,Date,Team,Opponent,Result,new team
149,1,1,2024-08-17,Arsenal,Wolves,W,Arsenal
150,1,0,2024-08-24,Arsenal,Aston Villa,W,Arsenal
151,0,1,2024-08-31,Arsenal,Brighton,D,Arsenal
152,1,1,2024-09-15,Arsenal,Tottenham,W,Arsenal
153,0,0,2024-09-22,Arsenal,Manchester City,D,Arsenal
...,...,...,...,...,...,...,...
3711,1,0,2025-04-26,Wolverhampton Wanderers,Leicester City,W,Wolves
3712,0,0,2025-05-02,Wolverhampton Wanderers,Manchester City,L,Wolves
3713,0,0,2025-05-10,Wolverhampton Wanderers,Brighton,L,Wolves
3714,0,0,2025-05-20,Wolverhampton Wanderers,Crystal Palace,L,Wolves


In [98]:
merged = combined_df.merge(combined_df, left_on=['Date', 'new team'], right_on=['Date', 'Opponent'])
merged

Unnamed: 0,actual_x,prediction_x,Date,Team_x,Opponent_x,Result_x,new team_x,actual_y,prediction_y,Team_y,Opponent_y,Result_y,new team_y
0,1,1,2024-08-17,Arsenal,Wolves,W,Arsenal,0,0,Wolverhampton Wanderers,Arsenal,L,Wolves
1,1,0,2024-08-24,Arsenal,Aston Villa,W,Arsenal,0,0,Aston Villa,Arsenal,L,Aston Villa
2,0,1,2024-08-31,Arsenal,Brighton,D,Arsenal,0,0,Brighton and Hove Albion,Arsenal,D,Brighton
3,1,1,2024-09-15,Arsenal,Tottenham,W,Arsenal,0,0,Tottenham Hotspur,Arsenal,L,Tottenham
4,0,0,2024-09-22,Arsenal,Manchester City,D,Arsenal,0,1,Manchester City,Arsenal,D,Manchester City
...,...,...,...,...,...,...,...,...,...,...,...,...,...
711,1,0,2025-04-26,Wolverhampton Wanderers,Leicester City,W,Wolves,0,0,Leicester City,Wolves,L,Leicester City
712,0,0,2025-05-02,Wolverhampton Wanderers,Manchester City,L,Wolves,1,1,Manchester City,Wolves,W,Manchester City
713,0,0,2025-05-10,Wolverhampton Wanderers,Brighton,L,Wolves,1,1,Brighton and Hove Albion,Wolves,W,Brighton
714,0,0,2025-05-20,Wolverhampton Wanderers,Crystal Palace,L,Wolves,1,0,Crystal Palace,Wolves,W,Crystal Palace


In [101]:
merged[(merged['prediction_x'] == 1) & (merged['prediction_y'] == 0)]["actual_x"].value_counts()

Unnamed: 0_level_0,count
actual_x,Unnamed: 1_level_1
0,90
1,85


In [104]:
cols

['GF', 'GA', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']