In [1]:
"""
Data from the current season fragment will be plugged into the model to get picks.
The most recent full season will be the test dataset.
All seasons before that will be the train dataset.

In the same folder as this notebook, you should have folders called "data"
(to store the data files as CSV) and "models" (to store the machine learning 
models as pickle files). Otherwise sections of this code will throw errors.
"""

# import packages
import numpy as np
import pandas as pd
import datetime as dt
import http.client
import json
import time
from bs4 import BeautifulSoup
import joblib
import requests

# Settings
pd.set_option('display.max_columns', 100)

# Define current season 
#   (and the test dataset is the entire season before that one)
season_to_bet = 2024
season_to_test = season_to_bet - 1

# Define leagues 
#   E0: the English Premier League
#   D1: the German Bundesliga
#   SP1: Spanish La Liga
leagues_of_choice = ["E0", "D1", "SP1"]

# Define years being used to train the model
years_of_choice = ["2425", "2324", "2223", 
                   "2122", "2021", "1920", "1819", 
                   "1718", "1617", "1516", "1415", 
                   "1314", "1213", "1112", "1011"]

# URLs to scrape for upcoming fixtures
url_dict = {
    "E0": "https://fbref.com/en/comps/9/schedule/Premier-League-Scores-and-Fixtures",
    "D1": "https://fbref.com/en/comps/20/schedule/Bundesliga-Scores-and-Fixtures",
    "SP1": "https://fbref.com/en/comps/12/schedule/La-Liga-Scores-and-Fixtures"
}

# Set the days of the upcoming week
date_of_check = dt.date.today()
end_of_checked_week = dt.date.today() + dt.timedelta(days=7)
days_to_bet = pd.date_range(date_of_check, end_of_checked_week)
days_to_bet


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/markyoung/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/Users/markyoung/anaconda3/lib/python3.11/site-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/Users/markyoung/anaconda3/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 736, in start
    self.io_loop.start()
  File "

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/markyoung/anaconda3/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/Users/markyoung/anaconda3/lib/python3.11/site-packages/traitlets/config/application.py", line 992, in launch_instance
    app.start()
  File "/Users/markyoung/anaconda3/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 736, in start
    self.io_loop.start()
  File "

AttributeError: _ARRAY_API not found

DatetimeIndex(['2024-12-08', '2024-12-09', '2024-12-10', '2024-12-11',
               '2024-12-12', '2024-12-13', '2024-12-14', '2024-12-15'],
              dtype='datetime64[ns]', freq='D')

In [2]:
"""
Get data sets from football-data.co.uk
"""

all_matches_raw = pd.DataFrame()

for eachLeague in leagues_of_choice:
    for eachSeason in years_of_choice:
        time.sleep(5)
        url = f"https://www.football-data.co.uk/mmz4281/{eachSeason}/{eachLeague}.csv"
        try:
            temp_df = pd.read_csv(url)
            temp_year = eachSeason[0:2]
            temp_df['Season'] = int(f"20{temp_year}")
            all_matches_raw = pd.concat([all_matches_raw, temp_df], axis=0)
        except: 
            pass


In [3]:
"""
Target variables: home win vs away win
"""

all_matches_raw["homewin"] = (all_matches_raw['FTR'] == 'H').astype(int)
all_matches_raw["awaywin"] = (all_matches_raw['FTR'] == 'A').astype(int)

In [4]:
"""
Dates in the file are wonky. Make them consistent with 
the built-in Pandas date parser "to_datetime"
"""

all_matches_raw['FixedDate'] = pd.to_datetime(all_matches_raw['Date'], format='mixed', dayfirst=True)

all_matches_raw.tail()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,BFH,BFD,BFA,PSH,PSD,PSA,WHH,WHD,WHA,1XBH,1XBD,1XBA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA,BFEH,BFED,...,IWD,IWA,VCH,VCD,VCA,IWCH,IWCD,IWCA,VCCH,VCCD,VCCA,Bb1X2,BbMxH,BbAvH,BbMxD,BbAvD,BbMxA,BbAvA,BbOU,BbMx>2.5,BbAv>2.5,BbMx<2.5,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,LBH,LBD,LBA,SJH,SJD,SJA,GBH,GBD,GBA,BSH,BSD,BSA,SBH,SBD,SBA,Unnamed: 70,Unnamed: 71,Unnamed: 72,homewin,awaywin,FixedDate
375,SP1,21/05/11,,Mallorca,Ath Madrid,3.0,4.0,A,0.0,2.0,A,,25.0,12.0,14.0,6.0,17.0,8.0,10.0,4.0,1.0,3.0,0.0,0.0,2.5,2.75,3.3,2.55,2.7,3.15,,,,,,,2.62,2.75,3.1,,,,,,,,,,,,...,2.6,3.4,2.5,3.0,3.2,,,,,,,40.0,2.65,2.5,3.0,2.79,3.4,3.22,33.0,1.75,1.7,2.19,2.09,23.0,0.0,1.77,1.71,2.26,2.17,2.38,2.8,2.88,2.5,2.7,3.4,2.5,2.75,3.1,2.4,2.75,3.1,2.5,2.6,3.2,,,,0,1,2011-05-21
376,SP1,21/05/11,,Osasuna,Villarreal,1.0,0.0,H,1.0,0.0,H,,14.0,4.0,6.0,3.0,20.0,10.0,11.0,3.0,1.0,2.0,0.0,0.0,1.8,3.0,5.75,1.8,2.8,5.95,,,,,,,1.83,2.8,6.0,,,,,,,,,,,,...,2.8,5.0,1.8,3.1,6.0,,,,,,,40.0,1.87,1.8,3.18,2.94,6.05,5.65,37.0,2.0,1.9,1.98,1.89,21.0,-0.5,1.86,1.81,2.12,2.07,1.91,2.62,4.5,1.8,3.0,6.0,1.85,2.75,5.5,1.83,3.0,4.5,1.83,2.7,5.5,,,,1,0,2011-05-21
377,SP1,21/05/11,,Real Madrid,Almeria,8.0,1.0,H,2.0,1.0,H,,27.0,6.0,13.0,1.0,11.0,10.0,15.0,6.0,1.0,1.0,0.0,0.0,1.13,8.5,17.0,1.1,9.5,15.5,,,,,,,1.14,7.5,17.0,,,,,,,,,,,,...,7.3,15.0,1.11,11.0,19.0,,,,,,,40.0,1.14,1.11,10.91,8.76,22.0,18.15,25.0,1.25,1.23,4.3,3.9,19.0,-2.25,1.75,1.69,2.29,2.24,1.11,6.5,15.0,1.13,8.5,17.0,1.12,8.0,17.0,1.12,8.0,15.0,1.11,7.5,15.0,,,,1,0,2011-05-21
378,SP1,21/05/11,,Santander,Ath Bilbao,1.0,2.0,A,0.0,2.0,A,,18.0,9.0,6.0,7.0,14.0,12.0,9.0,2.0,2.0,0.0,1.0,0.0,3.0,3.5,2.25,3.0,3.65,2.1,,,,,,,2.88,3.4,2.1,,,,,,,,,,,,...,3.3,2.3,3.1,3.75,2.2,,,,,,,41.0,3.26,3.08,3.81,3.43,2.3,2.21,28.0,1.72,1.64,2.26,2.15,20.0,0.25,2.01,1.97,1.96,1.91,2.75,3.25,2.2,3.0,3.5,2.25,3.0,3.25,2.25,2.8,3.25,2.3,2.9,3.3,2.2,,,,0,1,2011-05-21
379,SP1,21/05/11,,Sociedad,Getafe,1.0,1.0,D,0.0,1.0,A,,12.0,8.0,7.0,1.0,8.0,12.0,4.0,3.0,0.0,1.0,0.0,0.0,2.38,3.25,3.0,2.3,3.25,2.95,,,,,,,2.4,3.25,2.9,,,,,,,,,,,,...,3.3,3.1,2.4,3.4,3.1,,,,,,,41.0,2.45,2.36,3.44,3.26,3.18,2.95,37.0,1.83,1.72,2.2,2.07,21.0,0.0,1.75,1.71,2.21,2.15,2.2,3.2,2.8,2.38,3.4,2.88,2.35,3.3,2.8,2.38,3.2,2.75,2.35,3.2,2.75,,,,0,0,2011-05-21


In [5]:
"""
Drop columns I won't be using

And make the others all lower case
"""

my_cols = ['Div', 'Season', 'FixedDate', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'homewin', 'awaywin', 
           'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF']

df_raw = all_matches_raw[my_cols] 

df_raw.columns = [c.lower() for c in df_raw.columns]

df_raw.head()

Unnamed: 0,div,season,fixeddate,hometeam,awayteam,fthg,ftag,homewin,awaywin,hs,as,hst,ast,hc,ac,hf,af
0,E0,2024,2024-08-16,Man United,Fulham,1.0,0.0,1,0,14.0,10.0,5.0,2.0,7.0,8.0,12.0,10.0
1,E0,2024,2024-08-17,Ipswich,Liverpool,0.0,2.0,0,1,7.0,18.0,2.0,5.0,2.0,10.0,9.0,18.0
2,E0,2024,2024-08-17,Arsenal,Wolves,2.0,0.0,1,0,18.0,9.0,6.0,3.0,8.0,2.0,17.0,14.0
3,E0,2024,2024-08-17,Everton,Brighton,0.0,3.0,0,1,9.0,10.0,1.0,5.0,1.0,5.0,8.0,8.0
4,E0,2024,2024-08-17,Newcastle,Southampton,1.0,0.0,1,0,3.0,19.0,1.0,4.0,3.0,12.0,15.0,16.0


In [6]:
"""
Make rolling averages for the last three matches
"""

def rolling_averages(group, cols, new_cols):
    group = group.sort_values("fixeddate")
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [7]:
cols = ["hs", "as", "hst", "ast", "hc", "ac", "hf", "af"]
new_cols = [f"{c}_rolling" for c in cols]

raw_rolling = df_raw.groupby('hometeam').apply(lambda x: rolling_averages(x, cols, new_cols), include_groups=False)

raw_rolling = raw_rolling.droplevel('hometeam')

raw_rolling = raw_rolling[raw_rolling['season'] < season_to_bet]

raw_rolling.head()

Unnamed: 0,div,season,fixeddate,awayteam,fthg,ftag,homewin,awaywin,hs,as,hst,ast,hc,ac,hf,af,hs_rolling,as_rolling,hst_rolling,ast_rolling,hc_rolling,ac_rolling,hf_rolling,af_rolling
75,SP1,2016,2016-10-16,Malaga,1.0,1.0,0,0,8.0,12.0,4.0,5.0,6.0,7.0,14.0,14.0,12.666667,12.0,3.0,1.666667,4.333333,5.666667,13.666667,13.0
91,SP1,2016,2016-10-29,Real Madrid,1.0,4.0,0,1,8.0,15.0,3.0,5.0,4.0,3.0,15.0,6.0,11.0,13.666667,3.666667,3.0,5.0,6.0,15.0,13.666667
115,SP1,2016,2016-11-20,Espanol,0.0,1.0,0,1,10.0,10.0,4.0,3.0,2.0,5.0,20.0,14.0,10.666667,13.333333,4.333333,4.333333,5.0,5.333333,14.666667,13.0
134,SP1,2016,2016-12-04,Las Palmas,1.0,1.0,0,0,9.0,10.0,2.0,3.0,3.0,5.0,15.0,11.0,8.666667,12.333333,3.666667,4.333333,4.0,5.0,16.333333,11.333333
150,SP1,2016,2016-12-16,Betis,1.0,0.0,1,0,10.0,6.0,2.0,3.0,6.0,4.0,20.0,6.0,9.0,11.666667,3.0,3.666667,3.0,4.333333,16.666667,10.333333


In [8]:
"""
Get unique values in the "Div" column 

(We don't just use the list of league codes from above, as a way of checking the data)
"""

league_code_list = raw_rolling['div'].unique()

league_code_list

array(['SP1', 'E0', 'D1'], dtype=object)

In [9]:
# Make all historic data

for league in league_code_list:
    league_hist_df = raw_rolling[(raw_rolling['div'] == league) & (raw_rolling['season'] < season_to_bet)]
    league_hist_df.to_csv(f"data/{league}_2010to{season_to_bet - 1}.csv")

In [10]:
# Make data for current season that can be used to predict upcoming match results

for league in league_code_list:
    league_season_df = df_raw[(df_raw['div'] == league) & (df_raw['season'] == season_to_bet)]
    league_season_last3 = league_season_df.sort_values(["hometeam", "fixeddate"]).groupby('hometeam').tail(3)
    last3_stats = league_season_last3.groupby("hometeam")[cols].sum()
    last3_stats.columns = new_cols
    last3_stats[new_cols] = last3_stats[new_cols].sub(last3_stats[new_cols].mean(axis=0)).div(last3_stats[new_cols].std(axis=0))
    league_season_sum = league_season_df.groupby('hometeam')[cols].sum()
    league_season_sum[cols] = league_season_sum[cols].sub(league_season_sum[cols].mean(axis=0)).div(league_season_sum[cols].std(axis=0))
    league_season_sum[new_cols] = last3_stats
    league_season_sum.to_csv(f"data/{league}_{season_to_bet}.csv")
    

In [11]:
"""
Function to make models

I liked the XGBoosted model, but it was picking very strange upsets
and may have been overfitted to the training data.

So I'm going back to my previous model, Logistic Regression
"""

from sklearn.linear_model import LogisticRegression

def make_models(leagueName):
    # Get data, isolate relevant rows/columns for modeling
    matches = pd.read_csv(f"data/{leagueName}_2010to{season_to_test}.csv", index_col=0)
    train_df = matches[matches["season"] < season_to_test]
    test_df = matches[matches["season"] == season_to_test]
    targets = ["homewin", "awaywin"]
    predictors = ["hs", "as", "hst", "ast", "hc", "ac", "hf", "af",
              "hs_rolling", "as_rolling", "hst_rolling", "ast_rolling",
              "hc_rolling", "ac_rolling", "hf_rolling", "af_rolling"]
    all_vars = targets + predictors
    train_df = train_df[all_vars]
    test_df = test_df[all_vars]
    
    # Data transforms: Normalize and fill NA with 0
    train_df[predictors] = train_df[predictors]\
    .sub(train_df[predictors].mean(axis=0))\
    .div(train_df[predictors].std(axis=0))
    
    test_df[predictors] = test_df[predictors]\
    .sub(test_df[predictors].mean(axis=0))\
    .div(test_df[predictors].std(axis=0))
    
    train_df = train_df.fillna(0)
    test_df = test_df.fillna(0)
    
    # First, predict a home win
    lr_home = LogisticRegression(C=0.01, solver='liblinear')
    lr_home.fit(train_df[predictors], train_df["homewin"])
    joblib.dump(lr_home, f'models/{leagueName}_homewin.pkl')
    
    # Next, predict away win
    lr_away = LogisticRegression(C=0.01, solver='liblinear')
    lr_away.fit(train_df[predictors], train_df["awaywin"])
    joblib.dump(lr_away, f'models/{leagueName}_awaywin.pkl')

In [12]:
"""
Run the function
"""

for league in leagues_of_choice:
    make_models(league)

In [13]:
"""
Define arrays of column names that we will need
after scraping the upcoming matches
"""

cols_to_keep = ['Wk', 'Day', 'Date', 'Home', 'Away']

model_cols = ['hs', 'as', 'hst', 'ast', 'hc', 'ac', 'hf', 'af', 
              'hs_rolling', 'as_rolling', 'hst_rolling', 'ast_rolling', 
              'hc_rolling', 'ac_rolling', 'hf_rolling', 'af_rolling']

In [14]:
"""
Function to scrape the week's matches
"""

from io import StringIO

def get_matches(theLeague, theSeason):
    # Get upcoming fixtures
    theurl = url_dict[theLeague]
    data = requests.get(theurl)
    thisfix = pd.read_html(StringIO(str(data.text)))[0]
    thisfix['Date'] = pd.to_datetime(thisfix['Date'])
    thisfix = thisfix[thisfix['Date'].isin(days_to_bet)]
    thisfix = thisfix[cols_to_keep]
    
    # Attach predictive data from this season to those fixtures
    season_to_date = pd.read_csv(f"data/{theLeague}_{theSeason}.csv")
    
    # Need to execute a join, but names are not consistent across files
    names1 = list(set(np.append(thisfix['Home'].unique(), thisfix['Away'].unique())))
    names1.sort()
    season_to_date['newhome'] = names1
    matches_to_bet = thisfix.merge(season_to_date, how='left', left_on ='Home', right_on='newhome')
    
    return matches_to_bet

In [15]:
"""
Function to apply our models to the dataframe we just scraped
"""

def apply_models(theLeague, theDF):
    # Apply model for home wins
    homewin_model = joblib.load(f'models/{theLeague}_homewin.pkl')
    theDF['pred_homewin'] = homewin_model.predict(theDF[model_cols])
    homewin_probs = homewin_model.predict_proba(theDF[model_cols])
    theDF['prob_homewin'] = homewin_probs[:, 1]
    
    # Apply model for away wins
    awaywin_model = joblib.load(f'models/{theLeague}_awaywin.pkl')
    theDF['pred_awaywin'] = awaywin_model.predict(theDF[model_cols])
    awaywin_probs = awaywin_model.predict_proba(theDF[model_cols])
    theDF['prob_awaywin'] = awaywin_probs[:, 1]
    
    theDF['prob_draw'] = 1 - theDF['prob_homewin'] - theDF['prob_awaywin']
    theDF['prob_diff'] = np.abs(theDF['prob_homewin'] - theDF['prob_awaywin'])
    theDF['draw_threat'] = np.where(theDF['prob_diff'] < 0.1, 1, 
                            np.where(theDF['prob_draw'] > 0.25, 1, 0))
    theDF['div'] = theLeague
    
    betslip = theDF[['div', 'Wk', 'Day', 'Home', 'Away', 
                     'pred_homewin', 'prob_homewin',
                     'pred_awaywin', 'prob_awaywin',
                     'prob_draw', 'draw_threat',
                    ]]
    return betslip

In [16]:
"""
Bring the two functions together
"""

def make_betslip(leagueList, aYear):
    betslip = pd.DataFrame()
    for league in leagueList:
        lg_df = get_matches(league, aYear)
        lg_betslip = apply_models(league, lg_df)
        betslip = pd.concat([betslip, lg_betslip], axis=0)
    return betslip

In [17]:
"""
This week's betslip
"""

make_betslip(leagues_of_choice, season_to_bet)

Unnamed: 0,div,Wk,Day,Home,Away,pred_homewin,prob_homewin,pred_awaywin,prob_awaywin,prob_draw,draw_threat
0,E0,15.0,Sun,Fulham,Arsenal,1,0.646981,0,0.149679,0.203341,0
1,E0,15.0,Sun,Ipswich Town,Bournemouth,0,0.165993,1,0.601319,0.232689,0
2,E0,15.0,Sun,Leicester City,Brighton,0,0.062585,1,0.814969,0.122446,0
3,E0,15.0,Sun,Tottenham,Chelsea,1,0.614904,0,0.168717,0.216379,0
4,E0,15.0,Mon,West Ham,Wolves,0,0.428324,0,0.343961,0.227715,1
5,E0,16.0,Sat,Newcastle Utd,Leicester City,0,0.250124,0,0.478296,0.27158,1
6,E0,16.0,Sat,Arsenal,Everton,1,0.782708,0,0.078085,0.139208,0
7,E0,16.0,Sat,Wolves,Ipswich Town,0,0.209981,1,0.606016,0.184003,0
8,E0,16.0,Sat,Liverpool,Fulham,1,0.887151,0,0.051638,0.061211,0
9,E0,16.0,Sat,Nott'ham Forest,Aston Villa,0,0.45621,0,0.269786,0.274004,1
