In [28]:
## MIKE - run all the cells below this one, then run this cell.

#Configuration
league = "EPL"

year = 2024

num_to_check = 10

# Get today's date
todays_date = datetime.now().date().strftime("%Y-%m-%d")

# File names
lg_table_file = "lg_table_" + todays_date + ".csv"
match_table_file = "match_table_" + todays_date + ".csv"

lg_table = pd.DataFrame()
match_table = pd.DataFrame()

try:
    # Try reading the CSV files
    lg_table = pd.read_csv(lg_table_file)
    match_table = pd.read_csv(match_table_file)

    print("Files loaded successfully!")
except FileNotFoundError:
    # If the files do not exist, handle the exception here
    print("CSV file(s) not found for today's date:", todays_date)
    # You can create new DataFrames or perform any other necessary action in case of file absence
    


if(match_table.empty or lg_table.empty):
    lg_table, match_table = scrape_league_and_matches(league, year)
    match_table.to_csv(lg_table_file, index=False)
    lg_table.to_csv(match_table_file, index=False)


    
away_averages, home_averages = get_team_averages(lg_table)
merged = merge_lg_and_match_tables(match_table, home_averages, away_averages)

#Derive league xg and xGA's for home/away (league strength values)
home_atk_str = merged["Home xG"].mean()
away_atk_str = merged["Away xG"].mean()
home_def_str = merged["Home xGA"].mean()
away_def_str = merged["Away xGA"].mean()
    
team_stats = get_team_stats(merged, home_atk_str, away_atk_str, home_def_str, away_def_str)
homes, aways = get_next_matches(num_to_check)

homes, aways = clean_names(homes, aways)

for i in range(0, len(homes)):
    title = "== Home: " + homes[i] + " | Away: " + aways[i] + " ==" 
    print(title)
    print(""+"-"*len(title)+"")
    #try:
    home, away = compareTeams(homes[i], aways[i], team_stats, home_atk_str, away_atk_str)
    probs = getPoisson(home, away)
    summarise(probs, homes[i], aways[i])
    #except IndexError:
        #print("\nSkipping, as team name not possible.\n")
        
    
    print(""+"-"*len(title)+"\n")


Files loaded successfully!
== Home: Everton | Away: Newcastle United ==
--------------------------------------------
               Team  Win Chance (%)  Odds
0           Everton           12.00  8.33
1  Newcastle United           63.00  1.59
2              Draw           24.00  4.17
3             Total            0.99     -
--------------------------------------------

== Home: Tottenham Hotspur | Away: West Ham United ==
-----------------------------------------------------
                Team  Win Chance (%)  Odds
0  Tottenham Hotspur           26.00  3.85
1    West Ham United           50.00   2.0
2               Draw           23.00  4.35
3              Total            0.99     -
-----------------------------------------------------

== Home: Crystal Palace | Away: Liverpool ==
--------------------------------------------
             Team  Win Chance (%)  Odds
0  Crystal Palace           18.00  5.56
1       Liverpool           57.00  1.75
2            Draw           24.00  4.17

  result = float(atk) * float(dfnce) * float(avg_away_xG)
  result = float(atk) * float(dfnce) * float(avg_away_xG)
  result = float(atk) * float(dfnce) * float(avg_away_xG)
  result = float(atk) * float(dfnce) * float(avg_away_xG)
  result = float(atk) * float(dfnce) * float(avg_away_xG)
  result = float(atk) * float(dfnce) * float(avg_away_xG)
  result = float(atk) * float(dfnce) * float(avg_away_xG)
  result = float(atk) * float(dfnce) * float(avg_away_xG)
  result = float(atk) * float(dfnce) * float(avg_away_xG)


TypeError: cannot convert the series to <class 'float'>

In [None]:
import ScraperFC as sfc
import traceback2 as traceback
import pandas as pd
from urllib.parse import urlparse
import requests
from scipy.stats import poisson
from datetime import datetime

In [None]:
def clean_names(homes, aways):
    
    for index, squad in enumerate(homes):
        if "Bournemouth" in squad:
            homes[index] = "Bournemouth"
    
    for index, squad in enumerate(aways):
        if "Bournemouth" in squad:
            homes[index] = "Bournemouth"
        
    return homes, aways

In [29]:
#https://www.football-data.org/client/register
def get_next_matches(num):
    # Replace 'YOUR_API_KEY' with your actual API key if required by the Premier League API
    api_key = '0ea7a91de7094131b0d1cd0e6753b21a'
    url = f'https://api.football-data.org/v2/competitions/PL/matches?status=SCHEDULED&limit=10'

    headers = {
        'X-Auth-Token': api_key  # Add your API key here if required
    }

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()

        matches = []
        for match in data['matches']:
            home_team = match['homeTeam']['name'].replace(" FC", "")
            away_team = match['awayTeam']['name'].replace(" FC", "")
            match_date = match['utcDate']
            matches.append({'Home Team': home_team, 'Away Team': away_team, 'Date': match_date})

        # Create a DataFrame
        df = pd.DataFrame(matches)

        # Convert 'Date' column to datetime
        df['Date'] = pd.to_datetime(df['Date'])

        # Sort by the 'Date' column (most recent matches first)
        df = df.sort_values(by='Date', ascending=True)
        selection = df.head(num)
        home_team = list(selection["Home Team"])
        away_team = list(selection["Away Team"])
        

        return home_team, away_team
    
    else:
        print("Failed to fetch data. Status code:", response.status_code)
    

In [30]:
# Initialize the FBRef scraper
def scrape_league_and_matches(league, year):
    scraper = sfc.FBRef()

    try:
        # Scrape the League table
        lg_table = scraper.scrape_league_table(year=year, league=league)

        # clean league table columns
        lg_table.drop(columns=["Goalkeeper", "Attendance", "Top Team Scorer", "Notes"], inplace=True)

        # Scrape the Match table
        match_table = scraper.scrape_matches(year, league, save=False)
        # Define unneccesary columns to be removed
        rm_cols = ['Home Team ID',
               'Away Team ID', 'Home Formation', 'Away Formation', 'Home Player Stats',
               'Away Player Stats', "Away Ast", "Home Ast", "Home xAG", "Away xAG", "Home npxG", "Away npxG"]
        #remove
        match_table.drop(columns=rm_cols, inplace=True)
   
    except:
        # Catch and print any exceptions. This allows us to still close the
        # scraper below, even if an exception occurs.
        traceback.print_exc()
    finally:
        # It's important to close the scraper when you're done with it. Otherwise,
        # you'll have a bunch of webdrivers open and running in the background.
        scraper.close()
    return lg_table, match_table

In [31]:
def get_team_averages(lg_table):
    #Get unique team names
    away_teams = lg_table["Away Team"].unique()
    home_teams = lg_table["Home Team"].unique()
    away_averages = pd.DataFrame()
    home_averages = pd.DataFrame()

    #For each away team, get the last 5 match results from match table, getting the mean GA, GF, xGA xG values.
    # Store as dataframe away_averages
    for team in away_teams:
        team_table = lg_table[lg_table["Away Team"] == team].tail(5)
        team_table.rename(columns={"Home xG" : "Away xGA"}, inplace=True)
        team_table.rename(columns={"Home Goals" : "Away GA"}, inplace=True)
        team_table.rename(columns={"Away Goals" : "Away GF"}, inplace=True)

        new_data = {
            "Squad": [team],
            "Away GA": [team_table["Away GA"].mean()],
            "Away GF": [team_table["Away GF"].mean()],
            "Away xGA": [team_table["Away xGA"].mean()],
            "Away xG": [team_table["Away xG"].mean()]
        }
        new_df = pd.DataFrame(new_data)
       
        away_averages = pd.concat([away_averages, new_df], ignore_index=True)


    #For each home team, get the last 5 match results from match table, getting the mean GA, GF, xGA xG values.
    # Store as dataframe home_averages   
    for team in home_teams:
        team_table = lg_table[lg_table["Home Team"] == team].tail(5)

        team_table.rename(columns={"Away xG" : "Home xGA"}, inplace=True)
        team_table.rename(columns={"Away Goals" : "Home GA"}, inplace=True)
        team_table.rename(columns={"Home Goals" : "Home GF"}, inplace=True)

        new_data = {
            "Squad": [team],
            "Home GA": [team_table["Home GA"].mean()],
            "Home GF": [team_table["Home GF"].mean()],
            "Home xGA": [team_table["Home xGA"].mean()],
            "Home xG": [team_table["Home xG"].mean()]
        }
        new_df = pd.DataFrame(new_data)
        home_averages = pd.concat([home_averages, new_df], ignore_index=True)
    
    return away_averages, home_averages

In [32]:
# Make the squad names in lg_table more consistent with home_averages and away_averages
def merge_lg_and_match_tables(lg_table, home_averages, away_averages):
    
    # Ensure consistent squad names
    corrections = {
        "Nott'ham Forest": "Nottingham Forest",
        "Wolves": "Wolverhampton Wanderers",
        "Tottenham": "Tottenham Hotspur",
        "Manchester Utd": "Manchester United",
        "Newcastle Utd": "Newcastle United",
        "Sheffield Utd": "Sheffield United",
        "Brighton": "Brighton & Hove Albion",
        "West Ham": "West Ham United"
    }
    
    lg_table["Squad"] = lg_table["Squad"].replace(corrections)
    
    #Merge the cleaned results from home_averages and away_averages with match_table
    merged = pd.merge(lg_table, home_averages, on="Squad", how="inner")
    merged = pd.merge(merged, away_averages, on="Squad", how="inner")
    #merged = merged.drop(columns=["Last 5"])
    
    return merged

In [33]:
def get_team_stats(merged, home_atk_str, away_atk_str, home_def_str, away_def_str):
    
    team_stats = pd.DataFrame()

    #Calculate squad home/away attacking strength for each squad
    for squad in merged["Squad"].unique():
        team = merged[merged["Squad"] == squad]

        strengths = {
        "Squad" : squad,
        "home_atk_str" : team["Home xG"]/home_atk_str,
        "away_atk_str" : team["Away xG"]/away_atk_str,
        "home_def_str" : team["Home xGA"]/home_def_str,
        "away_def_str" : team["Away xGA"]/away_def_str
        }

        team_stats = pd.concat([team_stats, pd.DataFrame(strengths)], ignore_index=True)
    return team_stats

In [34]:
#Retrieve poisson distribution probabilities for home/away goals
def compareTeams(home_team, away_team, stats, avg_home_xG, avg_away_xG):
    
    atk = stats[stats["Squad"] == home_team]["away_atk_str"] 
    dfnce = stats[stats["Squad"] == away_team]["home_def_str"]
    result = float(atk) * float(dfnce) * float(avg_away_xG)
    
    home_probs = []
    for i in range(0, 6):   
        numbers = poisson.pmf(i, result)
        home_probs.append(numbers)

    
    atk = stats[stats["Squad"] == away_team]["away_atk_str"] 
    dfnce = stats[stats["Squad"] == home_team]["home_def_str"]
    
    result = float(atk) * float(dfnce) * float(avg_away_xG)
    
    away_probs = []
    for i in range(0, 6):   
        numbers = poisson.pmf(i, result)
        away_probs.append(numbers)
        
    return home_probs, away_probs
    


In [35]:
# Create poisson probability distribution dataframe
def getPoisson(home, away):
    probs = pd.DataFrame(columns=["0", "1", "2", "3", "4", "5"])
    for h in home:
        home_goals = []
        for a in away:
            home_goals.append(round(h*a, 5))
        probs.loc[len(probs)] = home_goals
    return probs

In [36]:
# Sum total probabilities for draw, home and away.
def summarise(probs, home_squad, away_squad):
    draw_probs = []
    away_probs = []
    home_probs = []
    for i in range(0, 6):
        draw_probs.append(probs.iloc[i, i])

        for j in range(0, 6):
            if(j > i):
                away_probs.append(probs.iloc[i, j])
            if(j < i):
                home_probs.append(probs.iloc[i, j])

    sum_draw_probs = round(sum(draw_probs), 2)
    away_win_probs = round(sum(away_probs), 2)
    home_win_probs = round(sum(home_probs), 2)
    
    # Define the statistics
    home_win_percentage = home_win_probs * 100
    away_win_percentage = away_win_probs * 100
    draw_percentage = sum_draw_probs * 100
    total_probability = sum_draw_probs + away_win_probs + home_win_probs

    # Create a dictionary with the statistics
    data = {
        'Team': [home_squad, away_squad, 'Draw', 'Total'],
        'Win Chance (%)': [home_win_percentage, away_win_percentage, draw_percentage, total_probability],
        'Odds': [round(1/home_win_probs, 2), round(1/away_win_probs, 2), round(1/sum_draw_probs, 2), '-']
    }

    # Create a DataFrame
    df = pd.DataFrame(data)

    # Print the DataFrame
    print(df)
    return df


---

In [37]:
print(lg_table.columns)


Index(['Link', 'Date', 'Stage', 'Home Team', 'Away Team', 'Home Goals',
       'Away Goals', 'Home xG', 'Away xG', 'Shots'],
      dtype='object')
