# Calling Necessary Libraries AND Getting Today's Date

In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from dateutil import parser
from scipy.stats import poisson
import warnings

warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Get today's date
given_date = "2024-10-14" #year-month-day
threshold = 80 #threshold for percentages to highlight in final dataframe (between 1 and 100)

wanted_leagues = ['argentina', 'austria', 'belgium',
                  'brazil', 'denmark', 'portugal2',
                  'england', 'england2', 'england3', 'england4', 'england5', 'france', 'france2',
                  'germany', 'germany2', 'greece', 'italy', 'italy2',
                   'mexico', 'netherlands', 'norway','poland', 'portugal',
                  'scotland', 'spain', 'spain2', 'sweden',
                  'switzerland', 'turkey', 'usa']

In [2]:
#Calculating Days between given date and today
# Today's date
today = datetime.now().date()

# Specific date
specific_date = datetime.strptime(given_date, "%Y-%m-%d").date()

# Calculate the difference in days
difference = specific_date - today

# Add one day to the difference
days_until_specific_date = difference.days + 1

# Scraping Today's Matches and Leagues

In [3]:
URL = "https://www.soccerstats.com/matches.asp?matchday=" + str(days_until_specific_date) + "&listing=2"
page = requests.get(URL)
liqa = []
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find(id="btable")
sth = results.find_all("tr", attrs={'height': '34'})

In [4]:
# Initialize lists to store extracted data
league_list = []
home_team_list = []
away_team_list = []

# Iterate through each <tr> element
for tr_element in sth:
    # Find the <td> element with sorttable_customkey attribute
    td_element = tr_element.find('td', attrs={'sorttable_customkey': True})
    
    # Check if <td> element exists and contains an <img> element with the alt attribute
    if td_element and td_element.find('img', alt=True):
        # Extract league info
        league = td_element['sorttable_customkey']
        if league in wanted_leagues:
            league_list.append(league)
        
        # Find <td> elements with class "steam"
        td_elements_steam = tr_element.find_all('td', class_='steam')
        
        # Extract home and away team info
        if len(td_elements_steam) == 2 and league in wanted_leagues:
            home_team = td_elements_steam[0].get_text(strip=True)
            away_team = td_elements_steam[1].get_text(strip=True)
            home_team_list.append(home_team)
            away_team_list.append(away_team)
        else:
            continue
    else:
        continue

# Create DataFrame
matches = pd.DataFrame({
    'league': league_list,
    'home_team': home_team_list,
    'away_team': away_team_list
})

# Display DataFrame
matches.head()

Unnamed: 0,league,home_team,away_team


# Converting Date and Collecting Leagues for Analysis

In [5]:
from dateutil import parser

day_abbreviations = {0: "Mo", 1: "Tu", 2: "We", 3: "Th", 4: "Fr", 5: "Sa", 6: "Su"}
given_date_parsed = parser.parse(given_date)

# Manually format the day to remove leading zeros
day = given_date_parsed.day
month = given_date_parsed.strftime('%b')

# Format the date as "Su 1 Oct" or "Tu 10 Oct" without leading zero for single-digit days
formatted_date = f"{day_abbreviations[given_date_parsed.weekday()]} {day} {month}"

# Scraping the Web for the League Statistics

In [6]:
final =  pd.DataFrame()
liqa = ''
unique_leagues = wanted_leagues#matches['league'].unique().tolist()
next_matches = pd.DataFrame()

for i in unique_leagues:
    URL = "https://www.soccerstats.com/results.asp?league=" + i + "&pmtype=bydate"
    page = requests.get(URL)
    liqa = i
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find(id="btable")
    sth = results.find_all("tr", class_="odd")
    sth


    date, league, home, away, ft, ht = [], [], [], [], [],[]
    for i in sth:
        date.append(i.find_all("td", align = 'right')[0].get_text(strip=True))
        league.append(liqa.capitalize())
        home.append(i.find_all("td", align = 'right')[1].get_text(strip=True))
        away.append(i.find("td", align = "left").get_text(strip = True))
        ft.append(i.find_all("td", align = 'center')[0].get_text(strip = True))
        try:
            ht.append(i.find_all("td", align = 'center')[2].get_text(strip = True))
        except IndexError as e:
            ht.append('NA')#print("Last output before error occurred:", i.find_all("td", align = 'center'))

    data = {'Date': date, 'League': league,'Home': home, 'Away': away, 'FT': ft, 'HT': ht}

# Create a DataFrame from the dictionary
    df = pd.DataFrame(data)

# Replace empty strings with NaN
    next_df = df[(df['Date'] == formatted_date) & (df['HT'] == '')]
    next_matches = pd.concat([next_matches, next_df], ignore_index = True)
    df.replace('', pd.NA, inplace=True)

# Drop rows with NaN values
    df_cleaned = df.dropna()

#For Half-Time Results
    hthg, htag = [], []
    for i in df_cleaned['HT']:
        if i == 'NA':
            hthg.append('NA')
            htag.append('NA')
        elif i == '+' or i == '-':
            hthg.append('NA')
            htag.append('NA')
        else:
            try:
                hthg.append(int(i[1]))
                htag.append(int(i[3]))
            except IndexError as e:
                print("Last output before error occurred:", i)



#For Full-Time Results
    hg, ag, tg = [], [], []
    for i in df_cleaned['FT']:
        if len(i) < 5 or ':' in i:
            hg.append('NA')
            ag.append('NA')
            tg.append('NA')
        else:
            try:
                hghg = int(i.split(' - ')[0])
                hg.append(hghg)
                agag = int(i.split(' - ')[1])
                ag.append(agag)
                tg.append(hghg + agag)
            except:
                print(hghg + agag)

    
    df_cleaned['FTHG'], df_cleaned['FTAG'], df_cleaned['FTTG'] = hg, ag, tg
    df_cleaned['HTHG'], df_cleaned['HTAG'] = hthg, htag
    df_cleaned['HTTG'] = df_cleaned['HTHG'] + df_cleaned['HTAG']
    
    final = pd.concat([final, df_cleaned], ignore_index=True)
    
final = final[final['HT'] != 'NA']
combined_df = pd.concat([final.head(), final.tail()])
combined_df

Unnamed: 0,Date,League,Home,Away,FT,HT,FTHG,FTAG,FTTG,HTHG,HTAG,HTTG
0,Fr 10 May,Argentina,Sarmiento,Instituto,1 - 2,(0-1),1,2,3,0,1,1
1,Sa 11 May,Argentina,Argentinos Jrs,Rosario Central,3 - 2,(2-2),3,2,5,2,2,4
2,Sa 11 May,Argentina,Newells,Platense,2 - 0,(0-0),2,0,2,0,0,0
3,Sa 11 May,Argentina,Huracan,Defensa y J.,3 - 1,(1-0),3,1,4,1,0,1
4,Sa 11 May,Argentina,Godoy Cruz,Barracas C.,0 - 1,(0-0),0,1,1,0,0,0
3385,Su 6 Oct,Usa,LA Galaxy,Austin,2 - 1,(1-0),2,1,3,1,0,1
3386,Su 6 Oct,Usa,SJ Earthquakes,Real Salt Lake,0 - 1,(0-0),0,1,1,0,0,0
3387,Su 6 Oct,Usa,New York City,Nashville SC,3 - 1,(3-0),3,1,4,3,0,3
3388,Mo 7 Oct,Usa,Portland,Dallas,0 - 0,(0-0),0,0,0,0,0,0
3389,Su 13 Oct,Usa,Columbus Crew,New England,4 - 0,(2-0),4,0,4,2,0,2


In [7]:
next_leagues = next_matches['League'].unique().tolist()
pd.concat([next_matches.head(), next_matches.tail()])

Unnamed: 0,Date,League,Home,Away,FT,HT
0,Mo 14 Oct,Portugal2,Portimonense,Benfica B,18:00,
1,Mo 14 Oct,Spain2,Sporting Gijon,Castellon,19:30,
2,Mo 14 Oct,Usa,Vancouver,Los Angeles FC,00:30,
0,Mo 14 Oct,Portugal2,Portimonense,Benfica B,18:00,
1,Mo 14 Oct,Spain2,Sporting Gijon,Castellon,19:30,
2,Mo 14 Oct,Usa,Vancouver,Los Angeles FC,00:30,


# Calculating Functions Needed for Dixon-Coles Model

In [8]:
from scipy.optimize import minimize
from scipy.stats import poisson

def rho_correction(x, y, lambda_x, mu_y, rho):
    if x==0 and y==0:
        return 1- (lambda_x * mu_y * rho)
    elif x==0 and y==1:
        return 1 + (lambda_x * rho)
    elif x==1 and y==0:
        return 1 + (mu_y * rho)
    elif x==1 and y==1:
        return 1 - rho
    else:
        return 1.0

def dc_log_like(x, y, alpha_x, beta_x, alpha_y, beta_y, rho, gamma):
    lambda_x, mu_y = np.exp(alpha_x + beta_y + gamma), np.exp(alpha_y + beta_x) 
    return (np.log(rho_correction(x, y, lambda_x, mu_y, rho)) + 
            np.log(poisson.pmf(x, lambda_x)) + np.log(poisson.pmf(y, mu_y)))



def solve_parameters(dataset, half_or_full = 'full', debug = False, init_vals=None, options={'disp': True, 'maxiter':100},
                     constraints = [{'type':'eq', 'fun': lambda x: sum(x[:20])-20}] , **kwargs):
    teams = np.sort(dataset['Home'].unique())
    # check for no weirdness in dataset
    away_teams = np.sort(dataset['Away'].unique())
    if not np.array_equal(teams, away_teams):
        raise ValueError("Something's not right")
    n_teams = len(teams)
    if init_vals is None:
        # random initialisation of model parameters
        init_vals = np.concatenate((np.random.uniform(0,1,(n_teams)), # attack strength
                                      np.random.uniform(0,-1,(n_teams)), # defence strength
                                      np.array([0, 1.0]) # rho (score correction), gamma (home advantage)
                                     ))

    def estimate_paramters(params):
        score_coefs = dict(zip(teams, params[:n_teams]))
        defend_coefs = dict(zip(teams, params[n_teams:(2*n_teams)]))
        rho, gamma = params[-2:]
        if half_or_full == 'full':
            log_like = [dc_log_like(row.FTHG, row.FTAG, score_coefs[row.Home], defend_coefs[row.Home],
                        score_coefs[row.Away], defend_coefs[row.Away], rho, gamma) for row in dataset.itertuples()]
        elif half_or_full == 'half':
            log_like = [dc_log_like(row.HTHG, row.HTAG, score_coefs[row.Home], defend_coefs[row.Home],
                        score_coefs[row.Away], defend_coefs[row.Away], rho, gamma) for row in dataset.itertuples()]

        return -sum(log_like)
    opt_output = minimize(estimate_paramters, init_vals, options=options, constraints = constraints, **kwargs)
    if debug:
        # sort of hacky way to investigate the output of the optimisation process
        return opt_output
    else:
        return dict(zip(["attack_"+team for team in teams] + 
                        ["defence_"+team for team in teams] +
                        ['rho', 'home_adv'],
                        opt_output.x))


# Calculating Lambda Values for Dixon-Coles Model

In [9]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
stats_df = pd.DataFrame()
full_time_models = []
half_time_models = []

for league in next_leagues:
    league_df = final[final['League'] == league.capitalize()]
    
    full_time_estimates = solve_parameters(league_df, half_or_full = 'full')
    full_time_models.append(full_time_estimates)

    half_time_estimates = solve_parameters(league_df, half_or_full = 'half')
    half_time_models.append(half_time_estimates)

Optimization terminated successfully    (Exit mode 0)
            Current function value: 172.76255526686867
            Iterations: 42
            Function evaluations: 1683
            Gradient evaluations: 42
Optimization terminated successfully    (Exit mode 0)
            Current function value: 119.88549022837012
            Iterations: 29
            Function evaluations: 1162
            Gradient evaluations: 29
Optimization terminated successfully    (Exit mode 0)
            Current function value: 238.2931510367431
            Iterations: 73
            Function evaluations: 3529
            Gradient evaluations: 73
Optimization terminated successfully    (Exit mode 0)
            Current function value: 146.6649923795641
            Iterations: 40
            Function evaluations: 1932
            Gradient evaluations: 40
Optimization terminated successfully    (Exit mode 0)
            Current function value: 1426.21153064343
            Iterations: 83
            Function

In [10]:
full_time_models[-1]

{'attack_Atlanta Utd': 0.8366063035147298,
 'attack_Austin': 0.7054100374773052,
 'attack_CF Montreal': 0.9087846808615054,
 'attack_Charlotte': 0.8112494844247118,
 'attack_Chicago Fire': 0.7586131455111425,
 'attack_Cincinnati': 1.0717505132299852,
 'attack_Colorado Rapids': 1.1836066989340301,
 'attack_Columbus Crew': 1.295190894774223,
 'attack_DC United': 1.026908016322505,
 'attack_Dallas': 1.0766862022414696,
 'attack_Houston Dynamo': 0.8917804422466356,
 'attack_Inter Miami': 1.3545496881886046,
 'attack_LA Galaxy': 1.340176634537932,
 'attack_Los Angeles FC': 1.2224153536055744,
 'attack_Minnesota Utd': 1.1143012920456494,
 'attack_Nashville SC': 0.6153879512441682,
 'attack_New England': 0.6234631743179383,
 'attack_New York City': 1.0206565436989614,
 'attack_New York RB': 1.0314677867623516,
 'attack_Orlando City': 1.1109951560605782,
 'attack_Philadelphia': 1.193962272251305,
 'attack_Portland': 1.2912641411540169,
 'attack_Real Salt Lake': 1.251469160392484,
 'attack_SJ E

# Calculating Probability Matrices for Half/Full Time

In [11]:
#First Function needs work to make it more understandable and a df rather than matrix!
def dixon_coles_simulate_match(params_dict, homeTeam, awayTeam, max_goals=10):
    team_avgs = [np.exp(params_dict['attack_'+homeTeam] + params_dict['defence_'+awayTeam] + params_dict['home_adv']),
                 np.exp(params_dict['defence_'+homeTeam] + params_dict['attack_'+awayTeam])]
    team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in team_avgs]
    output_matrix = np.outer(np.array(team_pred[0]), np.array(team_pred[1]))
    correction_matrix = np.array([[rho_correction(home_goals, away_goals, team_avgs[0],
                                                   team_avgs[1], params_dict['rho']) for away_goals in range(2)]
                                   for home_goals in range(2)])
    output_matrix[:2,:2] = output_matrix[:2,:2] * correction_matrix
    return output_matrix

full_time_matrices = []
half_time_matrices = []

for i in range(len(next_matches)):
    my_league = next_matches['League'].iloc[i]
    league_index = next_leagues.index(my_league)
    ft_match_score_matrix = dixon_coles_simulate_match(full_time_models[league_index], 
                                                       next_matches['Home'].iloc[i], next_matches['Away'].iloc[i], max_goals = 8)
    ht_match_score_matrix = dixon_coles_simulate_match(half_time_models[league_index], 
                                                       next_matches['Home'].iloc[i], next_matches['Away'].iloc[i], max_goals = 4)
    full_time_matrices.append(ft_match_score_matrix)
    half_time_matrices.append(ht_match_score_matrix)

full_time_matrices[0]

array([[1.95142746e-02, 3.06875731e-02, 7.09180801e-02, 8.52983435e-02,
        7.69459007e-02, 5.55290656e-02, 3.33944271e-02, 1.72139620e-02,
        7.76418247e-03],
       [1.30637219e-03, 4.44399549e-02, 6.46239498e-02, 7.77279342e-02,
        7.01167885e-02, 5.06007430e-02, 3.04306007e-02, 1.56861862e-02,
        7.07509478e-03],
       [4.52293413e-03, 1.63201875e-02, 2.94442184e-02, 3.54147074e-02,
        3.19468872e-02, 2.30549098e-02, 1.38649101e-02, 7.14700194e-03,
        3.22358254e-03],
       [1.37383804e-03, 4.95724540e-03, 8.94366050e-03, 1.07571923e-02,
        9.70384437e-03, 7.00291250e-03, 4.21145662e-03, 2.17089678e-03,
        9.79160913e-04],
       [3.12976748e-04, 1.12931983e-03, 2.03747290e-03, 2.45061715e-03,
        2.21065189e-03, 1.59534728e-03, 9.59420224e-04, 4.94556268e-04,
        2.23064575e-04],
       [5.70398793e-05, 2.05818060e-04, 3.71328571e-04, 4.46623935e-04,
        4.02890367e-04, 2.90751364e-04, 1.74853928e-04, 9.01326697e-05,
        4.0

# Calculating Probabilities of Dixon-Coles Model

In [12]:
ft1, ftx, ft2, ft_score = [], [], [], []
over_15, over_25, under_35, under_45, btts = [], [], [], [], []
ht1, htx, ht2, ht_score, ht_over05, ht_under15 = [], [], [], [], [], []
ho05, ao05, ho15, ao15, hu25, au25 = [], [], [], [], [], []

# Helper function to calculate total goals for each score
def total_goals(i, j):
    return i + j

for i in range(len(next_matches)):
    my_matrix = full_time_matrices[i]
    ht_matrix = half_time_matrices[i]

    ft1.append(round(np.sum(np.tril(my_matrix, k=-1)) * 100, 2)) # Sum of lower triangular values (home win)
    ftx.append(round(np.sum(np.diag(my_matrix)) * 100, 2)) # Sum of diagonal values (draw)
    ft2.append(round(np.sum(np.triu(my_matrix, k=1)) * 100, 2)) # Sum of higher triangular values (away_win)
    
    max_score = np.unravel_index(np.argmax(my_matrix), my_matrix.shape) # Find the index of the maximum score
    home_goals, away_goals = max_score
    ft_score.append(f"{home_goals}-{away_goals}") # Format the score as 'home-away'

    # Calculate the probabilities
    over_15.append(round(np.sum([my_matrix[i, j] for i in range(my_matrix.shape[0]) for j in range(my_matrix.shape[1]) if total_goals(i, j) > 1.5]) * 100, 2))
    over_25.append(round(np.sum([my_matrix[i, j] for i in range(my_matrix.shape[0]) for j in range(my_matrix.shape[1]) if total_goals(i, j) > 2.5]) * 100, 2))
    under_35.append(round(np.sum([my_matrix[i, j] for i in range(my_matrix.shape[0]) for j in range(my_matrix.shape[1]) if total_goals(i, j) <= 3.5]) * 100, 2))
    under_45.append(round(np.sum([my_matrix[i, j] for i in range(my_matrix.shape[0]) for j in range(my_matrix.shape[1]) if total_goals(i, j) <= 4.5]) * 100, 2))

    # Calculate BTTS (both teams to score and goals != 0)
    btts.append(round(np.sum([my_matrix[i, j] for i in range(1, my_matrix.shape[0]) for j in range(1, my_matrix.shape[1])]) * 100, 2)) 

    # Calculate statistics for Half Time
    ht1.append(round(np.sum(np.tril(ht_matrix, k=-1)) * 100, 2)) # Sum of lower triangular values (home win)
    htx.append(round(np.sum(np.diag(ht_matrix)) * 100, 2)) # Sum of diagonal values (draw)
    ht2.append(round(np.sum(np.triu(ht_matrix, k=1)) * 100, 2)) # Sum of higher triangular values (away_win)

    ht_max_score = np.unravel_index(np.argmax(ht_matrix), ht_matrix.shape) # Find the index of the maximum score
    ht_hogs, ht_awgs = ht_max_score
    ht_score.append(f"{ht_hogs}-{ht_awgs}") # Format the score as 'home-away'

    ht_over05.append(round(np.sum([ht_matrix[i, j] for i in range(ht_matrix.shape[0]) for j in range(ht_matrix.shape[1]) if total_goals(i, j) > 0.5]) * 100, 2))   
    ht_under15.append(round(np.sum([ht_matrix[i, j] for i in range(ht_matrix.shape[0]) for j in range(ht_matrix.shape[1]) if total_goals(i, j) < 1.5]) * 100, 2)) 

    ho05.append(round(np.sum(my_matrix[1:,:]) * 100, 2))
    ao05.append(round(np.sum(my_matrix[:,1:]) * 100, 2))
    ho15.append(round(np.sum(my_matrix[2:,:]) * 100, 2))
    ao15.append(round(np.sum(my_matrix[:,2:]) * 100, 2))
    hu25.append(round(np.sum(my_matrix[:3,:]) * 100, 2))
    au25.append(round(np.sum(my_matrix[:,:3]) * 100, 2))
    

# Combine lists into a DataFrame
final_results = pd.DataFrame({
    'League': next_matches['League'], 'Home': next_matches['Home'], 'Away': next_matches['Away'],
    'FT1': ft1, 'FTX': ftx, 'FT2': ft2, 'FTR': ft_score,
    'DC1X': [x + y for x, y in zip(ft1, ftx)], 'DC12': [x + y for x, y in zip(ft1, ft2)], 'DCX2': [x + y for x, y in zip(ftx, ft2)],
    '1.5O': over_15, '2.5O': over_25, '3.5U': under_35, '4.5U': under_45, 'BTTS': btts,
    'HT1': ht1, 'HTX': htx, 'HT2': ht2, 'HTR': ht_score,
    'HTDC1X': [x + y for x, y in zip(ht1, htx)], 'HTDC12': [x + y for x, y in zip(ht1, ht2)], 'HTDCX2': [x + y for x, y in zip(htx, ht2)],
    'HT0.5O': ht_over05, 'HT1.5U': ht_under15, 'H0.5O':ho05, 'A0.5O':ao05, 'H1.5O':ho15, 'A1.5O':ao15, 'H2.5U':hu25, 'A2.5U':au25
})

# Function to highlight values higher than threshold
def highlight_values(value):
    if isinstance(value, str):
        return ''  # Return empty string for NaN values
    elif value > threshold:
    #color = 'red'
        return 'background-color: red'
    else:
        return ''

# Apply the style
with pd.option_context('display.precision', 2):
    styled_df = final_results.style.applymap(highlight_values)
styled_df.to_excel(given_date + ".xlsx", index = False)
# Display the styled DataFrame
from IPython.display import display, HTML
display(styled_df)

Unnamed: 0,League,Home,Away,FT1,FTX,FT2,FTR,DC1X,DC12,DCX2,1.5O,2.5O,3.5U,4.5U,BTTS,HT1,HTX,HT2,HTR,HTDC1X,HTDC12,HTDCX2,HT0.5O,HT1.5U,H0.5O,A0.5O,H1.5O,A1.5O,H2.5U,A2.5U
0,Portugal2,Portimonense,Benfica B,4.52,10.67,83.63,0-3,15.19,88.15,94.3,93.67,81.68,33.9,52.84,58.33,4.58,15.56,74.78,0-1,20.14,79.36,90.34,84.86,35.24,59.09,96.11,22.89,86.33,92.42,30.13
1,Spain2,Sporting Gijon,Castellon,46.85,18.83,34.32,1-0,65.68,81.17,53.15,59.96,38.12,81.64,92.59,39.37,34.65,32.74,32.54,0-0,67.39,67.19,65.28,73.55,72.89,70.84,62.55,34.91,25.77,87.24,92.29
2,Usa,Vancouver,Los Angeles FC,33.2,28.0,38.79,1-1,61.2,71.99,66.79,79.63,54.71,67.62,83.65,59.53,26.21,34.9,38.67,0-0,61.11,64.88,73.57,77.64,58.11,74.66,77.63,39.87,44.13,84.0,80.94
