# Reading Model Predictions and Bet365 Odds

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import warnings
from rapidfuzz import process
from datetime import datetime

warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

csv_url = "https://docs.google.com/spreadsheets/d/1WfEG-1icUjj6k7TGePJQEXH-w0TLEIcN/export?format=csv"
uefa = pd.read_csv(csv_url, dtype={'FTHG': 'Int64', 'FTAG': 'Int64', 'HTHG': 'Int64', 'HTAG': 'Int64'})
uefa['FT'] = uefa['FTHG'].astype(str) + ' - ' + uefa['FTAG'].astype(str)
uefa['HT'] = '(' + uefa['HTHG'].astype(str) + '-' + uefa['HTAG'].astype(str) + ')'
uefa['FTTG'] = uefa['FTHG'] + uefa['FTAG']
uefa['HTTG'] = uefa['HTHG'] + uefa['HTAG']

predictions = pd.read_excel('C:/Users/99451/Desktop/MODEL/2025/dixon_coles_model_predictions/_predictions.xlsx')
bet365_odds = pd.read_excel('C:/Users/99451/Desktop/MODEL/2025/dixon_coles_model_predictions/final_odds.xlsx')

print(f"Games found: {len(predictions)} in predictions and {len(bet365_odds)} in odds dataset.")
bet365_odds.tail()

Games found: 5766 in predictions and 5931 in odds dataset.


Unnamed: 0,Home,Away,FT1,FTX,FT2,DC1X,DC12,DCX2,HT1,HTX,HT2,HT1X,HT12,HTX2,BTTS,OTTS,1.5O,1.5U,2.5O,2.5U,3.5O,3.5U,4.5O,4.5U,HT0.5O,HT0.5U,HT1.5O,HT1.5U
5926,Portland Timbers,St. Louis City,1.65,4.1,5.0,1.18,1.22,2.1,2.2,2.5,4.5,1.17,1.5,1.62,1.53,2.38,1.14,5.5,1.53,2.4,2.25,1.57,4.0,1.22,1.25,3.75,2.2,1.62
5927,Junior,Ind. Medellin,2.5,2.9,2.9,1.4,1.4,1.5,3.1,2.0,3.6,1.29,1.73,1.36,1.91,1.8,1.4,2.75,2.25,1.62,4.0,1.22,10.0,1.06,1.5,2.5,3.4,1.3
5928,Los Angeles FC,Sporting Kansas City,1.44,4.75,6.5,1.11,1.18,2.63,1.91,2.63,6.0,1.11,1.44,1.8,1.67,2.1,1.14,5.5,1.48,2.6,2.2,1.62,3.75,1.25,1.25,3.75,2.1,1.67
5929,Vancouver Whitecaps,Seattle Sounders,3.2,3.4,2.15,1.67,1.33,1.29,3.75,2.15,2.7,1.4,1.62,1.25,1.7,2.05,1.29,3.5,1.93,1.88,3.25,1.33,6.5,1.11,1.36,3.0,2.75,1.4
5930,Once Caldas,Millonarios,2.75,2.9,2.88,1.44,1.38,1.48,3.4,1.95,3.5,1.25,1.83,1.33,2.0,1.73,1.44,2.63,2.4,1.53,4.5,1.18,11.0,1.05,1.53,2.38,3.5,1.29


# Merging 2 DataFrames for similarity of values

In [2]:
# Use predictions key values as the canonical list
home_keys = predictions['Home'].unique().tolist()
away_keys = predictions['Away'].unique().tolist()

def get_canonical(val, canonical_list, threshold=85):
    """
    For a given value from df2, find the best matching canonical value from df1
    using a fuzzy matching score. If no match meets the threshold, return the original value.
    """
    match = process.extractOne(val, canonical_list, score_cutoff=threshold)
    if match:
        return match[0]
    return val

# Replace values in bet365_odds Home and Away columns using the canonical mapping from predictions
bet365_odds['Home'] = bet365_odds['Home'].apply(lambda x: get_canonical(x, home_keys))
bet365_odds['Away'] = bet365_odds['Away'].apply(lambda x: get_canonical(x, away_keys))

# Merge 2 dataframes on Home and Away columns
merged_df = pd.merge(predictions, bet365_odds, on=['Home', 'Away'], how='inner')
merged_df = merged_df.drop_duplicates(subset=['League','Home', 'Away'], keep='last')
merged_df.tail()

Unnamed: 0,League,Home,Away,FT1_x,FTX_x,FT2_x,FTR,DC1X_x,DC12_x,DCX2_x,1.5O_x,2.5O_x,3.5U_x,4.5U_x,BTTS_x,HT1_x,HTX_x,HT2_x,HTR,HTDC1X,HTDC12,HTDCX2,HT0.5O_x,HT1.5U_x,H0.5O,A0.5O,H1.5O,A1.5O,H2.5U,A2.5U,FT1_y,FTX_y,FT2_y,DC1X_y,DC12_y,DCX2_y,HT1_y,HTX_y,HT2_y,HT1X,HT12,HTX2,BTTS_y,OTTS,1.5O_y,1.5U,2.5O_y,2.5U,3.5O,3.5U_y,4.5O,4.5U_y,HT0.5O_y,HT0.5U,HT1.5O,HT1.5U_y
5020,Sweden,Sirius,AIK,11.63,19.51,68.83,0-1,31.14,80.46,88.34,72.72,47.91,73.8,87.87,40.19,11.61,54.96,33.38,0-0,66.57,44.99,88.34,55.68,74.35,47.26,85.77,13.52,58.04,97.26,68.99,3.1,3.4,2.25,1.62,1.3,1.36,3.75,2.1,2.88,1.4,1.67,1.25,1.8,1.95,1.33,3.25,2.05,1.75,3.75,1.25,8.0,1.08,1.4,2.75,3.0,1.36
5021,Turkey,Bodrumspor,Besiktas,20.82,37.07,42.11,0-0,57.89,62.93,79.18,45.77,19.67,93.15,98.04,27.62,16.76,48.06,35.14,0-0,64.82,51.9,83.2,58.9,78.73,43.86,61.13,11.45,24.4,97.91,92.95,4.2,3.9,1.75,2.0,1.25,1.22,4.33,2.4,2.3,1.57,1.55,1.22,1.62,2.2,1.18,4.5,1.62,2.25,2.5,1.5,4.33,1.2,1.29,3.5,2.25,1.57
5022,Turkey,Samsunspor,Kayserispor,58.65,24.44,16.89,1-0,83.09,75.54,41.33,74.73,49.12,72.75,87.19,48.53,45.71,40.6,13.54,0-0,86.31,59.25,54.14,67.01,70.77,83.2,57.43,53.25,21.08,73.47,94.44,1.55,3.9,6.5,1.11,1.25,2.38,2.0,2.5,5.5,1.13,1.5,1.73,1.62,2.2,1.17,5.0,1.57,2.35,2.25,1.57,4.0,1.22,1.25,3.75,2.25,1.57
5023,Usa,Portland,St. Louis City,53.56,29.06,17.37,1-0,82.62,70.93,46.43,66.58,38.5,81.37,92.44,41.92,22.37,55.32,22.29,0-0,77.69,44.66,77.61,54.21,78.29,77.35,52.46,43.72,17.11,81.26,96.03,1.65,4.1,5.0,1.18,1.22,2.1,2.2,2.5,4.5,1.17,1.5,1.62,1.53,2.38,1.14,5.5,1.53,2.4,2.25,1.57,4.0,1.22,1.25,3.75,2.2,1.62
5025,Usa,Vancouver,Seattle,56.9,25.28,17.8,1-1,82.18,74.7,43.08,77.11,51.66,70.47,85.65,51.88,55.42,35.02,8.72,1-0,90.44,64.14,43.74,76.48,50.73,83.66,60.54,54.07,23.85,72.73,93.2,3.2,3.4,2.15,1.67,1.33,1.29,3.75,2.15,2.7,1.4,1.62,1.25,1.7,2.05,1.29,3.5,1.93,1.88,3.25,1.33,6.5,1.11,1.36,3.0,2.75,1.4


# Scraping SoccerStats For Match Results

In [11]:
final =  pd.DataFrame()
liqa = ''
unique_leagues = predictions['League'].unique().tolist()

# Convert to lowercase and exclude UEFA competitions
uefa_list = ['unl', 'uel', 'ucl', 'ufcl', 'colombia', 'mexico2']
list_2024 = ['norway_2024', 'sweden_2024', 'usa_2024']
unique_leagues = list(set([league.lower() for league in unique_leagues if league.lower() not in uefa_list]))
#unique_leagues.clear('colombia') #was having server error

# Append list_2024 to unique_leagues
unique_leagues.extend(list_2024)

for i in unique_leagues:
    URL = "https://www.soccerstats.com/results.asp?league=" + i + "&pmtype=bydate"
    page = requests.get(URL)
    liqa = i
    #print(liqa)
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find(id="btable")
    sth = results.find_all("tr", class_="odd")
    sth


    date, league, home, away, ft, ht = [], [], [], [], [],[]
    for i in sth:
        date.append(i.find_all("td", align = 'right')[0].get_text(strip=True))
        league.append(liqa.capitalize())
        home.append(i.find_all("td", align = 'right')[1].get_text(strip=True))
        away.append(i.find("td", align = "left").get_text(strip = True))
        ft.append(i.find_all("td", align = 'center')[0].get_text(strip = True))
        try:
            ht.append(i.find_all("td", align = 'center')[2].get_text(strip = True))
        except IndexError as e:
            ht.append('NA')#print("Last output before error occurred:", i.find_all("td", align = 'center'))

    data = {'Date': date, 'League': league,'Home': home, 'Away': away, 'FT': ft, 'HT': ht}

# Create a DataFrame from the dictionary
    df = pd.DataFrame(data)

# Replace empty strings with NaN
    #next_df = df[(df['Date'] == formatted_date) & (df['HT'] == '')]
    df.replace('', pd.NA, inplace=True)

# Drop rows with NaN values
    df_cleaned = df.dropna()

#For Half-Time Results
    hthg, htag = [], []
    for i in df_cleaned['HT']:
        if i == 'NA':
            hthg.append('NA')
            htag.append('NA')
        elif i == '+' or i == '-':
            hthg.append('NA')
            htag.append('NA')
        else:
            try:
                hthg.append(int(i[1]))
                htag.append(int(i[3]))
            except IndexError as e:
                print("Last output before error occurred:", i)



#For Full-Time Results
    hg, ag, tg = [], [], []
    for i in df_cleaned['FT']:
        if len(i) < 5 or ':' in i:
            hg.append('NA')
            ag.append('NA')
            tg.append('NA')
        else:
            try:
                hghg = int(i.split(' - ')[0])
                hg.append(hghg)
                agag = int(i.split(' - ')[1])
                ag.append(agag)
                tg.append(hghg + agag)
            except:
                print(hghg + agag)

    
    df_cleaned['FTHG'], df_cleaned['FTAG'], df_cleaned['FTTG'] = hg, ag, tg
    df_cleaned['HTHG'], df_cleaned['HTAG'] = hthg, htag
    df_cleaned['HTTG'] = df_cleaned['HTHG'] + df_cleaned['HTAG']
    
    final = pd.concat([final, df_cleaned], ignore_index=True)
    
final = final[final['HT'] != 'NA']

# Get the date for today
today = datetime.now()

# Append the correct year based on if the date already occurred in 2025
def assign_year(date_str):
    if pd.isna(date_str):  # Handle NaN values
        return None
    
    # Check if the same day and month already occurred in 2025
    date_in_2025 = datetime.strptime(date_str + ' 2025', "%d %b %Y")

    # If this day and month already occurred in 2025, append 2025, otherwise 2024
    if date_in_2025 <= today:
        return f"{date_str} 2025"
    else:
        return f"{date_str} 2024"

# Example: Fix the 'date' column by removing the weekday and extra punctuation
final['Date'] = final['Date'].str.extract(r'(\d{1,2} \w{3})')  # Extract day and month part
final['Date'] = final['Date'].apply(assign_year)

# Convert to datetime format
final['Date'] = pd.to_datetime(final['Date'], format='%d %b %Y', errors='coerce')

# Filter rows before September 17th, 2024
final_filtered = final[final['Date'] >= pd.Timestamp('2024-09-17')]

# Remove "_YYYY" (4-digit year) at the end of usa, norway and sweden but keep other numbers
final_filtered['League'] = final_filtered['League'].str.replace(r'_\d{4}$', '', regex=True)

# Align columns of uefa to match final_filtered
uefa = uefa[final_filtered.columns]

# Concatenate
final_filtered = pd.concat([uefa, final_filtered], ignore_index=True)
combined = pd.concat([final_filtered.head(), final_filtered.tail()])
combined

Unnamed: 0,Date,League,Home,Away,FT,HT,FTHG,FTAG,FTTG,HTHG,HTAG,HTTG
0,9/17/2024,UCL,Juventus,PSV,3 - 1,(2-0),3,1,4,2,0,2
1,9/17/2024,UCL,Young Boys,Aston Villa,0 - 3,(0-2),0,3,3,0,2,2
2,9/17/2024,UCL,Bayern,Dinamo Zagreb,9 - 2,(3-0),9,2,11,3,0,3
3,9/17/2024,UCL,Milan,Liverpool,1 - 3,(1-2),1,3,4,1,2,3
4,9/17/2024,UCL,Real Madrid,Stuttgart,3 - 1,(0-0),3,1,4,0,0,0
5677,2024-10-20 00:00:00,Usa,Houston Dynamo,LA Galaxy,2 - 1,(1-0),2,1,3,1,0,1
5678,2024-10-20 00:00:00,Usa,Los Angeles FC,SJ Earthquakes,3 - 1,(0-1),3,1,4,0,1,1
5679,2024-10-20 00:00:00,Usa,Minnesota Utd,St. Louis City,4 - 1,(1-0),4,1,5,1,0,1
5680,2024-10-20 00:00:00,Usa,Real Salt Lake,Vancouver,2 - 1,(0-0),2,1,3,0,0,0
5681,2024-10-20 00:00:00,Usa,Seattle,Portland,1 - 1,(1-0),1,1,2,1,0,1


# Merging with Predictions + Odds Dataframes

In [12]:
# Merge on multiple columns
final_df = pd.merge(merged_df, final_filtered, on=['Home', 'Away'], how='inner')

# Drop duplicates based on the 'Home' and 'Away' columns
final_df_unique = final_df[~final_df.duplicated(subset=['League_x', 'Home', 'Away'], keep='last')]
final_df_unique = final_df_unique.dropna()

# Reset the index afterward
final_df_unique.reset_index(drop=True, inplace=True)

print('Number of games matched: ', len(final_df_unique))
final_df_unique.tail()

Number of games matched:  1027


Unnamed: 0,League_x,Home,Away,FT1_x,FTX_x,FT2_x,FTR,DC1X_x,DC12_x,DCX2_x,1.5O_x,2.5O_x,3.5U_x,4.5U_x,BTTS_x,HT1_x,HTX_x,HT2_x,HTR,HTDC1X,HTDC12,HTDCX2,HT0.5O_x,HT1.5U_x,H0.5O,A0.5O,H1.5O,A1.5O,H2.5U,A2.5U,FT1_y,FTX_y,FT2_y,DC1X_y,DC12_y,DCX2_y,HT1_y,HTX_y,HT2_y,HT1X,HT12,HTX2,BTTS_y,OTTS,1.5O_y,1.5U,2.5O_y,2.5U,3.5O,3.5U_y,4.5O,4.5U_y,HT0.5O_y,HT0.5U,HT1.5O,HT1.5U_y,Date,League_y,FT,HT,FTHG,FTAG,FTTG,HTHG,HTAG,HTTG
1022,Sweden,Malmo FF,Hacken,52.6,23.93,23.47,1-0,76.53,76.07,47.4,72.96,48.34,73.44,87.64,49.88,42.18,54.29,3.48,0-0,96.47,45.66,57.77,49.7,82.71,80.21,62.73,48.16,25.95,77.81,92.2,1.62,4.2,4.75,1.18,1.22,2.25,2.1,2.5,4.75,1.17,1.5,1.67,1.62,2.2,1.17,5.0,1.57,2.35,2.38,1.53,4.0,1.22,1.25,3.75,2.25,1.57,2025-09-22 00:00:00,Sweden,4 - 0,(1-0),4,0,4,1,0,1
1023,Sweden,Mjallby,Varnamo,88.43,7.22,3.34,3-0,95.65,91.77,10.56,90.54,76.87,41.07,60.58,43.72,72.63,6.11,-0.08,3-0,78.74,72.55,6.03,74.3,15.82,96.0,45.21,85.5,12.4,31.94,96.61,1.5,4.33,6.0,1.13,1.2,2.5,2.0,2.4,6.0,1.11,1.53,1.73,1.8,1.95,1.22,4.0,1.7,2.1,2.63,1.44,5.0,1.17,1.3,3.4,2.5,1.5,2024-09-25 00:00:00,Sweden,1 - 1,(0-1),1,1,2,0,1,1
1024,Sweden,Sirius,AIK,11.63,19.51,68.83,0-1,31.14,80.46,88.34,72.72,47.91,73.8,87.87,40.19,11.61,54.96,33.38,0-0,66.57,44.99,88.34,55.68,74.35,47.26,85.77,13.52,58.04,97.26,68.99,3.1,3.4,2.25,1.62,1.3,1.36,3.75,2.1,2.88,1.4,1.67,1.25,1.8,1.95,1.33,3.25,2.05,1.75,3.75,1.25,8.0,1.08,1.4,2.75,3.0,1.36,2024-09-25 00:00:00,Sweden,0 - 1,(0-0),0,1,1,0,0,0
1025,Usa,Portland,St. Louis City,53.56,29.06,17.37,1-0,82.62,70.93,46.43,66.58,38.5,81.37,92.44,41.92,22.37,55.32,22.29,0-0,77.69,44.66,77.61,54.21,78.29,77.35,52.46,43.72,17.11,81.26,96.03,1.65,4.1,5.0,1.18,1.22,2.1,2.2,2.5,4.5,1.17,1.5,1.62,1.53,2.38,1.14,5.5,1.53,2.4,2.25,1.57,4.0,1.22,1.25,3.75,2.2,1.62,2025-08-25 00:00:00,Usa,4 - 4,(1-3),4,4,8,1,3,4
1026,Usa,Vancouver,Seattle,56.9,25.28,17.8,1-1,82.18,74.7,43.08,77.11,51.66,70.47,85.65,51.88,55.42,35.02,8.72,1-0,90.44,64.14,43.74,76.48,50.73,83.66,60.54,54.07,23.85,72.73,93.2,3.2,3.4,2.15,1.67,1.33,1.29,3.75,2.15,2.7,1.4,1.62,1.25,1.7,2.05,1.29,3.5,1.93,1.88,3.25,1.33,6.5,1.11,1.36,3.0,2.75,1.4,2024-10-03 00:00:00,Usa,0 - 3,(0-1),0,3,3,0,1,1


# Creating Results Columns

In [13]:
import numpy as np

# Add the new columns based on the condition
final_df_unique['FT1'] = np.where(final_df_unique['FTHG'] > final_df_unique['FTAG'], 1, 0)
final_df_unique['FTX'] = np.where(final_df_unique['FTHG'] == final_df_unique['FTAG'], 1, 0)
final_df_unique['FT2'] = np.where(final_df_unique['FTHG'] < final_df_unique['FTAG'], 1, 0)

final_df_unique['FT1X'] = np.where(final_df_unique['FTHG'] >= final_df_unique['FTAG'], 1, 0)
final_df_unique['FT12'] = np.where(final_df_unique['FTX'] == 0, 1, 0)
final_df_unique['FTX2'] = np.where(final_df_unique['FTHG'] <= final_df_unique['FTAG'], 1, 0)

final_df_unique['1.5O'] = np.where(final_df_unique['FTTG'] > 1.5, 1, 0)
final_df_unique['1.5U_y'] = np.where(final_df_unique['FTTG'] < 1.5, 1, 0)
final_df_unique['2.5O'] = np.where(final_df_unique['FTTG'] > 2.5, 1, 0)
final_df_unique['2.5U_y'] = np.where(final_df_unique['FTTG'] < 2.5, 1, 0)
final_df_unique['3.5O_y'] = np.where(final_df_unique['FTTG'] > 3.5, 1, 0)
final_df_unique['3.5U'] = np.where(final_df_unique['FTTG'] < 3.5, 1, 0)
final_df_unique['4.5O_y'] = np.where(final_df_unique['FTTG'] > 4.5, 1, 0)
final_df_unique['4.5U'] = np.where(final_df_unique['FTTG'] < 4.5, 1, 0)

final_df_unique['BTTS'] = np.where((final_df_unique['FTHG'] != 0) & (final_df_unique['FTAG'] != 0), 1, 0)
final_df_unique['OTTS_y'] = np.where(final_df_unique['BTTS'] == 0, 1, 0)

final_df_unique['HT1'] = np.where(final_df_unique['HTHG'] > final_df_unique['HTAG'], 1, 0)
final_df_unique['HTX'] = np.where(final_df_unique['HTHG'] == final_df_unique['HTAG'], 1, 0)
final_df_unique['HT2'] = np.where(final_df_unique['HTHG'] < final_df_unique['HTAG'], 1, 0)

final_df_unique['HT1X_y'] = np.where(final_df_unique['HTHG'] >= final_df_unique['HTAG'], 1, 0)
final_df_unique['HT12_y'] = np.where(final_df_unique['HTX'] == 0, 1, 0)
final_df_unique['HTX2_y'] = np.where(final_df_unique['HTHG'] <= final_df_unique['HTAG'], 1, 0)

final_df_unique['HT0.5O'] = np.where(final_df_unique['HTTG'] > 0.5, 1, 0)
final_df_unique['HT0.5U_y'] = np.where(final_df_unique['HTTG'] < 0.5, 1, 0)
final_df_unique['HT1.5O_y'] = np.where(final_df_unique['HTTG'] > 1.5, 1, 0)
final_df_unique['HT1.5U'] = np.where(final_df_unique['HTTG'] < 1.5, 1, 0)

print('Games Found: ', len(final_df_unique))
final_df_unique.tail()

Games Found:  1027


Unnamed: 0,League_x,Home,Away,FT1_x,FTX_x,FT2_x,FTR,DC1X_x,DC12_x,DCX2_x,1.5O_x,2.5O_x,3.5U_x,4.5U_x,BTTS_x,HT1_x,HTX_x,HT2_x,HTR,HTDC1X,HTDC12,HTDCX2,HT0.5O_x,HT1.5U_x,H0.5O,A0.5O,H1.5O,A1.5O,H2.5U,A2.5U,FT1_y,FTX_y,FT2_y,DC1X_y,DC12_y,DCX2_y,HT1_y,HTX_y,HT2_y,HT1X,HT12,HTX2,BTTS_y,OTTS,1.5O_y,1.5U,2.5O_y,2.5U,3.5O,3.5U_y,4.5O,4.5U_y,HT0.5O_y,HT0.5U,HT1.5O,HT1.5U_y,Date,League_y,FT,HT,FTHG,FTAG,FTTG,HTHG,HTAG,HTTG,FT1,FTX,FT2,FT1X,FT12,FTX2,1.5O,1.5U_y,2.5O,2.5U_y,3.5O_y,3.5U,4.5O_y,4.5U,BTTS,OTTS_y,HT1,HTX,HT2,HT1X_y,HT12_y,HTX2_y,HT0.5O,HT0.5U_y,HT1.5O_y,HT1.5U
1022,Sweden,Malmo FF,Hacken,52.6,23.93,23.47,1-0,76.53,76.07,47.4,72.96,48.34,73.44,87.64,49.88,42.18,54.29,3.48,0-0,96.47,45.66,57.77,49.7,82.71,80.21,62.73,48.16,25.95,77.81,92.2,1.62,4.2,4.75,1.18,1.22,2.25,2.1,2.5,4.75,1.17,1.5,1.67,1.62,2.2,1.17,5.0,1.57,2.35,2.38,1.53,4.0,1.22,1.25,3.75,2.25,1.57,2025-09-22 00:00:00,Sweden,4 - 0,(1-0),4,0,4,1,0,1,1,0,0,1,1,0,1,0,1,0,1,0,0,1,0,1,1,0,0,1,1,0,1,0,0,1
1023,Sweden,Mjallby,Varnamo,88.43,7.22,3.34,3-0,95.65,91.77,10.56,90.54,76.87,41.07,60.58,43.72,72.63,6.11,-0.08,3-0,78.74,72.55,6.03,74.3,15.82,96.0,45.21,85.5,12.4,31.94,96.61,1.5,4.33,6.0,1.13,1.2,2.5,2.0,2.4,6.0,1.11,1.53,1.73,1.8,1.95,1.22,4.0,1.7,2.1,2.63,1.44,5.0,1.17,1.3,3.4,2.5,1.5,2024-09-25 00:00:00,Sweden,1 - 1,(0-1),1,1,2,0,1,1,0,1,0,1,0,1,1,0,0,1,0,1,0,1,1,0,0,0,1,0,1,1,1,0,0,1
1024,Sweden,Sirius,AIK,11.63,19.51,68.83,0-1,31.14,80.46,88.34,72.72,47.91,73.8,87.87,40.19,11.61,54.96,33.38,0-0,66.57,44.99,88.34,55.68,74.35,47.26,85.77,13.52,58.04,97.26,68.99,3.1,3.4,2.25,1.62,1.3,1.36,3.75,2.1,2.88,1.4,1.67,1.25,1.8,1.95,1.33,3.25,2.05,1.75,3.75,1.25,8.0,1.08,1.4,2.75,3.0,1.36,2024-09-25 00:00:00,Sweden,0 - 1,(0-0),0,1,1,0,0,0,0,0,1,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1
1025,Usa,Portland,St. Louis City,53.56,29.06,17.37,1-0,82.62,70.93,46.43,66.58,38.5,81.37,92.44,41.92,22.37,55.32,22.29,0-0,77.69,44.66,77.61,54.21,78.29,77.35,52.46,43.72,17.11,81.26,96.03,1.65,4.1,5.0,1.18,1.22,2.1,2.2,2.5,4.5,1.17,1.5,1.62,1.53,2.38,1.14,5.5,1.53,2.4,2.25,1.57,4.0,1.22,1.25,3.75,2.2,1.62,2025-08-25 00:00:00,Usa,4 - 4,(1-3),4,4,8,1,3,4,0,1,0,1,0,1,1,0,1,0,1,0,1,0,1,0,0,0,1,0,1,1,1,0,1,0
1026,Usa,Vancouver,Seattle,56.9,25.28,17.8,1-1,82.18,74.7,43.08,77.11,51.66,70.47,85.65,51.88,55.42,35.02,8.72,1-0,90.44,64.14,43.74,76.48,50.73,83.66,60.54,54.07,23.85,72.73,93.2,3.2,3.4,2.15,1.67,1.33,1.29,3.75,2.15,2.7,1.4,1.62,1.25,1.7,2.05,1.29,3.5,1.93,1.88,3.25,1.33,6.5,1.11,1.36,3.0,2.75,1.4,2024-10-03 00:00:00,Usa,0 - 3,(0-1),0,3,3,0,1,1,0,0,1,0,1,1,1,0,1,0,0,1,0,1,0,1,0,0,1,0,1,1,1,0,0,1


# Creating Profit Columns for Initial Model Predictions

In [14]:
ft1p, ftxp, ft2p, ft1xp, ft12p, ftx2p = [], [], [], [], [], []
over15, under15, over25, under25, over35, under35, over45, under45 = [], [], [], [], [], [], [], []
btts, otts, ht1p, htxp, ht2p, ht1xp, ht12p, htx2p = [], [], [], [], [], [], [], []
htover05, htunder05, htover15, htunder15 = [], [], [], []

#if prediction == result -> coefficient, elif prediction != result -> 0, else -> -1
for i in range(len(final_df_unique)):
    ft_list = [final_df_unique['FT1_x'].iloc[i], final_df_unique['FTX_x'].iloc[i], final_df_unique['FT2_x'].iloc[i]]
    if (ft_list.index(max(ft_list)) == 0) and (final_df_unique['FT1'].iloc[i] == 1):
        ft1p.append(final_df_unique['FT1_y'].iloc[i])
    elif (ft_list.index(max(ft_list)) == 0) and (final_df_unique['FT1'].iloc[i] != 1):
        ft1p.append(0)
    else:
        ft1p.append(-1)
    
    if (ft_list.index(max(ft_list)) == 1) and (final_df_unique['FTX'].iloc[i] == 1):
        ftxp.append(final_df_unique['FTX_y'].iloc[i])
    elif (ft_list.index(max(ft_list)) == 1) and (final_df_unique['FTX'].iloc[i] != 1):
        ftxp.append(0)
    else:
        ftxp.append(-1)
    
    if (ft_list.index(max(ft_list)) == 2) and (final_df_unique['FT2'].iloc[i] == 1):
        ft2p.append(final_df_unique['FT2_y'].iloc[i])
    elif (ft_list.index(max(ft_list)) == 2) and (final_df_unique['FT2'].iloc[i] != 1):
        ft2p.append(0)
    else:
        ft2p.append(-1)

    dc_list = [final_df_unique['DC1X_x'].iloc[i], final_df_unique['DC12_x'].iloc[i], final_df_unique['DCX2_x'].iloc[i]]
    if (dc_list.index(max(dc_list)) == 0) and (final_df_unique['FT1X'].iloc[i] == 1):
        ft1xp.append(final_df_unique['DC1X_y'].iloc[i])
    elif (dc_list.index(max(dc_list)) == 0) and (final_df_unique['FT1X'].iloc[i] != 1):
        ft1xp.append(0)
    else:
        ft1xp.append(-1)
    
    if (dc_list.index(max(dc_list)) == 1) and (final_df_unique['FT12'].iloc[i] == 1):
        ft12p.append(final_df_unique['DC12_y'].iloc[i])
    elif (dc_list.index(max(dc_list)) == 1) and (final_df_unique['FT12'].iloc[i] != 1):
        ft12p.append(0)
    else:
        ft12p.append(-1)
    
    if (dc_list.index(max(dc_list)) == 2) and (final_df_unique['FTX2'].iloc[i] == 1):
        ftx2p.append(final_df_unique['DCX2_y'].iloc[i])
    elif (dc_list.index(max(dc_list)) == 2) and (final_df_unique['FTX2'].iloc[i] != 1):
        ftx2p.append(0)
    else:
        ftx2p.append(-1)
    
    if (final_df_unique['1.5O_x'].iloc[i] >= 50) and (final_df_unique['1.5O'].iloc[i] == 1):
        over15.append(final_df_unique['1.5O_y'].iloc[i])
    elif (final_df_unique['1.5O_x'].iloc[i] >= 50) and (final_df_unique['1.5O'].iloc[i] != 1):
        over15.append(0)
    else:
        over15.append(-1)
    
    if (final_df_unique['1.5O_x'].iloc[i] < 50) and (final_df_unique['1.5O'].iloc[i] != 1):
        under15.append(final_df_unique['1.5U'].iloc[i])
    elif (final_df_unique['1.5O_x'].iloc[i] < 50) and (final_df_unique['1.5O'].iloc[i] == 1):
        under15.append(0)
    else:
        under15.append(-1)
    
    if (final_df_unique['2.5O_x'].iloc[i] >= 50) and (final_df_unique['2.5O'].iloc[i] == 1):
        over25.append(final_df_unique['2.5O_y'].iloc[i])
    elif (final_df_unique['2.5O_x'].iloc[i] >= 50) and (final_df_unique['2.5O'].iloc[i] != 1):
        over25.append(0)
    else:
        over25.append(-1)
    
    if (final_df_unique['2.5O_x'].iloc[i] < 50) and (final_df_unique['2.5O'].iloc[i] != 1):
        under25.append(final_df_unique['2.5U'].iloc[i])
    elif (final_df_unique['2.5O_x'].iloc[i] < 50) and (final_df_unique['2.5O'].iloc[i] == 1):
        under25.append(0)
    else:
        under25.append(-1)
    
    if (final_df_unique['3.5U_x'].iloc[i] <= 50) and (final_df_unique['3.5U'].iloc[i] != 1):
        over35.append(final_df_unique['3.5O'].iloc[i])
    elif (final_df_unique['3.5U_x'].iloc[i] <= 50) and (final_df_unique['3.5U'].iloc[i] == 1):
        over35.append(0)
    else:
        over35.append(-1)
    
    if (final_df_unique['3.5U_x'].iloc[i] > 50) and (final_df_unique['3.5U'].iloc[i] == 1):
        under35.append(final_df_unique['3.5U_y'].iloc[i])
    elif (final_df_unique['3.5U_x'].iloc[i] > 50) and (final_df_unique['3.5U'].iloc[i] != 1):
        under35.append(0)
    else:
        under35.append(-1)
    
    if (final_df_unique['4.5U_x'].iloc[i] <= 50) and (final_df_unique['4.5U'].iloc[i] != 1):
        over45.append(final_df_unique['4.5O'].iloc[i])
    elif (final_df_unique['4.5U_x'].iloc[i] <= 50) and (final_df_unique['4.5U'].iloc[i] == 1):
        over45.append(0)
    else:
        over45.append(-1)
    
    if (final_df_unique['4.5U_x'].iloc[i] > 50) and (final_df_unique['4.5U'].iloc[i] == 1):
        under45.append(final_df_unique['4.5U_y'].iloc[i])
    elif (final_df_unique['4.5U_x'].iloc[i] > 50) and (final_df_unique['4.5U'].iloc[i] != 1):
        under45.append(0)
    else:
        under45.append(-1)
    
    if (final_df_unique['BTTS_x'].iloc[i] >= 50) and (final_df_unique['BTTS'].iloc[i] == 1):
        btts.append(final_df_unique['BTTS_y'].iloc[i])
    elif (final_df_unique['BTTS_x'].iloc[i] >= 50) and (final_df_unique['BTTS'].iloc[i] != 1):
        btts.append(0)
    else:
        btts.append(-1)
    
    if (final_df_unique['BTTS_x'].iloc[i] < 50) and (final_df_unique['BTTS'].iloc[i] != 1):
        otts.append(final_df_unique['OTTS'].iloc[i])
    elif (final_df_unique['BTTS_x'].iloc[i] < 50) and (final_df_unique['BTTS'].iloc[i] == 1):
        otts.append(0)
    else:
        otts.append(-1)
    
    ht_list = [final_df_unique['HT1_x'].iloc[i], final_df_unique['HTX_x'].iloc[i], final_df_unique['HT2_x'].iloc[i]]
    if (ht_list.index(max(ht_list)) == 0) and (final_df_unique['HT1'].iloc[i] == 1):
        ht1p.append(final_df_unique['HT1_y'].iloc[i])
    elif (ht_list.index(max(ht_list)) == 0) and (final_df_unique['HT1'].iloc[i] != 1):
        ht1p.append(0)
    else:
        ht1p.append(-1)
    
    if (ht_list.index(max(ht_list)) == 1) and (final_df_unique['HTX'].iloc[i] == 1):
        htxp.append(final_df_unique['HTX_y'].iloc[i])
    elif (ht_list.index(max(ht_list)) == 1) and (final_df_unique['HTX'].iloc[i] != 1):
        htxp.append(0)
    else:
        htxp.append(-1)
    
    if (ht_list.index(max(ht_list)) == 2) and (final_df_unique['HT2'].iloc[i] == 1):
        ht2p.append(final_df_unique['HT2_y'].iloc[i])
    elif (ht_list.index(max(ht_list)) == 2) and (final_df_unique['HT2'].iloc[i] != 1):
        ht2p.append(0)
    else:
        ht2p.append(-1)
    
    htdc_list = [final_df_unique['HTDC1X'].iloc[i], final_df_unique['HTDC12'].iloc[i], final_df_unique['HTDCX2'].iloc[i]]
    if (htdc_list.index(max(htdc_list)) == 0) and (final_df_unique['HT1X_y'].iloc[i] == 1):
        ht1xp.append(final_df_unique['HT1X'].iloc[i])
    elif (htdc_list.index(max(htdc_list)) == 0) and (final_df_unique['HT1X_y'].iloc[i] != 1):
        ht1xp.append(0)
    else:
        ht1xp.append(-1)
    
    if (htdc_list.index(max(htdc_list)) == 1) and (final_df_unique['HT12_y'].iloc[i] == 1):
        ht12p.append(final_df_unique['HT12'].iloc[i])
    elif (htdc_list.index(max(htdc_list)) == 1) and (final_df_unique['HT12_y'].iloc[i] != 1):
        ht12p.append(0)
    else:
        ht12p.append(-1)
    
    if (htdc_list.index(max(htdc_list)) == 2) and (final_df_unique['HTX2_y'].iloc[i] == 1):
        htx2p.append(final_df_unique['HTX2'].iloc[i])
    elif (htdc_list.index(max(htdc_list)) == 2) and (final_df_unique['HTX2_y'].iloc[i] != 1):
        htx2p.append(0)
    else:
        htx2p.append(-1)
    
    if (final_df_unique['HT0.5O_x'].iloc[i] >= 50) and (final_df_unique['HT0.5O'].iloc[i] == 1):
        htover05.append(final_df_unique['HT0.5O_y'].iloc[i])
    elif (final_df_unique['HT0.5O_x'].iloc[i] >= 50) and (final_df_unique['HT0.5O'].iloc[i] != 1):
        htover05.append(0)
    else:
        htover05.append(-1)
    
    if (final_df_unique['HT0.5O_x'].iloc[i] < 50) and (final_df_unique['HT0.5O'].iloc[i] != 1):
        htunder05.append(final_df_unique['HT0.5U'].iloc[i])
    elif (final_df_unique['HT0.5O_x'].iloc[i] < 50) and (final_df_unique['HT0.5O'].iloc[i] == 1):
        htunder05.append(0)
    else:
        htunder05.append(-1)
    
    if (final_df_unique['HT1.5U_x'].iloc[i] < 50) and (final_df_unique['HT1.5U'].iloc[i] != 1):
        htover15.append(final_df_unique['HT1.5O'].iloc[i])
    elif (final_df_unique['HT1.5U_x'].iloc[i] < 50) and (final_df_unique['HT1.5U'].iloc[i] == 1):
        htover15.append(0)
    else:
        htover15.append(-1)
    
    if (final_df_unique['HT1.5U_x'].iloc[i] >= 50) and (final_df_unique['HT1.5U'].iloc[i] == 1):
        htunder15.append(final_df_unique['HT1.5U_y'].iloc[i])
    elif (final_df_unique['HT1.5U_x'].iloc[i] >= 50) and (final_df_unique['HT1.5U'].iloc[i] != 1):
        htunder15.append(0)
    else:
        htunder15.append(-1)

final_df_unique['FT1P'], final_df_unique['FTXP'], final_df_unique['FT2P'] = ft1p, ftxp, ft2p
final_df_unique['FT1XP'], final_df_unique['FT12P'], final_df_unique['FTX2P'] = ft1xp, ft12p, ftx2p
final_df_unique['1.5OP'], final_df_unique['1.5UP'], final_df_unique['2.5OP'], final_df_unique['2.5UP'] = over15, under15, over25, under25
final_df_unique['3.5OP'], final_df_unique['3.5UP'], final_df_unique['4.5OP'], final_df_unique['4.5UP'] = over35, under35, over45, under45
final_df_unique['BTTSP'], final_df_unique['OTTSP'] = btts, otts
final_df_unique['HT1P'], final_df_unique['HTXP'], final_df_unique['HT2P'] = ht1p, htxp, ht2p
final_df_unique['HT1XP'], final_df_unique['HT12P'], final_df_unique['HTX2P'] = ht1xp, ht12p, htx2p
final_df_unique['HT0.5OP'], final_df_unique['HT0.5UP'] = htover05, htunder05
final_df_unique['HT1.5OP'], final_df_unique['HT1.5UP'] = htover15, htunder15

print('Games Found: ', len(final_df_unique))
final_df_unique.tail()

Games Found:  1027


Unnamed: 0,League_x,Home,Away,FT1_x,FTX_x,FT2_x,FTR,DC1X_x,DC12_x,DCX2_x,1.5O_x,2.5O_x,3.5U_x,4.5U_x,BTTS_x,HT1_x,HTX_x,HT2_x,HTR,HTDC1X,HTDC12,HTDCX2,HT0.5O_x,HT1.5U_x,H0.5O,A0.5O,H1.5O,A1.5O,H2.5U,A2.5U,FT1_y,FTX_y,FT2_y,DC1X_y,DC12_y,DCX2_y,HT1_y,HTX_y,HT2_y,HT1X,HT12,HTX2,BTTS_y,OTTS,1.5O_y,1.5U,2.5O_y,2.5U,3.5O,3.5U_y,4.5O,4.5U_y,HT0.5O_y,HT0.5U,HT1.5O,HT1.5U_y,Date,League_y,FT,HT,FTHG,FTAG,FTTG,HTHG,HTAG,HTTG,FT1,FTX,FT2,FT1X,FT12,FTX2,1.5O,1.5U_y,2.5O,2.5U_y,3.5O_y,3.5U,4.5O_y,4.5U,BTTS,OTTS_y,HT1,HTX,HT2,HT1X_y,HT12_y,HTX2_y,HT0.5O,HT0.5U_y,HT1.5O_y,HT1.5U,FT1P,FTXP,FT2P,FT1XP,FT12P,FTX2P,1.5OP,1.5UP,2.5OP,2.5UP,3.5OP,3.5UP,4.5OP,4.5UP,BTTSP,OTTSP,HT1P,HTXP,HT2P,HT1XP,HT12P,HTX2P,HT0.5OP,HT0.5UP,HT1.5OP,HT1.5UP
1022,Sweden,Malmo FF,Hacken,52.6,23.93,23.47,1-0,76.53,76.07,47.4,72.96,48.34,73.44,87.64,49.88,42.18,54.29,3.48,0-0,96.47,45.66,57.77,49.7,82.71,80.21,62.73,48.16,25.95,77.81,92.2,1.62,4.2,4.75,1.18,1.22,2.25,2.1,2.5,4.75,1.17,1.5,1.67,1.62,2.2,1.17,5.0,1.57,2.35,2.38,1.53,4.0,1.22,1.25,3.75,2.25,1.57,2025-09-22 00:00:00,Sweden,4 - 0,(1-0),4,0,4,1,0,1,1,0,0,1,1,0,1,0,1,0,1,0,0,1,0,1,1,0,0,1,1,0,1,0,0,1,1.62,-1.0,-1.0,1.18,-1.0,-1.0,1.17,-1.0,-1.0,0.0,-1.0,0.0,-1.0,1.22,-1.0,2.2,-1.0,0.0,-1.0,1.17,-1.0,-1.0,-1.0,0.0,-1.0,1.57
1023,Sweden,Mjallby,Varnamo,88.43,7.22,3.34,3-0,95.65,91.77,10.56,90.54,76.87,41.07,60.58,43.72,72.63,6.11,-0.08,3-0,78.74,72.55,6.03,74.3,15.82,96.0,45.21,85.5,12.4,31.94,96.61,1.5,4.33,6.0,1.13,1.2,2.5,2.0,2.4,6.0,1.11,1.53,1.73,1.8,1.95,1.22,4.0,1.7,2.1,2.63,1.44,5.0,1.17,1.3,3.4,2.5,1.5,2024-09-25 00:00:00,Sweden,1 - 1,(0-1),1,1,2,0,1,1,0,1,0,1,0,1,1,0,0,1,0,1,0,1,1,0,0,0,1,0,1,1,1,0,0,1,0.0,-1.0,-1.0,1.13,-1.0,-1.0,1.22,-1.0,0.0,-1.0,0.0,-1.0,-1.0,1.17,-1.0,0.0,0.0,-1.0,-1.0,0.0,-1.0,-1.0,1.3,-1.0,0.0,-1.0
1024,Sweden,Sirius,AIK,11.63,19.51,68.83,0-1,31.14,80.46,88.34,72.72,47.91,73.8,87.87,40.19,11.61,54.96,33.38,0-0,66.57,44.99,88.34,55.68,74.35,47.26,85.77,13.52,58.04,97.26,68.99,3.1,3.4,2.25,1.62,1.3,1.36,3.75,2.1,2.88,1.4,1.67,1.25,1.8,1.95,1.33,3.25,2.05,1.75,3.75,1.25,8.0,1.08,1.4,2.75,3.0,1.36,2024-09-25 00:00:00,Sweden,0 - 1,(0-0),0,1,1,0,0,0,0,0,1,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,-1.0,-1.0,2.25,-1.0,-1.0,1.36,0.0,-1.0,-1.0,1.75,-1.0,1.25,-1.0,1.08,-1.0,1.95,-1.0,2.1,-1.0,-1.0,-1.0,1.25,0.0,-1.0,-1.0,1.36
1025,Usa,Portland,St. Louis City,53.56,29.06,17.37,1-0,82.62,70.93,46.43,66.58,38.5,81.37,92.44,41.92,22.37,55.32,22.29,0-0,77.69,44.66,77.61,54.21,78.29,77.35,52.46,43.72,17.11,81.26,96.03,1.65,4.1,5.0,1.18,1.22,2.1,2.2,2.5,4.5,1.17,1.5,1.62,1.53,2.38,1.14,5.5,1.53,2.4,2.25,1.57,4.0,1.22,1.25,3.75,2.2,1.62,2025-08-25 00:00:00,Usa,4 - 4,(1-3),4,4,8,1,3,4,0,1,0,1,0,1,1,0,1,0,1,0,1,0,1,0,0,0,1,0,1,1,1,0,1,0,0.0,-1.0,-1.0,1.18,-1.0,-1.0,1.14,-1.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,-1.0,1.25,-1.0,-1.0,0.0
1026,Usa,Vancouver,Seattle,56.9,25.28,17.8,1-1,82.18,74.7,43.08,77.11,51.66,70.47,85.65,51.88,55.42,35.02,8.72,1-0,90.44,64.14,43.74,76.48,50.73,83.66,60.54,54.07,23.85,72.73,93.2,3.2,3.4,2.15,1.67,1.33,1.29,3.75,2.15,2.7,1.4,1.62,1.25,1.7,2.05,1.29,3.5,1.93,1.88,3.25,1.33,6.5,1.11,1.36,3.0,2.75,1.4,2024-10-03 00:00:00,Usa,0 - 3,(0-1),0,3,3,0,1,1,0,0,1,0,1,1,1,0,1,0,0,1,0,1,0,1,0,0,1,0,1,1,1,0,0,1,0.0,-1.0,-1.0,0.0,-1.0,-1.0,1.29,-1.0,1.93,-1.0,-1.0,1.33,-1.0,1.11,0.0,-1.0,0.0,-1.0,-1.0,0.0,-1.0,-1.0,1.36,-1.0,-1.0,1.4


# Checking For ROI of Profit Columns

In [15]:
# Select columns that end with 'P'
columns = [col for col in final_df_unique.columns if col.endswith('P')]

# Initialize lists to store results and games
results, games_list = [],  []

for col in columns:
    my_df = final_df_unique[final_df_unique[col] >= 0]
    numerator = np.sum(my_df[col]) - len(my_df)
    result = round(numerator / len(my_df) * 100, 2)
    
    # Append results and games to respective lists
    results.append(result)
    games_list.append(len(my_df))

# Convert results to a DataFrame with an additional column for Games
results_df = pd.DataFrame({
    'Column': columns,
    'ROI': results,
    'Games': games_list
})
results_df

Unnamed: 0,Column,ROI,Games
0,FT1P,-5.29,634
1,FTXP,-9.9,51
2,FT2P,1.93,342
3,FT1XP,-9.2,443
4,FT12P,-4.79,372
5,FTX2P,-3.42,212
6,1.5OP,-5.26,948
7,1.5UP,-8.89,79
8,2.5OP,-9.67,525
9,2.5UP,-7.39,502


# ROI of Profit Columns According To Leagues

In [16]:
# Step 1: Filter leagues with at least 10 games
league_counts = final_df_unique['League_x'].value_counts()
leagues_with_10_games = league_counts[league_counts >= 10].index
filtered_df = final_df_unique[final_df_unique['League_x'].isin(leagues_with_10_games)]

# Group by 'League' and calculate results for each group
grouped_results = {}
for league, group in filtered_df.groupby('League_x'):
    group_results = {}
    for col in columns:
        my_df = group[group[col] >= 0]
        numerator = np.sum(my_df[col]) - len(my_df)
        group_results[col] = round(numerator / len(my_df) * 100, 2)
    # Add the number of games for this league
    group_results['Games'] = round(len(group),2)
    grouped_results[league] = group_results

# Convert grouped results to a DataFrame for better visualization
grouped_results_df = pd.DataFrame(grouped_results).T

# Define a function to apply conditional formatting
def highlight_positive(val):
    # Highlight background to red if the value is positive
    color = 'background-color: red' if isinstance(val, (int, float)) and val > 0 else ''
    return color

# Apply the function to the DataFrame
styled_df = (
    grouped_results_df.style
    .applymap(highlight_positive)
    .format("{:.2f}")  # Format only numeric columns, excluding 'Games'
)

# Save the styled DataFrame to Excel
styled_df.to_excel("ROI_leagues.xlsx", index=True)

# Display the styled DataFrame
styled_df

Unnamed: 0,FT1P,FTXP,FT2P,FT1XP,FT12P,FTX2P,1.5OP,1.5UP,2.5OP,2.5UP,3.5OP,3.5UP,4.5OP,4.5UP,BTTSP,OTTSP,HT1P,HTXP,HT2P,HT1XP,HT12P,HTX2P,HT0.5OP,HT0.5UP,HT1.5OP,HT1.5UP,Games
Argentina,-33.08,8.67,-29.11,-4.36,-66.0,-4.65,0.16,-11.13,-2.08,-5.17,-100.0,-4.6,,-2.43,-2.22,-1.12,-46.82,14.33,78.83,-5.31,-100.0,7.75,9.92,12.57,-100.0,-15.88,60.0
Australia,-44.59,,-18.77,2.78,-12.07,10.71,-3.17,,6.52,51.22,-34.33,-34.86,,-12.17,9.26,-24.55,0.3,-1.58,-11.5,2.47,,-20.73,-7.23,,20.43,-25.7,30.0
Austria,16.5,,29.17,-30.29,-14.5,-19.33,17.19,,8.1,22.67,15.75,10.5,-100.0,-3.2,-48.8,-35.67,200.0,-3.64,165.0,-0.11,,31.86,16.14,-100.0,47.0,-29.42,16.0
Belgium,-17.29,-100.0,9.0,-24.85,-35.5,15.0,3.89,,-18.5,-23.38,-100.0,0.67,,1.74,29.8,-4.67,-3.0,-24.17,-40.0,-11.7,57.0,-17.0,7.16,,75.33,15.62,19.0
Brazil,-7.31,-100.0,-27.2,-11.29,-45.9,-37.89,-8.79,31.5,-11.44,-12.32,,-9.44,,-8.96,3.69,-1.3,37.6,-24.1,225.0,-10.34,,-25.44,16.87,-25.0,-100.0,-27.33,50.0
Denmark,40.62,,18.75,-10.0,5.0,-23.4,-16.31,,-24.63,,-100.0,5.75,-100.0,0.47,-36.62,,-4.37,18.33,163.0,-22.78,51.5,24.4,8.0,,-50.0,-27.33,16.0
England,-26.62,,-17.86,-26.2,-1.2,-25.2,-5.73,,-17.64,-8.25,150.0,36.64,,15.6,-8.4,-27.0,14.78,12.5,15.0,-21.25,48.25,-24.0,18.43,-100.0,60.25,24.09,15.0
England2,-31.25,-17.5,45.0,-38.42,10.0,-50.0,-2.1,275.0,-36.17,-23.47,-100.0,-22.68,,-14.05,-49.57,-13.29,-50.0,7.5,,-48.21,,15.14,4.21,-100.0,156.5,-13.16,21.0
England3,24.69,225.0,-68.75,-3.2,5.55,-29.0,-1.17,-100.0,-30.25,-17.38,-40.5,-13.05,-100.0,-7.29,-22.46,-37.92,-62.5,-40.71,-40.0,-7.5,,-16.09,-7.71,34.5,-20.67,-20.32,25.0
England4,13.39,,-52.0,-17.17,-24.92,-100.0,-13.41,-100.0,-14.67,-17.58,-100.0,-19.11,,-1.93,13.38,-25.2,31.29,9.76,,-3.62,,10.0,-5.12,0.0,-100.0,-9.52,28.0


# Creating Optimum Threshold for Each Prediction Column

In [17]:
# Assuming `df` is your DataFrame and it contains the columns for percentages and correctness
def calculate_threshold(percentages, predictions):
    # Ensure inputs are pandas Series
    percentages = pd.Series(percentages)
    predictions = pd.Series(predictions)
    
    thresholds = percentages.unique()
    best_threshold = 0
    best_j_stat = -np.inf  # Start with negative infinity for comparison
    
    for threshold in thresholds:
        # Predict 1s based on the threshold
        predicted_1s = (percentages >= threshold).astype(int)
        
        # Calculate true positives, true negatives, false positives, false negatives
        true_positives = ((predicted_1s == 1) & (predictions == 1)).sum()
        true_negatives = ((predicted_1s == 0) & (predictions == 0)).sum()
        false_positives = ((predicted_1s == 1) & (predictions == 0)).sum()
        false_negatives = ((predicted_1s == 0) & (predictions == 1)).sum()
        
        # Calculate Sensitivity (Recall) and Specificity
        sensitivity = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        specificity = true_negatives / (true_negatives + false_positives) if (true_negatives + false_positives) > 0 else 0
        
        # Calculate Youden's J Statistic
        j_stat = sensitivity + specificity - 1
        
        # Update best threshold if J statistic improves
        if j_stat > best_j_stat:
            best_j_stat = j_stat
            best_threshold = threshold
    
    return best_threshold, round(best_j_stat, 2)

# Select only numeric columns
numeric_columns = final_df_unique.select_dtypes(include=[np.number])

# Remove rows where any numeric value is greater than 100
final_df_unique = final_df_unique[(numeric_columns <= 100).all(axis=1)]

#Selecting dataframes with model predictions
ft1df = final_df_unique[final_df_unique['FT1_x'] >= final_df_unique[['FTX_x', 'FT2_x']].max(axis=1)]
ftxdf = final_df_unique[final_df_unique['FTX_x'] >= final_df_unique[['FT1_x', 'FT2_x']].max(axis=1)]
ft2df = final_df_unique[final_df_unique['FT2_x'] >= final_df_unique[['FTX_x', 'FT1_x']].max(axis=1)]
dc1xdf = final_df_unique[final_df_unique['DC1X_x'] >= final_df_unique[['DC12_x', 'DCX2_x']].max(axis=1)]
dc12df = final_df_unique[final_df_unique['DC12_x'] >= final_df_unique[['DC1X_x', 'DCX2_x']].max(axis=1)]
dcx2df = final_df_unique[final_df_unique['DCX2_x'] >= final_df_unique[['DC1X_x', 'DC12_x']].max(axis=1)]
over15df, under15df = final_df_unique[final_df_unique['1.5O_x'] >= 50], final_df_unique[final_df_unique['1.5O_x'] < 50]
over25df, under25df = final_df_unique[final_df_unique['2.5O_x'] >= 50], final_df_unique[final_df_unique['2.5O_x'] < 50]
over35df, under35df = final_df_unique[final_df_unique['3.5U_x'] < 50], final_df_unique[final_df_unique['3.5U_x'] >= 50]
over45df, under45df = final_df_unique[final_df_unique['4.5U_x'] < 50], final_df_unique[final_df_unique['4.5U_x'] >= 50]
bttsdf, ottsdf = final_df_unique[final_df_unique['BTTS_x'] >= 50], final_df_unique[final_df_unique['BTTS_x'] < 50]
ht1df = final_df_unique[final_df_unique['HT1_x'] >= final_df_unique[['HTX_x', 'HT2_x']].max(axis=1)]
htxdf = final_df_unique[final_df_unique['HTX_x'] >= final_df_unique[['HT1_x', 'HT2_x']].max(axis=1)]
ht2df = final_df_unique[final_df_unique['HT2_x'] >= final_df_unique[['HT1_x', 'HTX_x']].max(axis=1)]
ht1xdf = final_df_unique[final_df_unique['HTDC1X'] >= final_df_unique[['HTDC12', 'HTDCX2']].max(axis=1)]
ht12df = final_df_unique[final_df_unique['HTDC12'] >= final_df_unique[['HTDC1X', 'HTDCX2']].max(axis=1)]
htx2df = final_df_unique[final_df_unique['HTDCX2'] >= final_df_unique[['HTDC1X', 'HTDC12']].max(axis=1)]
htover05df, htunder05df = final_df_unique[final_df_unique['HT0.5O_x'] >= 50], final_df_unique[final_df_unique['HT0.5O_x'] < 50]
htover15df, htunder15df = final_df_unique[final_df_unique['HT1.5U_x'] < 50], final_df_unique[final_df_unique['HT1.5U_x'] >= 50]

ft1t, ft1a = calculate_threshold(ft1df['FT1_x'], ft1df['FT1'])
ftxt, ftxa = calculate_threshold(ftxdf['FTX_x'], ftxdf['FTX'])
ft2t, ft2a = calculate_threshold(ft2df['FT2_x'], ft2df['FT2'])
ft1xt, ft1xa = calculate_threshold(dc1xdf['DC1X_x'], dc1xdf['FT1X'])
ft12t, ft12a = calculate_threshold(dc12df['DC12_x'], dc12df['FT12'])
ftx2t, ftx2a = calculate_threshold(dcx2df['DCX2_x'], dcx2df['FTX2'])
over15t, over15a = calculate_threshold(over15df['1.5O_x'], over15df['1.5O'])
under15t, under15a = calculate_threshold(under15df['1.5O_x'], under15df['1.5U'])
over25t, over25a = calculate_threshold(over25df['2.5O_x'], over25df['2.5O'])
under25t, under25a = calculate_threshold(under25df['2.5O_x'], under25df['2.5U'])
over35t, over35a = calculate_threshold(over35df['3.5U_x'], over35df['3.5O'])
under35t, under35a = calculate_threshold(under35df['3.5U_x'], under35df['3.5U'])
over45t, over45a = calculate_threshold(over45df['4.5U_x'], over45df['4.5O'])
under45t, under45a = calculate_threshold(under45df['4.5U_x'], under45df['4.5U'])
bttst, bttsa = calculate_threshold(bttsdf['BTTS_x'], bttsdf['BTTS'])
ottst, ottsa = calculate_threshold(ottsdf['BTTS_x'], ottsdf['OTTS'])
ht1t, ht1a = calculate_threshold(ht1df['HT1_x'], ht1df['HT1'])
htxt, htxa = calculate_threshold(htxdf['HTX_x'], htxdf['HTX'])
ht2t, ht2a = calculate_threshold(ht2df['HT2_x'], ht2df['HT2'])
ht1xt, ht1xa = calculate_threshold(ht1xdf['HTDC1X'], ht1xdf['HT1X'])
ht12t, ht12a = calculate_threshold(ht12df['HTDC12'], ht12df['HT12'])
htx2t, htx2a = calculate_threshold(htx2df['HTDCX2'], htx2df['HTX2'])
htover05t, htover05a = calculate_threshold(htover05df['HT0.5O_x'], htover05df['HT0.5O'])
htunder05t, htunder05a = calculate_threshold(htunder05df['HT0.5O_x'], htunder05df['HT0.5U'])
htover15t, htover15a = calculate_threshold(htover15df['HT1.5U_x'], htover15df['HT1.5O'])
htunder15t, htunder15a = calculate_threshold(htunder15df['HT1.5U_x'], htunder15df['HT1.5U'])

new_ft1df, new_ftxdf, new_ft2df = ft1df[ft1df['FT1_x'] >= ft1t],ftxdf[ftxdf['FTX_x'] >= ftxt],ft2df[ft2df['FT2_x'] >= ft2t]
new_ft1xdf, new_ft12df, new_ftx2df = dc1xdf[dc1xdf['DC1X_x'] >= ft1xt],dc12df[dc12df['DC12_x'] >= ft12t],dcx2df[dcx2df['DCX2_x'] >= ftx2t] 
new_over15, new_under15 = over15df[over15df['1.5O_x'] >= over15t], under15df[under15df['1.5O_x'] <= under15t]
new_over25, new_under25 = over25df[over25df['2.5O_x'] >= over25t], under25df[under25df['2.5O_x'] <= under25t]
new_over35, new_under35 = over35df[over35df['3.5U_x'] <= over35t], under35df[under35df['3.5U_x'] >= under35t]
new_over45, new_under45 = over45df[over45df['4.5U_x'] <= over45t], under45df[under45df['4.5U_x'] >= under45t]
new_btts, new_otts = bttsdf[bttsdf['BTTS_x'] >= bttst], ottsdf[ottsdf['BTTS_x'] <= ottst]
new_ht1df, new_htxdf, new_ht2df = ht1df[ht1df['HT1_x'] >= ht1t],htxdf[htxdf['HTX_x'] >= htxt],ht2df[ht2df['HT2_x'] >= ht2t]
new_ht1xdf, new_ht12df, new_htx2df = ht1xdf[ht1xdf['HTDC1X'] >= ht1xt],ht12df[ht12df['HTDC12'] >= ht12t],htx2df[htx2df['HTDCX2'] >= htx2t]
new_htover05, new_htunder05 = htover05df[htover05df['HT0.5O_x'] >= htover05t], htunder05df[htunder05df['HT0.5O_x'] <= htunder05t]
new_htover15, new_htunder15 = htover15df[htover15df['HT1.5U_x'] <= htover15t], htunder15df[htunder15df['HT1.5U_x'] >= htunder15t]

# Store the results in a list
results = [
('FT1', ft1t, ft1a, len(new_ft1df), round(len(new_ft1df)/len(ft1df)*100,2), np.sum(new_ft1df['FT1P']) - len(new_ft1df)),
('FTX', ftxt, ftxa, len(new_ftxdf), round(len(new_ftxdf)/len(ftxdf)*100,2), np.sum(new_ftxdf['FTXP']) - len(new_ftxdf)),
('FT2', ft2t, ft2a, len(new_ft2df), round(len(new_ft2df)/len(ft2df)*100,2), np.sum(new_ft2df['FT2P']) - len(new_ft2df)),
('FT1X', ft1xt, ft1xa, len(new_ft1xdf), round(len(new_ft1xdf)/len(dc1xdf)*100,2), np.sum(new_ft1xdf['FT1XP']) - len(new_ft1xdf)),
('FT12', ft12t, ft12a, len(new_ft12df), round(len(new_ft12df)/len(dc12df)*100,2), np.sum(new_ft12df['FT12P']) - len(new_ft12df)),
('FTX2', ftx2t, ftx2a, len(new_ftx2df), round(len(new_ftx2df)/len(dcx2df)*100,2), np.sum(new_ftx2df['FTX2P']) - len(new_ftx2df)),
('1.5O', over15t, over15a, len(new_over15), round(len(new_over15)/len(over15df)*100,2), np.sum(new_over15['1.5OP'])-len(new_over15)),
('1.5U', under15t, under15a, len(new_under15), round(len(new_under15)/len(under15df)*100,2), np.sum(new_under15['1.5UP'])-len(new_under15)),
('2.5O', over25t, over25a, len(new_over25), round(len(new_over25)/len(over25df)*100,2), np.sum(new_over25['2.5OP'])-len(new_over25)),
('2.5U', under25t, under25a, len(new_under25), round(len(new_under25)/len(under25df)*100,2), np.sum(new_under25['2.5UP'])-len(new_under25)),
('3.5O', over35t, over35a, len(new_over35), round(len(new_over35)/len(over35df)*100,2), np.sum(new_over35['3.5OP'])-len(new_over35)),
('3.5U', under35t, under35a, len(new_under35), round(len(new_under35)/len(under35df)*100,2), np.sum(new_under35['3.5UP'])-len(new_under35)),
('4.5O', over45t, over45a, len(new_over45), round(len(new_over45)/len(over45df)*100,2), np.sum(new_over45['4.5OP'])-len(new_over45)),
('4.5U', under45t, under45a, len(new_under45), round(len(new_under45)/len(under45df)*100,2), np.sum(new_under45['4.5UP'])-len(new_under45)),
('BTTS', bttst, bttsa, len(new_btts), round(len(new_btts)/len(bttsdf)*100,2), np.sum(new_btts['BTTSP'])-len(new_btts)),
('OTTS', ottst, ottsa, len(new_otts), round(len(new_otts)/len(ottsdf)*100,2), np.sum(new_otts['OTTSP'])-len(new_otts)),
('HT1', ht1t, ht1a, len(new_ht1df), round(len(new_ht1df)/len(ht1df)*100,2), np.sum(new_ht1df['HT1P']) - len(new_ht1df)),
('HTX', htxt, htxa, len(new_htxdf), round(len(new_htxdf)/len(htxdf)*100,2), np.sum(new_htxdf['HTXP']) - len(new_htxdf)),
('HT2', ht2t, ht2a, len(new_ht2df), round(len(new_ht2df)/len(ht2df)*100,2), np.sum(new_ht2df['HT2P']) - len(new_ht2df)),
('HT1X', ht1xt, ht1xa, len(new_ht1xdf), round(len(new_ht1xdf)/len(ht1xdf)*100,2), np.sum(new_ht1xdf['HT1XP']) - len(new_ht1xdf)),
('HT12', ht12t, ht12a, len(new_ht12df), round(len(new_ht12df)/len(ht12df)*100,2), np.sum(new_ht12df['HT12P']) - len(new_ht12df)),
('HTX2', htx2t, htx2a, len(new_htx2df), round(len(new_htx2df)/len(htx2df)*100,2), np.sum(new_htx2df['HTX2P']) - len(new_htx2df)),
('HT0.5O', htover05t, htover05a, len(new_htover05), round(len(new_htover05)/len(htover05df)*100,2), np.sum(new_htover05['HT0.5OP'])-len(new_htover05)),
('HT0.5U', htunder05t, htunder05a, len(new_htunder05), round(len(new_htunder05)/len(htunder05df)*100,2), np.sum(new_htunder05['HT0.5UP'])-len(new_htunder05)),
('HT1.5O', htover15t, htover15a, len(new_htover15), round(len(new_htover15)/len(htover15df)*100,2), np.sum(new_htover15['HT1.5OP'])-len(new_htover15)),
('HT1.5U', htunder15t, htunder15a, len(new_htunder15), round(len(new_htunder15)/len(htunder15df)*100,2), np.sum(new_htunder15['HT1.5UP'])-len(new_htunder15))
]

# Create a DataFrame from the results
results_df = pd.DataFrame(results, columns=['Prediction', 'Threshold', 'J-Stat', 'Games', 'Games%', 'Profit'])
results_df['ROI'] = round(results_df['Profit'] / results_df['Games'] * 100, 2)
print('Number of matches: ', len(final_df_unique))
results_df

Number of matches:  1023


Unnamed: 0,Prediction,Threshold,J-Stat,Games,Games%,Profit,ROI
0,FT1,63.04,0.19,193,30.59,7.5,3.89
1,FTX,36.2,0.19,43,86.0,2.95,6.86
2,FT2,48.71,0.18,175,51.02,22.56,12.89
3,FT1X,79.04,0.14,317,72.05,-24.19,-7.63
4,FT12,72.19,0.17,314,84.41,-0.1,-0.03
5,FTX2,80.75,0.25,118,55.92,6.11,5.18
6,1.5O,72.02,0.06,587,62.18,-33.9,-5.78
7,1.5U,31.1,-1.0,23,29.11,-3.16,-13.74
8,2.5O,53.05,0.03,448,85.66,-41.77,-9.32
9,2.5U,27.51,-1.0,101,20.2,-10.13,-10.03


# Testing Best / Most Profitable Model Predictions

In [18]:
# Select columns from predictions table
predictions = ['FT1_x', 'FTX_x', 'FT2_x', 'DC1X_x', 'DC12_x', 'DCX2_x', 
               '1.5O_x', '2.5O_x', '3.5U_x', '4.5U_x', 'BTTS_x', 
               'HT1_x', 'HTX_x', 'HT2_x', 'HTDC1X', 'HTDC12', 'HTDCX2', 
               'HT0.5O_x', 'HT1.5U_x']

# Select columns from betting odds table
results = ['FT1', 'FTX', 'FT2', 'FT1X', 'FT12', 'FTX2', 
           '1.5O', '2.5O', '3.5U', '4.5U', 'BTTS',
            'HT1', 'HTX', 'HT2', 'HT1X', 'HT12', 'HTX2', 
            'HT0.5O', 'df2_HT1.5U']

# Select columns ending with 'P' (profit columns)
profits = ['FT1P', 'FTXP', 'FT2P', 'FT1XP', 'FT12P', 'FTX2P', 
           '1.5OP', '2.5OP', '3.5UP', '4.5UP', 'BTTSP',
            'HT1P', 'HTXP', 'HT2P', 'HT1XP', 'HT12P', 'HTX2P', 
            'HT0.5OP', 'HT1.5UP']

bet, percentage, profit = [], [], []

for i in range(len(final_df_unique)):
    my_list = []
    for j in predictions:
        my_list.append(final_df_unique[j].iloc[i])
    percentage.append(max(my_list))
    max_index = my_list.index(max(my_list))
    bet.append(results[max_index])
    profit_column = profits[max_index]
    profit.append(final_df_unique[profit_column].iloc[i])

# Create a DataFrame
model_recs = pd.DataFrame({
    'League': final_df_unique['League_x'],
    'Home': final_df_unique['Home'],
    'Away': final_df_unique['Away'],
    'BET': bet,
    'Percentage': percentage,
    'Profit': profit
})


print('Matches found: ', len(final_df_unique))
print(f"Correct Predictions: {len(model_recs[model_recs['Profit'] > 0])/len(model_recs)*100}")
print(f"Profit: {round(sum(model_recs['Profit']) - len(model_recs),2)} ROI: {round((sum(model_recs['Profit']) - len(model_recs)) / len(model_recs) * 100, 2)}%")
model_recs.tail()

Matches found:  1023
Correct Predictions: 81.32942326490713
Profit: -61.42 ROI: -6.0%


Unnamed: 0,League,Home,Away,BET,Percentage,Profit
1022,Sweden,Malmo FF,Hacken,HT1X,96.47,1.17
1023,Sweden,Mjallby,Varnamo,FT1X,95.65,1.13
1024,Sweden,Sirius,AIK,FTX2,88.34,1.36
1025,Usa,Portland,St. Louis City,4.5U,92.44,0.0
1026,Usa,Vancouver,Seattle,HT1X,90.44,0.0


In [19]:
final_df_unique['OTTS_x'] = 100 - final_df_unique['BTTS_x']
final_df_unique['1.5U_x'] = 100 - final_df_unique['1.5O_x']
final_df_unique['2.5U_x'] = 100 - final_df_unique['2.5O_x']
final_df_unique['3.5O_x'] = 100 - final_df_unique['3.5U_x']
final_df_unique['4.5O_x'] = 100 - final_df_unique['4.5U_x']
final_df_unique['HT0.5U_x'] = 100 - final_df_unique['HT0.5O_x']
final_df_unique['HT1.5O_x'] = 100 - final_df_unique['HT1.5U_x'] 

# Select columns starting with 'df1_'
predictions = ['FT1_x', 'FTX_x', 'FT2_x', 'DC1X_x', 'DC12_x', 'DCX2_x', 
               '1.5O_x', '1.5U_x', '2.5O_x','2.5U_x','3.5O_x', '3.5U_x', 
               '4.5O_x', '4.5U_x', 'BTTS_x', 'OTTS_x',
               'HT1_x', 'HTX_x', 'HT2_x', 'HTDC1X', 'HTDC12', 'HTDCX2', 
               'HT0.5O_x', 'HT0.5U_x', 'HT1.5O_x', 'HT1.5U_x']

# Select columns starting with 'df2_'
odds = ['FT1_y', 'FTX_y', 'FT2_y', 'DC1X_y', 'DC12_y', 'DCX2_y', 
           '1.5O_y', '1.5U', '2.5O_y','2.5U', '3.5O','3.5U_y', 
           '4.5O', '4.5U_y', 'BTTS_y', 'OTTS',
            'HT1_y', 'HTX_y', 'HT2_y', 'HT1X', 'HT12', 'HTX2', 
            'HT0.5O_y', 'HT0.5U', 'HT1.5O', 'HT1.5U_y']

# Select columns ending with 'P'
profit = ['FT1P', 'FTXP', 'FT2P', 'FT1XP', 'FT12P', 'FTX2P', 
           '1.5OP', '1.5UP', '2.5OP', '2.5UP', '3.5OP', '3.5UP', '4.5OP', '4.5UP', 
           'BTTSP', 'OTTSP', 'HT1P', 'HTXP', 'HT2P', 'HT1XP', 'HT12P', 'HTX2P', 
            'HT0.5OP', 'HT0.5UP', 'HT1.5OP','HT1.5UP']

bets, percentages, profits, difference = [], [], [], [] 
for i in range(len(final_df_unique)):
    my_list = []
    valid_indices = []  # To keep track of indices where profit is not negative
    for j in range(len(predictions)):
        pred_column = predictions[j]
        odds_column = odds[j]
        profit_column = profit[j]  # Corresponding profit column
        
        # Calculate the value
        my_value = (100 / final_df_unique[pred_column].iloc[i]) - final_df_unique[odds_column].iloc[i]
        
        # Only add to the list if the corresponding profit is non-negative
        if final_df_unique[profit_column].iloc[i] >= 0:
            my_list.append(my_value)
            valid_indices.append(j)
        else:
            my_list.append(float('-inf'))  # Set to negative infinity to ignore in max()
    
    if valid_indices:  # Ensure there is at least one valid index
        max_index = my_list.index(max(my_list))
        rec_bet = profit[max_index]
        percent_bet = predictions[max_index]
        bets.append(rec_bet)
        percentages.append(final_df_unique[percent_bet].iloc[i])
        rec_profit = profit[max_index]
        profits.append(final_df_unique[rec_profit].iloc[i])
        difference.append(round(max(my_list), 2))
    else:
        # Handle case where no valid profits are found for this match
        bets.append(None)
        percentages.append(None)
        profits.append(None)
        difference.append(None)

# Create a DataFrame
model_recs = pd.DataFrame({
    'League': final_df_unique['League_x'],
    'Home': final_df_unique['Home'],
    'Away': final_df_unique['Away'],
    'BET': bets,
    'Percentage': percentages,
    'Profit': profits,
    'Difference': difference
}).dropna()  # Drop rows with None values

print('Matches found: ', len(final_df_unique))
print(f"Correct Predictions: {len(model_recs[model_recs['Profit'] > 0])/len(model_recs)*100}")
print(f"Profit: {round(sum(model_recs['Profit']) - len(model_recs), 2)} ROI: {round((sum(model_recs['Profit']) - len(model_recs)) / len(model_recs) * 100, 2)}%")
model_recs.tail()

Matches found:  1023
Correct Predictions: 69.30596285434996
Profit: -18.6 ROI: -1.82%


Unnamed: 0,League,Home,Away,BET,Percentage,Profit,Difference
1022,Sweden,Malmo FF,Hacken,FT1P,52.6,1.62,0.28
1023,Sweden,Mjallby,Varnamo,4.5UP,60.58,1.17,0.48
1024,Sweden,Sirius,AIK,HT0.5OP,55.68,0.0,0.4
1025,Usa,Portland,St. Louis City,HT0.5OP,54.21,1.25,0.59
1026,Usa,Vancouver,Seattle,HT1.5UP,50.73,1.4,0.57


## Checking the Betting Strategy in Article 

probability(model) / probability(bookies) > r (between 1 and 1.5)

In [20]:
def find_best_threshold_for_column(df, r_col, p_col, start=1.0, stop=1.5, step=0.01):
    """
    For a given r column and probability column in df, loop through thresholds
    from start to stop (inclusive) in step increments. Compute the score for each threshold,
    and return the threshold that gives the maximum score along with that score.
    
    Score is defined as:
      (sum(probabilities) - count(probabilities)) / count(probabilities) * 100
    """
    best_score = -np.inf
    best_threshold = None
    # Create thresholds from start to stop inclusive
    for thr in np.arange(start, stop + step, step):
        # Filter the rows based on the current threshold and probability condition
        subset = df[(df[r_col] > thr) & (df[p_col] >= 0)]
        if len(subset) == 0:
            continue  # skip if no data for this threshold
        # Convert probability values from percentage to decimals by dividing by 100.
        score = ((subset[p_col]).sum() - len(subset)) / len(subset) * 100
        if score > best_score:
            best_score = score
            best_threshold = thr
            games = len(subset)
    return best_threshold, best_score, games

# Step 1: Compute r score columns (if not already computed)
for pred, odd in zip(predictions, odds):
    r_col = f'{pred}_r'
    final_df_unique[r_col] = (final_df_unique[pred] / 100) / (1 / final_df_unique[odd])

# Dictionary to store the best threshold and score for each column
results = {}

for prof, pred in zip(profit, predictions):
    r_col = f'{pred}_r'
    # profit column is taken directly from the profit list.
    profit_col = prof
    
    # Optional: check if columns exist
    if r_col not in final_df_unique.columns:
        print(f"Column {r_col} not found, skipping.")
        continue
    if profit_col not in final_df_unique.columns:
        print(f"Column {profit_col} not found, skipping.")
        continue

    best_thr, best_score, games = find_best_threshold_for_column(final_df_unique, r_col, profit_col)
    results[prof] = {'best_threshold': best_thr, 'best_score': best_score, 'games': games}

# Convert the results dictionary to a DataFrame for viewing.
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values(by="best_score", ascending=False)
results_df

Unnamed: 0,best_threshold,best_score,games
HT0.5OP,1.37,78.0,1.0
1.5OP,1.32,67.0,1.0
BTTSP,1.35,46.636364,11.0
4.5UP,1.3,36.0,1.0
FTX2P,1.48,32.4,15.0
HT12P,1.16,28.8,5.0
HT2P,1.43,25.046875,64.0
3.5UP,1.39,20.428571,14.0
FT2P,1.49,18.384615,78.0
FT12P,1.09,18.0,9.0


# Combining Different Bets

In [21]:
# Define the list of base columns and multiplier columns
ft_result_columns = ['FT1P', 'FTXP', 'FT2P', 'FT1XP', 'FT12P', 'FTX2P'] 
ft_goals_columns = ['1.5OP', '1.5UP', '2.5OP', '2.5UP', '3.5OP', '3.5UP', '4.5OP', '4.5UP', 'BTTSP', 'OTTSP']
ht_result_columns = ['HT1P', 'HTXP', 'HT2P', 'HT1XP', 'HT12P', 'HTX2P']
ht_goals_columns = ['HT0.5OP', 'HT0.5UP', 'HT1.5OP','HT1.5UP']

# List to store summary information for each new column.
summary_list = []

# Multiply each FT result column with each FT goals column.
for ft_result in ft_result_columns:
    for ft_goal in ft_goals_columns:
        new_col = f"{ft_result}_{ft_goal}"  # e.g., "FT1P_1.5OP"
        # Multiply only if both factors are >= 0; otherwise assign -1.
        final_df_unique[new_col] = np.where(
            (final_df_unique[ft_result] >= 0) & (final_df_unique[ft_goal] >= 0),
            final_df_unique[ft_result] * final_df_unique[ft_goal],
            -1
        )
        # Select only valid (>= 0) product values.
        valid_values = final_df_unique.loc[final_df_unique[new_col] >= 0, new_col]
        count_valid = len(valid_values)
        summary_value = (valid_values.sum() - count_valid) / count_valid if count_valid > 0 else np.nan
        summary_list.append({'Combination': new_col, 'ROI': summary_value, 'Games': count_valid})
        # Add a new column to store the count of valid games.
        final_df_unique[f"{new_col}_games"] = count_valid

# Multiply each HT result column with each HT goals column.
for ht_result in ht_result_columns:
    for ht_goal in ht_goals_columns:
        new_col = f"{ht_result}_{ht_goal}"  # e.g., "HT1P_HT0.5OP"
        final_df_unique[new_col] = np.where(
            (final_df_unique[ht_result] >= 0) & (final_df_unique[ht_goal] >= 0),
            final_df_unique[ht_result] * final_df_unique[ht_goal],
            -1
        )
        valid_values = final_df_unique.loc[final_df_unique[new_col] >= 0, new_col]
        count_valid = len(valid_values)
        summary_value = (valid_values.sum() - count_valid) / count_valid if count_valid > 0 else np.nan
        summary_list.append({'Combination': new_col, 'ROI': summary_value, 'Games': count_valid})
        final_df_unique[f"{new_col}_games"] = count_valid

# Extra combinations: BTTSP with 2.5OP and BTTSP with 4.5UP.
extra_combinations = [("BTTSP_2.5OP", "2.5OP"), ("BTTSP_4.5UP", "4.5UP")]

for new_col, other_col in extra_combinations:
    final_df_unique[new_col] = np.where(
        (final_df_unique["BTTSP"] >= 0) & (final_df_unique[other_col] >= 0),
        final_df_unique["BTTSP"] * final_df_unique[other_col],
        -1
    )
    valid_values = final_df_unique.loc[final_df_unique[new_col] >= 0, new_col]
    count_valid = len(valid_values)
    summary_value = (valid_values.sum() - count_valid) / count_valid if count_valid > 0 else np.nan
    summary_list.append({'Combination': new_col, 'ROI': summary_value, 'Games': count_valid})
    final_df_unique[f"{new_col}_games"] = count_valid

# Create a summary DataFrame to display the summary values and game counts.
summary_df = pd.DataFrame(summary_list)
summary_df = summary_df.sort_values(by="ROI", ascending=False)
summary_df

Unnamed: 0,Combination,ROI,Games
65,HTXP_HT0.5UP,1.175265,117
6,FT1P_4.5OP,0.5448,25
81,HTX2P_HT0.5UP,0.476296,69
76,HT12P_HT0.5OP,0.465114,22
62,HT1P_HT1.5OP,0.440058,116
68,HT2P_HT0.5OP,0.428474,142
78,HT12P_HT1.5OP,0.329643,21
60,HT1P_HT0.5OP,0.305127,302
70,HT2P_HT1.5OP,0.299457,65
84,BTTSP_2.5OP,0.251971,408


# Checking Dixon Coles Model Accuracy for FT Results
### Performs significantly worse than Classificatin Models

In [22]:
# Accuracy for each class
cols = ['FT1P', 'FTXP', 'FT2P']
ratios = {}

for col in cols:
    num_positive = (final_df_unique[col] > 0).sum()
    num_non_negative = (final_df_unique[col] >= 0).sum()
    ratio = num_positive / num_non_negative if num_non_negative != 0 else None
    ratios[col] = round(ratio, 4) if ratio is not None else None

print(ratios)

# FT1P (Subclass A) Overall Accuracy
a_preds = final_df_unique['FT1P']
a_valid = a_preds >= 0
a_correct = a_preds > 0

a_total = a_valid.sum()
a_correct_total = a_correct.sum()

# FTXP + FT2P (Subclass B)
b1 = final_df_unique['FTXP']
b2 = final_df_unique['FT2P']

b_valid = (b1 >= 0) | (b2 >= 0)
b_correct = (b1 > 0) | (b2 > 0)

b_total = b_valid.sum()
b_correct_total = b_correct.sum()

# Overall
overall_total = a_total + b_total
overall_correct = a_correct_total + b_correct_total

overall_accuracy = overall_correct / overall_total if overall_total else None

# Print all
print(f"FT1P Accuracy: {a_correct_total}/{a_total} = {a_correct_total / a_total:.4f}")
print(f"Combined FTXP+FT2P Accuracy: {b_correct_total}/{b_total} = {b_correct_total / b_total:.4f}")
print(f"Overall Accuracy: {overall_correct}/{overall_total} = {overall_accuracy:.4f}")


{'FT1P': 0.5198, 'FTXP': 0.28, 'FT2P': 0.4444}
FT1P Accuracy: 328/631 = 0.5198
Combined FTXP+FT2P Accuracy: 166/392 = 0.4235
Overall Accuracy: 494/1023 = 0.4829
