# Reading Model Predictions and Bet365 Odds

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import warnings
from rapidfuzz import process
from datetime import datetime

warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

csv_url = "https://docs.google.com/spreadsheets/d/1WfEG-1icUjj6k7TGePJQEXH-w0TLEIcN/export?format=csv"
uefa = pd.read_csv(csv_url, dtype={'FTHG': 'Int64', 'FTAG': 'Int64', 'HTHG': 'Int64', 'HTAG': 'Int64'})
uefa['FT'] = uefa['FTHG'].astype(str) + ' - ' + uefa['FTAG'].astype(str)
uefa['HT'] = '(' + uefa['HTHG'].astype(str) + '-' + uefa['HTAG'].astype(str) + ')'
uefa['FTTG'] = uefa['FTHG'] + uefa['FTAG']
uefa['HTTG'] = uefa['HTHG'] + uefa['HTAG']

predictions = pd.read_excel('C:/Users/99451/Desktop/MODEL/2025/dixon_coles_model_predictions/_predictions.xlsx')
bet365_odds = pd.read_excel('C:/Users/99451/Desktop/MODEL/2025/dixon_coles_model_predictions/final_odds.xlsx')

print(f"Games found: {len(predictions)} in predictions and {len(bet365_odds)} in odds dataset.")
bet365_odds.tail()

Games found: 4939 in predictions and 5167 in odds dataset.


Unnamed: 0,Home,Away,FT1,FTX,FT2,DC1X,DC12,DCX2,HT1,HTX,HT2,HT1X,HT12,HTX2,BTTS,OTTS,1.5O,1.5U,2.5O,2.5U,3.5O,3.5U,4.5O,4.5U,HT0.5O,HT0.5U,HT1.5O,HT1.5U
5162,Celje,Lugano,2.63,3.4,2.55,1.5,1.3,1.44,3.1,2.3,3.0,1.36,1.57,1.36,1.5,2.5,1.18,4.5,1.62,2.25,2.5,1.5,4.5,1.18,1.3,3.4,2.5,1.5
5163,Jagiellonia,Cercle Brugge KSV,2.05,3.4,3.5,1.29,1.3,1.73,2.63,2.3,3.75,1.25,1.57,1.44,1.53,2.38,1.2,4.33,1.7,2.1,2.63,1.44,5.0,1.17,1.33,3.25,2.5,1.5
5164,Ajax,Eintracht Frankfurt,2.45,3.75,2.63,1.44,1.25,1.53,3.0,2.3,3.2,1.36,1.57,1.36,1.53,2.38,1.2,4.5,1.67,2.2,2.63,1.5,4.5,1.2,1.3,3.4,2.5,1.5
5165,Plzen,Lazio,3.9,3.8,1.85,1.91,1.25,1.25,4.33,2.25,2.5,1.5,1.57,1.2,1.7,2.05,1.25,4.0,1.84,2.06,3.0,1.4,5.5,1.14,1.36,3.0,2.63,1.44
5166,Bodo/Glimt,Olympiacos Piraeus,2.1,3.5,3.4,1.3,1.29,1.7,2.75,2.2,3.75,1.25,1.62,1.4,1.7,2.05,1.25,4.0,1.88,2.02,3.2,1.36,5.5,1.14,1.36,3.0,2.75,1.4


# Merging 2 DataFrames for similarity of values

In [2]:
# Use predictions key values as the canonical list
home_keys = predictions['Home'].unique().tolist()
away_keys = predictions['Away'].unique().tolist()

def get_canonical(val, canonical_list, threshold=85):
    """
    For a given value from df2, find the best matching canonical value from df1
    using a fuzzy matching score. If no match meets the threshold, return the original value.
    """
    match = process.extractOne(val, canonical_list, score_cutoff=threshold)
    if match:
        return match[0]
    return val

# Replace values in bet365_odds Home and Away columns using the canonical mapping from predictions
bet365_odds['Home'] = bet365_odds['Home'].apply(lambda x: get_canonical(x, home_keys))
bet365_odds['Away'] = bet365_odds['Away'].apply(lambda x: get_canonical(x, away_keys))

# Merge 2 dataframes on Home and Away columns
merged_df = pd.merge(predictions, bet365_odds, on=['Home', 'Away'], how='inner')
merged_df = merged_df.drop_duplicates(subset=['League','Home', 'Away'], keep='last')
merged_df.tail()

Unnamed: 0,League,Home,Away,FT1_x,FTX_x,FT2_x,FTR,DC1X_x,DC12_x,DCX2_x,1.5O_x,2.5O_x,3.5U_x,4.5U_x,BTTS_x,HT1_x,HTX_x,HT2_x,HTR,HTDC1X,HTDC12,HTDCX2,HT0.5O_x,HT1.5U_x,H0.5O,A0.5O,H1.5O,A1.5O,H2.5U,A2.5U,FT1_y,FTX_y,FT2_y,DC1X_y,DC12_y,DCX2_y,HT1_y,HTX_y,HT2_y,HT1X,HT12,HTX2,BTTS_y,OTTS,1.5O_y,1.5U,2.5O_y,2.5U,3.5O,3.5U_y,4.5O,4.5U_y,HT0.5O_y,HT0.5U,HT1.5O,HT1.5U_y
4361,England5,Tamworth,Wealdstone,43.32,26.99,29.69,1-1,70.31,73.01,56.68,79.36,54.84,67.5,83.57,58.8,37.52,37.84,24.53,0-0,75.36,62.05,62.37,72.35,66.51,79.61,72.34,47.2,36.8,78.58,86.04,2.35,3.2,3.0,1.36,1.36,1.57,3.0,2.2,3.5,1.29,1.62,1.36,1.67,2.1,1.25,3.75,1.88,1.93,3.25,1.33,6.0,1.13,1.36,3.0,2.75,1.4
4362,England5,York City,Altrincham,53.69,24.91,21.38,1-1,78.6,75.07,46.29,80.55,56.72,65.67,82.23,57.51,41.1,32.0,26.6,1-0,73.1,67.7,58.6,80.32,55.75,84.3,66.89,55.24,30.3,71.66,89.91,1.73,3.4,4.75,1.18,1.3,2.0,2.38,2.25,4.75,1.17,1.57,1.53,1.75,2.0,1.25,3.75,1.83,1.98,3.0,1.36,6.0,1.13,1.36,3.0,2.75,1.4
4363,Spain2,Racing Ferrol,Burgos,24.31,37.0,38.7,0-0,61.31,63.01,75.7,47.89,21.22,92.35,97.72,30.12,22.27,60.67,17.05,0-0,82.94,39.32,77.72,43.61,89.18,48.61,60.03,14.4,23.38,96.99,93.43,3.0,2.7,2.8,1.4,1.44,1.36,4.0,1.8,3.75,1.25,1.91,1.22,2.25,1.57,1.62,2.2,3.1,1.36,6.5,1.11,17.0,1.03,1.67,2.1,4.33,1.2
4364,Saudiarabia,Al Kholood,Al Fateh,57.31,17.15,25.5,2-1,74.46,82.81,42.65,82.98,65.43,56.47,74.96,61.52,44.02,36.38,19.37,0-0,80.4,63.39,55.75,74.67,61.32,87.42,72.6,61.37,37.15,65.63,85.79,2.5,3.4,2.5,1.5,1.3,1.5,3.1,2.2,3.0,1.36,1.62,1.36,1.62,2.2,1.22,4.0,1.75,2.05,2.75,1.4,5.0,1.14,1.36,3.0,2.63,1.44
4365,Saudiarabia,Al Quadisiya,Al Ittihad,35.48,21.49,43.03,0-1,56.97,78.51,64.52,68.45,45.49,75.86,89.18,47.9,36.97,35.25,27.56,0-0,72.22,64.53,62.81,77.93,56.94,68.91,73.46,32.59,38.26,88.63,85.09,2.63,3.4,2.35,1.5,1.25,1.4,3.1,2.3,2.88,1.4,1.57,1.36,1.53,2.38,1.17,4.5,1.62,2.25,2.5,1.5,4.33,1.2,1.3,3.4,2.5,1.5


# Scraping SoccerStats For Match Results

In [3]:
final =  pd.DataFrame()
liqa = ''
unique_leagues = predictions['League'].unique().tolist()

# Convert to lowercase and exclude UEFA competitions
uefa_list = ['unl', 'uel', 'ucl', 'ufcl']
list_2024 = ['norway_2024', 'sweden_2024', 'usa_2024']
unique_leagues = list(set([league.lower() for league in unique_leagues if league.lower() not in uefa_list]))

# Append list_2024 to unique_leagues
unique_leagues.extend(list_2024)

for i in unique_leagues:
    URL = "https://www.soccerstats.com/results.asp?league=" + i + "&pmtype=bydate"
    page = requests.get(URL)
    liqa = i
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find(id="btable")
    sth = results.find_all("tr", class_="odd")
    sth


    date, league, home, away, ft, ht = [], [], [], [], [],[]
    for i in sth:
        date.append(i.find_all("td", align = 'right')[0].get_text(strip=True))
        league.append(liqa.capitalize())
        home.append(i.find_all("td", align = 'right')[1].get_text(strip=True))
        away.append(i.find("td", align = "left").get_text(strip = True))
        ft.append(i.find_all("td", align = 'center')[0].get_text(strip = True))
        try:
            ht.append(i.find_all("td", align = 'center')[2].get_text(strip = True))
        except IndexError as e:
            ht.append('NA')#print("Last output before error occurred:", i.find_all("td", align = 'center'))

    data = {'Date': date, 'League': league,'Home': home, 'Away': away, 'FT': ft, 'HT': ht}

# Create a DataFrame from the dictionary
    df = pd.DataFrame(data)

# Replace empty strings with NaN
    #next_df = df[(df['Date'] == formatted_date) & (df['HT'] == '')]
    df.replace('', pd.NA, inplace=True)

# Drop rows with NaN values
    df_cleaned = df.dropna()

#For Half-Time Results
    hthg, htag = [], []
    for i in df_cleaned['HT']:
        if i == 'NA':
            hthg.append('NA')
            htag.append('NA')
        elif i == '+' or i == '-':
            hthg.append('NA')
            htag.append('NA')
        else:
            try:
                hthg.append(int(i[1]))
                htag.append(int(i[3]))
            except IndexError as e:
                print("Last output before error occurred:", i)



#For Full-Time Results
    hg, ag, tg = [], [], []
    for i in df_cleaned['FT']:
        if len(i) < 5 or ':' in i:
            hg.append('NA')
            ag.append('NA')
            tg.append('NA')
        else:
            try:
                hghg = int(i.split(' - ')[0])
                hg.append(hghg)
                agag = int(i.split(' - ')[1])
                ag.append(agag)
                tg.append(hghg + agag)
            except:
                print(hghg + agag)

    
    df_cleaned['FTHG'], df_cleaned['FTAG'], df_cleaned['FTTG'] = hg, ag, tg
    df_cleaned['HTHG'], df_cleaned['HTAG'] = hthg, htag
    df_cleaned['HTTG'] = df_cleaned['HTHG'] + df_cleaned['HTAG']
    
    final = pd.concat([final, df_cleaned], ignore_index=True)
    
final = final[final['HT'] != 'NA']

# Get the date for today
today = datetime.now()

# Append the correct year based on if the date already occurred in 2025
def assign_year(date_str):
    if pd.isna(date_str):  # Handle NaN values
        return None
    
    # Check if the same day and month already occurred in 2025
    date_in_2025 = datetime.strptime(date_str + ' 2025', "%d %b %Y")

    # If this day and month already occurred in 2025, append 2025, otherwise 2024
    if date_in_2025 <= today:
        return f"{date_str} 2025"
    else:
        return f"{date_str} 2024"

# Example: Fix the 'date' column by removing the weekday and extra punctuation
final['Date'] = final['Date'].str.extract(r'(\d{1,2} \w{3})')  # Extract day and month part
final['Date'] = final['Date'].apply(assign_year)

# Convert to datetime format
final['Date'] = pd.to_datetime(final['Date'], format='%d %b %Y', errors='coerce')

# Filter rows before September 17th, 2024
final_filtered = final[final['Date'] >= pd.Timestamp('2024-09-17')]

# Remove "_YYYY" (4-digit year) at the end of usa, norway and sweden but keep other numbers
final_filtered['League'] = final_filtered['League'].str.replace(r'_\d{4}$', '', regex=True)

# Align columns of uefa to match final_filtered
uefa = uefa[final_filtered.columns]

# Concatenate
final_filtered = pd.concat([uefa, final_filtered], ignore_index=True)
combined = pd.concat([final_filtered.head(), final_filtered.tail()])
combined

Unnamed: 0,Date,League,Home,Away,FT,HT,FTHG,FTAG,FTTG,HTHG,HTAG,HTTG
0,9/17/2024,UCL,Juventus,PSV,3 - 1,(2-0),3.0,1.0,4.0,2.0,0.0,2.0
1,9/17/2024,UCL,Young Boys,Aston Villa,0 - 3,(0-2),0.0,3.0,3.0,0.0,2.0,2.0
2,9/17/2024,UCL,Bayern,Dinamo Zagreb,9 - 2,(3-0),9.0,2.0,11.0,3.0,0.0,3.0
3,9/17/2024,UCL,Milan,Liverpool,1 - 3,(1-2),1.0,3.0,4.0,1.0,2.0,3.0
4,9/17/2024,UCL,Real Madrid,Stuttgart,3 - 1,(0-0),3.0,1.0,4.0,0.0,0.0,0.0
7598,2024-10-20 00:00:00,Usa,Houston Dynamo,LA Galaxy,2 - 1,(1-0),2.0,1.0,3.0,1.0,0.0,1.0
7599,2024-10-20 00:00:00,Usa,Los Angeles FC,SJ Earthquakes,3 - 1,(0-1),3.0,1.0,4.0,0.0,1.0,1.0
7600,2024-10-20 00:00:00,Usa,Minnesota Utd,St. Louis City,4 - 1,(1-0),4.0,1.0,5.0,1.0,0.0,1.0
7601,2024-10-20 00:00:00,Usa,Real Salt Lake,Vancouver,2 - 1,(0-0),2.0,1.0,3.0,0.0,0.0,0.0
7602,2024-10-20 00:00:00,Usa,Seattle,Portland,1 - 1,(1-0),1.0,1.0,2.0,1.0,0.0,1.0


# Merging with Predictions + Odds Dataframes

In [4]:
# Merge on multiple columns
final_df = pd.merge(merged_df, final_filtered, on=['Home', 'Away'], how='inner')

# Drop duplicates based on the 'Home' and 'Away' columns
final_df_unique = final_df[~final_df.duplicated(subset=['League_x', 'Home', 'Away'], keep='last')]
final_df_unique = final_df_unique.dropna()

# Reset the index afterward
final_df_unique.reset_index(drop=True, inplace=True)

print('Number of games matched: ', len(final_df_unique))
final_df_unique.tail()

Number of games matched:  3985


Unnamed: 0,League_x,Home,Away,FT1_x,FTX_x,FT2_x,FTR,DC1X_x,DC12_x,DCX2_x,1.5O_x,2.5O_x,3.5U_x,4.5U_x,BTTS_x,HT1_x,HTX_x,HT2_x,HTR,HTDC1X,HTDC12,HTDCX2,HT0.5O_x,HT1.5U_x,H0.5O,A0.5O,H1.5O,A1.5O,H2.5U,A2.5U,FT1_y,FTX_y,FT2_y,DC1X_y,DC12_y,DCX2_y,HT1_y,HTX_y,HT2_y,HT1X,HT12,HTX2,BTTS_y,OTTS,1.5O_y,1.5U,2.5O_y,2.5U,3.5O,3.5U_y,4.5O,4.5U_y,HT0.5O_y,HT0.5U,HT1.5O,HT1.5U_y,Date,League_y,FT,HT,FTHG,FTAG,FTTG,HTHG,HTAG,HTTG
3980,England5,Solihull Moors,Sutton Utd,38.6,27.26,34.14,1-1,65.86,72.74,61.4,79.49,55.02,67.33,83.44,59.45,19.99,39.87,40.05,0-0,59.86,60.04,79.92,68.93,70.17,77.5,75.14,43.94,40.53,81.09,83.54,2.35,3.4,2.8,1.4,1.3,1.57,3.1,2.1,3.6,1.29,1.67,1.36,1.8,1.95,1.33,3.25,2.05,1.75,3.5,1.29,8.0,1.08,1.44,2.63,3.25,1.33,2025-03-04 00:00:00,England5,1 - 1,(1-1),1.0,1.0,2.0,1.0,1.0,2.0
3981,England5,Southend Utd,Woking,49.1,31.2,19.7,1-0,80.3,68.8,50.9,62.35,34.0,84.58,94.16,39.69,46.43,43.22,10.24,0-0,89.65,56.67,53.46,61.94,76.35,73.27,52.39,38.01,17.06,85.26,96.05,1.73,3.6,4.5,1.18,1.29,2.05,2.4,2.1,5.0,1.14,1.67,1.53,2.0,1.75,1.33,3.25,2.08,1.73,3.75,1.25,8.0,1.08,1.44,2.63,3.25,1.33,2025-03-04 00:00:00,England5,2 - 2,(1-1),2.0,2.0,4.0,1.0,1.0,2.0
3982,England5,Tamworth,Wealdstone,43.32,26.99,29.69,1-1,70.31,73.01,56.68,79.36,54.84,67.5,83.57,58.8,37.52,37.84,24.53,0-0,75.36,62.05,62.37,72.35,66.51,79.61,72.34,47.2,36.8,78.58,86.04,2.35,3.2,3.0,1.36,1.36,1.57,3.0,2.2,3.5,1.29,1.62,1.36,1.67,2.1,1.25,3.75,1.88,1.93,3.25,1.33,6.0,1.13,1.36,3.0,2.75,1.4,2025-03-04 00:00:00,England5,4 - 1,(0-0),4.0,1.0,5.0,0.0,0.0,0.0
3983,England5,York City,Altrincham,53.69,24.91,21.38,1-1,78.6,75.07,46.29,80.55,56.72,65.67,82.23,57.51,41.1,32.0,26.6,1-0,73.1,67.7,58.6,80.32,55.75,84.3,66.89,55.24,30.3,71.66,89.91,1.73,3.4,4.75,1.18,1.3,2.0,2.38,2.25,4.75,1.17,1.57,1.53,1.75,2.0,1.25,3.75,1.83,1.98,3.0,1.36,6.0,1.13,1.36,3.0,2.75,1.4,2025-03-04 00:00:00,England5,1 - 2,(0-1),1.0,2.0,3.0,0.0,1.0,1.0
3984,Spain2,Racing Ferrol,Burgos,24.31,37.0,38.7,0-0,61.31,63.01,75.7,47.89,21.22,92.35,97.72,30.12,22.27,60.67,17.05,0-0,82.94,39.32,77.72,43.61,89.18,48.61,60.03,14.4,23.38,96.99,93.43,3.0,2.7,2.8,1.4,1.44,1.36,4.0,1.8,3.75,1.25,1.91,1.22,2.25,1.57,1.62,2.2,3.1,1.36,6.5,1.11,17.0,1.03,1.67,2.1,4.33,1.2,2025-03-05 00:00:00,Spain2,0 - 1,(0-1),0.0,1.0,1.0,0.0,1.0,1.0


# Creating Results Columns

In [5]:
import numpy as np

# Add the new columns based on the condition
final_df_unique['FT1'] = np.where(final_df_unique['FTHG'] > final_df_unique['FTAG'], 1, 0)
final_df_unique['FTX'] = np.where(final_df_unique['FTHG'] == final_df_unique['FTAG'], 1, 0)
final_df_unique['FT2'] = np.where(final_df_unique['FTHG'] < final_df_unique['FTAG'], 1, 0)

final_df_unique['FT1X'] = np.where(final_df_unique['FTHG'] >= final_df_unique['FTAG'], 1, 0)
final_df_unique['FT12'] = np.where(final_df_unique['FTX'] == 0, 1, 0)
final_df_unique['FTX2'] = np.where(final_df_unique['FTHG'] <= final_df_unique['FTAG'], 1, 0)

final_df_unique['1.5O'] = np.where(final_df_unique['FTTG'] > 1.5, 1, 0)
final_df_unique['1.5U_y'] = np.where(final_df_unique['FTTG'] < 1.5, 1, 0)
final_df_unique['2.5O'] = np.where(final_df_unique['FTTG'] > 2.5, 1, 0)
final_df_unique['2.5U_y'] = np.where(final_df_unique['FTTG'] < 2.5, 1, 0)
final_df_unique['3.5O_y'] = np.where(final_df_unique['FTTG'] > 3.5, 1, 0)
final_df_unique['3.5U'] = np.where(final_df_unique['FTTG'] < 3.5, 1, 0)
final_df_unique['4.5O_y'] = np.where(final_df_unique['FTTG'] > 4.5, 1, 0)
final_df_unique['4.5U'] = np.where(final_df_unique['FTTG'] < 4.5, 1, 0)

final_df_unique['BTTS'] = np.where((final_df_unique['FTHG'] != 0) & (final_df_unique['FTAG'] != 0), 1, 0)
final_df_unique['OTTS_y'] = np.where(final_df_unique['BTTS'] == 0, 1, 0)

final_df_unique['HT1'] = np.where(final_df_unique['HTHG'] > final_df_unique['HTAG'], 1, 0)
final_df_unique['HTX'] = np.where(final_df_unique['HTHG'] == final_df_unique['HTAG'], 1, 0)
final_df_unique['HT2'] = np.where(final_df_unique['HTHG'] < final_df_unique['HTAG'], 1, 0)

final_df_unique['HT1X_y'] = np.where(final_df_unique['HTHG'] >= final_df_unique['HTAG'], 1, 0)
final_df_unique['HT12_y'] = np.where(final_df_unique['HTX'] == 0, 1, 0)
final_df_unique['HTX2_y'] = np.where(final_df_unique['HTHG'] <= final_df_unique['HTAG'], 1, 0)

final_df_unique['HT0.5O'] = np.where(final_df_unique['HTTG'] > 0.5, 1, 0)
final_df_unique['HT0.5U_y'] = np.where(final_df_unique['HTTG'] < 0.5, 1, 0)
final_df_unique['HT1.5O_y'] = np.where(final_df_unique['HTTG'] > 1.5, 1, 0)
final_df_unique['HT1.5U'] = np.where(final_df_unique['HTTG'] < 1.5, 1, 0)

print('Games Found: ', len(final_df_unique))
final_df_unique.tail()

Games Found:  3985


Unnamed: 0,League_x,Home,Away,FT1_x,FTX_x,FT2_x,FTR,DC1X_x,DC12_x,DCX2_x,1.5O_x,2.5O_x,3.5U_x,4.5U_x,BTTS_x,HT1_x,HTX_x,HT2_x,HTR,HTDC1X,HTDC12,HTDCX2,HT0.5O_x,HT1.5U_x,H0.5O,A0.5O,H1.5O,A1.5O,H2.5U,A2.5U,FT1_y,FTX_y,FT2_y,DC1X_y,DC12_y,DCX2_y,HT1_y,HTX_y,HT2_y,HT1X,HT12,HTX2,BTTS_y,OTTS,1.5O_y,1.5U,2.5O_y,2.5U,3.5O,3.5U_y,4.5O,4.5U_y,HT0.5O_y,HT0.5U,HT1.5O,HT1.5U_y,Date,League_y,FT,HT,FTHG,FTAG,FTTG,HTHG,HTAG,HTTG,FT1,FTX,FT2,FT1X,FT12,FTX2,1.5O,1.5U_y,2.5O,2.5U_y,3.5O_y,3.5U,4.5O_y,4.5U,BTTS,OTTS_y,HT1,HTX,HT2,HT1X_y,HT12_y,HTX2_y,HT0.5O,HT0.5U_y,HT1.5O_y,HT1.5U
3980,England5,Solihull Moors,Sutton Utd,38.6,27.26,34.14,1-1,65.86,72.74,61.4,79.49,55.02,67.33,83.44,59.45,19.99,39.87,40.05,0-0,59.86,60.04,79.92,68.93,70.17,77.5,75.14,43.94,40.53,81.09,83.54,2.35,3.4,2.8,1.4,1.3,1.57,3.1,2.1,3.6,1.29,1.67,1.36,1.8,1.95,1.33,3.25,2.05,1.75,3.5,1.29,8.0,1.08,1.44,2.63,3.25,1.33,2025-03-04 00:00:00,England5,1 - 1,(1-1),1.0,1.0,2.0,1.0,1.0,2.0,0,1,0,1,0,1,1,0,0,1,0,1,0,1,1,0,0,1,0,1,0,1,1,0,1,0
3981,England5,Southend Utd,Woking,49.1,31.2,19.7,1-0,80.3,68.8,50.9,62.35,34.0,84.58,94.16,39.69,46.43,43.22,10.24,0-0,89.65,56.67,53.46,61.94,76.35,73.27,52.39,38.01,17.06,85.26,96.05,1.73,3.6,4.5,1.18,1.29,2.05,2.4,2.1,5.0,1.14,1.67,1.53,2.0,1.75,1.33,3.25,2.08,1.73,3.75,1.25,8.0,1.08,1.44,2.63,3.25,1.33,2025-03-04 00:00:00,England5,2 - 2,(1-1),2.0,2.0,4.0,1.0,1.0,2.0,0,1,0,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1,0
3982,England5,Tamworth,Wealdstone,43.32,26.99,29.69,1-1,70.31,73.01,56.68,79.36,54.84,67.5,83.57,58.8,37.52,37.84,24.53,0-0,75.36,62.05,62.37,72.35,66.51,79.61,72.34,47.2,36.8,78.58,86.04,2.35,3.2,3.0,1.36,1.36,1.57,3.0,2.2,3.5,1.29,1.62,1.36,1.67,2.1,1.25,3.75,1.88,1.93,3.25,1.33,6.0,1.13,1.36,3.0,2.75,1.4,2025-03-04 00:00:00,England5,4 - 1,(0-0),4.0,1.0,5.0,0.0,0.0,0.0,1,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1,0,1,0,1,0,1
3983,England5,York City,Altrincham,53.69,24.91,21.38,1-1,78.6,75.07,46.29,80.55,56.72,65.67,82.23,57.51,41.1,32.0,26.6,1-0,73.1,67.7,58.6,80.32,55.75,84.3,66.89,55.24,30.3,71.66,89.91,1.73,3.4,4.75,1.18,1.3,2.0,2.38,2.25,4.75,1.17,1.57,1.53,1.75,2.0,1.25,3.75,1.83,1.98,3.0,1.36,6.0,1.13,1.36,3.0,2.75,1.4,2025-03-04 00:00:00,England5,1 - 2,(0-1),1.0,2.0,3.0,0.0,1.0,1.0,0,0,1,0,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1,0,1,1,1,0,0,1
3984,Spain2,Racing Ferrol,Burgos,24.31,37.0,38.7,0-0,61.31,63.01,75.7,47.89,21.22,92.35,97.72,30.12,22.27,60.67,17.05,0-0,82.94,39.32,77.72,43.61,89.18,48.61,60.03,14.4,23.38,96.99,93.43,3.0,2.7,2.8,1.4,1.44,1.36,4.0,1.8,3.75,1.25,1.91,1.22,2.25,1.57,1.62,2.2,3.1,1.36,6.5,1.11,17.0,1.03,1.67,2.1,4.33,1.2,2025-03-05 00:00:00,Spain2,0 - 1,(0-1),0.0,1.0,1.0,0.0,1.0,1.0,0,0,1,0,1,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1,1,1,0,0,1


# Creating Profit Columns for Initial Model Predictions

In [6]:
ft1p, ftxp, ft2p, ft1xp, ft12p, ftx2p = [], [], [], [], [], []
over15, under15, over25, under25, over35, under35, over45, under45 = [], [], [], [], [], [], [], []
btts, otts, ht1p, htxp, ht2p, ht1xp, ht12p, htx2p = [], [], [], [], [], [], [], []
htover05, htunder05, htover15, htunder15 = [], [], [], []

#if prediction == result -> coefficient, elif prediction != result -> 0, else -> -1
for i in range(len(final_df_unique)):
    ft_list = [final_df_unique['FT1_x'].iloc[i], final_df_unique['FTX_x'].iloc[i], final_df_unique['FT2_x'].iloc[i]]
    if (ft_list.index(max(ft_list)) == 0) and (final_df_unique['FT1'].iloc[i] == 1):
        ft1p.append(final_df_unique['FT1_y'].iloc[i])
    elif (ft_list.index(max(ft_list)) == 0) and (final_df_unique['FT1'].iloc[i] != 1):
        ft1p.append(0)
    else:
        ft1p.append(-1)
    
    if (ft_list.index(max(ft_list)) == 1) and (final_df_unique['FTX'].iloc[i] == 1):
        ftxp.append(final_df_unique['FTX_y'].iloc[i])
    elif (ft_list.index(max(ft_list)) == 1) and (final_df_unique['FTX'].iloc[i] != 1):
        ftxp.append(0)
    else:
        ftxp.append(-1)
    
    if (ft_list.index(max(ft_list)) == 2) and (final_df_unique['FT2'].iloc[i] == 1):
        ft2p.append(final_df_unique['FT2_y'].iloc[i])
    elif (ft_list.index(max(ft_list)) == 2) and (final_df_unique['FT2'].iloc[i] != 1):
        ft2p.append(0)
    else:
        ft2p.append(-1)

    dc_list = [final_df_unique['DC1X_x'].iloc[i], final_df_unique['DC12_x'].iloc[i], final_df_unique['DCX2_x'].iloc[i]]
    if (dc_list.index(max(dc_list)) == 0) and (final_df_unique['FT1X'].iloc[i] == 1):
        ft1xp.append(final_df_unique['DC1X_y'].iloc[i])
    elif (dc_list.index(max(dc_list)) == 0) and (final_df_unique['FT1X'].iloc[i] != 1):
        ft1xp.append(0)
    else:
        ft1xp.append(-1)
    
    if (dc_list.index(max(dc_list)) == 1) and (final_df_unique['FT12'].iloc[i] == 1):
        ft12p.append(final_df_unique['DC12_y'].iloc[i])
    elif (dc_list.index(max(dc_list)) == 1) and (final_df_unique['FT12'].iloc[i] != 1):
        ft12p.append(0)
    else:
        ft12p.append(-1)
    
    if (dc_list.index(max(dc_list)) == 2) and (final_df_unique['FTX2'].iloc[i] == 1):
        ftx2p.append(final_df_unique['DCX2_y'].iloc[i])
    elif (dc_list.index(max(dc_list)) == 2) and (final_df_unique['FTX2'].iloc[i] != 1):
        ftx2p.append(0)
    else:
        ftx2p.append(-1)
    
    if (final_df_unique['1.5O_x'].iloc[i] >= 50) and (final_df_unique['1.5O'].iloc[i] == 1):
        over15.append(final_df_unique['1.5O_y'].iloc[i])
    elif (final_df_unique['1.5O_x'].iloc[i] >= 50) and (final_df_unique['1.5O'].iloc[i] != 1):
        over15.append(0)
    else:
        over15.append(-1)
    
    if (final_df_unique['1.5O_x'].iloc[i] < 50) and (final_df_unique['1.5O'].iloc[i] != 1):
        under15.append(final_df_unique['1.5U'].iloc[i])
    elif (final_df_unique['1.5O_x'].iloc[i] < 50) and (final_df_unique['1.5O'].iloc[i] == 1):
        under15.append(0)
    else:
        under15.append(-1)
    
    if (final_df_unique['2.5O_x'].iloc[i] >= 50) and (final_df_unique['2.5O'].iloc[i] == 1):
        over25.append(final_df_unique['2.5O_y'].iloc[i])
    elif (final_df_unique['2.5O_x'].iloc[i] >= 50) and (final_df_unique['2.5O'].iloc[i] != 1):
        over25.append(0)
    else:
        over25.append(-1)
    
    if (final_df_unique['2.5O_x'].iloc[i] < 50) and (final_df_unique['2.5O'].iloc[i] != 1):
        under25.append(final_df_unique['2.5U'].iloc[i])
    elif (final_df_unique['2.5O_x'].iloc[i] < 50) and (final_df_unique['2.5O'].iloc[i] == 1):
        under25.append(0)
    else:
        under25.append(-1)
    
    if (final_df_unique['3.5U_x'].iloc[i] <= 50) and (final_df_unique['3.5U'].iloc[i] != 1):
        over35.append(final_df_unique['3.5O'].iloc[i])
    elif (final_df_unique['3.5U_x'].iloc[i] <= 50) and (final_df_unique['3.5U'].iloc[i] == 1):
        over35.append(0)
    else:
        over35.append(-1)
    
    if (final_df_unique['3.5U_x'].iloc[i] > 50) and (final_df_unique['3.5U'].iloc[i] == 1):
        under35.append(final_df_unique['3.5U_y'].iloc[i])
    elif (final_df_unique['3.5U_x'].iloc[i] > 50) and (final_df_unique['3.5U'].iloc[i] != 1):
        under35.append(0)
    else:
        under35.append(-1)
    
    if (final_df_unique['4.5U_x'].iloc[i] <= 50) and (final_df_unique['4.5U'].iloc[i] != 1):
        over45.append(final_df_unique['4.5O'].iloc[i])
    elif (final_df_unique['4.5U_x'].iloc[i] <= 50) and (final_df_unique['4.5U'].iloc[i] == 1):
        over45.append(0)
    else:
        over45.append(-1)
    
    if (final_df_unique['4.5U_x'].iloc[i] > 50) and (final_df_unique['4.5U'].iloc[i] == 1):
        under45.append(final_df_unique['4.5U_y'].iloc[i])
    elif (final_df_unique['4.5U_x'].iloc[i] > 50) and (final_df_unique['4.5U'].iloc[i] != 1):
        under45.append(0)
    else:
        under45.append(-1)
    
    if (final_df_unique['BTTS_x'].iloc[i] >= 50) and (final_df_unique['BTTS'].iloc[i] == 1):
        btts.append(final_df_unique['BTTS_y'].iloc[i])
    elif (final_df_unique['BTTS_x'].iloc[i] >= 50) and (final_df_unique['BTTS'].iloc[i] != 1):
        btts.append(0)
    else:
        btts.append(-1)
    
    if (final_df_unique['BTTS_x'].iloc[i] < 50) and (final_df_unique['BTTS'].iloc[i] != 1):
        otts.append(final_df_unique['OTTS'].iloc[i])
    elif (final_df_unique['BTTS_x'].iloc[i] < 50) and (final_df_unique['BTTS'].iloc[i] == 1):
        otts.append(0)
    else:
        otts.append(-1)
    
    ht_list = [final_df_unique['HT1_x'].iloc[i], final_df_unique['HTX_x'].iloc[i], final_df_unique['HT2_x'].iloc[i]]
    if (ht_list.index(max(ht_list)) == 0) and (final_df_unique['HT1'].iloc[i] == 1):
        ht1p.append(final_df_unique['HT1_y'].iloc[i])
    elif (ht_list.index(max(ht_list)) == 0) and (final_df_unique['HT1'].iloc[i] != 1):
        ht1p.append(0)
    else:
        ht1p.append(-1)
    
    if (ht_list.index(max(ht_list)) == 1) and (final_df_unique['HTX'].iloc[i] == 1):
        htxp.append(final_df_unique['HTX_y'].iloc[i])
    elif (ht_list.index(max(ht_list)) == 1) and (final_df_unique['HTX'].iloc[i] != 1):
        htxp.append(0)
    else:
        htxp.append(-1)
    
    if (ht_list.index(max(ht_list)) == 2) and (final_df_unique['HT2'].iloc[i] == 1):
        ht2p.append(final_df_unique['HT2_y'].iloc[i])
    elif (ht_list.index(max(ht_list)) == 2) and (final_df_unique['HT2'].iloc[i] != 1):
        ht2p.append(0)
    else:
        ht2p.append(-1)
    
    htdc_list = [final_df_unique['HTDC1X'].iloc[i], final_df_unique['HTDC12'].iloc[i], final_df_unique['HTDCX2'].iloc[i]]
    if (htdc_list.index(max(htdc_list)) == 0) and (final_df_unique['HT1X_y'].iloc[i] == 1):
        ht1xp.append(final_df_unique['HT1X'].iloc[i])
    elif (htdc_list.index(max(htdc_list)) == 0) and (final_df_unique['HT1X_y'].iloc[i] != 1):
        ht1xp.append(0)
    else:
        ht1xp.append(-1)
    
    if (htdc_list.index(max(htdc_list)) == 1) and (final_df_unique['HT12_y'].iloc[i] == 1):
        ht12p.append(final_df_unique['HT12'].iloc[i])
    elif (htdc_list.index(max(htdc_list)) == 1) and (final_df_unique['HT12_y'].iloc[i] != 1):
        ht12p.append(0)
    else:
        ht12p.append(-1)
    
    if (htdc_list.index(max(htdc_list)) == 2) and (final_df_unique['HTX2_y'].iloc[i] == 1):
        htx2p.append(final_df_unique['HTX2'].iloc[i])
    elif (htdc_list.index(max(htdc_list)) == 2) and (final_df_unique['HTX2_y'].iloc[i] != 1):
        htx2p.append(0)
    else:
        htx2p.append(-1)
    
    if (final_df_unique['HT0.5O_x'].iloc[i] >= 50) and (final_df_unique['HT0.5O'].iloc[i] == 1):
        htover05.append(final_df_unique['HT0.5O_y'].iloc[i])
    elif (final_df_unique['HT0.5O_x'].iloc[i] >= 50) and (final_df_unique['HT0.5O'].iloc[i] != 1):
        htover05.append(0)
    else:
        htover05.append(-1)
    
    if (final_df_unique['HT0.5O_x'].iloc[i] < 50) and (final_df_unique['HT0.5O'].iloc[i] != 1):
        htunder05.append(final_df_unique['HT0.5U'].iloc[i])
    elif (final_df_unique['HT0.5O_x'].iloc[i] < 50) and (final_df_unique['HT0.5O'].iloc[i] == 1):
        htunder05.append(0)
    else:
        htunder05.append(-1)
    
    if (final_df_unique['HT1.5U_x'].iloc[i] < 50) and (final_df_unique['HT1.5U'].iloc[i] != 1):
        htover15.append(final_df_unique['HT1.5O'].iloc[i])
    elif (final_df_unique['HT1.5U_x'].iloc[i] < 50) and (final_df_unique['HT1.5U'].iloc[i] == 1):
        htover15.append(0)
    else:
        htover15.append(-1)
    
    if (final_df_unique['HT1.5U_x'].iloc[i] >= 50) and (final_df_unique['HT1.5U'].iloc[i] == 1):
        htunder15.append(final_df_unique['HT1.5U_y'].iloc[i])
    elif (final_df_unique['HT1.5U_x'].iloc[i] >= 50) and (final_df_unique['HT1.5U'].iloc[i] != 1):
        htunder15.append(0)
    else:
        htunder15.append(-1)

final_df_unique['FT1P'], final_df_unique['FTXP'], final_df_unique['FT2P'] = ft1p, ftxp, ft2p
final_df_unique['FT1XP'], final_df_unique['FT12P'], final_df_unique['FTX2P'] = ft1xp, ft12p, ftx2p
final_df_unique['1.5OP'], final_df_unique['1.5UP'], final_df_unique['2.5OP'], final_df_unique['2.5UP'] = over15, under15, over25, under25
final_df_unique['3.5OP'], final_df_unique['3.5UP'], final_df_unique['4.5OP'], final_df_unique['4.5UP'] = over35, under35, over45, under45
final_df_unique['BTTSP'], final_df_unique['OTTSP'] = btts, otts
final_df_unique['HT1P'], final_df_unique['HTXP'], final_df_unique['HT2P'] = ht1p, htxp, ht2p
final_df_unique['HT1XP'], final_df_unique['HT12P'], final_df_unique['HTX2P'] = ht1xp, ht12p, htx2p
final_df_unique['HT0.5OP'], final_df_unique['HT0.5UP'] = htover05, htunder05
final_df_unique['HT1.5OP'], final_df_unique['HT1.5UP'] = htover15, htunder15

print('Games Found: ', len(final_df_unique))
final_df_unique.tail()

Games Found:  3985


Unnamed: 0,League_x,Home,Away,FT1_x,FTX_x,FT2_x,FTR,DC1X_x,DC12_x,DCX2_x,1.5O_x,2.5O_x,3.5U_x,4.5U_x,BTTS_x,HT1_x,HTX_x,HT2_x,HTR,HTDC1X,HTDC12,HTDCX2,HT0.5O_x,HT1.5U_x,H0.5O,A0.5O,H1.5O,A1.5O,H2.5U,A2.5U,FT1_y,FTX_y,FT2_y,DC1X_y,DC12_y,DCX2_y,HT1_y,HTX_y,HT2_y,HT1X,HT12,HTX2,BTTS_y,OTTS,1.5O_y,1.5U,2.5O_y,2.5U,3.5O,3.5U_y,4.5O,4.5U_y,HT0.5O_y,HT0.5U,HT1.5O,HT1.5U_y,Date,League_y,FT,HT,FTHG,FTAG,FTTG,HTHG,HTAG,HTTG,FT1,FTX,FT2,FT1X,FT12,FTX2,1.5O,1.5U_y,2.5O,2.5U_y,3.5O_y,3.5U,4.5O_y,4.5U,BTTS,OTTS_y,HT1,HTX,HT2,HT1X_y,HT12_y,HTX2_y,HT0.5O,HT0.5U_y,HT1.5O_y,HT1.5U,FT1P,FTXP,FT2P,FT1XP,FT12P,FTX2P,1.5OP,1.5UP,2.5OP,2.5UP,3.5OP,3.5UP,4.5OP,4.5UP,BTTSP,OTTSP,HT1P,HTXP,HT2P,HT1XP,HT12P,HTX2P,HT0.5OP,HT0.5UP,HT1.5OP,HT1.5UP
3980,England5,Solihull Moors,Sutton Utd,38.6,27.26,34.14,1-1,65.86,72.74,61.4,79.49,55.02,67.33,83.44,59.45,19.99,39.87,40.05,0-0,59.86,60.04,79.92,68.93,70.17,77.5,75.14,43.94,40.53,81.09,83.54,2.35,3.4,2.8,1.4,1.3,1.57,3.1,2.1,3.6,1.29,1.67,1.36,1.8,1.95,1.33,3.25,2.05,1.75,3.5,1.29,8.0,1.08,1.44,2.63,3.25,1.33,2025-03-04 00:00:00,England5,1 - 1,(1-1),1.0,1.0,2.0,1.0,1.0,2.0,0,1,0,1,0,1,1,0,0,1,0,1,0,1,1,0,0,1,0,1,0,1,1,0,1,0,0.0,-1.0,-1.0,-1.0,0.0,-1.0,1.33,-1.0,0.0,-1.0,-1.0,1.29,-1.0,1.08,1.8,-1.0,-1.0,-1.0,0.0,-1.0,-1.0,1.36,1.44,-1.0,-1.0,0.0
3981,England5,Southend Utd,Woking,49.1,31.2,19.7,1-0,80.3,68.8,50.9,62.35,34.0,84.58,94.16,39.69,46.43,43.22,10.24,0-0,89.65,56.67,53.46,61.94,76.35,73.27,52.39,38.01,17.06,85.26,96.05,1.73,3.6,4.5,1.18,1.29,2.05,2.4,2.1,5.0,1.14,1.67,1.53,2.0,1.75,1.33,3.25,2.08,1.73,3.75,1.25,8.0,1.08,1.44,2.63,3.25,1.33,2025-03-04 00:00:00,England5,2 - 2,(1-1),2.0,2.0,4.0,1.0,1.0,2.0,0,1,0,1,0,1,1,0,1,0,1,0,0,1,1,0,0,1,0,1,0,1,1,0,1,0,0.0,-1.0,-1.0,1.18,-1.0,-1.0,1.33,-1.0,-1.0,0.0,-1.0,0.0,-1.0,1.08,-1.0,0.0,0.0,-1.0,-1.0,1.14,-1.0,-1.0,1.44,-1.0,-1.0,0.0
3982,England5,Tamworth,Wealdstone,43.32,26.99,29.69,1-1,70.31,73.01,56.68,79.36,54.84,67.5,83.57,58.8,37.52,37.84,24.53,0-0,75.36,62.05,62.37,72.35,66.51,79.61,72.34,47.2,36.8,78.58,86.04,2.35,3.2,3.0,1.36,1.36,1.57,3.0,2.2,3.5,1.29,1.62,1.36,1.67,2.1,1.25,3.75,1.88,1.93,3.25,1.33,6.0,1.13,1.36,3.0,2.75,1.4,2025-03-04 00:00:00,England5,4 - 1,(0-0),4.0,1.0,5.0,0.0,0.0,0.0,1,0,0,1,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1,0,1,0,1,0,1,2.35,-1.0,-1.0,-1.0,1.36,-1.0,1.25,-1.0,1.88,-1.0,-1.0,0.0,-1.0,0.0,1.67,-1.0,-1.0,2.2,-1.0,1.29,-1.0,-1.0,0.0,-1.0,-1.0,1.4
3983,England5,York City,Altrincham,53.69,24.91,21.38,1-1,78.6,75.07,46.29,80.55,56.72,65.67,82.23,57.51,41.1,32.0,26.6,1-0,73.1,67.7,58.6,80.32,55.75,84.3,66.89,55.24,30.3,71.66,89.91,1.73,3.4,4.75,1.18,1.3,2.0,2.38,2.25,4.75,1.17,1.57,1.53,1.75,2.0,1.25,3.75,1.83,1.98,3.0,1.36,6.0,1.13,1.36,3.0,2.75,1.4,2025-03-04 00:00:00,England5,1 - 2,(0-1),1.0,2.0,3.0,0.0,1.0,1.0,0,0,1,0,1,1,1,0,1,0,0,1,0,1,1,0,0,0,1,0,1,1,1,0,0,1,0.0,-1.0,-1.0,0.0,-1.0,-1.0,1.25,-1.0,1.83,-1.0,-1.0,1.36,-1.0,1.13,1.75,-1.0,0.0,-1.0,-1.0,0.0,-1.0,-1.0,1.36,-1.0,-1.0,1.4
3984,Spain2,Racing Ferrol,Burgos,24.31,37.0,38.7,0-0,61.31,63.01,75.7,47.89,21.22,92.35,97.72,30.12,22.27,60.67,17.05,0-0,82.94,39.32,77.72,43.61,89.18,48.61,60.03,14.4,23.38,96.99,93.43,3.0,2.7,2.8,1.4,1.44,1.36,4.0,1.8,3.75,1.25,1.91,1.22,2.25,1.57,1.62,2.2,3.1,1.36,6.5,1.11,17.0,1.03,1.67,2.1,4.33,1.2,2025-03-05 00:00:00,Spain2,0 - 1,(0-1),0.0,1.0,1.0,0.0,1.0,1.0,0,0,1,0,1,1,0,1,0,1,0,1,0,1,0,1,0,0,1,0,1,1,1,0,0,1,-1.0,-1.0,2.8,-1.0,-1.0,1.36,-1.0,2.2,-1.0,1.36,-1.0,1.11,-1.0,1.03,-1.0,1.57,-1.0,0.0,-1.0,0.0,-1.0,-1.0,-1.0,0.0,-1.0,1.2


# Checking For ROI of Profit Columns

In [7]:
# Select columns that end with 'P'
columns = [col for col in final_df_unique.columns if col.endswith('P')]

# Initialize lists to store results and games
results, games_list = [],  []

for col in columns:
    my_df = final_df_unique[final_df_unique[col] >= 0]
    numerator = np.sum(my_df[col]) - len(my_df)
    result = round(numerator / len(my_df) * 100, 2)
    
    # Append results and games to respective lists
    results.append(result)
    games_list.append(len(my_df))

# Convert results to a DataFrame with an additional column for Games
results_df = pd.DataFrame({
    'Column': columns,
    'ROI': results,
    'Games': games_list
})
results_df

Unnamed: 0,Column,ROI,Games
0,FT1P,-5.4,2267
1,FTXP,-5.97,271
2,FT2P,-10.82,1447
3,FT1XP,-3.24,1762
4,FT12P,-6.06,1159
5,FTX2P,-6.49,1064
6,1.5OP,-5.95,3669
7,1.5UP,0.09,316
8,2.5OP,-6.15,1865
9,2.5UP,-5.15,2120


# ROI of Profit Columns According To Leagues

In [8]:
# Step 1: Filter leagues with at least 10 games
league_counts = final_df_unique['League_x'].value_counts()
leagues_with_10_games = league_counts[league_counts >= 10].index
filtered_df = final_df_unique[final_df_unique['League_x'].isin(leagues_with_10_games)]

# Group by 'League' and calculate results for each group
grouped_results = {}
for league, group in filtered_df.groupby('League_x'):
    group_results = {}
    for col in columns:
        my_df = group[group[col] >= 0]
        numerator = np.sum(my_df[col]) - len(my_df)
        group_results[col] = round(numerator / len(my_df) * 100, 2)
    # Add the number of games for this league
    group_results['Games'] = round(len(group),2)
    grouped_results[league] = group_results

# Convert grouped results to a DataFrame for better visualization
grouped_results_df = pd.DataFrame(grouped_results).T

# Define a function to apply conditional formatting
def highlight_positive(val):
    # Highlight background to red if the value is positive
    color = 'background-color: red' if isinstance(val, (int, float)) and val > 0 else ''
    return color

# Apply the function to the DataFrame
styled_df = (
    grouped_results_df.style
    .applymap(highlight_positive)
    .format("{:.2f}")  # Format only numeric columns, excluding 'Games'
)

# Save the styled DataFrame to Excel
styled_df.to_excel("ROI_leagues.xlsx", index=True)

# Display the styled DataFrame
styled_df

Unnamed: 0,FT1P,FTXP,FT2P,FT1XP,FT12P,FTX2P,1.5OP,1.5UP,2.5OP,2.5UP,3.5OP,3.5UP,4.5OP,4.5UP,BTTSP,OTTSP,HT1P,HTXP,HT2P,HT1XP,HT12P,HTX2P,HT0.5OP,HT0.5UP,HT1.5OP,HT1.5UP,Games
Argentina,-38.38,0.0,275.0,-5.57,-100.0,-23.5,5.15,-43.75,-100.0,-8.5,,-5.12,,-8.0,-100.0,10.38,-100.0,-4.58,333.0,-7.77,,38.75,21.08,39.6,,-12.53,17.0
Australia,-29.33,,-15.1,18.2,-3.33,30.6,-8.11,,5.0,76.2,-41.29,-25.5,,-0.63,-21.62,-35.0,-0.33,13.33,45.43,7.67,,-6.1,-20.0,,24.6,-10.79,19.0
Austria,-36.55,-22.5,-25.47,-5.92,9.11,7.0,1.51,300.0,26.1,5.21,16.5,6.85,-100.0,-2.16,-17.14,-24.71,-50.09,-5.63,-23.57,0.07,,-7.96,2.1,62.78,16.64,5.92,50.0
Belgium,8.02,5.38,-40.11,8.58,-36.12,-7.46,-8.61,-35.0,-14.29,4.21,-31.42,6.75,-100.0,0.44,-22.58,-5.81,-6.23,-20.6,-26.88,3.96,-100.0,-11.04,-6.2,23.29,-77.37,-0.83,105.0
Brazil,-24.6,-100.0,-19.82,-3.09,5.46,-10.22,-0.64,175.0,-5.52,-12.92,,-4.05,,-0.03,9.0,-22.83,-2.62,2.58,,-10.99,,4.86,0.29,41.29,-100.0,-10.0,106.0
Denmark,8.76,75.0,-22.68,12.32,-29.56,6.94,-0.67,,-3.9,12.36,5.33,-17.06,-100.0,-7.41,25.63,12.2,43.41,29.21,-38.0,3.33,-24.25,19.71,-11.74,0.0,49.5,17.38,45.0
England,-3.94,-26.67,-16.41,-9.13,17.71,-2.57,-0.8,-100.0,-8.63,-15.32,-60.33,-9.95,-100.0,0.5,-2.04,-13.63,-9.21,-8.97,38.76,-7.27,34.0,-18.69,6.69,-2.78,4.39,-0.9,119.0
England2,-8.84,13.46,-19.95,2.22,-9.62,2.73,-4.84,14.46,-12.7,7.22,-46.55,0.5,,1.17,-6.6,-5.79,6.32,-9.37,-67.5,-1.11,,-13.57,-4.92,16.96,39.87,-4.8,235.0
England3,1.09,49.85,-9.01,-5.64,-6.9,-9.44,-7.55,4.64,-2.67,2.27,-30.32,0.29,-46.43,-1.99,2.21,3.88,-26.65,-9.69,2.27,-3.98,15.0,-11.11,-8.54,-14.7,0.0,-4.25,243.0
England4,-5.05,61.67,-17.88,-4.4,-4.12,6.29,-12.58,-28.18,-14.3,-4.67,9.58,1.85,116.67,0.13,-20.7,-15.1,-12.14,-0.89,-51.58,-8.44,-16.0,-7.34,-12.24,-28.91,-21.85,0.6,219.0


# Creating Optimum Threshold for Each Prediction Column

In [9]:
# Assuming `df` is your DataFrame and it contains the columns for percentages and correctness
def calculate_threshold(percentages, predictions):
    # Ensure inputs are pandas Series
    percentages = pd.Series(percentages)
    predictions = pd.Series(predictions)
    
    thresholds = percentages.unique()
    best_threshold = 0
    best_j_stat = -np.inf  # Start with negative infinity for comparison
    
    for threshold in thresholds:
        # Predict 1s based on the threshold
        predicted_1s = (percentages >= threshold).astype(int)
        
        # Calculate true positives, true negatives, false positives, false negatives
        true_positives = ((predicted_1s == 1) & (predictions == 1)).sum()
        true_negatives = ((predicted_1s == 0) & (predictions == 0)).sum()
        false_positives = ((predicted_1s == 1) & (predictions == 0)).sum()
        false_negatives = ((predicted_1s == 0) & (predictions == 1)).sum()
        
        # Calculate Sensitivity (Recall) and Specificity
        sensitivity = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        specificity = true_negatives / (true_negatives + false_positives) if (true_negatives + false_positives) > 0 else 0
        
        # Calculate Youden's J Statistic
        j_stat = sensitivity + specificity - 1
        
        # Update best threshold if J statistic improves
        if j_stat > best_j_stat:
            best_j_stat = j_stat
            best_threshold = threshold
    
    return best_threshold, round(best_j_stat, 2)

# Select only numeric columns
numeric_columns = final_df_unique.select_dtypes(include=[np.number])

# Remove rows where any numeric value is greater than 100
final_df_unique = final_df_unique[(numeric_columns <= 100).all(axis=1)]

#Selecting dataframes with model predictions
ft1df = final_df_unique[final_df_unique['FT1_x'] >= final_df_unique[['FTX_x', 'FT2_x']].max(axis=1)]
ftxdf = final_df_unique[final_df_unique['FTX_x'] >= final_df_unique[['FT1_x', 'FT2_x']].max(axis=1)]
ft2df = final_df_unique[final_df_unique['FT2_x'] >= final_df_unique[['FTX_x', 'FT1_x']].max(axis=1)]
dc1xdf = final_df_unique[final_df_unique['DC1X_x'] >= final_df_unique[['DC12_x', 'DCX2_x']].max(axis=1)]
dc12df = final_df_unique[final_df_unique['DC12_x'] >= final_df_unique[['DC1X_x', 'DCX2_x']].max(axis=1)]
dcx2df = final_df_unique[final_df_unique['DCX2_x'] >= final_df_unique[['DC1X_x', 'DC12_x']].max(axis=1)]
over15df, under15df = final_df_unique[final_df_unique['1.5O_x'] >= 50], final_df_unique[final_df_unique['1.5O_x'] < 50]
over25df, under25df = final_df_unique[final_df_unique['2.5O_x'] >= 50], final_df_unique[final_df_unique['2.5O_x'] < 50]
over35df, under35df = final_df_unique[final_df_unique['3.5U_x'] < 50], final_df_unique[final_df_unique['3.5U_x'] >= 50]
over45df, under45df = final_df_unique[final_df_unique['4.5U_x'] < 50], final_df_unique[final_df_unique['4.5U_x'] >= 50]
bttsdf, ottsdf = final_df_unique[final_df_unique['BTTS_x'] >= 50], final_df_unique[final_df_unique['BTTS_x'] < 50]
ht1df = final_df_unique[final_df_unique['HT1_x'] >= final_df_unique[['HTX_x', 'HT2_x']].max(axis=1)]
htxdf = final_df_unique[final_df_unique['HTX_x'] >= final_df_unique[['HT1_x', 'HT2_x']].max(axis=1)]
ht2df = final_df_unique[final_df_unique['HT2_x'] >= final_df_unique[['HT1_x', 'HTX_x']].max(axis=1)]
ht1xdf = final_df_unique[final_df_unique['HTDC1X'] >= final_df_unique[['HTDC12', 'HTDCX2']].max(axis=1)]
ht12df = final_df_unique[final_df_unique['HTDC12'] >= final_df_unique[['HTDC1X', 'HTDCX2']].max(axis=1)]
htx2df = final_df_unique[final_df_unique['HTDCX2'] >= final_df_unique[['HTDC1X', 'HTDC12']].max(axis=1)]
htover05df, htunder05df = final_df_unique[final_df_unique['HT0.5O_x'] >= 50], final_df_unique[final_df_unique['HT0.5O_x'] < 50]
htover15df, htunder15df = final_df_unique[final_df_unique['HT1.5U_x'] < 50], final_df_unique[final_df_unique['HT1.5U_x'] >= 50]

ft1t, ft1a = calculate_threshold(ft1df['FT1_x'], ft1df['FT1'])
ftxt, ftxa = calculate_threshold(ftxdf['FTX_x'], ftxdf['FTX'])
ft2t, ft2a = calculate_threshold(ft2df['FT2_x'], ft2df['FT2'])
ft1xt, ft1xa = calculate_threshold(dc1xdf['DC1X_x'], dc1xdf['FT1X'])
ft12t, ft12a = calculate_threshold(dc12df['DC12_x'], dc12df['FT12'])
ftx2t, ftx2a = calculate_threshold(dcx2df['DCX2_x'], dcx2df['FTX2'])
over15t, over15a = calculate_threshold(over15df['1.5O_x'], over15df['1.5O'])
under15t, under15a = calculate_threshold(under15df['1.5O_x'], under15df['1.5U'])
over25t, over25a = calculate_threshold(over25df['2.5O_x'], over25df['2.5O'])
under25t, under25a = calculate_threshold(under25df['2.5O_x'], under25df['2.5U'])
over35t, over35a = calculate_threshold(over35df['3.5U_x'], over35df['3.5O'])
under35t, under35a = calculate_threshold(under35df['3.5U_x'], under35df['3.5U'])
over45t, over45a = calculate_threshold(over45df['4.5U_x'], over45df['4.5O'])
under45t, under45a = calculate_threshold(under45df['4.5U_x'], under45df['4.5U'])
bttst, bttsa = calculate_threshold(bttsdf['BTTS_x'], bttsdf['BTTS'])
ottst, ottsa = calculate_threshold(ottsdf['BTTS_x'], ottsdf['OTTS'])
ht1t, ht1a = calculate_threshold(ht1df['HT1_x'], ht1df['HT1'])
htxt, htxa = calculate_threshold(htxdf['HTX_x'], htxdf['HTX'])
ht2t, ht2a = calculate_threshold(ht2df['HT2_x'], ht2df['HT2'])
ht1xt, ht1xa = calculate_threshold(ht1xdf['HTDC1X'], ht1xdf['HT1X'])
ht12t, ht12a = calculate_threshold(ht12df['HTDC12'], ht12df['HT12'])
htx2t, htx2a = calculate_threshold(htx2df['HTDCX2'], htx2df['HTX2'])
htover05t, htover05a = calculate_threshold(htover05df['HT0.5O_x'], htover05df['HT0.5O'])
htunder05t, htunder05a = calculate_threshold(htunder05df['HT0.5O_x'], htunder05df['HT0.5U'])
htover15t, htover15a = calculate_threshold(htover15df['HT1.5U_x'], htover15df['HT1.5O'])
htunder15t, htunder15a = calculate_threshold(htunder15df['HT1.5U_x'], htunder15df['HT1.5U'])

new_ft1df, new_ftxdf, new_ft2df = ft1df[ft1df['FT1_x'] >= ft1t],ftxdf[ftxdf['FTX_x'] >= ftxt],ft2df[ft2df['FT2_x'] >= ft2t]
new_ft1xdf, new_ft12df, new_ftx2df = dc1xdf[dc1xdf['DC1X_x'] >= ft1xt],dc12df[dc12df['DC12_x'] >= ft12t],dcx2df[dcx2df['DCX2_x'] >= ftx2t] 
new_over15, new_under15 = over15df[over15df['1.5O_x'] >= over15t], under15df[under15df['1.5O_x'] <= under15t]
new_over25, new_under25 = over25df[over25df['2.5O_x'] >= over25t], under25df[under25df['2.5O_x'] <= under25t]
new_over35, new_under35 = over35df[over35df['3.5U_x'] <= over35t], under35df[under35df['3.5U_x'] >= under35t]
new_over45, new_under45 = over45df[over45df['4.5U_x'] <= over45t], under45df[under45df['4.5U_x'] >= under45t]
new_btts, new_otts = bttsdf[bttsdf['BTTS_x'] >= bttst], ottsdf[ottsdf['BTTS_x'] <= ottst]
new_ht1df, new_htxdf, new_ht2df = ht1df[ht1df['HT1_x'] >= ht1t],htxdf[htxdf['HTX_x'] >= htxt],ht2df[ht2df['HT2_x'] >= ht2t]
new_ht1xdf, new_ht12df, new_htx2df = ht1xdf[ht1xdf['HTDC1X'] >= ht1xt],ht12df[ht12df['HTDC12'] >= ht12t],htx2df[htx2df['HTDCX2'] >= htx2t]
new_htover05, new_htunder05 = htover05df[htover05df['HT0.5O_x'] >= htover05t], htunder05df[htunder05df['HT0.5O_x'] <= htunder05t]
new_htover15, new_htunder15 = htover15df[htover15df['HT1.5U_x'] <= htover15t], htunder15df[htunder15df['HT1.5U_x'] >= htunder15t]

# Store the results in a list
results = [
('FT1', ft1t, ft1a, len(new_ft1df), round(len(new_ft1df)/len(ft1df)*100,2), np.sum(new_ft1df['FT1P']) - len(new_ft1df)),
('FTX', ftxt, ftxa, len(new_ftxdf), round(len(new_ftxdf)/len(ftxdf)*100,2), np.sum(new_ftxdf['FTXP']) - len(new_ftxdf)),
('FT2', ft2t, ft2a, len(new_ft2df), round(len(new_ft2df)/len(ft2df)*100,2), np.sum(new_ft2df['FT2P']) - len(new_ft2df)),
('FT1X', ft1xt, ft1xa, len(new_ft1xdf), round(len(new_ft1xdf)/len(dc1xdf)*100,2), np.sum(new_ft1xdf['FT1XP']) - len(new_ft1xdf)),
('FT12', ft12t, ft12a, len(new_ft12df), round(len(new_ft12df)/len(dc12df)*100,2), np.sum(new_ft12df['FT12P']) - len(new_ft12df)),
('FTX2', ftx2t, ftx2a, len(new_ftx2df), round(len(new_ftx2df)/len(dcx2df)*100,2), np.sum(new_ftx2df['FTX2P']) - len(new_ftx2df)),
('1.5O', over15t, over15a, len(new_over15), round(len(new_over15)/len(over15df)*100,2), np.sum(new_over15['1.5OP'])-len(new_over15)),
('1.5U', under15t, under15a, len(new_under15), round(len(new_under15)/len(under15df)*100,2), np.sum(new_under15['1.5UP'])-len(new_under15)),
('2.5O', over25t, over25a, len(new_over25), round(len(new_over25)/len(over25df)*100,2), np.sum(new_over25['2.5OP'])-len(new_over25)),
('2.5U', under25t, under25a, len(new_under25), round(len(new_under25)/len(under25df)*100,2), np.sum(new_under25['2.5UP'])-len(new_under25)),
('3.5O', over35t, over35a, len(new_over35), round(len(new_over35)/len(over35df)*100,2), np.sum(new_over35['3.5OP'])-len(new_over35)),
('3.5U', under35t, under35a, len(new_under35), round(len(new_under35)/len(under35df)*100,2), np.sum(new_under35['3.5UP'])-len(new_under35)),
('4.5O', over45t, over45a, len(new_over45), round(len(new_over45)/len(over45df)*100,2), np.sum(new_over45['4.5OP'])-len(new_over45)),
('4.5U', under45t, under45a, len(new_under45), round(len(new_under45)/len(under45df)*100,2), np.sum(new_under45['4.5UP'])-len(new_under45)),
('BTTS', bttst, bttsa, len(new_btts), round(len(new_btts)/len(bttsdf)*100,2), np.sum(new_btts['BTTSP'])-len(new_btts)),
('OTTS', ottst, ottsa, len(new_otts), round(len(new_otts)/len(ottsdf)*100,2), np.sum(new_otts['OTTSP'])-len(new_otts)),
('HT1', ht1t, ht1a, len(new_ht1df), round(len(new_ht1df)/len(ht1df)*100,2), np.sum(new_ht1df['HT1P']) - len(new_ht1df)),
('HTX', htxt, htxa, len(new_htxdf), round(len(new_htxdf)/len(htxdf)*100,2), np.sum(new_htxdf['HTXP']) - len(new_htxdf)),
('HT2', ht2t, ht2a, len(new_ht2df), round(len(new_ht2df)/len(ht2df)*100,2), np.sum(new_ht2df['HT2P']) - len(new_ht2df)),
('HT1X', ht1xt, ht1xa, len(new_ht1xdf), round(len(new_ht1xdf)/len(ht1xdf)*100,2), np.sum(new_ht1xdf['HT1XP']) - len(new_ht1xdf)),
('HT12', ht12t, ht12a, len(new_ht12df), round(len(new_ht12df)/len(ht12df)*100,2), np.sum(new_ht12df['HT12P']) - len(new_ht12df)),
('HTX2', htx2t, htx2a, len(new_htx2df), round(len(new_htx2df)/len(htx2df)*100,2), np.sum(new_htx2df['HTX2P']) - len(new_htx2df)),
('HT0.5O', htover05t, htover05a, len(new_htover05), round(len(new_htover05)/len(htover05df)*100,2), np.sum(new_htover05['HT0.5OP'])-len(new_htover05)),
('HT0.5U', htunder05t, htunder05a, len(new_htunder05), round(len(new_htunder05)/len(htunder05df)*100,2), np.sum(new_htunder05['HT0.5UP'])-len(new_htunder05)),
('HT1.5O', htover15t, htover15a, len(new_htover15), round(len(new_htover15)/len(htover15df)*100,2), np.sum(new_htover15['HT1.5OP'])-len(new_htover15)),
('HT1.5U', htunder15t, htunder15a, len(new_htunder15), round(len(new_htunder15)/len(htunder15df)*100,2), np.sum(new_htunder15['HT1.5UP'])-len(new_htunder15))
]

# Create a DataFrame from the results
results_df = pd.DataFrame(results, columns=['Prediction', 'Threshold', 'J-Stat', 'Games', 'Games%', 'Profit'])
results_df['ROI'] = round(results_df['Profit'] / results_df['Games'] * 100, 2)
print('Number of matches: ', len(final_df_unique))
results_df

Number of matches:  3943


Unnamed: 0,Prediction,Threshold,J-Stat,Games,Games%,Profit,ROI
0,FT1,59.23,0.18,891,39.64,15.66,1.76
1,FTX,51.02,0.05,42,16.47,6.95,16.55
2,FT2,48.26,0.1,769,53.33,-62.58,-8.14
3,FT1X,79.03,0.14,1208,69.71,-26.38,-2.18
4,FT12,75.59,0.07,643,55.53,-21.84,-3.4
5,FTX2,74.65,0.11,802,76.24,-36.68,-4.57
6,1.5O,76.26,0.07,1710,47.11,-98.32,-5.75
7,1.5U,30.21,-1.0,46,14.7,4.06,8.83
8,2.5O,64.8,0.06,765,41.4,-36.42,-4.76
9,2.5U,7.67,-1.0,34,1.62,-1.49,-4.38


# Testing Best / Most Profitable Model Predictions

In [10]:
# Select columns from predictions table
predictions = ['FT1_x', 'FTX_x', 'FT2_x', 'DC1X_x', 'DC12_x', 'DCX2_x', 
               '1.5O_x', '2.5O_x', '3.5U_x', '4.5U_x', 'BTTS_x', 
               'HT1_x', 'HTX_x', 'HT2_x', 'HTDC1X', 'HTDC12', 'HTDCX2', 
               'HT0.5O_x', 'HT1.5U_x']

# Select columns from betting odds table
results = ['FT1', 'FTX', 'FT2', 'FT1X', 'FT12', 'FTX2', 
           '1.5O', '2.5O', '3.5U', '4.5U', 'BTTS',
            'HT1', 'HTX', 'HT2', 'HT1X', 'HT12', 'HTX2', 
            'HT0.5O', 'df2_HT1.5U']

# Select columns ending with 'P' (profit columns)
profits = ['FT1P', 'FTXP', 'FT2P', 'FT1XP', 'FT12P', 'FTX2P', 
           '1.5OP', '2.5OP', '3.5UP', '4.5UP', 'BTTSP',
            'HT1P', 'HTXP', 'HT2P', 'HT1XP', 'HT12P', 'HTX2P', 
            'HT0.5OP', 'HT1.5UP']

bet, percentage, profit = [], [], []

for i in range(len(final_df_unique)):
    my_list = []
    for j in predictions:
        my_list.append(final_df_unique[j].iloc[i])
    percentage.append(max(my_list))
    max_index = my_list.index(max(my_list))
    bet.append(results[max_index])
    profit_column = profits[max_index]
    profit.append(final_df_unique[profit_column].iloc[i])

# Create a DataFrame
model_recs = pd.DataFrame({
    'League': final_df_unique['League_x'],
    'Home': final_df_unique['Home'],
    'Away': final_df_unique['Away'],
    'BET': bet,
    'Percentage': percentage,
    'Profit': profit
})


print('Matches found: ', len(final_df_unique))
print(f"Correct Predictions: {len(model_recs[model_recs['Profit'] > 0])/len(model_recs)*100}")
print(f"Profit: {round(sum(model_recs['Profit']) - len(model_recs),2)} ROI: {round((sum(model_recs['Profit']) - len(model_recs)) / len(model_recs) * 100, 2)}%")
model_recs.tail()

Matches found:  3943
Correct Predictions: 83.84478823231042
Profit: -157.01 ROI: -3.98%


Unnamed: 0,League,Home,Away,BET,Percentage,Profit
3980,England5,Solihull Moors,Sutton Utd,4.5U,83.44,1.08
3981,England5,Southend Utd,Woking,4.5U,94.16,1.08
3982,England5,Tamworth,Wealdstone,4.5U,83.57,0.0
3983,England5,York City,Altrincham,4.5U,82.23,1.13
3984,Spain2,Racing Ferrol,Burgos,4.5U,97.72,1.03


In [11]:
final_df_unique['OTTS_x'] = 100 - final_df_unique['BTTS_x']
final_df_unique['1.5U_x'] = 100 - final_df_unique['1.5O_x']
final_df_unique['2.5U_x'] = 100 - final_df_unique['2.5O_x']
final_df_unique['3.5O_x'] = 100 - final_df_unique['3.5U_x']
final_df_unique['4.5O_x'] = 100 - final_df_unique['4.5U_x']
final_df_unique['HT0.5U_x'] = 100 - final_df_unique['HT0.5O_x']
final_df_unique['HT1.5O_x'] = 100 - final_df_unique['HT1.5U_x'] 

# Select columns starting with 'df1_'
predictions = ['FT1_x', 'FTX_x', 'FT2_x', 'DC1X_x', 'DC12_x', 'DCX2_x', 
               '1.5O_x', '1.5U_x', '2.5O_x','2.5U_x','3.5O_x', '3.5U_x', 
               '4.5O_x', '4.5U_x', 'BTTS_x', 'OTTS_x',
               'HT1_x', 'HTX_x', 'HT2_x', 'HTDC1X', 'HTDC12', 'HTDCX2', 
               'HT0.5O_x', 'HT0.5U_x', 'HT1.5O_x', 'HT1.5U_x']

# Select columns starting with 'df2_'
odds = ['FT1_y', 'FTX_y', 'FT2_y', 'DC1X_y', 'DC12_y', 'DCX2_y', 
           '1.5O_y', '1.5U', '2.5O_y','2.5U', '3.5O','3.5U_y', 
           '4.5O', '4.5U_y', 'BTTS_y', 'OTTS',
            'HT1_y', 'HTX_y', 'HT2_y', 'HT1X', 'HT12', 'HTX2', 
            'HT0.5O_y', 'HT0.5U', 'HT1.5O', 'HT1.5U_y']

# Select columns ending with 'P'
profit = ['FT1P', 'FTXP', 'FT2P', 'FT1XP', 'FT12P', 'FTX2P', 
           '1.5OP', '1.5UP', '2.5OP', '2.5UP', '3.5OP', '3.5UP', '4.5OP', '4.5UP', 
           'BTTSP', 'OTTSP', 'HT1P', 'HTXP', 'HT2P', 'HT1XP', 'HT12P', 'HTX2P', 
            'HT0.5OP', 'HT0.5UP', 'HT1.5OP','HT1.5UP']

bets, percentages, profits, difference = [], [], [], [] 
for i in range(len(final_df_unique)):
    my_list = []
    valid_indices = []  # To keep track of indices where profit is not negative
    for j in range(len(predictions)):
        pred_column = predictions[j]
        odds_column = odds[j]
        profit_column = profit[j]  # Corresponding profit column
        
        # Calculate the value
        my_value = (100 / final_df_unique[pred_column].iloc[i]) - final_df_unique[odds_column].iloc[i]
        
        # Only add to the list if the corresponding profit is non-negative
        if final_df_unique[profit_column].iloc[i] >= 0:
            my_list.append(my_value)
            valid_indices.append(j)
        else:
            my_list.append(float('-inf'))  # Set to negative infinity to ignore in max()
    
    if valid_indices:  # Ensure there is at least one valid index
        max_index = my_list.index(max(my_list))
        rec_bet = profit[max_index]
        percent_bet = predictions[max_index]
        bets.append(rec_bet)
        percentages.append(final_df_unique[percent_bet].iloc[i])
        rec_profit = profit[max_index]
        profits.append(final_df_unique[rec_profit].iloc[i])
        difference.append(round(max(my_list), 2))
    else:
        # Handle case where no valid profits are found for this match
        bets.append(None)
        percentages.append(None)
        profits.append(None)
        difference.append(None)

# Create a DataFrame
model_recs = pd.DataFrame({
    'League': final_df_unique['League_x'],
    'Home': final_df_unique['Home'],
    'Away': final_df_unique['Away'],
    'BET': bets,
    'Percentage': percentages,
    'Profit': profits,
    'Difference': difference
}).dropna()  # Drop rows with None values

print('Matches found: ', len(final_df_unique))
print(f"Correct Predictions: {len(model_recs[model_recs['Profit'] > 0])/len(model_recs)*100}")
print(f"Profit: {round(sum(model_recs['Profit']) - len(model_recs), 2)} ROI: {round((sum(model_recs['Profit']) - len(model_recs)) / len(model_recs) * 100, 2)}%")
model_recs.tail()

Matches found:  3943
Correct Predictions: 67.66421506467157
Profit: -152.32 ROI: -3.86%


Unnamed: 0,League,Home,Away,BET,Percentage,Profit,Difference
3980,England5,Solihull Moors,Sutton Utd,FT1P,38.6,0.0,0.24
3981,England5,Southend Utd,Woking,FT1P,49.1,0.0,0.31
3982,England5,Tamworth,Wealdstone,HTXP,37.84,2.2,0.44
3983,England5,York City,Altrincham,HT1.5UP,55.75,1.4,0.39
3984,Spain2,Racing Ferrol,Burgos,4.5UP,97.72,1.03,-0.01


## Checking the Betting Strategy in Article 

probability(model) / probability(bookies) > r (between 1 and 1.5)

In [12]:
def find_best_threshold_for_column(df, r_col, p_col, start=1.0, stop=1.5, step=0.01):
    """
    For a given r column and probability column in df, loop through thresholds
    from start to stop (inclusive) in step increments. Compute the score for each threshold,
    and return the threshold that gives the maximum score along with that score.
    
    Score is defined as:
      (sum(probabilities) - count(probabilities)) / count(probabilities) * 100
    """
    best_score = -np.inf
    best_threshold = None
    # Create thresholds from start to stop inclusive
    for thr in np.arange(start, stop + step, step):
        # Filter the rows based on the current threshold and probability condition
        subset = df[(df[r_col] > thr) & (df[p_col] >= 0)]
        if len(subset) == 0:
            continue  # skip if no data for this threshold
        # Convert probability values from percentage to decimals by dividing by 100.
        score = ((subset[p_col]).sum() - len(subset)) / len(subset) * 100
        if score > best_score:
            best_score = score
            best_threshold = thr
            games = len(subset)
    return best_threshold, best_score, games

# Step 1: Compute r score columns (if not already computed)
for pred, odd in zip(predictions, odds):
    r_col = f'{pred}_r'
    final_df_unique[r_col] = (final_df_unique[pred] / 100) / (1 / final_df_unique[odd])

# Dictionary to store the best threshold and score for each column
results = {}

for prof, pred in zip(profit, predictions):
    r_col = f'{pred}_r'
    # profit column is taken directly from the profit list.
    profit_col = prof
    
    # Optional: check if columns exist
    if r_col not in final_df_unique.columns:
        print(f"Column {r_col} not found, skipping.")
        continue
    if profit_col not in final_df_unique.columns:
        print(f"Column {profit_col} not found, skipping.")
        continue

    best_thr, best_score, games = find_best_threshold_for_column(final_df_unique, r_col, profit_col)
    results[prof] = {'best_threshold': best_thr, 'best_score': best_score, 'games': games}

# Convert the results dictionary to a DataFrame for viewing.
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values(by="best_score", ascending=False)
results_df

Unnamed: 0,best_threshold,best_score,games
HT12P,1.44,45.333333,6.0
4.5UP,1.31,36.0,1.0
HT0.5OP,1.36,33.142857,14.0
3.5UP,1.48,19.923077,13.0
1.5OP,1.27,14.714286,21.0
FT1P,1.47,8.794326,141.0
FTXP,1.49,6.091954,87.0
BTTSP,1.46,5.866667,15.0
HT1.5OP,1.5,5.085308,422.0
HTX2P,1.38,4.857143,49.0


# Combining Different Bets

In [19]:
# Define the list of base columns and multiplier columns
ft_result_columns = ['FT1P', 'FTXP', 'FT2P', 'FT1XP', 'FT12P', 'FTX2P'] 
ft_goals_columns = ['1.5OP', '1.5UP', '2.5OP', '2.5UP', '3.5OP', '3.5UP', '4.5OP', '4.5UP', 'BTTSP', 'OTTSP']
ht_result_columns = ['HT1P', 'HTXP', 'HT2P', 'HT1XP', 'HT12P', 'HTX2P']
ht_goals_columns = ['HT0.5OP', 'HT0.5UP', 'HT1.5OP','HT1.5UP']

# List to store summary information for each new column.
summary_list = []

# Multiply each FT result column with each FT goals column.
for ft_result in ft_result_columns:
    for ft_goal in ft_goals_columns:
        new_col = f"{ft_result}_{ft_goal}"  # e.g., "FT1P_1.5OP"
        # Multiply only if both factors are >= 0; otherwise assign -1.
        final_df_unique[new_col] = np.where(
            (final_df_unique[ft_result] >= 0) & (final_df_unique[ft_goal] >= 0),
            final_df_unique[ft_result] * final_df_unique[ft_goal],
            -1
        )
        # Select only valid (>= 0) product values.
        valid_values = final_df_unique.loc[final_df_unique[new_col] >= 0, new_col]
        count_valid = len(valid_values)
        summary_value = (valid_values.sum() - count_valid) / count_valid if count_valid > 0 else np.nan
        summary_list.append({'Combination': new_col, 'Summary': summary_value, 'Games': count_valid})
        # Add a new column to store the count of valid games.
        final_df_unique[f"{new_col}_games"] = count_valid

# Multiply each HT result column with each HT goals column.
for ht_result in ht_result_columns:
    for ht_goal in ht_goals_columns:
        new_col = f"{ht_result}_{ht_goal}"  # e.g., "HT1P_HT0.5OP"
        final_df_unique[new_col] = np.where(
            (final_df_unique[ht_result] >= 0) & (final_df_unique[ht_goal] >= 0),
            final_df_unique[ht_result] * final_df_unique[ht_goal],
            -1
        )
        valid_values = final_df_unique.loc[final_df_unique[new_col] >= 0, new_col]
        count_valid = len(valid_values)
        summary_value = (valid_values.sum() - count_valid) / count_valid if count_valid > 0 else np.nan
        summary_list.append({'Combination': new_col, 'Summary': summary_value, 'Games': count_valid})
        final_df_unique[f"{new_col}_games"] = count_valid

# Extra combinations: BTTSP with 2.5OP and BTTSP with 4.5UP.
extra_combinations = [("BTTSP_2.5OP", "2.5OP"), ("BTTSP_4.5UP", "4.5UP")]

for new_col, other_col in extra_combinations:
    final_df_unique[new_col] = np.where(
        (final_df_unique["BTTSP"] >= 0) & (final_df_unique[other_col] >= 0),
        final_df_unique["BTTSP"] * final_df_unique[other_col],
        -1
    )
    valid_values = final_df_unique.loc[final_df_unique[new_col] >= 0, new_col]
    count_valid = len(valid_values)
    summary_value = (valid_values.sum() - count_valid) / count_valid if count_valid > 0 else np.nan
    summary_list.append({'Combination': new_col, 'Summary': summary_value, 'Games': count_valid})
    final_df_unique[f"{new_col}_games"] = count_valid

# Create a summary DataFrame to display the summary values and game counts.
summary_df = pd.DataFrame(summary_list)
summary_df = summary_df.sort_values(by="Summary", ascending=False)
summary_df

Unnamed: 0,Combination,Summary,Games
65,HTXP_HT0.5UP,0.810498,461
76,HT12P_HT0.5OP,0.333452,130
84,BTTSP_2.5OP,0.313846,1447
78,HT12P_HT1.5OP,0.297313,97
46,FT12P_4.5OP,0.290141,37
60,HT1P_HT0.5OP,0.263062,1150
18,FTXP_BTTSP,0.2396,30
13,FTXP_2.5UP,0.235098,246
11,FTXP_1.5UP,0.211633,120
26,FT2P_4.5OP,0.192494,79
