# Reading Model Predictions and Bet365 Odds

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import warnings

warnings.filterwarnings('ignore')
pd.options.mode.chained_assignment = None
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

csv_url = "https://docs.google.com/spreadsheets/d/1WfEG-1icUjj6k7TGePJQEXH-w0TLEIcN/export?format=csv"
uefa = pd.read_csv(csv_url, dtype={'FTHG': 'Int64', 'FTAG': 'Int64', 'HTHG': 'Int64', 'HTAG': 'Int64'})
uefa['FT'] = uefa['FTHG'].astype(str) + ' - ' + uefa['FTAG'].astype(str)
uefa['HT'] = '(' + uefa['HTHG'].astype(str) + '-' + uefa['HTAG'].astype(str) + ')'
uefa['FTTG'] = uefa['FTHG'] + uefa['FTAG']
uefa['HTTG'] = uefa['HTHG'] + uefa['HTAG']

predictions = pd.read_excel('C:/Users/99451/Desktop/MODEL/2025/dixon_coles_model_predictions/_predictions.xlsx')
bet365_odds = pd.read_excel('C:/Users/99451/Desktop/MODEL/2025/dixon_coles_model_predictions/final_odds.xlsx')
bet365_odds.tail()

Unnamed: 0,Home,Away,FT1,FTX,FT2,DC1X,DC12,DCX2,HT1,HTX,HT2,HT1X,HT12,HTX2,BTTS,OTTS,1.5O,1.5U,2.5O,2.5U,3.5O,3.5U,4.5O,4.5U,HT0.5O,HT0.5U,HT1.5O,HT1.5U
2424,Malaga,Racing Santander,2.75,3.2,2.55,1.5,1.36,1.4,3.5,2.0,3.4,1.3,1.73,1.29,1.83,1.83,1.36,3.0,2.1,1.7,3.75,1.25,8.0,1.08,1.5,2.5,3.4,1.3
2425,Tigre,Instituto,2.6,3.0,3.0,1.36,1.36,1.5,3.4,1.95,3.75,1.25,1.8,1.3,2.0,1.75,1.44,2.63,2.5,1.5,5.0,1.17,11.0,1.05,1.57,2.25,3.75,1.25
2426,Botafogo RJ,Vitoria,1.36,4.75,9.0,1.07,1.18,3.0,1.83,2.4,8.0,1.06,1.53,1.83,2.1,1.67,1.25,4.0,1.8,2.0,3.0,1.4,5.5,1.14,1.36,3.0,2.63,1.44
2427,Atletico GO,Palmeiras,11.0,5.0,1.3,3.4,1.17,1.05,9.0,2.4,1.8,1.91,1.53,1.04,2.2,1.62,1.25,4.0,1.8,2.0,3.0,1.4,5.5,1.14,1.36,3.0,2.63,1.44
2428,Juventude,Cuiaba,1.8,3.3,5.0,1.17,1.33,1.95,2.5,2.0,5.5,1.13,1.73,1.5,2.2,1.62,1.44,2.75,2.4,1.53,4.5,1.2,10.0,1.06,1.53,2.38,3.5,1.29


# Merging 2 DataFrames for similarity of values

In [2]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Function for fuzzy matching on both columns
def fuzzy_merge_on_two_columns(df1, df2, key1_home, key1_away, key2_home, key2_away, threshold=80):
    """
    Merge two DataFrames based on fuzzy matching of both Home and Away columns.
    - df1, df2: DataFrames to merge
    - key1_home, key1_away: column names for 'Home' and 'Away' in df1
    - key2_home, key2_away: column names for 'Home' and 'Away' in df2
    - threshold: minimum similarity score for a match
    """
    matches = []
    
    for i, row1 in df1.iterrows():
        home_team1, away_team1 = row1[key1_home], row1[key1_away]
        
        # Find the best match in df2 for both Home and Away teams
        best_match = None
        best_score = 0
        
        for j, row2 in df2.iterrows():
            home_team2, away_team2 = row2[key2_home], row2[key2_away]
            
            # Calculate similarity for both Home and Away columns
            home_score = fuzz.ratio(home_team1, home_team2)
            away_score = fuzz.ratio(away_team1, away_team2)
            
            # Average similarity score for the pair
            avg_score = (home_score + away_score) / 2
            
            # Check if this is the best match
            if avg_score >= threshold and avg_score > best_score:
                best_match = j
                best_score = avg_score
        
        # If a match was found above the threshold, save the indices
        if best_match is not None:
            matches.append((i, best_match))

    # Create matched DataFrames based on indices
    matched_df1 = df1.loc[[i for i, _ in matches]].reset_index(drop=True)
    matched_df2 = df2.loc[[j for _, j in matches]].reset_index(drop=True)
    
    # Concatenate the matched data side by side
    return pd.concat([matched_df1, matched_df2], axis=1, keys=["df1", "df2"])

# Use the function to merge
merged_df = fuzzy_merge_on_two_columns(predictions, bet365_odds, 'Home', 'Away', 'Home', 'Away', threshold=80)
merged_df.tail()

Unnamed: 0_level_0,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df1,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2,df2
Unnamed: 0_level_1,League,Home,Away,FT1,FTX,FT2,FTR,DC1X,DC12,DCX2,1.5O,2.5O,3.5U,4.5U,BTTS,HT1,HTX,HT2,HTR,HTDC1X,HTDC12,HTDCX2,HT0.5O,HT1.5U,H0.5O,A0.5O,H1.5O,A1.5O,H2.5U,A2.5U,Home,Away,FT1,FTX,FT2,DC1X,DC12,DCX2,HT1,HTX,HT2,HT1X,HT12,HTX2,BTTS,OTTS,1.5O,1.5U,2.5O,2.5U,3.5O,3.5U,4.5O,4.5U,HT0.5O,HT0.5U,HT1.5O,HT1.5U
2025,Switzerland,Lausanne Sport,Sion,45.91,23.26,30.82,1-0,69.17,76.73,54.08,71.9,48.15,73.6,87.74,50.84,34.1,38.78,26.99,0-0,72.88,61.09,65.77,73.51,62.49,76.81,67.96,42.93,31.49,81.84,89.26,Lausanne,Sion,1.91,3.5,4.0,1.22,1.29,1.83,2.5,2.25,4.33,1.2,1.57,1.5,1.67,2.1,1.22,4.0,1.8,2.0,2.75,1.4,5.5,1.14,1.36,3.0,2.63,1.44
2026,Turkey,Eyupspor,Rizespor,66.61,25.6,7.77,2-0,92.21,74.38,33.37,70.26,41.7,78.92,91.03,36.56,37.26,52.32,10.38,0-0,89.58,47.64,62.7,55.12,78.39,83.83,40.96,54.38,9.85,72.45,98.34,Eyupspor,Rizespor,1.95,3.4,3.8,1.25,1.3,1.8,2.63,2.2,4.0,1.22,1.62,1.44,1.73,2.0,1.25,3.75,1.93,1.93,3.25,1.33,6.5,1.11,1.4,2.75,2.75,1.4
2027,Turkey,Gaziantep,Basaksehir,35.55,28.11,36.33,1-1,63.66,71.88,64.44,86.16,63.32,58.83,76.92,67.18,33.87,41.78,24.11,0-0,75.65,57.98,65.89,75.16,52.21,80.25,80.62,48.23,48.82,77.74,77.26,Gaziantep,Basaksehir,2.6,3.4,2.6,1.5,1.3,1.5,3.25,2.2,3.25,1.33,1.62,1.33,1.67,2.1,1.25,3.75,1.9,1.95,3.25,1.33,6.0,1.13,1.4,2.75,2.75,1.4
2028,Turkey,Kayserispor,Fenerbahce,6.66,20.17,73.1,0-2,26.83,79.76,93.27,80.83,56.02,66.3,82.7,43.98,3.21,35.55,60.71,0-1,38.76,63.92,96.26,68.88,64.88,46.8,89.89,13.23,66.8,97.3,59.66,Kayserispor,Fenerbahce,7.0,4.33,1.44,2.63,1.2,1.1,6.5,2.5,1.95,1.8,1.5,1.11,1.83,1.83,1.18,4.5,1.65,2.2,2.63,1.44,4.5,1.18,1.3,3.4,2.5,1.5
2029,Turkey,Bodrumspor,Galatasaray,4.86,16.84,78.17,0-2,21.7,83.03,95.01,83.58,60.7,61.5,79.05,42.57,1.98,22.63,72.76,0-1,24.61,74.74,95.39,79.76,45.78,44.45,92.1,11.81,72.25,97.67,52.97,Bodrumspor,Galatasaray,7.5,4.5,1.42,2.75,1.18,1.1,6.5,2.4,1.91,1.8,1.53,1.1,1.83,1.83,1.2,4.33,1.67,2.15,2.63,1.44,5.0,1.17,1.33,3.25,2.5,1.5


# Scraping SoccerStats For Match Results

In [3]:
final =  pd.DataFrame()
liqa = ''
unique_leagues = predictions['League'].unique().tolist()

# Convert to lowercase and remove 'UNL'
unique_leagues = [league.lower() for league in unique_leagues if league.lower() != 'unl']

for i in unique_leagues:
    URL = "https://www.soccerstats.com/results.asp?league=" + i + "&pmtype=bydate"
    page = requests.get(URL)
    liqa = i
    soup = BeautifulSoup(page.content, "html.parser")
    results = soup.find(id="btable")
    sth = results.find_all("tr", class_="odd")
    sth


    date, league, home, away, ft, ht = [], [], [], [], [],[]
    for i in sth:
        date.append(i.find_all("td", align = 'right')[0].get_text(strip=True))
        league.append(liqa.capitalize())
        home.append(i.find_all("td", align = 'right')[1].get_text(strip=True))
        away.append(i.find("td", align = "left").get_text(strip = True))
        ft.append(i.find_all("td", align = 'center')[0].get_text(strip = True))
        try:
            ht.append(i.find_all("td", align = 'center')[2].get_text(strip = True))
        except IndexError as e:
            ht.append('NA')#print("Last output before error occurred:", i.find_all("td", align = 'center'))

    data = {'Date': date, 'League': league,'Home': home, 'Away': away, 'FT': ft, 'HT': ht}

# Create a DataFrame from the dictionary
    df = pd.DataFrame(data)

# Replace empty strings with NaN
    #next_df = df[(df['Date'] == formatted_date) & (df['HT'] == '')]
    df.replace('', pd.NA, inplace=True)

# Drop rows with NaN values
    df_cleaned = df.dropna()

#For Half-Time Results
    hthg, htag = [], []
    for i in df_cleaned['HT']:
        if i == 'NA':
            hthg.append('NA')
            htag.append('NA')
        elif i == '+' or i == '-':
            hthg.append('NA')
            htag.append('NA')
        else:
            try:
                hthg.append(int(i[1]))
                htag.append(int(i[3]))
            except IndexError as e:
                print("Last output before error occurred:", i)



#For Full-Time Results
    hg, ag, tg = [], [], []
    for i in df_cleaned['FT']:
        if len(i) < 5 or ':' in i:
            hg.append('NA')
            ag.append('NA')
            tg.append('NA')
        else:
            try:
                hghg = int(i.split(' - ')[0])
                hg.append(hghg)
                agag = int(i.split(' - ')[1])
                ag.append(agag)
                tg.append(hghg + agag)
            except:
                print(hghg + agag)

    
    df_cleaned['FTHG'], df_cleaned['FTAG'], df_cleaned['FTTG'] = hg, ag, tg
    df_cleaned['HTHG'], df_cleaned['HTAG'] = hthg, htag
    df_cleaned['HTTG'] = df_cleaned['HTHG'] + df_cleaned['HTAG']
    
    final = pd.concat([final, df_cleaned], ignore_index=True)
    
final = final[final['HT'] != 'NA']

# Example: Fix the 'date' column by removing the weekday and extra punctuation
final['Date'] = final['Date'].str.extract(r'(\d{1,2} \w{3})')  # Extract day and month part
final['Date'] = final['Date'] + ' 2024'  # Append the year

# Convert to datetime format
final['Date'] = pd.to_datetime(final['Date'], format='%d %b %Y', errors='coerce')

# Filter rows before September 6th, 2024
final_filtered = final[final['Date'] >= pd.Timestamp('2024-09-17')]

# Align columns of uefa to match final_filtered
uefa = uefa[final_filtered.columns]

# Concatenate
final_filtered = pd.concat([uefa, final_filtered], ignore_index=True)

combined = pd.concat([final_filtered.head(), final_filtered.tail()])

combined

Unnamed: 0,Date,League,Home,Away,FT,HT,FTHG,FTAG,FTTG,HTHG,HTAG,HTTG
0,9/17/2024,UCL,Juventus,PSV,3 - 1,(2-0),3,1,4,2,0,2
1,9/17/2024,UCL,Young Boys,Aston Villa,0 - 3,(0-2),0,3,3,0,2,2
2,9/17/2024,UCL,Bayern,Dinamo Zagreb,9 - 2,(3-0),9,2,11,3,0,3
3,9/17/2024,UCL,Milan,Liverpool,1 - 3,(1-2),1,3,4,1,2,3
4,9/17/2024,UCL,Real Madrid,Stuttgart,3 - 1,(0-0),3,1,4,0,0,0
5298,2024-11-10 00:00:00,Portugal2,Benfica B,Alverca,2 - 1,(1-1),2,1,3,1,1,2
5299,2024-11-10 00:00:00,Portugal2,Torreense,Uniao de Leiria,2 - 1,(1-1),2,1,3,1,1,2
5300,2024-11-23 00:00:00,Portugal2,Portimonense,Benfica B,0 - 2,(0-1),0,2,2,0,1,1
5301,2024-11-23 00:00:00,Portugal2,Feirense,Maritimo,1 - 0,(0-0),1,0,1,0,0,0
5302,2024-11-24 00:00:00,Portugal2,Vizela,Oliveirense,0 - 0,(0-0),0,0,0,0,0,0


# Merging with Predictions + Odds Dataframes

In [4]:
# Flatten multi-level columns
merged_df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in merged_df.columns]

# Rename two columns
merged_df = merged_df.rename(columns={'df1_Home': 'Home', 'df1_Away': 'Away'})

# Merge on multiple columns
final_df = pd.merge(merged_df, final_filtered, on=['Home', 'Away'], how='inner')

# Drop duplicates based on the 'Home' and 'Away' columns
final_df_unique = final_df.drop_duplicates(subset=['Home', 'Away'])

# Reset the index afterward
final_df_unique.reset_index(drop=True, inplace=True)

print('Number of games matched: ', len(final_df_unique))
final_df_unique.tail()

Number of games matched:  2003


Unnamed: 0,df1_League,Home,Away,df1_FT1,df1_FTX,df1_FT2,df1_FTR,df1_DC1X,df1_DC12,df1_DCX2,df1_1.5O,df1_2.5O,df1_3.5U,df1_4.5U,df1_BTTS,df1_HT1,df1_HTX,df1_HT2,df1_HTR,df1_HTDC1X,df1_HTDC12,df1_HTDCX2,df1_HT0.5O,df1_HT1.5U,df1_H0.5O,df1_A0.5O,df1_H1.5O,df1_A1.5O,df1_H2.5U,df1_A2.5U,df2_Home,df2_Away,df2_FT1,df2_FTX,df2_FT2,df2_DC1X,df2_DC12,df2_DCX2,df2_HT1,df2_HTX,df2_HT2,df2_HT1X,df2_HT12,df2_HTX2,df2_BTTS,df2_OTTS,df2_1.5O,df2_1.5U,df2_2.5O,df2_2.5U,df2_3.5O,df2_3.5U,df2_4.5O,df2_4.5U,df2_HT0.5O,df2_HT0.5U,df2_HT1.5O,df2_HT1.5U,Date,League,FT,HT,FTHG,FTAG,FTTG,HTHG,HTAG,HTTG
1998,Switzerland,Lausanne Sport,Sion,45.91,23.26,30.82,1-0,69.17,76.73,54.08,71.9,48.15,73.6,87.74,50.84,34.1,38.78,26.99,0-0,72.88,61.09,65.77,73.51,62.49,76.81,67.96,42.93,31.49,81.84,89.26,Lausanne,Sion,1.91,3.5,4.0,1.22,1.29,1.83,2.5,2.25,4.33,1.2,1.57,1.5,1.67,2.1,1.22,4.0,1.8,2.0,2.75,1.4,5.5,1.14,1.36,3.0,2.63,1.44,2024-11-23 00:00:00,Switzerland,1 - 0,(0-0),1,0,1,0,0,0
1999,Turkey,Eyupspor,Rizespor,66.61,25.6,7.77,2-0,92.21,74.38,33.37,70.26,41.7,78.92,91.03,36.56,37.26,52.32,10.38,0-0,89.58,47.64,62.7,55.12,78.39,83.83,40.96,54.38,9.85,72.45,98.34,Eyupspor,Rizespor,1.95,3.4,3.8,1.25,1.3,1.8,2.63,2.2,4.0,1.22,1.62,1.44,1.73,2.0,1.25,3.75,1.93,1.93,3.25,1.33,6.5,1.11,1.4,2.75,2.75,1.4,2024-11-23 00:00:00,Turkey,1 - 2,(1-0),1,2,3,1,0,1
2000,Turkey,Gaziantep,Basaksehir,35.55,28.11,36.33,1-1,63.66,71.88,64.44,86.16,63.32,58.83,76.92,67.18,33.87,41.78,24.11,0-0,75.65,57.98,65.89,75.16,52.21,80.25,80.62,48.23,48.82,77.74,77.26,Gaziantep,Basaksehir,2.6,3.4,2.6,1.5,1.3,1.5,3.25,2.2,3.25,1.33,1.62,1.33,1.67,2.1,1.25,3.75,1.9,1.95,3.25,1.33,6.0,1.13,1.4,2.75,2.75,1.4,2024-11-23 00:00:00,Turkey,3 - 0,(2-0),3,0,3,2,0,2
2001,Turkey,Kayserispor,Fenerbahce,6.66,20.17,73.1,0-2,26.83,79.76,93.27,80.83,56.02,66.3,82.7,43.98,3.21,35.55,60.71,0-1,38.76,63.92,96.26,68.88,64.88,46.8,89.89,13.23,66.8,97.3,59.66,Kayserispor,Fenerbahce,7.0,4.33,1.44,2.63,1.2,1.1,6.5,2.5,1.95,1.8,1.5,1.11,1.83,1.83,1.18,4.5,1.65,2.2,2.63,1.44,4.5,1.18,1.3,3.4,2.5,1.5,2024-11-23 00:00:00,Turkey,2 - 6,(1-3),2,6,8,1,3,4
2002,Turkey,Bodrumspor,Galatasaray,4.86,16.84,78.17,0-2,21.7,83.03,95.01,83.58,60.7,61.5,79.05,42.57,1.98,22.63,72.76,0-1,24.61,74.74,95.39,79.76,45.78,44.45,92.1,11.81,72.25,97.67,52.97,Bodrumspor,Galatasaray,7.5,4.5,1.42,2.75,1.18,1.1,6.5,2.4,1.91,1.8,1.53,1.1,1.83,1.83,1.2,4.33,1.67,2.15,2.63,1.44,5.0,1.17,1.33,3.25,2.5,1.5,2024-11-23 00:00:00,Turkey,0 - 1,(0-0),0,1,1,0,0,0


# Creating Results Columns

In [5]:
import numpy as np

# Add the new columns based on the condition
final_df_unique['FT1'] = np.where(final_df_unique['FTHG'] > final_df_unique['FTAG'], 1, 0)
final_df_unique['FTX'] = np.where(final_df_unique['FTHG'] == final_df_unique['FTAG'], 1, 0)
final_df_unique['FT2'] = np.where(final_df_unique['FTHG'] < final_df_unique['FTAG'], 1, 0)

final_df_unique['FT1X'] = np.where(final_df_unique['FTHG'] >= final_df_unique['FTAG'], 1, 0)
final_df_unique['FT12'] = np.where(final_df_unique['FTX'] == 0, 1, 0)
final_df_unique['FTX2'] = np.where(final_df_unique['FTHG'] <= final_df_unique['FTAG'], 1, 0)

final_df_unique['1.5O'] = np.where(final_df_unique['FTTG'] > 1.5, 1, 0)
final_df_unique['1.5U'] = np.where(final_df_unique['FTTG'] < 1.5, 1, 0)
final_df_unique['2.5O'] = np.where(final_df_unique['FTTG'] > 2.5, 1, 0)
final_df_unique['2.5U'] = np.where(final_df_unique['FTTG'] < 2.5, 1, 0)
final_df_unique['3.5O'] = np.where(final_df_unique['FTTG'] > 3.5, 1, 0)
final_df_unique['3.5U'] = np.where(final_df_unique['FTTG'] < 3.5, 1, 0)
final_df_unique['4.5O'] = np.where(final_df_unique['FTTG'] > 4.5, 1, 0)
final_df_unique['4.5U'] = np.where(final_df_unique['FTTG'] < 4.5, 1, 0)

final_df_unique['BTTS'] = np.where((final_df_unique['FTHG'] != 0) & (final_df_unique['FTAG'] != 0), 1, 0)
final_df_unique['OTTS'] = np.where(final_df_unique['BTTS'] == 0, 1, 0)

final_df_unique['HT1'] = np.where(final_df_unique['HTHG'] > final_df_unique['HTAG'], 1, 0)
final_df_unique['HTX'] = np.where(final_df_unique['HTHG'] == final_df_unique['HTAG'], 1, 0)
final_df_unique['HT2'] = np.where(final_df_unique['HTHG'] < final_df_unique['HTAG'], 1, 0)

final_df_unique['HT1X'] = np.where(final_df_unique['HTHG'] >= final_df_unique['HTAG'], 1, 0)
final_df_unique['HT12'] = np.where(final_df_unique['HTX'] == 0, 1, 0)
final_df_unique['HTX2'] = np.where(final_df_unique['HTHG'] <= final_df_unique['HTAG'], 1, 0)

final_df_unique['HT0.5O'] = np.where(final_df_unique['HTTG'] > 0.5, 1, 0)
final_df_unique['HT0.5U'] = np.where(final_df_unique['HTTG'] < 0.5, 1, 0)
final_df_unique['HT1.5O'] = np.where(final_df_unique['HTTG'] > 1.5, 1, 0)
final_df_unique['HT1.5U'] = np.where(final_df_unique['HTTG'] < 1.5, 1, 0)

print('Games Found: ', len(final_df_unique))
final_df_unique.tail()

Games Found:  2003


Unnamed: 0,df1_League,Home,Away,df1_FT1,df1_FTX,df1_FT2,df1_FTR,df1_DC1X,df1_DC12,df1_DCX2,df1_1.5O,df1_2.5O,df1_3.5U,df1_4.5U,df1_BTTS,df1_HT1,df1_HTX,df1_HT2,df1_HTR,df1_HTDC1X,df1_HTDC12,df1_HTDCX2,df1_HT0.5O,df1_HT1.5U,df1_H0.5O,df1_A0.5O,df1_H1.5O,df1_A1.5O,df1_H2.5U,df1_A2.5U,df2_Home,df2_Away,df2_FT1,df2_FTX,df2_FT2,df2_DC1X,df2_DC12,df2_DCX2,df2_HT1,df2_HTX,df2_HT2,df2_HT1X,df2_HT12,df2_HTX2,df2_BTTS,df2_OTTS,df2_1.5O,df2_1.5U,df2_2.5O,df2_2.5U,df2_3.5O,df2_3.5U,df2_4.5O,df2_4.5U,df2_HT0.5O,df2_HT0.5U,df2_HT1.5O,df2_HT1.5U,Date,League,FT,HT,FTHG,FTAG,FTTG,HTHG,HTAG,HTTG,FT1,FTX,FT2,FT1X,FT12,FTX2,1.5O,1.5U,2.5O,2.5U,3.5O,3.5U,4.5O,4.5U,BTTS,OTTS,HT1,HTX,HT2,HT1X,HT12,HTX2,HT0.5O,HT0.5U,HT1.5O,HT1.5U
1998,Switzerland,Lausanne Sport,Sion,45.91,23.26,30.82,1-0,69.17,76.73,54.08,71.9,48.15,73.6,87.74,50.84,34.1,38.78,26.99,0-0,72.88,61.09,65.77,73.51,62.49,76.81,67.96,42.93,31.49,81.84,89.26,Lausanne,Sion,1.91,3.5,4.0,1.22,1.29,1.83,2.5,2.25,4.33,1.2,1.57,1.5,1.67,2.1,1.22,4.0,1.8,2.0,2.75,1.4,5.5,1.14,1.36,3.0,2.63,1.44,2024-11-23 00:00:00,Switzerland,1 - 0,(0-0),1,0,1,0,0,0,1,0,0,1,1,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1
1999,Turkey,Eyupspor,Rizespor,66.61,25.6,7.77,2-0,92.21,74.38,33.37,70.26,41.7,78.92,91.03,36.56,37.26,52.32,10.38,0-0,89.58,47.64,62.7,55.12,78.39,83.83,40.96,54.38,9.85,72.45,98.34,Eyupspor,Rizespor,1.95,3.4,3.8,1.25,1.3,1.8,2.63,2.2,4.0,1.22,1.62,1.44,1.73,2.0,1.25,3.75,1.93,1.93,3.25,1.33,6.5,1.11,1.4,2.75,2.75,1.4,2024-11-23 00:00:00,Turkey,1 - 2,(1-0),1,2,3,1,0,1,0,0,1,0,1,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1,1,0,1,0,0,1
2000,Turkey,Gaziantep,Basaksehir,35.55,28.11,36.33,1-1,63.66,71.88,64.44,86.16,63.32,58.83,76.92,67.18,33.87,41.78,24.11,0-0,75.65,57.98,65.89,75.16,52.21,80.25,80.62,48.23,48.82,77.74,77.26,Gaziantep,Basaksehir,2.6,3.4,2.6,1.5,1.3,1.5,3.25,2.2,3.25,1.33,1.62,1.33,1.67,2.1,1.25,3.75,1.9,1.95,3.25,1.33,6.0,1.13,1.4,2.75,2.75,1.4,2024-11-23 00:00:00,Turkey,3 - 0,(2-0),3,0,3,2,0,2,1,0,0,1,1,0,1,0,1,0,0,1,0,1,0,1,1,0,0,1,1,0,1,0,1,0
2001,Turkey,Kayserispor,Fenerbahce,6.66,20.17,73.1,0-2,26.83,79.76,93.27,80.83,56.02,66.3,82.7,43.98,3.21,35.55,60.71,0-1,38.76,63.92,96.26,68.88,64.88,46.8,89.89,13.23,66.8,97.3,59.66,Kayserispor,Fenerbahce,7.0,4.33,1.44,2.63,1.2,1.1,6.5,2.5,1.95,1.8,1.5,1.11,1.83,1.83,1.18,4.5,1.65,2.2,2.63,1.44,4.5,1.18,1.3,3.4,2.5,1.5,2024-11-23 00:00:00,Turkey,2 - 6,(1-3),2,6,8,1,3,4,0,0,1,0,1,1,1,0,1,0,1,0,1,0,1,0,0,0,1,0,1,1,1,0,1,0
2002,Turkey,Bodrumspor,Galatasaray,4.86,16.84,78.17,0-2,21.7,83.03,95.01,83.58,60.7,61.5,79.05,42.57,1.98,22.63,72.76,0-1,24.61,74.74,95.39,79.76,45.78,44.45,92.1,11.81,72.25,97.67,52.97,Bodrumspor,Galatasaray,7.5,4.5,1.42,2.75,1.18,1.1,6.5,2.4,1.91,1.8,1.53,1.1,1.83,1.83,1.2,4.33,1.67,2.15,2.63,1.44,5.0,1.17,1.33,3.25,2.5,1.5,2024-11-23 00:00:00,Turkey,0 - 1,(0-0),0,1,1,0,0,0,0,0,1,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1


# Creating Profit Columns for Initial Model Predictions

In [6]:
ft1p, ftxp, ft2p, ft1xp, ft12p, ftx2p = [], [], [], [], [], []
over15, under15, over25, under25, over35, under35, over45, under45 = [], [], [], [], [], [], [], []
btts, otts, ht1p, htxp, ht2p, ht1xp, ht12p, htx2p = [], [], [], [], [], [], [], []
htover05, htunder05, htover15, htunder15 = [], [], [], []

#if prediction == result -> coefficient, elif prediction != result -> 0, else -> -1
for i in range(len(final_df_unique)):
    ft_list = [final_df_unique['df1_FT1'].iloc[i], final_df_unique['df1_FTX'].iloc[i], final_df_unique['df1_FT2'].iloc[i]]
    if (ft_list.index(max(ft_list)) == 0) and (final_df_unique['FT1'].iloc[i] == 1):
        ft1p.append(final_df_unique['df2_FT1'].iloc[i])
    elif (ft_list.index(max(ft_list)) == 0) and (final_df_unique['FT1'].iloc[i] != 1):
        ft1p.append(0)
    else:
        ft1p.append(-1)
    
    if (ft_list.index(max(ft_list)) == 1) and (final_df_unique['FTX'].iloc[i] == 1):
        ftxp.append(final_df_unique['df2_FTX'].iloc[i])
    elif (ft_list.index(max(ft_list)) == 1) and (final_df_unique['FTX'].iloc[i] != 1):
        ftxp.append(0)
    else:
        ftxp.append(-1)
    
    if (ft_list.index(max(ft_list)) == 2) and (final_df_unique['FT2'].iloc[i] == 1):
        ft2p.append(final_df_unique['df2_FT2'].iloc[i])
    elif (ft_list.index(max(ft_list)) == 2) and (final_df_unique['FT2'].iloc[i] != 1):
        ft2p.append(0)
    else:
        ft2p.append(-1)

    dc_list = [final_df_unique['df1_DC1X'].iloc[i], final_df_unique['df1_DC12'].iloc[i], final_df_unique['df1_DCX2'].iloc[i]]
    if (dc_list.index(max(dc_list)) == 0) and (final_df_unique['FT1X'].iloc[i] == 1):
        ft1xp.append(final_df_unique['df2_DC1X'].iloc[i])
    elif (dc_list.index(max(dc_list)) == 0) and (final_df_unique['FT1X'].iloc[i] != 1):
        ft1xp.append(0)
    else:
        ft1xp.append(-1)
    
    if (dc_list.index(max(dc_list)) == 1) and (final_df_unique['FT12'].iloc[i] == 1):
        ft12p.append(final_df_unique['df2_DC12'].iloc[i])
    elif (dc_list.index(max(dc_list)) == 1) and (final_df_unique['FT12'].iloc[i] != 1):
        ft12p.append(0)
    else:
        ft12p.append(-1)
    
    if (dc_list.index(max(dc_list)) == 2) and (final_df_unique['FTX2'].iloc[i] == 1):
        ftx2p.append(final_df_unique['df2_DCX2'].iloc[i])
    elif (dc_list.index(max(dc_list)) == 2) and (final_df_unique['FTX2'].iloc[i] != 1):
        ftx2p.append(0)
    else:
        ftx2p.append(-1)
    
    if (final_df_unique['df1_1.5O'].iloc[i] >= 50) and (final_df_unique['1.5O'].iloc[i] == 1):
        over15.append(final_df_unique['df2_1.5O'].iloc[i])
    elif (final_df_unique['df1_1.5O'].iloc[i] >= 50) and (final_df_unique['1.5O'].iloc[i] != 1):
        over15.append(0)
    else:
        over15.append(-1)
    
    if (final_df_unique['df1_1.5O'].iloc[i] < 50) and (final_df_unique['1.5O'].iloc[i] != 1):
        under15.append(final_df_unique['df2_1.5U'].iloc[i])
    elif (final_df_unique['df1_1.5O'].iloc[i] < 50) and (final_df_unique['1.5O'].iloc[i] == 1):
        under15.append(0)
    else:
        under15.append(-1)
    
    if (final_df_unique['df1_2.5O'].iloc[i] >= 50) and (final_df_unique['2.5O'].iloc[i] == 1):
        over25.append(final_df_unique['df2_2.5O'].iloc[i])
    elif (final_df_unique['df1_2.5O'].iloc[i] >= 50) and (final_df_unique['2.5O'].iloc[i] != 1):
        over25.append(0)
    else:
        over25.append(-1)
    
    if (final_df_unique['df1_2.5O'].iloc[i] < 50) and (final_df_unique['2.5O'].iloc[i] != 1):
        under25.append(final_df_unique['df2_2.5U'].iloc[i])
    elif (final_df_unique['df1_2.5O'].iloc[i] < 50) and (final_df_unique['2.5O'].iloc[i] == 1):
        under25.append(0)
    else:
        under25.append(-1)
    
    if (final_df_unique['df1_3.5U'].iloc[i] <= 50) and (final_df_unique['3.5O'].iloc[i] == 1):
        over35.append(final_df_unique['df2_3.5O'].iloc[i])
    elif (final_df_unique['df1_3.5U'].iloc[i] <= 50) and (final_df_unique['3.5O'].iloc[i] != 1):
        over35.append(0)
    else:
        over35.append(-1)
    
    if (final_df_unique['df1_3.5U'].iloc[i] > 50) and (final_df_unique['3.5O'].iloc[i] != 1):
        under35.append(final_df_unique['df2_3.5U'].iloc[i])
    elif (final_df_unique['df1_3.5U'].iloc[i] > 50) and (final_df_unique['3.5O'].iloc[i] == 1):
        under35.append(0)
    else:
        under35.append(-1)
    
    if (final_df_unique['df1_4.5U'].iloc[i] <= 50) and (final_df_unique['4.5O'].iloc[i] == 1):
        over45.append(final_df_unique['df2_4.5O'].iloc[i])
    elif (final_df_unique['df1_4.5U'].iloc[i] <= 50) and (final_df_unique['4.5O'].iloc[i] != 1):
        over45.append(0)
    else:
        over45.append(-1)
    
    if (final_df_unique['df1_4.5U'].iloc[i] > 50) and (final_df_unique['4.5O'].iloc[i] != 1):
        under45.append(final_df_unique['df2_4.5U'].iloc[i])
    elif (final_df_unique['df1_4.5U'].iloc[i] > 50) and (final_df_unique['4.5O'].iloc[i] == 1):
        under45.append(0)
    else:
        under45.append(-1)
    
    if (final_df_unique['df1_BTTS'].iloc[i] >= 50) and (final_df_unique['BTTS'].iloc[i] == 1):
        btts.append(final_df_unique['df2_BTTS'].iloc[i])
    elif (final_df_unique['df1_BTTS'].iloc[i] >= 50) and (final_df_unique['BTTS'].iloc[i] != 1):
        btts.append(0)
    else:
        btts.append(-1)
    
    if (final_df_unique['df1_BTTS'].iloc[i] < 50) and (final_df_unique['BTTS'].iloc[i] != 1):
        otts.append(final_df_unique['df2_OTTS'].iloc[i])
    elif (final_df_unique['df1_BTTS'].iloc[i] < 50) and (final_df_unique['BTTS'].iloc[i] == 1):
        otts.append(0)
    else:
        otts.append(-1)
    
    ht_list = [final_df_unique['df1_HT1'].iloc[i], final_df_unique['df1_HTX'].iloc[i], final_df_unique['df1_HT2'].iloc[i]]
    if (ht_list.index(max(ht_list)) == 0) and (final_df_unique['HT1'].iloc[i] == 1):
        ht1p.append(final_df_unique['df2_HT1'].iloc[i])
    elif (ht_list.index(max(ht_list)) == 0) and (final_df_unique['HT1'].iloc[i] != 1):
        ht1p.append(0)
    else:
        ht1p.append(-1)
    
    if (ht_list.index(max(ht_list)) == 1) and (final_df_unique['HTX'].iloc[i] == 1):
        htxp.append(final_df_unique['df2_HTX'].iloc[i])
    elif (ht_list.index(max(ht_list)) == 1) and (final_df_unique['HTX'].iloc[i] != 1):
        htxp.append(0)
    else:
        htxp.append(-1)
    
    if (ht_list.index(max(ht_list)) == 2) and (final_df_unique['HT2'].iloc[i] == 1):
        ht2p.append(final_df_unique['df2_HT2'].iloc[i])
    elif (ht_list.index(max(ht_list)) == 2) and (final_df_unique['HT2'].iloc[i] != 1):
        ht2p.append(0)
    else:
        ht2p.append(-1)
    
    htdc_list = [final_df_unique['df1_HTDC1X'].iloc[i], final_df_unique['df1_HTDC12'].iloc[i], final_df_unique['df1_HTDCX2'].iloc[i]]
    if (htdc_list.index(max(htdc_list)) == 0) and (final_df_unique['HT1X'].iloc[i] == 1):
        ht1xp.append(final_df_unique['df2_HT1X'].iloc[i])
    elif (htdc_list.index(max(htdc_list)) == 0) and (final_df_unique['HT1X'].iloc[i] != 1):
        ht1xp.append(0)
    else:
        ht1xp.append(-1)
    
    if (htdc_list.index(max(htdc_list)) == 1) and (final_df_unique['HT12'].iloc[i] == 1):
        ht12p.append(final_df_unique['df2_HT12'].iloc[i])
    elif (htdc_list.index(max(htdc_list)) == 1) and (final_df_unique['HT12'].iloc[i] != 1):
        ht12p.append(0)
    else:
        ht12p.append(-1)
    
    if (htdc_list.index(max(htdc_list)) == 2) and (final_df_unique['HTX2'].iloc[i] == 1):
        htx2p.append(final_df_unique['df2_HTX2'].iloc[i])
    elif (htdc_list.index(max(htdc_list)) == 2) and (final_df_unique['HTX2'].iloc[i] != 1):
        htx2p.append(0)
    else:
        htx2p.append(-1)
    
    if (final_df_unique['df1_HT0.5O'].iloc[i] >= 50) and (final_df_unique['HT0.5O'].iloc[i] == 1):
        htover05.append(final_df_unique['df2_HT0.5O'].iloc[i])
    elif (final_df_unique['df1_HT0.5O'].iloc[i] >= 50) and (final_df_unique['HT0.5O'].iloc[i] != 1):
        htover05.append(0)
    else:
        htover05.append(-1)
    
    if (final_df_unique['df1_HT0.5O'].iloc[i] < 50) and (final_df_unique['HT0.5O'].iloc[i] != 1):
        htunder05.append(final_df_unique['df2_HT0.5U'].iloc[i])
    elif (final_df_unique['df1_HT0.5O'].iloc[i] < 50) and (final_df_unique['HT0.5O'].iloc[i] == 1):
        htunder05.append(0)
    else:
        htunder05.append(-1)
    
    if (final_df_unique['df1_HT1.5U'].iloc[i] < 50) and (final_df_unique['HT1.5O'].iloc[i] == 1):
        htover15.append(final_df_unique['df2_HT1.5O'].iloc[i])
    elif (final_df_unique['df1_HT1.5U'].iloc[i] < 50) and (final_df_unique['HT1.5O'].iloc[i] != 1):
        htover15.append(0)
    else:
        htover15.append(-1)
    
    if (final_df_unique['df1_HT1.5U'].iloc[i] >= 50) and (final_df_unique['HT1.5O'].iloc[i] != 1):
        htunder15.append(final_df_unique['df2_HT1.5U'].iloc[i])
    elif (final_df_unique['df1_HT1.5U'].iloc[i] >= 50) and (final_df_unique['HT1.5O'].iloc[i] == 1):
        htunder15.append(0)
    else:
        htunder15.append(-1)

final_df_unique['FT1P'], final_df_unique['FTXP'], final_df_unique['FT2P'] = ft1p, ftxp, ft2p
final_df_unique['FT1XP'], final_df_unique['FT12P'], final_df_unique['FTX2P'] = ft1xp, ft12p, ftx2p
final_df_unique['1.5OP'], final_df_unique['1.5UP'], final_df_unique['2.5OP'], final_df_unique['2.5UP'] = over15, under15, over25, under25
final_df_unique['3.5OP'], final_df_unique['3.5UP'], final_df_unique['4.5OP'], final_df_unique['4.5UP'] = over35, under35, over45, under45
final_df_unique['BTTSP'], final_df_unique['OTTSP'] = btts, otts
final_df_unique['HT1P'], final_df_unique['HTXP'], final_df_unique['HT2P'] = ht1p, htxp, ht2p
final_df_unique['HT1XP'], final_df_unique['HT12P'], final_df_unique['HTX2P'] = ht1xp, ht12p, htx2p
final_df_unique['HT0.5OP'], final_df_unique['HT0.5UP'] = htover05, htunder05
final_df_unique['HT1.5OP'], final_df_unique['HT1.5UP'] = htover15, htunder15

print('Games Found: ', len(final_df_unique))
final_df_unique.tail()

Games Found:  2003


Unnamed: 0,df1_League,Home,Away,df1_FT1,df1_FTX,df1_FT2,df1_FTR,df1_DC1X,df1_DC12,df1_DCX2,df1_1.5O,df1_2.5O,df1_3.5U,df1_4.5U,df1_BTTS,df1_HT1,df1_HTX,df1_HT2,df1_HTR,df1_HTDC1X,df1_HTDC12,df1_HTDCX2,df1_HT0.5O,df1_HT1.5U,df1_H0.5O,df1_A0.5O,df1_H1.5O,df1_A1.5O,df1_H2.5U,df1_A2.5U,df2_Home,df2_Away,df2_FT1,df2_FTX,df2_FT2,df2_DC1X,df2_DC12,df2_DCX2,df2_HT1,df2_HTX,df2_HT2,df2_HT1X,df2_HT12,df2_HTX2,df2_BTTS,df2_OTTS,df2_1.5O,df2_1.5U,df2_2.5O,df2_2.5U,df2_3.5O,df2_3.5U,df2_4.5O,df2_4.5U,df2_HT0.5O,df2_HT0.5U,df2_HT1.5O,df2_HT1.5U,Date,League,FT,HT,FTHG,FTAG,FTTG,HTHG,HTAG,HTTG,FT1,FTX,FT2,FT1X,FT12,FTX2,1.5O,1.5U,2.5O,2.5U,3.5O,3.5U,4.5O,4.5U,BTTS,OTTS,HT1,HTX,HT2,HT1X,HT12,HTX2,HT0.5O,HT0.5U,HT1.5O,HT1.5U,FT1P,FTXP,FT2P,FT1XP,FT12P,FTX2P,1.5OP,1.5UP,2.5OP,2.5UP,3.5OP,3.5UP,4.5OP,4.5UP,BTTSP,OTTSP,HT1P,HTXP,HT2P,HT1XP,HT12P,HTX2P,HT0.5OP,HT0.5UP,HT1.5OP,HT1.5UP
1998,Switzerland,Lausanne Sport,Sion,45.91,23.26,30.82,1-0,69.17,76.73,54.08,71.9,48.15,73.6,87.74,50.84,34.1,38.78,26.99,0-0,72.88,61.09,65.77,73.51,62.49,76.81,67.96,42.93,31.49,81.84,89.26,Lausanne,Sion,1.91,3.5,4.0,1.22,1.29,1.83,2.5,2.25,4.33,1.2,1.57,1.5,1.67,2.1,1.22,4.0,1.8,2.0,2.75,1.4,5.5,1.14,1.36,3.0,2.63,1.44,2024-11-23 00:00:00,Switzerland,1 - 0,(0-0),1,0,1,0,0,0,1,0,0,1,1,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,1.91,-1.0,-1.0,-1.0,1.29,-1.0,0.0,-1.0,-1.0,2.0,-1.0,1.4,-1.0,1.14,0.0,-1.0,-1.0,2.25,-1.0,1.2,-1.0,-1.0,0.0,-1.0,-1.0,1.44
1999,Turkey,Eyupspor,Rizespor,66.61,25.6,7.77,2-0,92.21,74.38,33.37,70.26,41.7,78.92,91.03,36.56,37.26,52.32,10.38,0-0,89.58,47.64,62.7,55.12,78.39,83.83,40.96,54.38,9.85,72.45,98.34,Eyupspor,Rizespor,1.95,3.4,3.8,1.25,1.3,1.8,2.63,2.2,4.0,1.22,1.62,1.44,1.73,2.0,1.25,3.75,1.93,1.93,3.25,1.33,6.5,1.11,1.4,2.75,2.75,1.4,2024-11-23 00:00:00,Turkey,1 - 2,(1-0),1,2,3,1,0,1,0,0,1,0,1,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1,1,0,1,0,0,1,0.0,-1.0,-1.0,0.0,-1.0,-1.0,1.25,-1.0,-1.0,0.0,-1.0,1.33,-1.0,1.11,-1.0,0.0,-1.0,0.0,-1.0,1.22,-1.0,-1.0,1.4,-1.0,-1.0,1.4
2000,Turkey,Gaziantep,Basaksehir,35.55,28.11,36.33,1-1,63.66,71.88,64.44,86.16,63.32,58.83,76.92,67.18,33.87,41.78,24.11,0-0,75.65,57.98,65.89,75.16,52.21,80.25,80.62,48.23,48.82,77.74,77.26,Gaziantep,Basaksehir,2.6,3.4,2.6,1.5,1.3,1.5,3.25,2.2,3.25,1.33,1.62,1.33,1.67,2.1,1.25,3.75,1.9,1.95,3.25,1.33,6.0,1.13,1.4,2.75,2.75,1.4,2024-11-23 00:00:00,Turkey,3 - 0,(2-0),3,0,3,2,0,2,1,0,0,1,1,0,1,0,1,0,0,1,0,1,0,1,1,0,0,1,1,0,1,0,1,0,-1.0,-1.0,0.0,-1.0,1.3,-1.0,1.25,-1.0,1.9,-1.0,-1.0,1.33,-1.0,1.13,0.0,-1.0,-1.0,0.0,-1.0,1.33,-1.0,-1.0,1.4,-1.0,-1.0,0.0
2001,Turkey,Kayserispor,Fenerbahce,6.66,20.17,73.1,0-2,26.83,79.76,93.27,80.83,56.02,66.3,82.7,43.98,3.21,35.55,60.71,0-1,38.76,63.92,96.26,68.88,64.88,46.8,89.89,13.23,66.8,97.3,59.66,Kayserispor,Fenerbahce,7.0,4.33,1.44,2.63,1.2,1.1,6.5,2.5,1.95,1.8,1.5,1.11,1.83,1.83,1.18,4.5,1.65,2.2,2.63,1.44,4.5,1.18,1.3,3.4,2.5,1.5,2024-11-23 00:00:00,Turkey,2 - 6,(1-3),2,6,8,1,3,4,0,0,1,0,1,1,1,0,1,0,1,0,1,0,1,0,0,0,1,0,1,1,1,0,1,0,-1.0,-1.0,1.44,-1.0,-1.0,1.1,1.18,-1.0,1.65,-1.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,-1.0,1.95,-1.0,-1.0,1.11,1.3,-1.0,-1.0,0.0
2002,Turkey,Bodrumspor,Galatasaray,4.86,16.84,78.17,0-2,21.7,83.03,95.01,83.58,60.7,61.5,79.05,42.57,1.98,22.63,72.76,0-1,24.61,74.74,95.39,79.76,45.78,44.45,92.1,11.81,72.25,97.67,52.97,Bodrumspor,Galatasaray,7.5,4.5,1.42,2.75,1.18,1.1,6.5,2.4,1.91,1.8,1.53,1.1,1.83,1.83,1.2,4.33,1.67,2.15,2.63,1.44,5.0,1.17,1.33,3.25,2.5,1.5,2024-11-23 00:00:00,Turkey,0 - 1,(0-0),0,1,1,0,0,0,0,0,1,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,-1.0,-1.0,1.42,-1.0,-1.0,1.1,0.0,-1.0,0.0,-1.0,-1.0,1.44,-1.0,1.17,-1.0,1.83,-1.0,-1.0,0.0,-1.0,-1.0,1.1,0.0,-1.0,0.0,-1.0


# Checking For ROI of Profit Columns

In [7]:
# Select columns that end with 'P'
columns = [col for col in final_df_unique.columns if col.endswith('P')]

# Initialize lists to store results and games
results, games_list = [],  []

for col in columns:
    my_df = final_df_unique[final_df_unique[col] >= 0]
    numerator = np.sum(my_df[col]) - len(my_df)
    result = round(numerator / len(my_df) * 100, 2)
    
    # Append results and games to respective lists
    results.append(result)
    games_list.append(len(my_df))

# Convert results to a DataFrame with an additional column for Games
results_df = pd.DataFrame({
    'Column': columns,
    'ROI': results,
    'Games': games_list
})
results_df

Unnamed: 0,Column,ROI,Games
0,FT1P,-7.51,1127
1,FTXP,0.27,203
2,FT2P,-16.75,673
3,FT1XP,-3.42,907
4,FT12P,-5.53,587
5,FTX2P,-9.84,509
6,1.5OP,-6.69,1792
7,1.5UP,-5.96,211
8,2.5OP,-5.33,957
9,2.5UP,-4.43,1046


# ROI of Profit Columns According To Leagues

In [8]:
# Step 1: Filter leagues with at least 10 games
league_counts = final_df_unique['League'].value_counts()
leagues_with_10_games = league_counts[league_counts >= 10].index
filtered_df = final_df_unique[final_df_unique['League'].isin(leagues_with_10_games)]

# Group by 'League' and calculate results for each group
grouped_results = {}
for league, group in filtered_df.groupby('League'):
    group_results = {}
    for col in columns:
        my_df = group[group[col] >= 0]
        numerator = np.sum(my_df[col]) - len(my_df)
        group_results[col] = round(numerator / len(my_df) * 100, 2)
    # Add the number of games for this league
    group_results['Games'] = round(len(group),2)
    grouped_results[league] = group_results

# Convert grouped results to a DataFrame for better visualization
grouped_results_df = pd.DataFrame(grouped_results).T

# Define a function to apply conditional formatting
def highlight_positive(val):
    # Highlight background to red if the value is positive
    color = 'background-color: red' if isinstance(val, (int, float)) and val > 0 else ''
    return color

# Apply the function to the DataFrame
styled_df = (
    grouped_results_df.style
    .applymap(highlight_positive)
    .format("{:.2f}")  # Format only numeric columns, excluding 'Games'
)

# Save the styled DataFrame to Excel
styled_df.to_excel("ROI_leagues.xlsx", index=True)

# Display the styled DataFrame
styled_df

Unnamed: 0,FT1P,FTXP,FT2P,FT1XP,FT12P,FTX2P,1.5OP,1.5UP,2.5OP,2.5UP,3.5OP,3.5UP,4.5OP,4.5UP,BTTSP,OTTSP,HT1P,HTXP,HT2P,HT1XP,HT12P,HTX2P,HT0.5OP,HT0.5UP,HT1.5OP,HT1.5UP,Games
Argentina,-15.38,71.58,-75.0,-7.93,2.0,-10.33,-10.06,-3.29,-46.15,-2.41,,1.81,,-2.32,-53.71,-19.41,-43.21,26.46,-100.0,0.33,,5.59,-5.62,32.35,37.5,-11.88,95.0
Austria,-42.39,-100.0,-6.43,-14.5,9.43,-64.0,-7.89,300.0,5.0,-18.88,-16.0,9.34,-100.0,1.65,-24.42,-30.33,-52.92,20.35,90.0,6.61,,7.75,-13.62,21.5,-27.77,18.31,39.0
Belgium,-9.17,5.71,-45.31,5.59,-7.64,-26.07,1.67,-53.57,-12.71,-10.82,-40.5,11.47,-100.0,1.77,-33.56,6.21,11.0,-11.76,7.33,-2.0,-70.0,-4.04,-6.85,-21.55,-79.05,-3.61,59.0
Brazil,-14.23,-100.0,-22.92,0.84,-1.95,2.19,-9.74,150.0,-23.21,-12.37,,-5.15,,1.05,3.26,-14.42,7.8,22.35,,0.05,,15.28,-7.3,81.5,-100.0,-8.79,73.0
Denmark,-1.53,75.0,-68.61,17.92,-24.5,-18.77,2.46,,-7.15,-13.0,0.44,-24.04,33.33,-9.0,14.88,-15.1,73.93,-2.14,-100.0,22.11,-24.25,-0.75,6.67,50.0,22.88,-10.06,35.0
England,-5.21,-26.0,-34.95,3.6,25.33,-11.91,-8.43,-100.0,-12.14,-18.32,-100.0,0.11,-100.0,2.12,3.55,-16.62,-54.3,13.19,72.45,-15.19,18.6,-6.89,-7.33,-40.0,-10.14,5.75,50.0
England2,-8.79,-2.65,-52.54,5.84,8.83,-10.68,-12.5,44.2,-18.94,4.23,-100.0,-2.86,-100.0,1.29,-12.6,2.68,19.59,-20.67,-100.0,-10.27,,-23.24,-8.9,-6.29,16.3,0.43,93.0
England3,22.78,-2.0,-21.37,-10.89,1.64,-12.16,-3.47,-4.93,11.75,13.93,-27.47,-5.77,-58.33,-9.04,-0.45,3.54,-6.55,-32.08,9.33,-11.04,9.0,-8.51,1.2,-32.84,7.59,-5.49,103.0
England4,-40.1,118.33,-7.75,-22.5,-11.3,-4.89,-18.46,-10.44,-22.32,4.09,9.58,8.11,116.67,4.66,-30.58,-15.2,-30.46,-9.87,-52.31,-18.81,-33.14,-19.05,-14.59,-33.25,-23.53,0.52,92.0
England5,-17.53,-100.0,-19.35,-2.41,-3.54,-19.11,0.1,-59.38,4.26,-27.62,-48.7,-13.83,-100.0,-3.68,-1.74,-30.24,-3.57,-22.62,20.43,-10.86,-33.0,-18.88,2.02,-53.08,-39.0,-9.75,97.0


# Creating Optimum Threshold for Each Prediction Column

In [12]:
# Assuming `df` is your DataFrame and it contains the columns for percentages and correctness
def calculate_threshold(percentages, predictions):
    # Ensure inputs are pandas Series
    percentages = pd.Series(percentages)
    predictions = pd.Series(predictions)
    
    thresholds = percentages.unique()
    best_threshold = 0
    best_j_stat = -np.inf  # Start with negative infinity for comparison
    
    for threshold in thresholds:
        # Predict 1s based on the threshold
        predicted_1s = (percentages >= threshold).astype(int)
        
        # Calculate true positives, true negatives, false positives, false negatives
        true_positives = ((predicted_1s == 1) & (predictions == 1)).sum()
        true_negatives = ((predicted_1s == 0) & (predictions == 0)).sum()
        false_positives = ((predicted_1s == 1) & (predictions == 0)).sum()
        false_negatives = ((predicted_1s == 0) & (predictions == 1)).sum()
        
        # Calculate Sensitivity (Recall) and Specificity
        sensitivity = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        specificity = true_negatives / (true_negatives + false_positives) if (true_negatives + false_positives) > 0 else 0
        
        # Calculate Youden's J Statistic
        j_stat = sensitivity + specificity - 1
        
        # Update best threshold if J statistic improves
        if j_stat > best_j_stat:
            best_j_stat = j_stat
            best_threshold = threshold
    
    return best_threshold, round(best_j_stat, 2)

# Select only numeric columns
numeric_columns = final_df_unique.select_dtypes(include=[np.number])

# Remove rows where any numeric value is greater than 100
final_df_unique = final_df_unique[(numeric_columns <= 100).all(axis=1)]

#Selecting dataframes with model predictions
ft1df = final_df_unique[final_df_unique['df1_FT1'] >= final_df_unique[['df1_FTX', 'df1_FT2']].max(axis=1)]
ftxdf = final_df_unique[final_df_unique['df1_FTX'] >= final_df_unique[['df1_FT1', 'df1_FT2']].max(axis=1)]
ft2df = final_df_unique[final_df_unique['df1_FT2'] >= final_df_unique[['df1_FTX', 'df1_FT1']].max(axis=1)]
dc1xdf = final_df_unique[final_df_unique['df1_DC1X'] >= final_df_unique[['df1_DC12', 'df1_DCX2']].max(axis=1)]
dc12df = final_df_unique[final_df_unique['df1_DC12'] >= final_df_unique[['df1_DC1X', 'df1_DCX2']].max(axis=1)]
dcx2df = final_df_unique[final_df_unique['df1_DCX2'] >= final_df_unique[['df1_DC1X', 'df1_DC12']].max(axis=1)]
over15df, under15df = final_df_unique[final_df_unique['df1_1.5O'] >= 50], final_df_unique[final_df_unique['df1_1.5O'] < 50]
over25df, under25df = final_df_unique[final_df_unique['df1_2.5O'] >= 50], final_df_unique[final_df_unique['df1_2.5O'] < 50]
over35df, under35df = final_df_unique[final_df_unique['df1_3.5U'] < 50], final_df_unique[final_df_unique['df1_3.5U'] >= 50]
over45df, under45df = final_df_unique[final_df_unique['df1_4.5U'] < 50], final_df_unique[final_df_unique['df1_4.5U'] >= 50]
bttsdf, ottsdf = final_df_unique[final_df_unique['df1_BTTS'] >= 50], final_df_unique[final_df_unique['df1_BTTS'] < 50]
ht1df = final_df_unique[final_df_unique['df1_HT1'] >= final_df_unique[['df1_HTX', 'df1_HT2']].max(axis=1)]
htxdf = final_df_unique[final_df_unique['df1_HTX'] >= final_df_unique[['df1_HT1', 'df1_HT2']].max(axis=1)]
ht2df = final_df_unique[final_df_unique['df1_HT2'] >= final_df_unique[['df1_HT1', 'df1_HTX']].max(axis=1)]
ht1xdf = final_df_unique[final_df_unique['df1_HTDC1X'] >= final_df_unique[['df1_HTDC12', 'df1_HTDCX2']].max(axis=1)]
ht12df = final_df_unique[final_df_unique['df1_HTDC12'] >= final_df_unique[['df1_HTDC1X', 'df1_HTDCX2']].max(axis=1)]
htx2df = final_df_unique[final_df_unique['df1_HTDCX2'] >= final_df_unique[['df1_HTDC1X', 'df1_HTDC12']].max(axis=1)]
htover05df, htunder05df = final_df_unique[final_df_unique['df1_HT0.5O'] >= 50], final_df_unique[final_df_unique['df1_HT0.5O'] < 50]
htover15df, htunder15df = final_df_unique[final_df_unique['df1_HT1.5U'] < 50], final_df_unique[final_df_unique['df1_HT1.5U'] >= 50]

ft1t, ft1a = calculate_threshold(ft1df['df1_FT1'], ft1df['FT1'])
ftxt, ftxa = calculate_threshold(ftxdf['df1_FTX'], ftxdf['FTX'])
ft2t, ft2a = calculate_threshold(ft2df['df1_FT2'], ft2df['FT2'])
ft1xt, ft1xa = calculate_threshold(dc1xdf['df1_DC1X'], dc1xdf['FT1X'])
ft12t, ft12a = calculate_threshold(dc12df['df1_DC12'], dc12df['FT12'])
ftx2t, ftx2a = calculate_threshold(dcx2df['df1_DCX2'], dcx2df['FTX2'])
over15t, over15a = calculate_threshold(over15df['df1_1.5O'], over15df['1.5O'])
under15t, under15a = calculate_threshold(under15df['df1_1.5O'], under15df['1.5U'])
over25t, over25a = calculate_threshold(over25df['df1_2.5O'], over25df['2.5O'])
under25t, under25a = calculate_threshold(under25df['df1_2.5O'], under25df['2.5U'])
over35t, over35a = calculate_threshold(over35df['df1_3.5U'], over35df['3.5O'])
under35t, under35a = calculate_threshold(under35df['df1_3.5U'], under35df['3.5U'])
over45t, over45a = calculate_threshold(over45df['df1_4.5U'], over45df['4.5O'])
under45t, under45a = calculate_threshold(under45df['df1_4.5U'], under45df['4.5U'])
bttst, bttsa = calculate_threshold(bttsdf['df1_BTTS'], bttsdf['BTTS'])
ottst, ottsa = calculate_threshold(ottsdf['df1_BTTS'], ottsdf['OTTS'])
ht1t, ht1a = calculate_threshold(ht1df['df1_HT1'], ht1df['HT1'])
htxt, htxa = calculate_threshold(htxdf['df1_HTX'], htxdf['HTX'])
ht2t, ht2a = calculate_threshold(ht2df['df1_HT2'], ht2df['HT2'])
ht1xt, ht1xa = calculate_threshold(ht1xdf['df1_HTDC1X'], ht1xdf['HT1X'])
ht12t, ht12a = calculate_threshold(ht12df['df1_HTDC12'], ht12df['HT12'])
htx2t, htx2a = calculate_threshold(htx2df['df1_HTDCX2'], htx2df['HTX2'])
htover05t, htover05a = calculate_threshold(htover05df['df1_HT0.5O'], htover05df['HT0.5O'])
htunder05t, htunder05a = calculate_threshold(htunder05df['df1_HT0.5O'], htunder05df['HT0.5U'])
htover15t, htover15a = calculate_threshold(htover15df['df1_HT1.5U'], htover15df['HT1.5O'])
htunder15t, htunder15a = calculate_threshold(htunder15df['df1_HT1.5U'], htunder15df['HT1.5U'])

new_ft1df, new_ftxdf, new_ft2df = ft1df[ft1df['df1_FT1'] >= ft1t],ftxdf[ftxdf['df1_FTX'] >= ftxt],ft2df[ft2df['df1_FT2'] >= ft2t]
new_ft1xdf, new_ft12df, new_ftx2df = dc1xdf[dc1xdf['df1_DC1X'] >= ft1xt],dc12df[dc12df['df1_DC12'] >= ft12t],dcx2df[dcx2df['df1_DCX2'] >= ftx2t] 
new_over15, new_under15 = over15df[over15df['df1_1.5O'] >= over15t], under15df[under15df['df1_1.5O'] <= under15t]
new_over25, new_under25 = over25df[over25df['df1_2.5O'] >= over25t], under25df[under25df['df1_2.5O'] <= under25t]
new_over35, new_under35 = over35df[over35df['df1_3.5U'] <= over35t], under35df[under35df['df1_3.5U'] >= under35t]
new_over45, new_under45 = over45df[over45df['df1_4.5U'] <= over45t], under45df[under45df['df1_4.5U'] >= under45t]
new_btts, new_otts = bttsdf[bttsdf['df1_BTTS'] >= bttst], ottsdf[ottsdf['df1_BTTS'] <= ottst]
new_ht1df, new_htxdf, new_ht2df = ht1df[ht1df['df1_HT1'] >= ht1t],htxdf[htxdf['df1_HTX'] >= htxt],ht2df[ht2df['df1_HT2'] >= ht2t]
new_ht1xdf, new_ht12df, new_htx2df = ht1xdf[ht1xdf['df1_HTDC1X'] >= ht1xt],ht12df[ht12df['df1_HTDC12'] >= ht12t],htx2df[htx2df['df1_HTDCX2'] >= htx2t]
new_htover05, new_htunder05 = htover05df[htover05df['df1_HT0.5O'] >= htover05t], htunder05df[htunder05df['df1_HT0.5O'] <= htunder05t]
new_htover15, new_htunder15 = htover15df[htover15df['df1_HT1.5U'] <= htover15t], htunder15df[htunder15df['df1_HT1.5U'] >= htunder15t]

# Store the results in a list
results = [
('FT1', ft1t, ft1a, len(new_ft1df), round(len(new_ft1df)/len(ft1df)*100,2), np.sum(new_ft1df['FT1P']) - len(new_ft1df)),
('FTX', ftxt, ftxa, len(new_ftxdf), round(len(new_ftxdf)/len(ftxdf)*100,2), np.sum(new_ftxdf['FTXP']) - len(new_ftxdf)),
('FT2', ft2t, ft2a, len(new_ft2df), round(len(new_ft2df)/len(ft2df)*100,2), np.sum(new_ft2df['FT2P']) - len(new_ft2df)),
('FT1X', ft1xt, ft1xa, len(new_ft1xdf), round(len(new_ft1xdf)/len(dc1xdf)*100,2), np.sum(new_ft1xdf['FT1XP']) - len(new_ft1xdf)),
('FT12', ft12t, ft12a, len(new_ft12df), round(len(new_ft12df)/len(dc12df)*100,2), np.sum(new_ft12df['FT12P']) - len(new_ft12df)),
('FTX2', ftx2t, ftx2a, len(new_ftx2df), round(len(new_ftx2df)/len(dcx2df)*100,2), np.sum(new_ftx2df['FTX2P']) - len(new_ftx2df)),
('1.5O', over15t, over15a, len(new_over15), round(len(new_over15)/len(over15df)*100,2), np.sum(new_over15['1.5OP'])-len(new_over15)),
('1.5U', under15t, under15a, len(new_under15), round(len(new_under15)/len(under15df)*100,2), np.sum(new_under15['1.5UP'])-len(new_under15)),
('2.5O', over25t, over25a, len(new_over25), round(len(new_over25)/len(over25df)*100,2), np.sum(new_over25['2.5OP'])-len(new_over25)),
('2.5U', under25t, under25a, len(new_under25), round(len(new_under25)/len(under25df)*100,2), np.sum(new_under25['2.5UP'])-len(new_under25)),
('3.5O', over35t, over35a, len(new_over35), round(len(new_over35)/len(over35df)*100,2), np.sum(new_over35['3.5OP'])-len(new_over35)),
('3.5U', under35t, under35a, len(new_under35), round(len(new_under35)/len(under35df)*100,2), np.sum(new_under35['3.5UP'])-len(new_under35)),
('4.5O', over45t, over45a, len(new_over45), round(len(new_over45)/len(over45df)*100,2), np.sum(new_over45['4.5OP'])-len(new_over45)),
('4.5U', under45t, under45a, len(new_under45), round(len(new_under45)/len(under45df)*100,2), np.sum(new_under45['4.5UP'])-len(new_under45)),
('BTTS', bttst, bttsa, len(new_btts), round(len(new_btts)/len(bttsdf)*100,2), np.sum(new_btts['BTTSP'])-len(new_btts)),
('OTTS', ottst, ottsa, len(new_otts), round(len(new_otts)/len(ottsdf)*100,2), np.sum(new_otts['OTTSP'])-len(new_otts)),
('HT1', ht1t, ht1a, len(new_ht1df), round(len(new_ht1df)/len(ht1df)*100,2), np.sum(new_ht1df['HT1P']) - len(new_ht1df)),
('HTX', htxt, htxa, len(new_htxdf), round(len(new_htxdf)/len(htxdf)*100,2), np.sum(new_htxdf['HTXP']) - len(new_htxdf)),
('HT2', ht2t, ht2a, len(new_ht2df), round(len(new_ht2df)/len(ht2df)*100,2), np.sum(new_ht2df['HT2P']) - len(new_ht2df)),
('HT1X', ht1xt, ht1xa, len(new_ht1xdf), round(len(new_ht1xdf)/len(ht1xdf)*100,2), np.sum(new_ht1xdf['HT1XP']) - len(new_ht1xdf)),
('HT12', ht12t, ht12a, len(new_ht12df), round(len(new_ht12df)/len(ht12df)*100,2), np.sum(new_ht12df['HT12P']) - len(new_ht12df)),
('HTX2', htx2t, htx2a, len(new_htx2df), round(len(new_htx2df)/len(htx2df)*100,2), np.sum(new_htx2df['HTX2P']) - len(new_htx2df)),
('HT0.5O', htover05t, htover05a, len(new_htover05), round(len(new_htover05)/len(htover05df)*100,2), np.sum(new_htover05['HT0.5OP'])-len(new_htover05)),
('HT0.5U', htunder05t, htunder05a, len(new_htunder05), round(len(new_htunder05)/len(htunder05df)*100,2), np.sum(new_htunder05['HT0.5UP'])-len(new_htunder05)),
('HT1.5O', htover15t, htover15a, len(new_htover15), round(len(new_htover15)/len(htover15df)*100,2), np.sum(new_htover15['HT1.5OP'])-len(new_htover15)),
('HT1.5U', htunder15t, htunder15a, len(new_htunder15), round(len(new_htunder15)/len(htunder15df)*100,2), np.sum(new_htunder15['HT1.5UP'])-len(new_htunder15))
]

# Create a DataFrame from the results
results_df = pd.DataFrame(results, columns=['Prediction', 'Threshold', 'J-Stat', 'Games', 'Games%', 'Profit'])
results_df['ROI'] = round(results_df['Profit'] / results_df['Games'] * 100, 2)
print('Number of matches: ', len(final_df_unique))
results_df

Number of matches:  1957


Unnamed: 0,Prediction,Threshold,J-Stat,Games,Games%,Profit,ROI
0,FT1,59.31,0.19,469,42.41,0.04,0.01
1,FTX,37.66,0.08,163,87.63,8.54,5.24
2,FT2,60.99,0.1,181,27.18,-26.52,-14.65
3,FT1X,87.6,0.13,351,40.07,-8.25,-2.35
4,FT12,77.24,0.06,298,50.85,-8.31,-2.79
5,FTX2,82.4,0.08,269,54.34,-27.14,-10.09
6,1.5O,78.36,0.09,785,44.86,-37.82,-4.82
7,1.5U,49.76,0.03,203,98.07,-9.95,-4.9
8,2.5O,60.44,0.05,565,60.23,-26.23,-4.64
9,2.5U,49.43,0.01,998,97.94,-53.19,-5.33
