In [1]:
import pymc as pm
import numpy as np
import pandas as pd
import os
import requests
import io
from datetime import datetime, timedelta
import multiprocessing
import arviz as az
import logging
import tqdm



In [2]:
API_KEY = os.getenv("API_KEY")
print(API_KEY)

W9tBYtIKSya2eyBP4MjNa0muqP6EFlS3T4AFXrB8


In [2]:
# get data
API_KEY = os.getenv("API_KEY")
url = 'https://data-service.beatthebookie.blog/data'
headers = {"x-api-key": API_KEY}

# Function to fetch data for a specific division and season
def fetch_data(division, season):
    params = {
        'division': division,
        'season': season
    }
    response = requests.get(url, headers=headers, params=params)
    if response.status_code == 200:
        return pd.read_json(io.StringIO(response.content.decode('utf-8')))
    else:
        print(f"Error fetching {division} {season}: {response.status_code}")
        print(response.content.decode('utf-8'))
        return pd.DataFrame()

# Fetch data for all combinations
seasons = ['2024_2025', '2023_2024']
divisions = ['Premier League', 'Championship']
dataframes = []

for division in divisions:
    for season in seasons:
        df = fetch_data(division, season)
        if not df.empty:
            dataframes.append(df)

# Combine all dataframes
if dataframes:
    df = pd.concat(dataframes, ignore_index=True)
    
    # Convert match_date to datetime
    df['match_date'] = pd.to_datetime(df['match_date'])

In [3]:
df

Unnamed: 0,division_id,division,season_id,season,match_date,match_teams,home_team_id,home_team,away_team_id,away_team,...,away_xgoals,home_deep,away_deep,home_ppda,away_ppda,bet365_home_odds,bet365_draw_odds,bet365_away_odds,bet365_u25_odds,bet365_o25_odds
0,98b8784f6685b7289f583e0ce4b4f6f2,Premier League,3ac445d3cc1d404987efdfcfa42f3bcd,20242025,2024-08-16,Man United - Fulham,f2b82cdbdadf9d3ec47c3a6be66dcfad,Man United,8cd5e94668b139c1f42a89a1e130f3cf,Fulham,...,0.418711,7.0,3.0,7.379310,10.833333,1.60,4.20,5.25,2.50,1.53
1,98b8784f6685b7289f583e0ce4b4f6f2,Premier League,3ac445d3cc1d404987efdfcfa42f3bcd,20242025,2024-08-17,Everton - Brighton,6414a61d98ab23b6d757e888ab17a66a,Everton,0d84883ca72c88cb53c8a38262efdcbc,Brighton,...,1.790830,4.0,5.0,18.333333,7.916667,2.63,3.30,2.63,2.00,1.80
2,98b8784f6685b7289f583e0ce4b4f6f2,Premier League,3ac445d3cc1d404987efdfcfa42f3bcd,20242025,2024-08-17,Ipswich - Liverpool,e4f63bf6d6d2cd121e6c8e59bef68209,Ipswich,afce84ff226407a47c9782a742ba02f7,Liverpool,...,3.929060,2.0,13.0,18.777778,8.739130,8.50,5.50,1.33,3.00,1.40
3,98b8784f6685b7289f583e0ce4b4f6f2,Premier League,3ac445d3cc1d404987efdfcfa42f3bcd,20242025,2024-08-17,Nott'm Forest - Bournemouth,9a8e1e9fad8766fc3d69a0c26d98b928,Nott'm Forest,b436d55f36cfbe8a085c8b75fb7fe98a,Bournemouth,...,1.909150,10.0,4.0,8.653846,9.954545,2.45,3.50,2.80,2.10,1.73
4,98b8784f6685b7289f583e0ce4b4f6f2,Premier League,3ac445d3cc1d404987efdfcfa42f3bcd,20242025,2024-08-17,Newcastle - Southampton,78e9266876e7649e0a12e3840f5be006,Newcastle,5a884401673693b0bdf379fefb7ec2b2,Southampton,...,1.954830,4.0,13.0,16.250000,3.789474,1.36,5.25,8.00,3.00,1.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1569,c4781225ef2d5018f7a9df4b6cb1c1fe,Championship,18b28eb3ae0ef75bc47858e438602442,20232024,2024-05-04,Sunderland - Sheffield Weds,1820e986a8a213df782c5cf5ad0b65e1,Sunderland,ec5c8ee83966ae3a295abeee6ab46871,Sheffield Weds,...,1.300000,,,,,2.70,3.40,2.55,1.80,2.00
1570,c4781225ef2d5018f7a9df4b6cb1c1fe,Championship,18b28eb3ae0ef75bc47858e438602442,20232024,2024-05-04,Rotherham - Cardiff,095ea4179483dfdd7fd4ee61a25954f4,Rotherham,288a6430a6f57a76e823d74e0dd750d2,Cardiff,...,1.300000,,,,,3.40,3.40,2.20,1.73,2.10
1571,c4781225ef2d5018f7a9df4b6cb1c1fe,Championship,18b28eb3ae0ef75bc47858e438602442,20232024,2024-05-04,Leicester - Blackburn,a06360acafe964d244e424cbd50862e8,Leicester,8178de69a5e17f7bbe3a74331977feb1,Blackburn,...,1.700000,,,,,1.70,3.75,5.25,2.50,1.53
1572,c4781225ef2d5018f7a9df4b6cb1c1fe,Championship,18b28eb3ae0ef75bc47858e438602442,20232024,2024-05-04,Plymouth Argyle - Hull,a0111c6b8f3812f2e36ababf2c032d8c,Plymouth Argyle,63b0fc998685327831f96bbe7b07b7e2,Hull,...,0.500000,,,,,3.50,3.75,2.00,2.30,1.62


In [4]:
# build model
import psutil


def build_bayesian_model(home_teams, away_teams, home_goals, away_goals, home_xg, away_xg, dates, leagues):
    print("Building Bayesian model...")
    print(f"Dataset size: {len(home_teams)} matches")
    print(f"Time span: {dates.min()} to {dates.max()}")
    
    # get unique teams and leagues
    teams = sorted(list(set(home_teams) | set(away_teams))) # alphabetically sorts and de-dupes list of team names
    unique_leagues = sorted(list(set(leagues)))

    # sets index values for each team/league within a dict
    team_indices = {team: idx for idx, team in enumerate(teams)}
    league_indices = {league: idx for idx, league in enumerate(unique_leagues)}

    # convert date into time differences
    max_date = np.max(dates)
    time_diffs = (max_date - dates).dt.days

    # convert team names to index vals
    home_idx = [team_indices[team] for team in home_teams]
    away_idx = [team_indices[team] for team in away_teams]

    # Get league index for each team directly from the data
    home_league_idx = [league_indices[league] for league in leagues]
    away_league_idx = [league_indices[league] for league in leagues]
    
    # Create array of league indices for each team
    team_league_idx = np.zeros(len(teams), dtype=int)
    for team, idx in team_indices.items():
        # Find first occurrence of this team and use its league
        if team in home_teams:
            first_idx = list(home_teams).index(team)
            team_league_idx[idx] = home_league_idx[first_idx]
        else:
            first_idx = list(away_teams).index(team)
            team_league_idx[idx] = away_league_idx[first_idx]

    with pm.Model() as model:
        # league level parameters for league strengths
        league_attack_mu = pm.Normal("league_attack_mu", mu=0, sigma=0.5) # using a normal distribution to infer average league attack value
        league_attack_sigma = pm.HalfNormal("league_attack_sigma", sigma=0.5) # using a half normal dist to infer league attack spread, half normal as std must be positive
        league_defense_mu = pm.Normal("league_defense_mu", mu=0, sigma=0.5)
        league_defense_sigma = pm.HalfNormal("league_defense_sigma", sigma=0.5)

        # creating raw league strengths for all leagues EXCEPT Premier League
        premier_league_idx = league_indices["Premier League"]
        league_strength_raw = pm.Normal("league_strength_raw", mu=-0.5, sigma=0.3, shape=len(unique_leagues)-1) # setting mu to -0.5 as other leagues are expected to be weaker. shape = -1 as Premier league will be 0
        league_strength = pm.Deterministic( # deterministic variable as derived from other random variables (league strengths)
            "league_strength",
            pm.math.concatenate([
                league_strength_raw[:premier_league_idx],
                pm.math.zeros(1), # creating array that will have all league strengths with Premier league in the "middle" with 0
                league_strength_raw[premier_league_idx:]
            ])
        )

        # team strength initalisation
        attack_raw = pm.Normal("attack_raw", mu=0, sigma=1, shape=len(teams)) # initalising normal distribution for relative attacking strength with mean 0 and std of 1
        defense_raw = pm.Normal('defense_raw', mu=0, sigma=1, shape=len(teams))

        # scale team strengths by league
        attack = pm.Deterministic(
            "attack",
            attack_raw * league_attack_sigma + league_attack_mu + league_strength[team_league_idx] # combining raw team strength with league average/std and then penalising by league overall strength
        )
        defense = pm.Deterministic(
            "defense",
            defense_raw * league_defense_sigma + league_defense_mu + league_strength[team_league_idx]
        )

        # initalise time decay parameter
        decay_rate = pm.HalfNormal("decay_rate", sigma=1.5/365) # balanced prior for decay rate, divided by 365 to account for daily rate

        # initalise home advantage
        home_advantage = pm.Normal("home_advantage", mu=0.2, sigma=0.1) # initalises home_adv to 0.2 and has std of 0.1 so val can extend or reduce that much

        # create time decay factor to apply to expected goals
        time_factor = pm.math.exp(-decay_rate * time_diffs)

        # expected goals parameter for both xG and goals, applied time decay
        home_theta = time_factor * pm.math.exp(attack[home_idx] - defense[away_idx] + home_advantage) # we use exponential so it's always positive and team strengths are multiplicative
        away_theta = time_factor * pm.math.exp(attack[away_idx] - defense[home_idx])

        # goals likelihood (poisson for actual goals)
        home_goals_like = pm.Poisson("home_goals", mu=home_theta, observed=home_goals) 
        away_goals_like = pm.Poisson("away_goals", mu=away_theta, observed=away_goals)

        # xG likelihood (gamma for expected goals)
        xg_alpha = pm.HalfNormal("xg_alpha", sigma=1.0) # shape parameter (must be positive hence half normal) - alpha shapes basic form of distribution
        home_xg_beta = xg_alpha / home_theta # beta is rate parameter - scales where that form sits on the axis
        away_xg_beta = xg_alpha / away_theta # we are setting the mean of the xg distribution to be equal to our team strength rating

        # add small constant to not allow 0s which breaks Gamma dist
        epsilon = 0.00001
        home_xg_adj = home_xg + epsilon
        away_xg_adj = away_xg + epsilon

        home_xg_like = pm.Gamma("home_xg", alpha=xg_alpha, beta=home_xg_beta, observed=home_xg_adj)
        away_xg_like = pm.Gamma("away_xg", alpha=xg_alpha, beta=away_xg_beta, observed=away_xg_adj)

        print("Model building completed!")

    return model, team_indices, league_indices

def fit_bayesian_model(model, draws=500, tunes=500):
    n_cores = min(4, multiprocessing.cpu_count() - 1)
    
    print(f"Starting model fitting with {n_cores} cores...")
    print(f"Planning {draws} draws with 500 tuning steps...")
    
    with model:
        trace = pm.sample(
            draws=draws,
            tune=tunes,
            chains=n_cores,
            cores=n_cores,
            progressbar=True,
            return_inferencedata=True,
            init='adapt_diag',
            target_accept=0.95,
            nuts={"max_treedepth": 15}  # Correctly nested NUTS parameter
        )
        
        # Print sampling diagnostics
        print("\nSampling Statistics:")
        print(f"Number of divergences: {trace.sample_stats.diverging.sum().values}")
        
        return trace
    
# Function to monitor memory usage
def print_memory_usage():
    process = psutil.Process()
    print(f"Memory usage: {process.memory_info().rss / 1024 / 1024:.2f} MB")

# Setup logging
logging.getLogger('pymc').setLevel(logging.INFO)



def get_league_strengths(trace, league_indices):
    leagues = list(league_indices.keys())
    league_strength_means = trace.posterior['league_strength'].mean(dim=['chain', 'draw']).values
    
    results = pd.DataFrame({
        'league': leagues,
        'league_strength': league_strength_means
    })
    
    return results.round(3).sort_values('league_strength', ascending=False)

def get_hierarchical_team_strengths(trace, team_indices, league_indices, team_leagues, current_teams):
    teams = list(team_indices.keys())
    attack_means = trace.posterior['attack'].mean(dim=['chain', 'draw']).values
    defense_means = trace.posterior['defense'].mean(dim=['chain', 'draw']).values
    home_adv = trace.posterior['home_advantage'].mean(dim=['chain', 'draw']).values
    
    # Get league strengths for reference
    league_strengths = get_league_strengths(trace, league_indices)
    
    results = pd.DataFrame({
        'team': teams,
        'league': [team_leagues.get(team, 'Unknown') for team in teams],  # Correctly map teams to leagues
        'attack_strength': attack_means,
        'defense_strength': defense_means,
        'overall_strength': (np.exp(attack_means - np.mean(defense_means)) - 
                           np.exp(np.mean(attack_means) - defense_means)),
        'home_advantage': home_adv
    })
    
    # Merge with league strengths
    results = results.merge(
        league_strengths,
        left_on='league',
        right_on='league',
        how='left'
    )
    
    # Filter current teams and sort
    results = (results[results['team'].isin(current_teams)]
              .round(3)
              .sort_values('overall_strength', ascending=False))
    
    return results, home_adv

def analyze_league_strengths(trace, league_indices, team_indices, team_leagues):
    # Get basic league strengths
    leagues = list(league_indices.keys())
    league_strength_means = trace.posterior['league_strength'].mean(dim=['chain', 'draw']).values
    
    # Get the posterior distributions for additional analysis
    league_attack_mu = trace.posterior['league_attack_mu'].mean(dim=['chain', 'draw']).values
    league_attack_sigma = trace.posterior['league_attack_sigma'].mean(dim=['chain', 'draw']).values
    league_defense_mu = trace.posterior['league_defense_mu'].mean(dim=['chain', 'draw']).values
    league_defense_sigma = trace.posterior['league_defense_sigma'].mean(dim=['chain', 'draw']).values
    
    # Calculate league-specific metrics
    detailed_results = []
    
    for league in leagues:
        league_idx = league_indices[league]
        league_teams = [team for team, l in team_leagues.items() if l == league]
        
        league_data = {
            'league': league,
            'base_strength': league_strength_means[league_idx],
            'attack_variation': league_attack_sigma,  # How much attack strength varies within the league
            'defense_variation': league_defense_sigma,  # How much defense strength varies within the league
            'num_teams': len(league_teams),
            'teams': ', '.join(sorted(league_teams)[:5]) + ('...' if len(league_teams) > 5 else '')
        }
        
        detailed_results.append(league_data)
    
    results_df = pd.DataFrame(detailed_results)
    
    # Calculate expected goals adjustment between leagues
    for idx, row in results_df.iterrows():
        base_league_strength = row['base_strength']
        results_df.loc[idx, 'expected_goals_vs_avg'] = np.exp(base_league_strength) - 1
    
    return results_df.round(3).sort_values('base_strength', ascending=False)
    

In [5]:
data = df[["home_team", "away_team", "home_goals", "away_goals", "home_xgoals", "away_xgoals", "match_date", "division"]]

# filter to matches only in previous 365 days
data = data[data["match_date"] > datetime.now() - timedelta(days=365)]

# get list of current teams
current_teams = df[df["season"] == 20242025]["home_team"].unique()

# get list of leagues
team_leagues = dict(zip(df["home_team"], df["division"]))

# Build model
model, team_indices, league_indices = build_bayesian_model(
        home_teams=data['home_team'],
        away_teams=data['away_team'],
        home_goals=np.array(data['home_goals']),
        away_goals=np.array(data['away_goals']),
        home_xg=np.array(data["home_xgoals"]),
        away_xg=np.array(data["away_xgoals"]),
        dates=data["match_date"],
        leagues=data["division"]
    )
    
# Fit model
print_memory_usage()
trace = fit_bayesian_model(model, draws=10, tunes=10)
print_memory_usage()




Only 10 samples per chain. Reliable r-hat and ESS diagnostics require longer chains for accurate estimate.


Building Bayesian model...
Dataset size: 911 matches
Time span: 2024-02-25 00:00:00 to 2025-02-17 00:00:00
Model building completed!
Memory usage: 270.83 MB
Starting model fitting with 4 cores...
Planning 10 draws with 500 tuning steps...


  warn(
Auto-assigning NUTS sampler...
Initializing NUTS using adapt_diag...
  warn(
  warn(
  warn(
  warn(
  warn(
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [league_attack_mu, league_attack_sigma, league_defense_mu, league_defense_sigma, league_strength_raw, attack_raw, defense_raw, decay_rate, home_advantage, xg_alpha]


Sampling 4 chains for 10 tune and 10 draw iterations (40 + 40 draws total) took 110 seconds.
The number of samples is too small to check convergence reliably.



Sampling Statistics:
Number of divergences: 0
Memory usage: 302.29 MB


In [6]:
# Create a dictionary mapping each team to its league based on the most recent season
latest_season = df["season"].max()
previous_season = latest_season - 1

# Combine current and previous season data
combined_df = pd.concat([df[df["season"] == latest_season], df[df["season"] == previous_season]])

# Create a dictionary mapping each team to its league
team_leagues = dict(zip(combined_df["home_team"], combined_df["division"]))

# Get results
team_strengths, home_advantage = get_hierarchical_team_strengths(
    trace=trace,
    team_indices=team_indices,
    league_indices=league_indices,
    team_leagues=team_leagues,
    current_teams=current_teams
)

# Analyze league strengths
league_analysis = analyze_league_strengths(
    trace=trace,
    league_indices=league_indices,
    team_indices=team_indices,
    team_leagues=team_leagues
)

# Print results
print("\nTeam Strengths:")
print(team_strengths)

print("\nLeague Analysis:")
print(league_analysis)


Team Strengths:
                team          league  attack_strength  defense_strength  \
22         Liverpool  Premier League            0.545             0.131   
0            Arsenal  Premier League            0.411             0.404   
24          Man City  Premier League            0.510             0.038   
10           Chelsea  Premier League            0.447            -0.179   
28         Newcastle  Premier League            0.366            -0.093   
4        Bournemouth  Premier League            0.334            -0.127   
42         Tottenham  Premier League            0.370            -0.207   
16            Fulham  Premier League            0.149            -0.024   
30     Nott'm Forest  Premier League            0.125            -0.011   
12    Crystal Palace  Premier League            0.153            -0.067   
5          Brentford  Premier League            0.199            -0.227   
1        Aston Villa  Premier League            0.164            -0.201   
6       

In [7]:
import pickle

def save_model_results(trace, team_indices, league_indices, team_strengths, league_analysis, filename=None):
    """Save all model results to a pickle file"""
    if filename is None:
        filename = f'model_results_{datetime.now().strftime("%Y%m%d")}.pkl'
    
    results = {
        'trace': trace,
        'team_indices': team_indices,
        'league_indices': league_indices,
        'team_strengths': team_strengths,
        'league_analysis': league_analysis
    }
    with open(filename, 'wb') as f:
        pickle.dump(results, f)
    print(f"Results saved to {filename}")

def load_model_results(filename):
    """Load model results from pickle file"""
    with open(filename, 'rb') as f:
        results = pickle.load(f)
    return (results['trace'], results['team_indices'], results['league_indices'], 
            results['team_strengths'], results['league_analysis'])

filename = f'model_results_{datetime.now().strftime("%Y%m%d")}.pkl'
save_model_results(trace, team_indices, league_indices, team_strengths, league_analysis, filename)

Results saved to model_results_20250223.pkl


In [14]:
filename = f'model_results_20250208.pkl'

trace, team_indices, league_indices, team_strengths, league_analysis = load_model_results(filename)

# Print results
print("\nTeam Strengths:")
print(team_strengths)

print("\nLeague Analysis:")
print(league_analysis)


Team Strengths:
                team          league  attack_strength  defense_strength  \
22         Liverpool  Premier League            0.606             0.121   
0            Arsenal  Premier League            0.484             0.424   
24          Man City  Premier League            0.497             0.094   
10           Chelsea  Premier League            0.500            -0.132   
28         Newcastle  Premier League            0.384            -0.084   
4        Bournemouth  Premier League            0.364            -0.108   
42         Tottenham  Premier League            0.388            -0.206   
16            Fulham  Premier League            0.163            -0.054   
12    Crystal Palace  Premier League            0.158            -0.058   
30     Nott'm Forest  Premier League            0.125            -0.006   
1        Aston Villa  Premier League            0.240            -0.215   
5          Brentford  Premier League            0.226            -0.210   
6       

In [16]:
def predict_match(home_team, away_team, trace, team_indices):
    home_idx = team_indices[home_team]
    away_idx = team_indices[away_team]

    # returning the range of distributions that teams attack/defense and home_adv could lie between
    attack_samples = trace.posterior["attack"].values
    defense_samples = trace.posterior["defense"].values
    home_advantage = trace.posterior["home_advantage"].values

    # use all combinations of strength to make prediction
    home_theta = np.exp(attack_samples[..., home_idx] - # ... means use all chains and draws
                        defense_samples[..., away_idx] +
                        home_advantage)
    away_theta = np.exp(attack_samples[...,  away_idx] - 
                        defense_samples[..., home_idx])
    
    # calculate mean expected goals from above samples
    home_xg = float(home_theta.mean())
    away_xg = float(away_theta.mean())

    # simulate match many times using Poisson distribution
    n_sims = 1000
    home_goals = np.random.poisson(home_xg, n_sims)
    away_goals = np.random.poisson(away_xg, n_sims)

    # Calculate match outcome probabilities
    home_wins = np.mean(home_goals > away_goals)
    draws = np.mean(home_goals == away_goals)
    away_wins = np.mean(home_goals < away_goals)

    return {
        'home_xg': round(home_xg, 2),
        'away_xg': round(away_xg, 2),
        'home_win_prob': round(home_wins * 100, 1),
        'draw_prob': round(draws * 100, 1),
        'away_win_prob': round(away_wins * 100, 1)
    }

def print_prediction(home_team, away_team, prediction):
    """Pretty print the match prediction"""
    print(f"\nMatch Prediction: {home_team} (H) vs {away_team} (A)")
    print(f"Expected Goals: {home_team} {prediction['home_xg']} - {prediction['away_xg']} {away_team}")
    print(f"Win Probability: {home_team}: {prediction['home_win_prob']}%")
    print(f"Draw Probability: {prediction['draw_prob']}%")
    print(f"Win Probability: {away_team}: {prediction['away_win_prob']}%")



prediction = predict_match("Leicester", "Brentford", trace, team_indices)   
print_prediction("Leicester", "Brentford", prediction) 


Match Prediction: Leicester (H) vs Brentford (A)
Expected Goals: Leicester 1.44 - 1.82 Brentford
Win Probability: Leicester: 30.0%
Draw Probability: 23.0%
Win Probability: Brentford: 47.0%


In [10]:
def predict_asian_handicap(home_team, away_team, trace, team_indices, handicaps=None, n_sims=50000, vig=0.05, random_seed=42):
    """
    Predict Asian handicap odds where favorite always gets minus handicap
    """
    np.random.seed(random_seed)
    
    home_idx = team_indices[home_team]
    away_idx = team_indices[away_team]

    # Get posterior samples
    attack_samples = trace.posterior["attack"].values.flatten()
    defense_samples = trace.posterior["defense"].values.flatten()
    home_advantage = float(trace.posterior["home_advantage"].values.mean())
    
    # Get team-specific parameters
    n_teams = len(team_indices)
    attack_home = attack_samples.reshape(-1)[home_idx::n_teams]
    attack_away = attack_samples.reshape(-1)[away_idx::n_teams]
    defense_home = defense_samples.reshape(-1)[home_idx::n_teams]
    defense_away = defense_samples.reshape(-1)[away_idx::n_teams]
    
    # Calculate expected goals
    home_theta = np.exp(attack_home - defense_away + home_advantage)
    away_theta = np.exp(attack_away - defense_home)
    
    # Calculate mean expected goals to determine favorite
    home_xg = float(home_theta.mean())
    away_xg = float(away_theta.mean())
    
    # Determine favorite and underdog
    if away_xg > home_xg:
        favorite = away_team
        underdog = home_team
        is_away_favorite = True
    else:
        favorite = home_team
        underdog = away_team
        is_away_favorite = False
    
    # Simulate matches
    sample_indices = np.random.randint(0, len(home_theta), size=n_sims)
    home_goals = np.random.poisson(home_theta[sample_indices])
    away_goals = np.random.poisson(away_theta[sample_indices])
    
    # Calculate goal difference from favorite's perspective
    goal_diff = away_goals - home_goals if is_away_favorite else home_goals - away_goals
    
    if handicaps is None:
        handicaps = [-3.0, -2.75, -2.5, -2.25, -2.0, -1.75, -1.5, -1.25, -1.0, -0.75, 
                    -0.5, -0.25, 0.0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0, 
                    2.25, 2.5, 2.75, 3.0]
    
    results = []
    # Add favorite 0.0 line first
    wins = np.sum(goal_diff > 0)
    draws = np.sum(goal_diff == 0)
    prob = (wins + 0.5 * draws) / n_sims
    margin_factor = np.sqrt(1 + vig)
    odds = round(margin_factor / prob, 2) if prob > 0.01 else 999.99
    results.append({
        "line": f"{favorite} 0.0",
        "odds": odds,
        "prob": prob
    })
    
    for handicap in handicaps:
        # For favorite lines
        if handicap < 0:
            # Favorite -X means they need to win by more than X
            if handicap % 0.5 == 0.25:
                lower = np.floor(handicap * 2) / 2
                upper = np.ceil(handicap * 2) / 2
                
                # Win by more than the handicap
                low_wins = np.sum(goal_diff > -lower)
                low_draws = np.sum(goal_diff == -lower)
                up_wins = np.sum(goal_diff > -upper)
                up_draws = np.sum(goal_diff == -upper)
                
                prob = ((low_wins + 0.5 * low_draws) + (up_wins + 0.5 * up_draws)) / (2 * n_sims)
            else:
                wins = np.sum(goal_diff > -handicap)
                draws = np.sum(goal_diff == -handicap)
                prob = (wins + 0.5 * draws) / n_sims
                
            line = f"{favorite} {handicap}"
            
        # For underdog lines
        else:
            # Underdog +X means they need to avoid losing by more than X
            if handicap % 0.5 == 0.25:
                lower = np.floor(handicap * 2) / 2
                upper = np.ceil(handicap * 2) / 2
                
                # Don't lose by more than the handicap
                low_wins = np.sum(goal_diff < lower)
                low_draws = np.sum(goal_diff == lower)
                up_wins = np.sum(goal_diff < upper)
                up_draws = np.sum(goal_diff == upper)
                
                prob = ((low_wins + 0.5 * low_draws) + (up_wins + 0.5 * up_draws)) / (2 * n_sims)
            else:
                wins = np.sum(goal_diff < handicap)
                draws = np.sum(goal_diff == handicap)
                prob = (wins + 0.5 * draws) / n_sims
                
            line = f"{underdog} +{handicap}" if handicap > 0 else f"{underdog} +0.0"
            
        # Calculate odds with margin
        margin_factor = np.sqrt(1 + vig)
        odds = round(margin_factor / prob, 2) if prob > 0.01 else 999.99
            
        results.append({
            "line": line,
            "odds": odds,
            "prob": prob
        })
    
    return pd.DataFrame(results).sort_values('line')
    

# Set a random seed for reproducibility
odds = predict_asian_handicap("Crystal Palace", "Everton", trace, team_indices, vig=0.05, random_seed=26)

# Print results with probabilities for verification
print(odds[['line', 'odds','prob']].to_string(index=False))

                line  odds     prob
Crystal Palace -0.25  1.75 0.587155
 Crystal Palace -0.5  1.96 0.523580
Crystal Palace -0.75  2.21 0.462630
 Crystal Palace -1.0  2.55 0.401680
Crystal Palace -1.25  3.01 0.340730
 Crystal Palace -1.5  3.66 0.279780
Crystal Palace -1.75  4.29 0.239105
 Crystal Palace -2.0  5.16 0.198430
Crystal Palace -2.25  6.50 0.157755
 Crystal Palace -2.5  8.75 0.117080
Crystal Palace -2.75 10.42 0.098365
 Crystal Palace -3.0 12.86 0.079650
  Crystal Palace 0.0  1.57 0.650730
        Everton +0.0  2.93 0.349270
       Everton +0.25  2.48 0.412845
        Everton +0.5  2.15 0.476420
       Everton +0.75  1.91 0.537370
        Everton +1.0  1.71 0.598320
       Everton +1.25  1.55 0.659270
        Everton +1.5  1.42 0.720220
       Everton +1.75  1.35 0.760895
        Everton +2.0  1.28 0.801570
       Everton +2.25  1.22 0.842245
        Everton +2.5  1.16 0.882920
       Everton +2.75  1.14 0.901635
        Everton +3.0  1.11 0.920350


In [40]:
def predict_asian_handicap_from_xg(home_team, away_team, home_xg, away_xg, requested_lines=None, handicaps=None, n_sims=50000, random_seed=42):
    np.random.seed(random_seed)
    
    # Determine favorite and underdog based on xG
    if away_xg > home_xg:
        favorite = away_team
        underdog = home_team
        is_away_favorite = True
    else:
        favorite = home_team
        underdog = away_team
        is_away_favorite = False
    
    # Simulate matches
    home_goals = np.random.poisson(home_xg, size=n_sims)
    away_goals = np.random.poisson(away_xg, size=n_sims)
    
    # Calculate goal difference from favorite's perspective
    goal_diff = away_goals - home_goals if is_away_favorite else home_goals - away_goals
    
    if handicaps is None:
        handicaps = [-3.0, -2.75, -2.5, -2.25, -2.0, -1.75, -1.5, -1.25, -1.0, -0.75,
                    -0.5, -0.25, 0.0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0,
                    2.25, 2.5, 2.75, 3.0]
    
    # Filter handicaps if requested_lines is provided
    if requested_lines is not None:
        handicaps = [h for h in handicaps if h in requested_lines]
    
    results = []
    processed_handicaps = set()
    
    # Handle 0.0 lines if requested
    if requested_lines is None or 0.0 in requested_lines:
        fav_wins = np.sum(goal_diff > 0)
        fav_draws = np.sum(goal_diff == 0)
        dog_wins = np.sum(goal_diff < 0)
        
        fav_prob = (fav_wins + 0.5 * fav_draws) / n_sims
        dog_prob = (dog_wins + 0.5 * fav_draws) / n_sims
        
        results.extend([
            {
                "line": f"{favorite} 0.0",
                "fair_prob": fav_prob
            },
            {
                "line": f"{underdog} 0.0",
                "fair_prob": dog_prob
            }
        ])
        processed_handicaps.add(0.0)
    
    # Process remaining handicaps
    for handicap in handicaps:
        if handicap in processed_handicaps:
            continue
            
        if handicap % 0.5 == 0.25:
            # Handle quarter lines
            lower = np.floor(handicap * 2) / 2
            upper = np.ceil(handicap * 2) / 2
            
            if handicap < 0:
                # Favorite lines
                low_wins = np.sum(goal_diff > -lower)
                low_draws = np.sum(goal_diff == -lower)
                up_wins = np.sum(goal_diff > -upper)
                up_draws = np.sum(goal_diff == -upper)
                
                fav_prob = ((low_wins + 0.5 * low_draws) + (up_wins + 0.5 * up_draws)) / (2 * n_sims)
                dog_prob = 1 - fav_prob  # Complementary probability
                
                results.extend([
                    {
                        "line": f"{favorite} {handicap}",
                        "fair_prob": fav_prob
                    },
                    {
                        "line": f"{underdog} +{abs(handicap)}",
                        "fair_prob": dog_prob
                    }
                ])
            else:
                # Already handled when processing negative handicap
                continue
        else:
            # Handle whole and half lines
            if handicap < 0:
                # Favorite lines
                wins = np.sum(goal_diff > -handicap)
                draws = np.sum(goal_diff == -handicap)
                
                fav_prob = (wins + 0.5 * draws) / n_sims
                dog_prob = 1 - fav_prob  # Complementary probability
                
                results.extend([
                    {
                        "line": f"{favorite} {handicap}",
                        "fair_prob": fav_prob
                    },
                    {
                        "line": f"{underdog} +{abs(handicap)}",
                        "fair_prob": dog_prob
                    }
                ])
            else:
                # Already handled when processing negative handicap
                continue
                
        processed_handicaps.add(handicap)
        processed_handicaps.add(-handicap)
    
    return pd.DataFrame(results).sort_values('line')

home_elevenify =  1.59
away_elevenify =  2.01

home_afpl = 1.61
away_afpl = 2.14

home_xg = (0.65*home_elevenify) + (0.35*home_afpl)
away_xg = (0.65*away_elevenify) + (0.35*away_afpl) 

odds = predict_asian_handicap_from_xg("PRE", "BUR", home_xg=0.90, away_xg=1.0, requested_lines=[-0.25, 0.25])

print(odds)

        line  fair_prob
0  BUR -0.25    0.44929
1  PRE +0.25    0.55071


In [42]:
def predict_goal_line_from_xg(home_xg, away_xg, requested_lines=None, n_sims=50000, random_seed=42):
    """
    Predict Asian total (goal line) probabilities using expected goals
    """
    np.random.seed(random_seed)
    
    # Simulate matches
    home_goals = np.random.poisson(home_xg, size=n_sims)
    away_goals = np.random.poisson(away_xg, size=n_sims)
    total_goals = home_goals + away_goals
    
    results = []
    
    def calculate_split_line_probs(lower_line, upper_line):
        """
        Calculate probabilities for a split line bet
        For Asian lines (e.g., 2.5, 3.0):
        - Over: Half stake on over lower_line, half stake on over upper_line
        - Under: Half stake on under lower_line, half stake on under upper_line
        """
        # Count the outcomes for various goal totals
        under_lower = np.sum(total_goals < lower_line)
        exactly_lower = np.sum(total_goals == lower_line)
        between_lines = np.sum((total_goals > lower_line) & (total_goals < upper_line))
        exactly_upper = np.sum(total_goals == upper_line)
        over_upper = np.sum(total_goals > upper_line)
        
        # For Over (lower_line, upper_line):
        # - Half stake on over lower_line: Win if > lower_line
        # - Half stake on over upper_line: Win if > upper_line, Push if = upper_line
        over_lower_stake = (between_lines + exactly_upper + over_upper) / n_sims
        over_upper_stake = (over_upper + 0.5 * exactly_upper) / n_sims
        over_prob = 0.5 * over_lower_stake + 0.5 * over_upper_stake
        
        # For Under (lower_line, upper_line):
        # - Half stake on under lower_line: Win if < lower_line
        # - Half stake on under upper_line: Win if < upper_line, Push if = upper_line
        under_lower_stake = under_lower / n_sims
        under_upper_stake = (under_lower + between_lines + 0.5 * exactly_upper) / n_sims
        under_prob = 0.5 * under_lower_stake + 0.5 * under_upper_stake
        
        # Calculate the push probability
        push_prob = 0.5 * (0 + 0.5 * exactly_upper / n_sims)
        
        return over_prob, under_prob, push_prob
    
    if requested_lines is None:
        requested_lines = ["0.5, 1.0", "1.0, 1.5", "1.5, 2.0", "2.0, 2.5", 
                         "2.5, 3.0", "3.0, 3.5", "3.5, 4.0", "4.0, 4.5"]
    
    for line in requested_lines:
        if ',' in line:
            # Handle split lines (e.g., "2.5, 3.0")
            lower_line, upper_line = map(float, line.split(','))
            over_prob, under_prob, push_prob = calculate_split_line_probs(lower_line, upper_line)
            
            results.extend([
                {
                    "line": f"Over {lower_line}, {upper_line}",
                    "fair_prob": over_prob,
                    "push_prob": push_prob
                },
                {
                    "line": f"Under {lower_line}, {upper_line}",
                    "fair_prob": under_prob,
                    "push_prob": push_prob
                }
            ])
        else:
            # Handle single lines (e.g., "2.5")
            line = float(line)
            if line % 1 == 0:
                # Whole number lines
                over_wins = np.sum(total_goals > line)
                pushes = np.sum(total_goals == line)
                under_wins = np.sum(total_goals < line)
                
                # For whole number lines, pushes are returned
                push_prob = pushes / n_sims
                over_prob = (over_wins + 0.5 * pushes) / n_sims
                under_prob = (under_wins + 0.5 * pushes) / n_sims
                
                results.extend([
                    {
                        "line": f"Over {line}",
                        "fair_prob": over_prob,
                        "push_prob": push_prob
                    },
                    {
                        "line": f"Under {line}",
                        "fair_prob": under_prob,
                        "push_prob": push_prob
                    }
                ])
            else:
                # Half number lines
                over_wins = np.sum(total_goals > line)
                over_prob = over_wins / n_sims
                under_prob = 1 - over_prob
                
                results.extend([
                    {
                        "line": f"Over {line}",
                        "fair_prob": over_prob,
                        "push_prob": 0.0
                    },
                    {
                        "line": f"Under {line}",
                        "fair_prob": under_prob,
                        "push_prob": 0.0
                    }
                ])
    
    return pd.DataFrame(results).sort_values('line')

# Example usage:
home_elevenify = 1.51
away_elevenify = 1.06

home_afpl = 1.45
away_afpl = 1.1

home_xg = (0.65*home_elevenify) + (0.35*home_afpl)
away_xg = (0.65*away_elevenify) + (0.35*away_afpl) 

# Get odds for Asian total 2.5, 3.0
odds = predict_goal_line_from_xg( home_xg=0.90, away_xg=1.0, requested_lines=["2.0"])
print(odds)

        line  fair_prob  push_prob
0   Over 2.0    0.43313    0.26958
1  Under 2.0    0.56687    0.26958


In [45]:
def kelly_expected(probability, decimal_odds, fractiont = 1.0):
    if decimal_odds <= 1 or probability <= 0 or probability >= 1:
        return 0, 0
        
    # Calculate Kelly Criterion
    q = 1 - probability  # probability of losing
    kelly = (probability * (decimal_odds - 1) - q) / (decimal_odds - 1) # win rate * potential profit subtracitng prob of losing. Divided by potentail profit
    kelly = kelly * fraction  # Apply fractional Kelly
    kelly = max(0, kelly)  # No negative bets
    
    # Calculate Expected Value
    ev = (probability * (decimal_odds - 1)) - (1 - probability)
    ev = ev * 100  # Convert to percentage
    
    return kelly, ev

prob =  0.56687
odds =  1.83
fraction = 1

bet_size, ev = kelly_expected(prob, odds, fraction)
print(f"Optimal bet size: {bet_size:.1%}")
print(f"Expected Value: {ev:.1f}%")

Optimal bet size: 4.5%
Expected Value: 3.7%
