In [2]:
import pandas as pd
import sqlite3

db_path = r'/Users/admin/dev/algobetting/infra/data/db/fotmob.db'
conn = sqlite3.connect(db_path)

# Load match data
stats_df = pd.read_sql_query("""
    SELECT 
        *,
        home.team_name as home_team_name,
        away.team_name as away_team_name              
    FROM match_stats ms
        LEFT JOIN team_id_mapping home ON home.team_id = ms.home_team
        LEFT JOIN team_id_mapping away ON away.team_id = ms.away_team
    WHERE
       league_id IN ('Premier_League')
    ORDER BY match_date ASC
""", conn)

stats_df

Unnamed: 0,match_id,home_team,away_team,match_date,league_id,home_BallPossesion,away_BallPossesion,home_expected_goals,away_expected_goals,home_total_shots,...,away_yellow_cards,home_red_cards,away_red_cards,season,team_id,team_name,team_id.1,team_name.1,home_team_name,away_team_name
0,3609929,9937,9825,2021-08-13,Premier_League,35,65,1.24,1.28,8,...,0,0,0,2021-2022,9937,Brentford,9825,Arsenal,Brentford,Arsenal
1,3609936,9850,8650,2021-08-14,Premier_League,50,50,1.42,1.66,14,...,1,0,0,2021-2022,9850,Norwich,8650,Liverpool,Norwich,Liverpool
2,3609930,8191,10204,2021-08-14,Premier_League,36,64,1.45,1.05,14,...,1,0,0,2021-2022,8191,Burnley,10204,Brighton,Burnley,Brighton
3,3609931,8455,9826,2021-08-14,Premier_League,62,38,0.68,0.22,13,...,0,0,0,2021-2022,8455,Chelsea,9826,Crystal Palace,Chelsea,Crystal Palace
4,3609932,8668,8466,2021-08-14,Premier_League,48,52,2.42,0.85,14,...,0,0,0,2021-2022,8668,Everton,8466,Southampton,Everton,Southampton
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1345,4813580,8668,8602,2026-01-07,Premier_League,45,55,0.89,0.73,13,...,3,2,0,2025-2026,8668,Everton,8602,Wolves,Everton,Wolves
1346,4813583,10261,8463,2026-01-07,Premier_League,62,38,2.43,1.89,18,...,2,0,0,2025-2026,10261,Newcastle,8463,Leeds,Newcastle,Leeds
1347,4813577,9937,8472,2026-01-07,Premier_League,50,50,3.24,1.72,18,...,1,0,0,2025-2026,9937,Brentford,8472,Sunderland,Brentford,Sunderland
1348,4813582,8456,10204,2026-01-07,Premier_League,60,40,2.55,1.16,21,...,6,0,0,2025-2026,8456,Man City,10204,Brighton,Man City,Brighton


In [3]:
for col in stats_df.columns:
    print(col)

match_id
home_team
away_team
match_date
league_id
home_BallPossesion
away_BallPossesion
home_expected_goals
away_expected_goals
home_total_shots
away_total_shots
home_ShotsOnTarget
away_ShotsOnTarget
home_big_chance
away_big_chance
home_big_chance_missed_title
away_big_chance_missed_title
home_accurate_passes
home_accurate_passes_pct
away_accurate_passes
away_accurate_passes_pct
home_fouls
away_fouls
home_corners
away_corners
home_shots
away_shots
home_ShotsOffTarget
away_ShotsOffTarget
home_blocked_shots
away_blocked_shots
home_shots_woodwork
away_shots_woodwork
home_shots_inside_box
away_shots_inside_box
home_shots_outside_box
away_shots_outside_box
home_expected_goals_open_play
away_expected_goals_open_play
home_expected_goals_set_play
away_expected_goals_set_play
home_expected_goals_non_penalty
away_expected_goals_non_penalty
home_expected_goals_on_target
away_expected_goals_on_target
home_passes
away_passes
home_own_half_passes
away_own_half_passes
home_opposition_half_passes
away

In [4]:
def add_ema_features(df, home_col, away_col, span, feature_name):
    """
    Add EMA features for home and away teams based on a stat column.
    
    Parameters:
    -----------
    df : pd.DataFrame
        The main dataframe with match data
    home_col : str
        Column name for home team stat
    away_col : str
        Column name for away team stat
    span : int
        EMA span parameter
    feature_name : str
        Base name for the output columns (will add _home and _away suffixes)
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with new EMA columns added
    """
    # Create a long format dataframe with all team performances
    home_stats = df[['match_date', 'home_team_name', home_col]].copy()
    home_stats.columns = ['match_date', 'team_name', 'stat_value']
    
    away_stats = df[['match_date', 'away_team_name', away_col]].copy()
    away_stats.columns = ['match_date', 'team_name', 'stat_value']
    
    # Combine home and away stats
    all_stats = pd.concat([home_stats, away_stats], ignore_index=True)
    all_stats = all_stats.sort_values(['team_name', 'match_date'])
    
    # Calculate EMA for each team
    all_stats['ema'] = all_stats.groupby('team_name')['stat_value'].transform(
        lambda x: x.ewm(span=span, adjust=False).mean()
    )
    
    # Shift to avoid data leakage
    all_stats['ema_shifted'] = all_stats.groupby('team_name')['ema'].shift(1)
    
    # Merge back for home team
    df = df.merge(
        all_stats[['match_date', 'team_name', 'ema_shifted']],
        left_on=['match_date', 'home_team_name'],
        right_on=['match_date', 'team_name'],
        how='left',
        suffixes=('', '_drop')
    )
    df = df.drop(columns=['team_name']).rename(columns={'ema_shifted': f'{feature_name}_home'})
    
    # Merge back for away team
    df = df.merge(
        all_stats[['match_date', 'team_name', 'ema_shifted']],
        left_on=['match_date', 'away_team_name'],
        right_on=['match_date', 'team_name'],
        how='left',
        suffixes=('', '_drop')
    )
    df = df.drop(columns=['team_name']).rename(columns={'ema_shifted': f'{feature_name}_away'})
    
    return df


# Usage example:
stats_df = add_ema_features(
    stats_df, 
    home_col='home_touches_opp_box',
    away_col='away_touches_opp_box',
    span=20,
    feature_name='tch_in_opp_box_ema20'
)

# Add multiple features easily:
stats_df = add_ema_features(stats_df, 'home_BallPossesion', 'away_BallPossesion', 20, 'poss_ema20')
stats_df = add_ema_features(stats_df, 'home_opposition_half_passes', 'away_opposition_half_passes', 20, 'opp_half_passes_ema20')

stats_df

Unnamed: 0,match_id,home_team,away_team,match_date,league_id,home_BallPossesion,away_BallPossesion,home_expected_goals,away_expected_goals,home_total_shots,...,team_id,home_team_name,away_team_name,team_name_drop,tch_in_opp_box_ema20_home,tch_in_opp_box_ema20_away,poss_ema20_home,poss_ema20_away,opp_half_passes_ema20_home,opp_half_passes_ema20_away
0,3609929,9937,9825,2021-08-13,Premier_League,35,65,1.24,1.28,8,...,9825,Brentford,Arsenal,Brentford,,,,,,
1,3609936,9850,8650,2021-08-14,Premier_League,50,50,1.42,1.66,14,...,8650,Norwich,Liverpool,Norwich,,,,,,
2,3609930,8191,10204,2021-08-14,Premier_League,36,64,1.45,1.05,14,...,10204,Burnley,Brighton,Burnley,,,,,,
3,3609931,8455,9826,2021-08-14,Premier_League,62,38,0.68,0.22,13,...,9826,Chelsea,Crystal Palace,Chelsea,,,,,,
4,3609932,8668,8466,2021-08-14,Premier_League,48,52,2.42,0.85,14,...,8466,Everton,Southampton,Everton,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1345,4813580,8668,8602,2026-01-07,Premier_League,45,55,0.89,0.73,13,...,8602,Everton,Wolves,Everton,23.545339,18.756032,41.811458,43.047892,151.076175,145.019893
1346,4813583,10261,8463,2026-01-07,Premier_League,62,38,2.43,1.89,18,...,8463,Newcastle,Leeds,Newcastle,27.586952,24.129101,53.559736,45.202809,199.017687,137.407472
1347,4813577,9937,8472,2026-01-07,Premier_League,50,50,3.24,1.72,18,...,8472,Brentford,Sunderland,Brentford,23.698790,18.669755,48.462897,41.204902,139.998168,128.383305
1348,4813582,8456,10204,2026-01-07,Premier_League,60,40,2.55,1.16,21,...,10204,Man City,Brighton,Man City,36.678596,28.614638,60.659827,55.060921,300.038576,191.450014


In [5]:
feature_df = stats_df[['match_id', 'match_date', 'home_team_name', 'tch_in_opp_box_ema20_home', 'poss_ema20_home', 'opp_half_passes_ema20_home',
                       'away_team_name', 'tch_in_opp_box_ema20_away', 'poss_ema20_away', 'opp_half_passes_ema20_away']]

feature_df.tail(8)

Unnamed: 0,match_id,match_date,home_team_name,tch_in_opp_box_ema20_home,poss_ema20_home,opp_half_passes_ema20_home,away_team_name,tch_in_opp_box_ema20_away,poss_ema20_away,opp_half_passes_ema20_away
1342,4813578,2026-01-07,Burnley,19.983641,44.388576,150.535606,Man United,28.454536,53.177435,192.477679
1343,4813575,2026-01-07,Bournemouth,27.093658,49.137205,171.048766,Tottenham,23.353782,50.432294,170.712473
1344,4813579,2026-01-07,Crystal Palace,26.120076,43.792561,139.90938,Aston Villa,25.235989,52.055682,169.183085
1345,4813580,2026-01-07,Everton,23.545339,41.811458,151.076175,Wolves,18.756032,43.047892,145.019893
1346,4813583,2026-01-07,Newcastle,27.586952,53.559736,199.017687,Leeds,24.129101,45.202809,137.407472
1347,4813577,2026-01-07,Brentford,23.69879,48.462897,139.998168,Sunderland,18.669755,41.204902,128.383305
1348,4813582,2026-01-07,Man City,36.678596,60.659827,300.038576,Brighton,28.614638,55.060921,191.450014
1349,4813576,2026-01-08,Arsenal,36.937289,57.879737,227.124945,Liverpool,34.650167,61.61927,297.542932


In [6]:
import pandas as pd
import sqlite3

db_path = r'/Users/admin/dev/algobetting/infra/data/db/fotmob.db'
conn = sqlite3.connect(db_path)

# Load match data
pen_df = pd.read_sql_query("""
    SELECT 
        *,
        home.team_name as home_team_name,
        away.team_name as away_team_name              
    FROM penalties ms
        LEFT JOIN team_id_mapping home ON home.team_id = ms.home_team
        LEFT JOIN team_id_mapping away ON away.team_id = ms.away_team
    WHERE
       league_id IN ('Premier_League')
    ORDER BY match_date ASC
""", conn)

pen_df = pen_df[['match_id', 'home_pens', 'away_pens']]

pen_df

Unnamed: 0,match_id,home_pens,away_pens
0,3609929,0,0
1,3609936,0,0
2,3609930,0,0
3,3609931,0,0
4,3609932,0,0
...,...,...,...
1345,4813580,0,0
1346,4813583,1,1
1347,4813577,0,1
1348,4813582,1,0


In [7]:
# Join feature dataset to y dataset
df = feature_df.merge(
    pen_df,
    on='match_id',
    how='left'  # adjust as needed
)

df = df[df['match_date'] > '2022-01-01']

df.head()

Unnamed: 0,match_id,match_date,home_team_name,tch_in_opp_box_ema20_home,poss_ema20_home,opp_half_passes_ema20_home,away_team_name,tch_in_opp_box_ema20_away,poss_ema20_away,opp_half_passes_ema20_away,home_pens,away_pens
186,3610131,2022-01-02,Brentford,22.301708,44.086016,117.773833,Aston Villa,26.399369,46.05535,145.816372,0,0
187,3610135,2022-01-02,Leeds,22.262632,52.779236,130.968789,Burnley,22.171608,39.451501,124.390683,0,0
188,3610134,2022-01-02,Everton,17.776875,39.400219,125.900213,Brighton,25.705855,57.465257,185.298508,1,0
189,3610132,2022-01-02,Chelsea,32.410091,60.605745,289.155843,Liverpool,37.226127,61.684425,281.441487,0,0
190,3610137,2022-01-03,Man United,23.63651,52.752712,201.070162,Wolves,14.489611,45.469925,153.356864,0,0


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import statsmodels.api as sm
from scipy.stats import poisson
import numpy as np
import pandas as pd

# Stack home and away data
home_data = df[['tch_in_opp_box_ema20_home', 'opp_half_passes_ema20_home', 'home_pens']].copy()
home_data.columns = ['box_touches_ema', 'opp_half_passes_ema', 'pens']
home_data['is_home'] = 1

away_data = df[['tch_in_opp_box_ema20_away', 'opp_half_passes_ema20_away', 'away_pens']].copy()
away_data.columns = ['box_touches_ema', 'opp_half_passes_ema', 'pens']
away_data['is_home'] = 0

# Combine
all_data = pd.concat([home_data, away_data], ignore_index=True)

# Features and target
X = all_data[['box_touches_ema', 'is_home']]
y = all_data['pens']

# Remove missing values
mask = X.notna().all(axis=1) & y.notna()
X_clean = X[mask]
y_clean = y[mask]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=42
)

# Add constant for intercept
X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)

# Fit Poisson regression
poisson_model = sm.GLM(y_train, X_train_sm, family=sm.families.Poisson())
poisson_results = poisson_model.fit()

# Make predictions
y_pred = poisson_results.predict(X_test_sm)

# ============================================================================
# SPLIT BY HOME/AWAY FOR COMPARISON
# ============================================================================

# Separate test data into home and away
home_mask_test = X_test['is_home'] == 1
away_mask_test = X_test['is_home'] == 0

y_test_home = y_test[home_mask_test]
y_test_away = y_test[away_mask_test]

y_pred_home = y_pred[home_mask_test]
y_pred_away = y_pred[away_mask_test]

# Calculate baseline predictions (historical averages from training data)
home_mask_train = X_train['is_home'] == 1
away_mask_train = X_train['is_home'] == 0

baseline_home = y_train[home_mask_train].mean()
baseline_away = y_train[away_mask_train].mean()

# Baseline predictions for test set
y_pred_baseline_home = np.full(len(y_test_home), baseline_home)
y_pred_baseline_away = np.full(len(y_test_away), baseline_away)

# ============================================================================
# COMPARISON TABLE
# ============================================================================

print("=" * 80)
print("HOME vs AWAY PERFORMANCE COMPARISON")
print("=" * 80)

print("\n" + "-" * 80)
print("HOME TEAMS")
print("-" * 80)
print(f"Sample size: {len(y_test_home)} matches")
print(f"Actual mean: {y_test_home.mean():.4f}")
print(f"Actual std: {y_test_home.std():.4f}")
print(f"Zero rate: {(y_test_home == 0).mean() * 100:.1f}%")
print()
print(f"BASELINE (historical avg = {baseline_home:.4f}):")
print(f"  MAE: {mean_absolute_error(y_test_home, y_pred_baseline_home):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test_home, y_pred_baseline_home)):.4f}")
print()
print(f"POISSON REGRESSION:")
print(f"  Predicted mean: {y_pred_home.mean():.4f}")
print(f"  Predicted std: {y_pred_home.std():.4f}")
print(f"  MAE: {mean_absolute_error(y_test_home, y_pred_home):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test_home, y_pred_home)):.4f}")
print()
mae_improvement_home = mean_absolute_error(y_test_home, y_pred_baseline_home) - mean_absolute_error(y_test_home, y_pred_home)
pct_improvement_home = (mae_improvement_home / mean_absolute_error(y_test_home, y_pred_baseline_home)) * 100
print(f"IMPROVEMENT: {mae_improvement_home:+.4f} MAE ({pct_improvement_home:+.1f}%)")

print("\n" + "-" * 80)
print("AWAY TEAMS")
print("-" * 80)
print(f"Sample size: {len(y_test_away)} matches")
print(f"Actual mean: {y_test_away.mean():.4f}")
print(f"Actual std: {y_test_away.std():.4f}")
print(f"Zero rate: {(y_test_away == 0).mean() * 100:.1f}%")
print()
print(f"BASELINE (historical avg = {baseline_away:.4f}):")
print(f"  MAE: {mean_absolute_error(y_test_away, y_pred_baseline_away):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test_away, y_pred_baseline_away)):.4f}")
print()
print(f"POISSON REGRESSION:")
print(f"  Predicted mean: {y_pred_away.mean():.4f}")
print(f"  Predicted std: {y_pred_away.std():.4f}")
print(f"  MAE: {mean_absolute_error(y_test_away, y_pred_away):.4f}")
print(f"  RMSE: {np.sqrt(mean_squared_error(y_test_away, y_pred_away)):.4f}")
print()
mae_improvement_away = mean_absolute_error(y_test_away, y_pred_baseline_away) - mean_absolute_error(y_test_away, y_pred_away)
pct_improvement_away = (mae_improvement_away / mean_absolute_error(y_test_away, y_pred_baseline_away)) * 100
print(f"IMPROVEMENT: {mae_improvement_away:+.4f} MAE ({pct_improvement_away:+.1f}%)")

print("\n" + "=" * 80)
print("OVERALL COMPARISON")
print("=" * 80)

# Overall metrics
mae_baseline_overall = mean_absolute_error(y_test, 
    np.concatenate([y_pred_baseline_home, y_pred_baseline_away]))
mae_poisson_overall = mean_absolute_error(y_test, y_pred)

print(f"\nCOMBINED (Home + Away):")
print(f"  Baseline MAE: {mae_baseline_overall:.4f}")
print(f"  Poisson MAE: {mae_poisson_overall:.4f}")
print(f"  Improvement: {mae_baseline_overall - mae_poisson_overall:+.4f} ({(mae_baseline_overall - mae_poisson_overall)/mae_baseline_overall * 100:+.1f}%)")

print("\n" + "-" * 80)
print("VERDICT")
print("-" * 80)

if abs(pct_improvement_home) < 5 and abs(pct_improvement_away) < 5:
    print("⚠️  Both home and away show <5% improvement")
    print("   → Poisson model adds minimal value over baseline")
    print("   → Consider using simple baseline for predictions")
elif pct_improvement_home > 5 and pct_improvement_away < 5:
    print("✓  Model helps for HOME teams but not AWAY")
    print("   → Consider separate models or hybrid approach")
elif pct_improvement_home < 5 and pct_improvement_away > 5:
    print("✓  Model helps for AWAY teams but not HOME")
    print("   → Consider separate models or hybrid approach")
else:
    print("✓  Model improves predictions for both home and away")
    print("   → Poisson regression is worthwhile")

# Show coefficient interpretation
print("\n" + "=" * 80)
print("MODEL COEFFICIENTS")
print("=" * 80)
print(poisson_results.summary().tables[1])

print("\nInterpretation:")
home_effect = np.exp(poisson_results.params['is_home']) - 1
print(f"  is_home coefficient: Home teams get {home_effect*100:+.1f}% more penalties")
print(f"  box_touches_ema coefficient: Each additional touch increases penalty rate by {(np.exp(poisson_results.params['box_touches_ema']) - 1)*100:.2f}%")

HOME vs AWAY PERFORMANCE COMPARISON

--------------------------------------------------------------------------------
HOME TEAMS
--------------------------------------------------------------------------------
Sample size: 230 matches
Actual mean: 0.1609
Actual std: 0.3912
Zero rate: 84.8%

BASELINE (historical avg = 0.1571):
  MAE: 0.2702
  RMSE: 0.3904

POISSON REGRESSION:
  Predicted mean: 0.1566
  Predicted std: 0.0443
  MAE: 0.2664
  RMSE: 0.3894

IMPROVEMENT: +0.0037 MAE (+1.4%)

--------------------------------------------------------------------------------
AWAY TEAMS
--------------------------------------------------------------------------------
Sample size: 232 matches
Actual mean: 0.1293
Actual std: 0.3729
Zero rate: 88.4%

BASELINE (historical avg = 0.1008):
  MAE: 0.2066
  RMSE: 0.3732

POISSON REGRESSION:
  Predicted mean: 0.0972
  Predicted std: 0.0258
  MAE: 0.2039
  RMSE: 0.3744

IMPROVEMENT: +0.0027 MAE (+1.3%)

OVERALL COMPARISON

COMBINED (Home + Away):
  Baseline 