In [11]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.special import factorial
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
import sqlite3

conn = sqlite3.connect(r'C:\Users\Owner\dev\algobetting\infra\data\db\algobetting.db')

df = pd.read_sql_query("""
                SELECT 
                    team,
                    summary_xg as xg,
                    summary_goals as goals,
                    keeper_psxg as psxg,
                    match_date as date
                FROM fbref_match_all_columns
                WHERE division = 'Premier League'
                    AND summary_xg IS NOT NULL
                    AND opp_summary_xg IS NOT NULL
                       """, conn)

conn.close()

df

Unnamed: 0,team,xg,goals,psxg,date
0,Tottenham,2.0,1.0,1.5,2025-05-25
1,Brighton,2.2,4.0,4.1,2025-05-25
2,Bournemouth,1.6,2.0,2.1,2025-05-25
3,Leicester City,0.3,0.0,0.0,2025-05-25
4,Newcastle Utd,1.2,0.0,1.7,2025-05-25
...,...,...,...,...,...
6071,Leeds United,0.3,3.0,0.5,2020-09-12
6072,Crystal Palace,1.1,1.0,0.8,2020-09-12
6073,Southampton,0.9,0.0,0.4,2020-09-12
6074,Fulham,0.1,0.0,0.1,2020-09-12


In [18]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

def match_level_correlation(df, games_per_period=20):
    """
    df: DataFrame with columns [team_id, match_date, goals, xg, psxg]
    Split each team's matches into periods and test correlations
    """
    
    results = []
    
    for team in df['team'].unique():
        team_data = df[df['team'] == team].sort_values('date')
        
        # Create rolling periods
        for i in range(games_per_period, len(team_data) - games_per_period, games_per_period):
            period1 = team_data.iloc[i-games_per_period:i]
            period2 = team_data.iloc[i:i+games_per_period]
            
            results.append({
                'team': team,
                'p1_goals_per_game': period1['goals'].mean(),
                'p1_xg_per_game': period1['xg'].mean(),
                'p1_psxg_per_game': period1['psxg'].mean(),
                'p2_goals_per_game': period2['goals'].mean()
            })
    
    analysis_df = pd.DataFrame(results)
    
    if len(analysis_df) < 30:
        print(f"Warning: Only {len(analysis_df)} periods found. Need more data.")
        return analysis_df
    
    # Calculate correlations
    goal_corr = pearsonr(analysis_df['p1_goals_per_game'], analysis_df['p2_goals_per_game'])[0]
    xg_corr = pearsonr(analysis_df['p1_xg_per_game'], analysis_df['p2_goals_per_game'])[0]  
    psxg_corr = pearsonr(analysis_df['p1_psxg_per_game'], analysis_df['p2_goals_per_game'])[0]
    
    print(f"Goals -> Future Goals: {goal_corr:.3f}")
    print(f"xG -> Future Goals: {xg_corr:.3f}")
    print(f"PSxG -> Future Goals: {psxg_corr:.3f}")
    print(f"\nPSxG beats xG by: {psxg_corr - xg_corr:.3f}")
    print(f"Sample size: {len(analysis_df)} periods")
    
    return analysis_df

# Usage:
match_level_correlation(df)

Goals -> Future Goals: 0.734
xG -> Future Goals: 0.745
PSxG -> Future Goals: 0.741

PSxG beats xG by: -0.004
Sample size: 258 periods


Unnamed: 0,team,p1_goals_per_game,p1_xg_per_game,p1_psxg_per_game,p2_goals_per_game
0,Tottenham,1.95,1.720,1.815,2.00
1,Tottenham,2.00,1.815,2.035,2.05
2,Tottenham,2.05,1.435,1.660,1.40
3,Tottenham,1.40,1.405,1.255,1.55
4,Tottenham,1.55,1.310,1.315,1.95
...,...,...,...,...,...
253,Norwich City,0.25,0.965,0.660,0.65
254,Leeds United,1.75,1.565,1.715,1.50
255,Leeds United,1.50,1.330,1.415,1.20
256,Leeds United,1.20,1.275,1.180,1.10
