In [19]:
import sqlite3 
import pandas as pd

db_path  = r'C:\Users\Owner\dev\algobetting\infra\data\db\fotmob.db'

conn = sqlite3.connect(db_path)
    
    # Load match data
df = pd.read_sql_query("""

    SELECT * FROM model_v0_salary_regression

    """, conn)

df['salary_millions'] = df['salary(infl)'] / 1000000


df

Unnamed: 0,team,mean,salary(infl),year,type,salary_millions
0,Arsenal,0.217,126000000,2021,attack,126.0
1,Aston Villa,-0.080,91000000,2021,attack,91.0
2,Brentford,-0.059,30500000,2021,attack,30.5
3,Brighton,-0.118,53000000,2021,attack,53.0
4,Burnley,-0.251,43600000,2021,attack,43.6
...,...,...,...,...,...,...
155,Nottm Forest,-0.075,66000000,2024,defence,66.0
156,Southampton,0.425,54100000,2024,defence,54.1
157,Tottenham,0.286,121100000,2024,defence,121.1
158,West Ham,0.142,103100000,2024,defence,103.1


In [20]:
att_df = df[df["type"] == "attack"]

att_corr = att_df[['mean', 'salary_millions']].corr()

att_corr


Unnamed: 0,mean,salary_millions
mean,1.0,0.587915
salary_millions,0.587915,1.0


In [None]:
def_df = df[df["type"] == "defence"]

def_corr = def_df[['mean', 'salary_millions']].corr()

def_corr

Unnamed: 0,team,mean,salary(infl),year,type,salary_millions
20,Arsenal,-0.156,126000000,2021,defence,126.0
21,Aston Villa,-0.050,91000000,2021,defence,91.0
22,Brentford,-0.011,30500000,2021,defence,30.5
23,Brighton,-0.172,53000000,2021,defence,53.0
24,Burnley,0.053,43600000,2021,defence,43.6
...,...,...,...,...,...,...
155,Nottm Forest,-0.075,66000000,2024,defence,66.0
156,Southampton,0.425,54100000,2024,defence,54.1
157,Tottenham,0.286,121100000,2024,defence,121.1
158,West Ham,0.142,103100000,2024,defence,103.1


In [27]:
import statsmodels.api as sm
import numpy as np

def fit_strength_salary_models(att_df, def_df):
    """Fit models to predict strength from salary for both team types"""
    
    # Attack teams: Strength ~ Salary
    X_att = sm.add_constant(att_df[['salary_millions']])
    y_att = att_df['mean']  # Strength as dependent variable
    att_model = sm.OLS(y_att, X_att).fit()
    
    # Defence teams: Strength ~ Salary  
    X_def = sm.add_constant(def_df[['salary_millions']])
    y_def = def_df['mean']  # Strength as dependent variable
    def_model = sm.OLS(y_def, X_def).fit()
    
    return att_model, def_model

def predict_team_strength(salary_millions, team_type="attack", att_model=None, def_model=None):
    """
    Predict team strength from salary spending
    """
    
    if team_type == "attack":
        model = att_model
    elif team_type == "defence":
        model = def_model
    else:
        raise ValueError("team_type must be 'attack' or 'defence'")
    
    if model is None:
        raise ValueError(f"Must provide {team_type}_model parameter")
    
    # Method 1: Direct calculation (always works)
    const_coef = model.params['const']
    salary_coef = model.params['salary_millions']
    predicted_strength = const_coef + salary_coef * salary_millions
    
    # Method 2: Using statsmodels prediction with proper DataFrame
    try:
        # Create DataFrame exactly like training data structure
        pred_data = pd.DataFrame({'salary_millions': [salary_millions]})
        pred_data_with_const = sm.add_constant(pred_data, has_constant='add')
        
        prediction_obj = model.get_prediction(pred_data_with_const)
        ci_lower, ci_upper = prediction_obj.conf_int()[0]
        
    except Exception as e:
        print(f"Confidence interval calculation failed: {e}")
        # Fallback: rough CI using standard error
        se = model.bse['salary_millions'] * salary_millions + model.bse['const']
        margin = 1.96 * se  # 95% CI
        ci_lower = predicted_strength - margin
        ci_upper = predicted_strength + margin
    
    return {
        'predicted_strength': predicted_strength,
        'confidence_interval': (ci_lower, ci_upper),
        'salary_input': salary_millions,
        'team_type': team_type,
        'r_squared': model.rsquared,
        'equation': f"Strength = {salary_coef:.6f} * Salary + {const_coef:.6f}"
    }

In [32]:
att_model, def_model = fit_strength_salary_models(att_df, def_df)

result = predict_team_strength(172.1, "attack", att_model, def_model)

result

{'predicted_strength': 0.21083019173757922,
 'confidence_interval': (0.13435690342816337, 0.2873034800469951),
 'salary_input': 172.1,
 'team_type': 'attack',
 'r_squared': 0.3456434699203226,
 'equation': 'Strength = 0.002722 * Salary + -0.257691'}