In [1]:
"""
LA LIGA WINNER PREDICTION - MACHINE LEARNING MODEL
Complete implementation with Random Forest, Gradient Boosting, and Logistic Regression
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

def load_and_prepare_data(filepath='2015-2025.csv'):
    """Load and prepare the dataset"""
    df = pd.read_csv(filepath)
    return df

def create_features(df):
    """
    Feature Engineering: Create meaningful features from raw data
    """
    # Basic performance metrics
    total_matches = df['wins'] + df['draws'] + df['losses']
    df['points_per_game'] = df['pts'] / (total_matches + 0.001)
    df['goals_per_game'] = df['goals_for'] / (total_matches + 0.001)
    df['goals_conceded_per_game'] = df['goals_against'] / (total_matches + 0.001)
    
    # Efficiency metrics
    df['home_dominance'] = df['home_pts'] / (df['home_pts'] + df['away_pts'] + 0.001)
    df['away_strength'] = df['away_pts'] / (total_matches * 3 / 2 + 0.001)
    
    # ELO metrics
    df['elo_change'] = df['elo_end_of_season_elo'] - df['elo_start_of_season_elo']
    df['elo_strength'] = df['elo_mean_of_season_elo']
    df['elo_normalized'] = (df['elo_strength'] - 1200) / 400
    
    # Form and consistency
    df['form_indicator'] = df['win_rate'] * df['goal_difference']
    df['consistency_score'] = df['win_rate'] - df['loss_rate']
    df['draw_tendency'] = df['draw_rate']
    
    # Attack/Defense balance
    df['attack_rating'] = df['goals_for'] / (total_matches + 0.001)
    df['defense_rating'] = df['goals_against'] / (total_matches + 0.001)
    df['balance_score'] = df['attack_rating'] / (df['defense_rating'] + 0.001)
    
    # Goal difference momentum
    df['gd_per_game'] = df['goal_difference'] / (total_matches + 0.001)
    
    return df

def select_features():
    """Define the feature set for ML models"""
    return [
        'win_rate', 'draw_rate', 'loss_rate',
        'goal_difference', 'goals_for', 'goals_against',
        'points_per_game', 'goals_per_game', 'goals_conceded_per_game',
        'home_pts', 'away_pts', 'home_dominance', 'away_strength',
        'elo_strength', 'elo_change', 'elo_normalized',
        'form_indicator', 'consistency_score', 'balance_score',
        'attack_rating', 'defense_rating', 'gd_per_game'
    ]

def train_models(X_train, y_train):
    """
    Train multiple ML models and return trained models
    """
    models = {
        'Random Forest': RandomForestClassifier(
            n_estimators=200, 
            max_depth=10, 
            min_samples_split=5,
            random_state=42,
            class_weight='balanced'
        ),
        'Gradient Boosting': GradientBoostingClassifier(
            n_estimators=150, 
            max_depth=5, 
            learning_rate=0.1,
            random_state=42
        ),
        'Logistic Regression': LogisticRegression(
            max_iter=1000, 
            random_state=42,
            class_weight='balanced',
            C=0.1
        )
    }
    
    trained_models = {}
    print("\n" + "=" * 80)
    print("TRAINING MACHINE LEARNING MODELS")
    print("=" * 80)
    
    for name, model in models.items():
        # Train model
        model.fit(X_train, y_train)
        trained_models[name] = model
        
        # Training accuracy
        train_pred = model.predict(X_train)
        train_accuracy = accuracy_score(y_train, train_pred)
        
        # Cross-validation score
        cv_scores = cross_val_score(model, X_train, y_train, cv=5)
        
        print(f"\n{name}:")
        print(f"  Training Accuracy: {train_accuracy:.3f}")
        print(f"  Cross-Val Score: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")
    
    return trained_models

def get_feature_importance(model, feature_names):
    """Extract and display feature importance"""
    if hasattr(model, 'feature_importances_'):
        importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False)
        return importance_df
    return None

def predict_current_season(trained_models, scaler, current_season, features):
    """
    Make predictions for the current season
    """
    # Prepare current season data
    X_current = current_season[features].fillna(0)
    X_current_scaled = scaler.transform(X_current)
    
    # Get predictions from all models
    predictions = {}
    probabilities = {}
    
    for name, model in trained_models.items():
        pred_proba = model.predict_proba(X_current_scaled)[:, 1]
        probabilities[name] = pred_proba * 100
    
    # Create results dataframe
    results = current_season[['team', 'pts', 'wins', 'draws', 'losses', 
                               'goal_difference', 'elo_strength']].copy()
    
    # Add model probabilities
    for name, prob in probabilities.items():
        results[f'{name}_prob'] = prob
    
    # Calculate ensemble prediction (weighted average)
    results['ensemble_probability'] = (
        results['Random Forest_prob'] * 0.4 +
        results['Gradient Boosting_prob'] * 0.4 +
        results['Logistic Regression_prob'] * 0.2
    )
    
    return results

def project_final_standings(results):
    """
    Project end-of-season standings
    """
    # Calculate matches played
    matches_played = results['wins'] + results['draws'] + results['losses']
    remaining_matches = 38 - matches_played
    
    # Calculate current form
    results['current_ppg'] = results['pts'] / matches_played
    
    # Simple projection based on current form
    results['projected_points_simple'] = results['pts'] + (results['current_ppg'] * remaining_matches)
    
    # ML-adjusted projection (considering championship probability)
    results['ml_adjusted_points'] = (
        results['projected_points_simple'] * 0.6 +
        (results['ensemble_probability'] / 100 * 95) * 0.4
    )
    
    return results, remaining_matches.iloc[0]

def main():
    """
    Main execution function
    """
    print("=" * 80)
    print("LA LIGA 2025/26 WINNER PREDICTION - MACHINE LEARNING MODEL")
    print("=" * 80)
    
    # Load data
    df = load_and_prepare_data('2015-2025.csv')
    
    # Feature engineering
    df = create_features(df)
    
    # Separate training and current season data
    train_df = df[df['season'] != '25/26'].copy()
    current_season = df[df['season'] == '25/26'].copy()
    
    print(f"\nDataset Summary:")
    print(f"  Training seasons: {train_df['season'].nunique()} seasons")
    print(f"  Training samples: {len(train_df)} team-seasons")
    print(f"  Champions in training: {train_df['is_champion'].sum()}")
    print(f"  Current season teams: {len(current_season)}")
    
    # Select features
    features = select_features()
    
    # Prepare training data
    X_train = train_df[features].fillna(0)
    y_train = train_df['is_champion']
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    
    # Train models
    trained_models = train_models(X_train_scaled, y_train)
    
    # Feature importance
    print("\n" + "=" * 80)
    print("FEATURE IMPORTANCE (Random Forest)")
    print("=" * 80)
    importance_df = get_feature_importance(trained_models['Random Forest'], features)
    if importance_df is not None:
        print("\nTop 10 Most Important Features:")
        for idx, row in importance_df.head(10).iterrows():
            bar = '‚ñà' * int(row['importance'] * 100)
            print(f"  {row['feature']:<30} {bar} {row['importance']:.4f}")
    
    # Make predictions
    print("\n" + "=" * 80)
    print("2025/26 SEASON PREDICTIONS")
    print("=" * 80)
    
    results = predict_current_season(trained_models, scaler, current_season, features)
    results, remaining_matches = project_final_standings(results)
    
    # Sort by ensemble probability
    results = results.sort_values('ensemble_probability', ascending=False)
    
    # Display championship probabilities
    print("\n" + "=" * 80)
    print("CHAMPIONSHIP PROBABILITY RANKINGS")
    print("=" * 80)
    print(f"\n{'#':<4}{'Team':<20}{'Pts':<8}{'W-D-L':<12}{'GD':<8}{'ML Prob':<12}{'Proj Pts':<10}")
    print("-" * 80)
    
    for idx, (_, row) in enumerate(results.iterrows(), 1):
        wdl = f"{int(row['wins'])}-{int(row['draws'])}-{int(row['losses'])}"
        print(f"{idx:<4}{row['team']:<20}{int(row['pts']):<8}{wdl:<12}"
              f"{int(row['goal_difference']):<8}{row['ensemble_probability']:<12.2f}"
              f"{row['ml_adjusted_points']:<10.1f}")
    
    # Winner prediction
    winner = results.iloc[0]
    print("\n" + "=" * 80)
    print("üèÜ PREDICTED LA LIGA CHAMPION 2025/26")
    print("=" * 80)
    print(f"""
Team:                  {winner['team']}
Championship Prob:     {winner['ensemble_probability']:.2f}%
Current Points:        {int(winner['pts'])} points
Record:                {int(winner['wins'])}W-{int(winner['draws'])}D-{int(winner['losses'])}L
Goal Difference:       {int(winner['goal_difference']):+d}
ELO Rating:            {winner['elo_strength']:.1f}
Projected Final Pts:   {winner['ml_adjusted_points']:.1f}

Confidence Level:      {"VERY HIGH" if winner['ensemble_probability'] > 70 else "HIGH" if winner['ensemble_probability'] > 50 else "MODERATE"}
""")
    
    # Model agreement
    print("=" * 80)
    print("MODEL PREDICTIONS COMPARISON (Top 5)")
    print("=" * 80)
    print(f"\n{'Team':<20}{'Random Forest':<18}{'Gradient Boost':<18}{'Log Regress':<18}{'Ensemble':<12}")
    print("-" * 86)
    
    for _, row in results.head(5).iterrows():
        print(f"{row['team']:<20}"
              f"{row['Random Forest_prob']:<18.2f}"
              f"{row['Gradient Boosting_prob']:<18.2f}"
              f"{row['Logistic Regression_prob']:<18.2f}"
              f"{row['ensemble_probability']:<12.2f}")
    
    # Top contenders
    print("\n" + "=" * 80)
    print("TOP CHAMPIONSHIP CONTENDERS")
    print("=" * 80)
    
    for idx, (_, row) in enumerate(results.head(5).iterrows(), 1):
        status = "FAVORITE" if idx == 1 else "STRONG CONTENDER" if idx <= 3 else "CONTENDER"
        print(f"\n{idx}. {row['team']} - {status}")
        print(f"   Probability: {row['ensemble_probability']:.2f}%")
        print(f"   Projected Points: {row['ml_adjusted_points']:.1f}")
        print(f"   Current Form: {row['current_ppg']:.2f} PPG")
    
    # Season projection summary
    print("\n" + "=" * 80)
    print("SEASON PROJECTION SUMMARY")
    print("=" * 80)
    print(f"""
Matches Played:        {38 - remaining_matches}
Matches Remaining:     {remaining_matches}
Competition Status:    {"WIDE OPEN" if results.iloc[0]['ensemble_probability'] < 50 else "CLEAR FAVORITE" if results.iloc[0]['ensemble_probability'] > 70 else "COMPETITIVE"}

The ML model predicts {winner['team']} will win La Liga 2025/26 with 
{winner['ensemble_probability']:.1f}% probability, finishing with approximately 
{winner['ml_adjusted_points']:.0f} points.
""")
    
    print("=" * 80)
    print("Analysis Complete!")
    print("=" * 80)
    
    return results

if __name__ == "__main__":
    results = main()

LA LIGA 2025/26 WINNER PREDICTION - MACHINE LEARNING MODEL

Dataset Summary:
  Training seasons: 10 seasons
  Training samples: 200 team-seasons
  Champions in training: 10
  Current season teams: 20

TRAINING MACHINE LEARNING MODELS

Random Forest:
  Training Accuracy: 1.000
  Cross-Val Score: 0.980 (+/- 0.029)

Gradient Boosting:
  Training Accuracy: 1.000
  Cross-Val Score: 0.985 (+/- 0.020)

Logistic Regression:
  Training Accuracy: 0.940
  Cross-Val Score: 0.930 (+/- 0.091)

FEATURE IMPORTANCE (Random Forest)

Top 10 Most Important Features:
  win_rate                       ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 0.1128
  points_per_game                ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 0.1033
  form_indicator                 ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 0.1022
  consistency_score              ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 0.1020
  gd_per_game                    ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 0.0849
  goal_difference                ‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà 0.0839
  away_strength               