In [5]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Tuple, Dict
import sys
from pathlib import Path
project_root = Path.cwd().parent.parent.parent 
print(project_root)
sys.path.append(str(project_root))


/media/l1t-w1n/E/footwork/FootWork


In [6]:

class FootballMatchPredictor:
    def __init__(self, data: pd.DataFrame):
        """
        Initialize the predictor with your football dataset.
        The class handles the entire modeling pipeline from preparation to evaluation.
        """
        self.data = data.copy()
        self.model = None
        self.feature_importance = None
        self.prepare_features()
        
    def prepare_features(self):
        """
        Prepare features for the model, excluding target variables and selecting
        relevant predictive features that would be available before a match.
        """
        # Select features that would be available before a match
        self.feature_columns = [
            col for col in self.data.columns 
            if not col.startswith('target_') and (
                'recent' in col or    # Recent performance metrics
                'avg' in col or       # Average statistics
                'ratio' in col or     # Performance ratios
                'venue' in col or     # Venue-specific stats
                'points' in col       # Points-related information
            )
        ]
        
        print(f"Selected {len(self.feature_columns)} features for prediction")
        print("\nFeature categories included:")
        categories = {
            'Recent Performance': len([col for col in self.feature_columns if 'recent' in col]),
            'Average Statistics': len([col for col in self.feature_columns if 'avg' in col]),
            'Performance Ratios': len([col for col in self.feature_columns if 'ratio' in col]),
            'Venue Statistics': len([col for col in self.feature_columns if 'venue' in col]),
            'Points Information': len([col for col in self.feature_columns if 'points' in col])
        }
        for category, count in categories.items():
            print(f"{category}: {count} features")
    
    def split_data(self) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        """
        Split the data into training and testing sets, ensuring balanced classes.
        """
        X = self.data[self.feature_columns]
        y = self.data['target_result']
        
        return train_test_split(
            X, y,
            test_size=0.2,
            random_state=42,
            stratify=y  # Ensure balanced classes in both sets
        )
    
    def train_model(self, verbose: bool = True) -> xgb.XGBClassifier:
        """
        Train the XGBoost model with carefully chosen parameters.
        """
        # Split the data
        X_train, X_test, y_train, y_test = self.split_data()
        
        # Initialize model with parameters suited for football prediction
        self.model = xgb.XGBClassifier(
            max_depth=6,             # Limit tree depth to prevent overfitting
            learning_rate=0.1,       # Conservative learning rate
            n_estimators=200,        # Number of trees
            subsample=0.8,           # Use 80% of samples per tree
            colsample_bytree=0.8,    # Use 80% of features per tree
            objective='multi:softprob',  # For multi-class probability output
            eval_metric='mlogloss',   # Multi-class log loss evaluation
            random_state=42
        )
        
        # Train the model with early stopping
        self.model.fit(
            X_train, y_train,
            eval_set=[(X_test, y_test)],
            early_stopping_rounds=20,
            verbose=verbose
        )
        
        # Store feature importance
        self.feature_importance = pd.DataFrame({
            'feature': self.feature_columns,
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        return X_test, y_test
    
    def evaluate_model(self, X_test: np.ndarray, y_test: np.ndarray):
        """
        Perform comprehensive model evaluation including classification metrics
        and visualizations.
        """
        # Make predictions
        y_pred = self.model.predict(X_test)
        y_pred_proba = self.model.predict_proba(X_test)
        
        # Print classification report
        print("\nModel Performance:")
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        
        # Create confusion matrix visualization
        plt.figure(figsize=(10, 8))
        cm = confusion_matrix(y_test, y_pred)
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('Confusion Matrix')
        plt.xlabel('Predicted Outcome')
        plt.ylabel('Actual Outcome')
        plt.show()
        
        # Plot feature importance
        plt.figure(figsize=(12, 6))
        top_features = self.feature_importance.head(20)
        sns.barplot(data=top_features, x='importance', y='feature')
        plt.title('Top 20 Most Important Features')
        plt.xlabel('Feature Importance')
        plt.tight_layout()
        plt.show()
    
    def cross_validate(self, n_splits: int = 5):
        """
        Perform cross-validation to get a more reliable estimate of model performance.
        """
        X = self.data[self.feature_columns]
        y = self.data['target_result']
        
        # Initialize cross-validation
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
        
        # Store scores for each fold
        scores = []
        for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            # Train model on this fold
            model = xgb.XGBClassifier(
                max_depth=6,
                learning_rate=0.1,
                n_estimators=200,
                subsample=0.8,
                colsample_bytree=0.8,
                objective='multi:softprob',
                random_state=42
            )
            
            model.fit(X_train, y_train)
            score = model.score(X_val, y_val)
            scores.append(score)
            
            print(f"Fold {fold} accuracy: {score:.3f}")
        
        print(f"\nCross-validation results:")
        print(f"Average accuracy: {np.mean(scores):.3f} (+/- {np.std(scores) * 2:.3f})")
        
        return scores

def train_football_predictor(data: pd.DataFrame) -> FootballMatchPredictor:
    """
    Complete pipeline for training and evaluating a football match predictor.
    """
    # Initialize predictor
    predictor = FootballMatchPredictor(data)
    
    # Train the model
    print("Training model...")
    X_test, y_test = predictor.train_model()
    
    # Evaluate the model
    print("\nEvaluating model performance...")
    predictor.evaluate_model(X_test, y_test)
    
    # Perform cross-validation
    print("\nPerforming cross-validation...")
    cv_scores = predictor.cross_validate()
    
    return predictor

# Example usage:
# predictor = train_football_predictor(normalized_data)

In [7]:
data = pd.read_csv(f"{project_root}/data/all_leagues_deskewed_normalized.csv")
predictor = train_football_predictor(data)

Selected 50 features for prediction

Feature categories included:
Recent Performance: 8 features
Average Statistics: 24 features
Performance Ratios: 14 features
Venue Statistics: 8 features
Points Information: 4 features
Training model...


TypeError: XGBClassifier.fit() got an unexpected keyword argument 'early_stopping_rounds'