In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

In [23]:
df = pd.read_csv('results.csv', parse_dates=['date'])

In [24]:
wc = df[df['tournament'].str.contains('World Cup', case=False)]


In [25]:
wc = wc[['date','home_team','away_team','home_score','away_score','neutral']]


In [8]:
wc.head(), wc.info()


<class 'pandas.core.frame.DataFrame'>
Index: 9443 entries, 1486 to 48331
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   date        9443 non-null   datetime64[ns]
 1   home_team   9443 non-null   object        
 2   away_team   9443 non-null   object        
 3   home_score  9443 non-null   int64         
 4   away_score  9443 non-null   int64         
 5   neutral     9443 non-null   bool          
dtypes: bool(1), datetime64[ns](1), int64(2), object(2)
memory usage: 451.9+ KB


(           date  home_team      away_team  home_score  away_score  neutral
 1486 1930-07-13    Belgium  United States           0           3     True
 1487 1930-07-13     France         Mexico           4           1     True
 1488 1930-07-14     Brazil     Yugoslavia           1           2     True
 1489 1930-07-14       Peru        Romania           1           3     True
 1490 1930-07-15  Argentina         France           1           0     True,
 None)

In [26]:
# Let's explore the data distribution
print("Dataset shape:", wc.shape)
print("\nResult distribution:")
print(wc.apply(lambda r: 'H' if r.home_score > r.away_score else ('A' if r.home_score < r.away_score else 'D'), axis=1).value_counts())
print("\nUnique teams:", wc['home_team'].nunique() + wc['away_team'].nunique())
print("\nDate range:", wc['date'].min(), "to", wc['date'].max())

Dataset shape: (9443, 6)

Result distribution:
H    4770
A    2670
D    2003
Name: count, dtype: int64

Unique teams: 464

Date range: 1930-07-13 00:00:00 to 2025-06-10 00:00:00


In [9]:
wc['home_win_rate'] = np.nan
wc['away_win_rate'] = np.nan


In [27]:
# Fix data leakage: compute win rates only from matches BEFORE current match
wc = wc.sort_values('date').reset_index(drop=True)
wc['home_win_rate'] = 0.0
wc['away_win_rate'] = 0.0
wc['home_recent_form'] = 0.0  # Win rate in last 5 matches
wc['away_recent_form'] = 0.0
wc['head_to_head_home'] = 0.0  # Historical H2H advantage for home team

def compute_historical_features(df):
    for i in range(len(df)):
        current_date = df.iloc[i]['date']
        home_team = df.iloc[i]['home_team']
        away_team = df.iloc[i]['away_team']
        
        # Get all matches before current date
        past_matches = df[df['date'] < current_date]
        
        # Home team historical performance
        home_past = past_matches[
            (past_matches['home_team'] == home_team) | 
            (past_matches['away_team'] == home_team)
        ]
        
        if len(home_past) > 0:
            home_wins = 0
            for _, match in home_past.iterrows():
                if match['home_team'] == home_team and match['home_score'] > match['away_score']:
                    home_wins += 1
                elif match['away_team'] == home_team and match['away_score'] > match['home_score']:
                    home_wins += 1
            df.at[i, 'home_win_rate'] = home_wins / len(home_past)
            
            # Recent form (last 5 matches)
            recent_home = home_past.tail(5)
            if len(recent_home) > 0:
                recent_wins = 0
                for _, match in recent_home.iterrows():
                    if match['home_team'] == home_team and match['home_score'] > match['away_score']:
                        recent_wins += 1
                    elif match['away_team'] == home_team and match['away_score'] > match['home_score']:
                        recent_wins += 1
                df.at[i, 'home_recent_form'] = recent_wins / len(recent_home)
        
        # Away team historical performance
        away_past = past_matches[
            (past_matches['home_team'] == away_team) | 
            (past_matches['away_team'] == away_team)
        ]
        
        if len(away_past) > 0:
            away_wins = 0
            for _, match in away_past.iterrows():
                if match['home_team'] == away_team and match['home_score'] > match['away_score']:
                    away_wins += 1
                elif match['away_team'] == away_team and match['away_score'] > match['home_score']:
                    away_wins += 1
            df.at[i, 'away_win_rate'] = away_wins / len(away_past)
            
            # Recent form (last 5 matches)
            recent_away = away_past.tail(5)
            if len(recent_away) > 0:
                recent_wins = 0
                for _, match in recent_away.iterrows():
                    if match['home_team'] == away_team and match['home_score'] > match['away_score']:
                        recent_wins += 1
                    elif match['away_team'] == away_team and match['away_score'] > match['home_score']:
                        recent_wins += 1
                df.at[i, 'away_recent_form'] = recent_wins / len(recent_away)
        
        # Head-to-head record
        h2h = past_matches[
            ((past_matches['home_team'] == home_team) & (past_matches['away_team'] == away_team)) |
            ((past_matches['home_team'] == away_team) & (past_matches['away_team'] == home_team))
        ]
        
        if len(h2h) > 0:
            home_h2h_wins = 0
            for _, match in h2h.iterrows():
                if match['home_team'] == home_team and match['home_score'] > match['away_score']:
                    home_h2h_wins += 1
                elif match['away_team'] == home_team and match['away_score'] > match['home_score']:
                    home_h2h_wins += 1
            df.at[i, 'head_to_head_home'] = home_h2h_wins / len(h2h)
    
    return df

print("Computing historical features (this may take a moment)...")
wc = compute_historical_features(wc)
print("Done!")

# Convert neutral to int
wc['neutral'] = wc['neutral'].astype(int)

Computing historical features (this may take a moment)...
Done!
Done!


In [28]:
# Add more sophisticated features
wc['goal_difference'] = wc['home_score'] - wc['away_score']

# Calculate historical goal averages (excluding current match)
wc['home_avg_goals_scored'] = 0.0
wc['home_avg_goals_conceded'] = 0.0
wc['away_avg_goals_scored'] = 0.0
wc['away_avg_goals_conceded'] = 0.0

for i in range(len(wc)):
    current_date = wc.iloc[i]['date']
    home_team = wc.iloc[i]['home_team']
    away_team = wc.iloc[i]['away_team']
    
    # Get past matches for goal averages
    past_matches = wc[wc['date'] < current_date]
    
    # Home team goal statistics
    home_matches = past_matches[
        (past_matches['home_team'] == home_team) | 
        (past_matches['away_team'] == home_team)
    ]
    
    if len(home_matches) > 0:
        home_goals_scored = 0
        home_goals_conceded = 0
        for _, match in home_matches.iterrows():
            if match['home_team'] == home_team:
                home_goals_scored += match['home_score']
                home_goals_conceded += match['away_score']
            else:
                home_goals_scored += match['away_score']
                home_goals_conceded += match['home_score']
        
        wc.at[i, 'home_avg_goals_scored'] = home_goals_scored / len(home_matches)
        wc.at[i, 'home_avg_goals_conceded'] = home_goals_conceded / len(home_matches)
    
    # Away team goal statistics
    away_matches = past_matches[
        (past_matches['home_team'] == away_team) | 
        (past_matches['away_team'] == away_team)
    ]
    
    if len(away_matches) > 0:
        away_goals_scored = 0
        away_goals_conceded = 0
        for _, match in away_matches.iterrows():
            if match['home_team'] == away_team:
                away_goals_scored += match['home_score']
                away_goals_conceded += match['away_score']
            else:
                away_goals_scored += match['away_score']
                away_goals_conceded += match['home_score']
        
        wc.at[i, 'away_avg_goals_scored'] = away_goals_scored / len(away_matches)
        wc.at[i, 'away_avg_goals_conceded'] = away_goals_conceded / len(away_matches)

# Add year feature (tournament era effect)
wc['year'] = wc['date'].dt.year
wc['modern_era'] = (wc['year'] >= 1990).astype(int)  # Modern football era

print("Enhanced features computed!")

Enhanced features computed!


In [29]:
def label(r):
    if r.home_score > r.away_score: return 'H'
    if r.home_score < r.away_score: return 'A'
    return 'D'
wc['result'] = wc.apply(label, axis=1)


In [30]:
le = LabelEncoder()
wc['y'] = le.fit_transform(wc['result'])  # H→?, A→?, D→?


In [31]:
# Use enhanced feature set
features = [
    'home_win_rate', 'away_win_rate', 'neutral',
    'home_recent_form', 'away_recent_form', 'head_to_head_home',
    'home_avg_goals_scored', 'home_avg_goals_conceded',
    'away_avg_goals_scored', 'away_avg_goals_conceded',
    'modern_era'
]

# Remove rows with insufficient historical data
wc_filtered = wc.dropna(subset=features)
print(f"Dataset size after filtering: {len(wc_filtered)} matches")

X = wc_filtered[features].fillna(0)
y = wc_filtered['y']

print("Feature correlation with target:")
for feature in features:
    corr = X[feature].corr(y)
    print(f"{feature}: {corr:.3f}")

Dataset size after filtering: 9443 matches
Feature correlation with target:
home_win_rate: 0.316
away_win_rate: -0.298
neutral: -0.089
home_recent_form: 0.222
away_recent_form: -0.239
head_to_head_home: 0.299
home_avg_goals_scored: 0.263
home_avg_goals_conceded: -0.286
away_avg_goals_scored: -0.262
away_avg_goals_conceded: 0.245
modern_era: -0.033


In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [32]:
# Improved model with better hyperparameters
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import StandardScaler

# Scale features for better performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create ensemble model
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    class_weight='balanced'  # Handle class imbalance
)

gb_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

lr_model = LogisticRegression(
    random_state=42,
    class_weight='balanced',
    max_iter=1000
)

# Ensemble model
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', rf_model),
        ('gb', gb_model),
        ('lr', lr_model)
    ],
    voting='soft'  # Use probabilities
)

ensemble_model.fit(X_scaled, y)
print("Ensemble model trained successfully!")

Ensemble model trained successfully!


In [33]:
# Update train-test split with scaled data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Make predictions
y_pred = ensemble_model.predict(X_test)
y_pred_proba = ensemble_model.predict_proba(X_test)

print("Ensemble model predictions completed!")

Ensemble model predictions completed!


In [34]:
from sklearn.metrics import confusion_matrix, f1_score
import seaborn as sns

# Comprehensive evaluation
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.3f}")
print(f"F1-Score (weighted): {f1:.3f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# Class distribution
print("\nActual class distribution in test set:")
print(pd.Series(y_test).value_counts().sort_index())
print("\nPredicted class distribution:")
print(pd.Series(y_pred).value_counts().sort_index())

ModuleNotFoundError: No module named 'seaborn'

In [21]:
# Feature importance analysis
feature_importance = rf_model.feature_importances_
feature_names = features

# Create DataFrame for better visualization
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='importance', y='feature')
plt.title('Feature Importance (Random Forest)')
plt.xlabel('Importance')
plt.tight_layout()
plt.show()

print("Top 5 most important features:")
print(importance_df.head())

# Classification report
print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

              precision    recall  f1-score   support

           A       0.54      0.53      0.54       533
           D       0.25      0.19      0.21       414
           H       0.67      0.74      0.70       942

    accuracy                           0.56      1889
   macro avg       0.49      0.49      0.48      1889
weighted avg       0.54      0.56      0.55      1889



In [None]:
# Save the model for deployment
import joblib

# Save model components
joblib.dump(ensemble_model, 'world_cup_model.pkl')
joblib.dump(scaler, 'feature_scaler.pkl')
joblib.dump(le, 'label_encoder.pkl')

print("Model saved successfully!")

# Create prediction function for deployment
def predict_match(home_team_stats, away_team_stats, is_neutral=0):
    """
    Predict match outcome given team statistics
    
    Parameters:
    home_team_stats: dict with keys:
        - win_rate, recent_form, avg_goals_scored, avg_goals_conceded
    away_team_stats: dict with keys:
        - win_rate, recent_form, avg_goals_scored, avg_goals_conceded
    is_neutral: 1 if neutral venue, 0 otherwise
    
    Returns:
    - prediction: 'H', 'A', or 'D'
    - probabilities: dict with probabilities for each outcome
    """
    
    # Create feature vector
    features_vector = [
        home_team_stats['win_rate'],
        away_team_stats['win_rate'],
        is_neutral,
        home_team_stats['recent_form'],
        away_team_stats['recent_form'],
        0.5,  # head_to_head_home (default)
        home_team_stats['avg_goals_scored'],
        home_team_stats['avg_goals_conceded'],
        away_team_stats['avg_goals_scored'],
        away_team_stats['avg_goals_conceded'],
        1  # modern_era (assume modern)
    ]
    
    # Scale features
    features_scaled = scaler.transform([features_vector])
    
    # Make prediction
    prediction = ensemble_model.predict(features_scaled)[0]
    probabilities = ensemble_model.predict_proba(features_scaled)[0]
    
    # Convert to readable format
    result = le.inverse_transform([prediction])[0]
    prob_dict = {
        le.classes_[i]: prob for i, prob in enumerate(probabilities)
    }
    
    return result, prob_dict

# Example usage
example_home = {
    'win_rate': 0.65,
    'recent_form': 0.8,
    'avg_goals_scored': 2.1,
    'avg_goals_conceded': 0.9
}

example_away = {
    'win_rate': 0.55,
    'recent_form': 0.6,
    'avg_goals_scored': 1.8,
    'avg_goals_conceded': 1.2
}

prediction, probabilities = predict_match(example_home, example_away, is_neutral=1)
print(f"\nExample prediction: {prediction}")
print("Probabilities:")
for outcome, prob in probabilities.items():
    print(f"  {outcome}: {prob:.3f}")

print(f"\nModel accuracy improved to: {accuracy:.1%}")