# Football Match Prediction - Clean Model Building

Saubere Implementation mit RandomForest und umfassendem Hyperparameter-Tuning.


In [41]:
import pickle
import pandas as pd
import numpy as np

# Load data
match_df = pickle.load(open("data/prepped_match_df.pck", "rb"))
print(f"Dataset loaded: {match_df.shape}")
match_df.head()


Dataset loaded: (4832, 67)


Unnamed: 0,teamHomeId,teamAwayId,matchDay,season,teamHomeValue,teamAwayValue,teamHomeAvgScoredGoals1,teamHomeAvgGottenGoals1,teamHomeAvgTeamPoints1,teamHomeAvgScoredGoals2,...,teamAwayAvgScoredGoals8,teamAwayAvgGottenGoals8,teamAwayAvgTeamPoints8,teamAwayAvgScoredGoals9,teamAwayAvgGottenGoals9,teamAwayAvgTeamPoints9,teamAwayAvgScoredGoals10,teamAwayAvgGottenGoals10,teamAwayAvgTeamPoints10,resultClass
90,7,54,11,2009,109.4,61.98,1.0,1.0,1.0,1.5,...,0.625,2.625,0.125,0.666667,2.555556,0.111111,0.7,2.3,0.4,2:0
93,100,87,11,2009,136.28,55.0,3.0,3.0,1.0,1.5,...,0.625,1.75,0.5,0.777778,1.666667,0.777778,1.0,1.8,0.8,2:3
91,131,81,11,2009,155.43,48.78,0.0,0.0,1.0,1.0,...,1.5,1.375,1.875,1.444444,1.333333,1.777778,1.5,1.4,1.7,3:3
92,16,40,11,2009,124.65,279.55,0.0,1.0,0.0,0.5,...,1.875,0.875,2.0,1.777778,0.888889,1.888889,1.7,0.9,1.8,0:0
94,65,55,11,2009,59.13,50.88,0.0,0.0,1.0,0.5,...,1.5,1.25,1.375,1.444444,1.222222,1.333333,1.3,1.2,1.2,0:1


In [42]:
# Prepare target variable
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
match_df["resultClass"] = enc.fit_transform(match_df["resultClass"])


In [43]:
# Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    match_df.drop("resultClass", axis=1), 
    match_df["resultClass"], 
    test_size=0.2, 
    shuffle=False  # Time series data
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")


Training set: (3865, 66)
Test set: (967, 66)


In [44]:
# Custom Kicktipp Scorer
from sklearn.metrics import make_scorer

def kicktipp_score(y_true, y_pred, **kwargs):
    """Custom scoring function for Kicktipp predictions"""
    # Convert to int64 for consistency
    y_true = y_true.astype('int64') if hasattr(y_true, 'astype') else np.array(y_true, dtype='int64')
    y_pred = y_pred.astype('int64') if hasattr(y_pred, 'astype') else np.array(y_pred, dtype='int64')
    
    # Decode predictions back to score format
    y_true_decoded = [x.split(":") for x in enc.inverse_transform(y_true)]
    y_pred_decoded = [x.split(":") for x in enc.inverse_transform(y_pred)]
    
    score_value = 0
    for true, pred in zip(y_true_decoded, y_pred_decoded):
        # Exact match: 5 points
        if true[0] == pred[0] and true[1] == pred[1]:
            score_value += 5
        # Goal difference correct: 3 points
        elif (int(true[0]) - int(true[1])) == (int(pred[0]) - int(pred[1])):
            score_value += 3
        # Winner correct: 1 point
        elif ((true[0] > true[1]) and (pred[0] > pred[1])) or \
             ((true[0] < true[1]) and (pred[0] < pred[1])) or \
             ((true[0] == true[1]) and (pred[0] == pred[1])):
            score_value += 1
    
    return round(score_value / (len(y_true)/306))  # Normalize to Kicktipp scale

kicktipp_scorer = make_scorer(kicktipp_score, greater_is_better=True)
print("‚úÖ Custom Kicktipp scorer ready")

# Cross-Validation Setup
from sklearn.model_selection import TimeSeriesSplit

tscv = TimeSeriesSplit(n_splits=5)
print("‚úÖ Time Series Cross-Validation ready (5 splits)")

‚úÖ Custom Kicktipp scorer ready
‚úÖ Time Series Cross-Validation ready (5 splits)


In [45]:
# Baseline Model
from collections import Counter

# Always predict most frequent class
majority_class = Counter(y_train).most_common(1)[0][0]
baseline_predictions = np.full(len(y_test), majority_class)
baseline_score = kicktipp_score(y_test, baseline_predictions)

print(f"üìä Baseline (majority class): {baseline_score} points")


üìä Baseline (majority class): 301 points


# Random Forest

In [46]:
# RandomForest with Comprehensive Hyperparameter Tuning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

print("üå≥ Setting up RandomForest with comprehensive hyperparameter search...")

# Comprehensive parameter grid
param_distributions = {
    'n_estimators': np.arange(100, 501, 25),           # 100, 125, 150, ..., 500
    'max_depth': [None] + list(np.arange(5, 31, 2)),   # None, 5, 7, 9, ..., 29
    'min_samples_split': np.arange(2, 21, 1),          # 2, 3, 4, ..., 20
    'min_samples_leaf': np.arange(1, 16, 1),           # 1, 2, 3, ..., 15
    'max_features': ['sqrt', 'log2', None] + list(np.arange(0.1, 1.0, 0.1)),  # sqrt, log2, None, 0.1-0.9
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'max_leaf_nodes': [None] + list(np.arange(10, 101, 10))  # None, 10, 20, ..., 100
}

# Calculate total combinations
total_combinations = 1
for param, values in param_distributions.items():
    total_combinations *= len(values)

print(f"üìä Hyperparameter space: {total_combinations:,} total combinations")
print(f"üìä Search iterations: 150 ({150/total_combinations*100:.4f}% coverage)")

# RandomForest model
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# RandomizedSearchCV
rf_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=150,  # Comprehensive search
    cv=tscv,
    scoring=kicktipp_scorer,
    n_jobs=-1,
    random_state=42,
    verbose=1
)

print("üöÄ Starting hyperparameter optimization...")
rf_search.fit(X_train, y_train)

# Results
best_params = rf_search.best_params_
best_cv_score = rf_search.best_score_
best_model = rf_search.best_estimator_

print(f"\nüéØ Best Parameters: {best_params}")
print(f"üéØ Best CV Score: {best_cv_score:.1f} points")


üå≥ Setting up RandomForest with comprehensive hyperparameter search...
üìä Hyperparameter space: 35,814,240 total combinations
üìä Search iterations: 150 (0.0004% coverage)
üöÄ Starting hyperparameter optimization...
Fitting 5 folds for each of 150 candidates, totalling 750 fits





üéØ Best Parameters: {'n_estimators': np.int64(325), 'min_samples_split': np.int64(6), 'min_samples_leaf': np.int64(2), 'max_leaf_nodes': np.int64(10), 'max_features': None, 'max_depth': np.int64(11), 'criterion': 'gini', 'bootstrap': True}
üéØ Best CV Score: 343.4 points


In [49]:
# Final Model Evaluation
predictions = best_model.predict(X_test)
final_score = kicktipp_score(y_test, predictions)

print("=" * 50)
print("üèÜ FINAL RESULTS")
print("=" * 50)
print(f"üìä Baseline Score:        {baseline_score} points")
print(f"üå≥ RandomForest CV Score: {best_cv_score:.1f} points")
print(f"üå≥ RandomForest Test Score: {final_score} points")
print(f"üìà Improvement over baseline: +{final_score - baseline_score} points")
print("=" * 50)

if final_score > baseline_score:
    print(f"‚úÖ SUCCESS: {final_score - baseline_score} points improvement!")
else:
    print(f"‚ùå Model performs worse than baseline by {baseline_score - final_score} points")


üèÜ FINAL RESULTS
üìä Baseline Score:        301 points
üå≥ RandomForest CV Score: 343.4 points
üå≥ RandomForest Test Score: 326 points
üìà Improvement over baseline: +25 points
‚úÖ SUCCESS: 25 points improvement!


In [48]:
# Save Model and Encoder
from datetime import datetime

# Generate timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save best model
model_filename = f"classifier_{timestamp}.pck"
pickle.dump(best_model, open(model_filename, "wb"))

# Save encoder (fixed filename)
encoder_filename = "encoder.pck"
pickle.dump(enc, open(encoder_filename, "wb"))

print(f"üíæ Model saved as: {model_filename}")
print(f"üíæ Encoder saved as: {encoder_filename}")
print(f"üíæ Final score: {final_score} points")
print(f"üíæ Best parameters: {best_params}")


üíæ Model saved as: classifier_20250928_125632.pck
üíæ Encoder saved as: encoder.pck
üíæ Final score: 326 points
üíæ Best parameters: {'n_estimators': np.int64(325), 'min_samples_split': np.int64(6), 'min_samples_leaf': np.int64(2), 'max_leaf_nodes': np.int64(10), 'max_features': None, 'max_depth': np.int64(11), 'criterion': 'gini', 'bootstrap': True}


In [52]:
# Stacking Ensemble: Top 3 RandomForest Models (corrected)
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd

print("üèóÔ∏è Building Stacking Ensemble from Top 3 RandomForest models...")

# Extrahiere Top 3 Modelle aus der rf_search (nicht pipeline_search)
cv_results = pd.DataFrame(rf_search.cv_results_)
cv_results_sorted = cv_results.sort_values('mean_test_score', ascending=False)

print(f"üìä Available models from RandomizedSearchCV: {len(cv_results)}")
print(f"üìä Top 3 CV scores: {cv_results_sorted['mean_test_score'].head(3).values}")

# Erstelle die Top 3 RandomForest Modelle (ohne Pipeline)
top_3_models = []
for i in range(3):
    params = cv_results_sorted.iloc[i]['params']
    
    # Erstelle RandomForest mit den Top-Parametern
    rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)
    rf_model.set_params(**params)
    
    # Trainiere das Modell
    rf_model.fit(X_train, y_train)
    
    top_3_models.append((f'rf_model_{i+1}', rf_model))
    
    print(f"‚úÖ Model {i+1} trained with CV score: {cv_results_sorted.iloc[i]['mean_test_score']:.1f}")

print(f"\nüèóÔ∏è Creating Stacking Ensemble with {len(top_3_models)} base models...")

# Stacking Classifier mit LogisticRegression als Meta-Learner
# TimeSeriesSplit funktioniert nicht mit StackingClassifier, nutze Standard KFold
from sklearn.model_selection import KFold

stacking_classifier = StackingClassifier(
    estimators=top_3_models,
    final_estimator=LogisticRegression(random_state=42, max_iter=1000),
    cv=KFold(n_splits=5, shuffle=True, random_state=42),  # Standard CV f√ºr Stacking
    n_jobs=-1,
    verbose=True
)

print("üöÄ Training Stacking Ensemble...")
stacking_classifier.fit(X_train, y_train)

# Stacking Predictions
stacking_predictions = stacking_classifier.predict(X_test)
stacking_score = kicktipp_score(y_test, stacking_predictions)

print(f"\nüéØ Stacking Ensemble Test Score: {stacking_score} points")

# Vergleiche auch die individuellen Modelle
print(f"\nüìä Individual model performance on test set:")
for i, (name, model) in enumerate(top_3_models):
    individual_predictions = model.predict(X_test)
    individual_score = kicktipp_score(y_test, individual_predictions)
    print(f"  {name}: {individual_score} points")


üèóÔ∏è Building Stacking Ensemble from Top 3 RandomForest models...
üìä Available models from RandomizedSearchCV: 150
üìä Top 3 CV scores: [343.4 340.8 340. ]
‚úÖ Model 1 trained with CV score: 343.4
‚úÖ Model 2 trained with CV score: 340.8
‚úÖ Model 3 trained with CV score: 340.0

üèóÔ∏è Creating Stacking Ensemble with 3 base models...
üöÄ Training Stacking Ensemble...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.7s finished
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.0s finished
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   12.6s finished



üéØ Stacking Ensemble Test Score: 315 points

üìä Individual model performance on test set:
  rf_model_1: 326 points
  rf_model_2: 324 points
  rf_model_3: 329 points


In [53]:
# Final Comparison: Original RF vs Stacking (Clean Version)
print("=" * 60)
print("üèÜ FINAL MODEL COMPARISON")
print("=" * 60)
print(f"üìä Baseline Score:        {baseline_score} points")
print(f"üå≥ Original RandomForest: {final_score} points")
print(f"üèóÔ∏è Stacking Ensemble:     {stacking_score} points")
print("=" * 60)

original_improvement = final_score - baseline_score
stacking_improvement = stacking_score - baseline_score
stacking_vs_original = stacking_score - final_score

print(f"üìà Original RF improvement:  +{original_improvement} points")
print(f"üìà Stacking improvement:     +{stacking_improvement} points")
print(f"üèóÔ∏è Stacking vs Original:     {stacking_vs_original:+} points")

if stacking_score > final_score:
    print(f"\n‚úÖ STACKING WINS! Ensemble improved by {stacking_vs_original} points")
    best_final_model = stacking_classifier
    best_final_score = stacking_score
    best_approach = "Stacking"
else:
    print(f"\nü§î Original RF still better by {-stacking_vs_original} points")
    best_final_model = best_model
    best_final_score = final_score
    best_approach = "Original RF"

print(f"\nüèÜ WINNER: {best_approach} with {best_final_score} points")

# Komplexit√§t vs Performance
print(f"\nüí° COMPLEXITY vs PERFORMANCE:")
print(f"  üå≥ Original RF: {final_score:3.0f} points (1 model)")
print(f"  üèóÔ∏è Stacking:    {stacking_score:3.0f} points (3 models + meta-learner)")

if stacking_vs_original >= 3:
    print(f"\nüèÜ RECOMMENDATION: Use Stacking Ensemble!")
    print(f"   ‚úÖ Significant improvement: +{stacking_vs_original} points")
else:
    print(f"\nüèÜ RECOMMENDATION: Use Original RandomForest!")
    print(f"   ‚úÖ Simpler model with comparable performance")


üèÜ FINAL MODEL COMPARISON
üìä Baseline Score:        301 points
üå≥ Original RandomForest: 326 points
üèóÔ∏è Stacking Ensemble:     315 points
üìà Original RF improvement:  +25 points
üìà Stacking improvement:     +14 points
üèóÔ∏è Stacking vs Original:     -11 points

ü§î Original RF still better by 11 points

üèÜ WINNER: Original RF with 326 points

üí° COMPLEXITY vs PERFORMANCE:
  üå≥ Original RF: 326 points (1 model)
  üèóÔ∏è Stacking:    315 points (3 models + meta-learner)

üèÜ RECOMMENDATION: Use Original RandomForest!
   ‚úÖ Simpler model with comparable performance


In [54]:
# Save Best Model and Encoder (Clean Version)
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Determine which model performed better
if stacking_score > final_score:
    model_to_save = stacking_classifier
    score_to_save = stacking_score
    approach_name = "Stacking"
    model_type = "StackingClassifier with 3 RandomForest base models"
else:
    model_to_save = best_model
    score_to_save = final_score
    approach_name = "Original RF"
    model_type = "RandomForestClassifier"

model_filename = f"classifier_{timestamp}.pck"
pickle.dump(model_to_save, open(model_filename, "wb"))

# Save encoder
encoder_filename = "encoder.pck"
pickle.dump(enc, open(encoder_filename, "wb"))

print(f"üíæ Best model ({approach_name}) saved as: {model_filename}")
print(f"üíæ Model type: {model_type}")
print(f"üíæ Encoder saved as: {encoder_filename}")
print(f"üíæ Final score: {score_to_save} points")
print(f"üíæ Improvement over baseline: +{score_to_save - baseline_score} points")


üíæ Best model (Original RF) saved as: classifier_20250928_130858.pck
üíæ Model type: RandomForestClassifier
üíæ Encoder saved as: encoder.pck
üíæ Final score: 326 points
üíæ Improvement over baseline: +25 points


In [55]:
# Stacking Ensemble: Top 3 RandomForest Models
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import pandas as pd

print("üèóÔ∏è Building Stacking Ensemble from Top 3 RandomForest models...")

# Extrahiere Top 3 Modelle aus der RandomizedSearchCV
cv_results = pd.DataFrame(pipeline_search.cv_results_)
cv_results_sorted = cv_results.sort_values('mean_test_score', ascending=False)

print(f"üìä Available models from RandomizedSearchCV: {len(cv_results)}")
print(f"üìä Top 3 CV scores: {cv_results_sorted['mean_test_score'].head(3).values}")

# Erstelle die Top 3 RandomForest Modelle
top_3_models = []
for i in range(3):
    params = cv_results_sorted.iloc[i]['params']
    
    # Erstelle Pipeline mit gleicher Struktur
    model_pipeline = Pipeline([
        ('feature_engineering', HomeAwayDifferenceTransformer(verbose=False)),
        ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
    ])
    
    # Setze die Parameter
    model_pipeline.set_params(**params)
    
    # Trainiere das Modell
    model_pipeline.fit(X_train, y_train)
    
    top_3_models.append((f'rf_model_{i+1}', model_pipeline))
    
    print(f"‚úÖ Model {i+1} trained with CV score: {cv_results_sorted.iloc[i]['mean_test_score']:.1f}")

print(f"\nüèóÔ∏è Creating Stacking Ensemble with {len(top_3_models)} base models...")

# Stacking Classifier mit LogisticRegression als Meta-Learner
stacking_classifier = StackingClassifier(
    estimators=top_3_models,
    final_estimator=LogisticRegression(random_state=42, max_iter=1000),
    cv=tscv,  # Gleiche CV-Strategie
    n_jobs=-1,
    verbose=True
)

print("üöÄ Training Stacking Ensemble...")
stacking_classifier.fit(X_train, y_train)

# Stacking Predictions
stacking_predictions = stacking_classifier.predict(X_test)
stacking_score = kicktipp_score(y_test, stacking_predictions)

print(f"\nüéØ Stacking Ensemble Test Score: {stacking_score} points")

# Vergleiche auch die individuellen Modelle
print(f"\nüìä Individual model performance on test set:")
for i, (name, model) in enumerate(top_3_models):
    individual_predictions = model.predict(X_test)
    individual_score = kicktipp_score(y_test, individual_predictions)
    print(f"  {name}: {individual_score} points")


üèóÔ∏è Building Stacking Ensemble from Top 3 RandomForest models...
üìä Available models from RandomizedSearchCV: 150
üìä Top 3 CV scores: [346.8 345.8 345.4]
‚úÖ Model 1 trained with CV score: 346.8
‚úÖ Model 2 trained with CV score: 345.8
‚úÖ Model 3 trained with CV score: 345.4

üèóÔ∏è Creating Stacking Ensemble with 3 base models...
üöÄ Training Stacking Ensemble...


ValueError: cross_val_predict only works for partitions

In [None]:
# Final Model Comparison: All Approaches
print("=" * 70)
print("üèÜ FINAL MODEL COMPARISON - ALL APPROACHES")
print("=" * 70)
print(f"üìä Baseline Score:              {baseline_score} points")
print(f"üå≥ Original RandomForest:       {final_score} points")
print(f"üîÑ Pipeline (Feature Eng.):     {pipeline_final_score} points")
print(f"üèóÔ∏è Stacking Ensemble:           {stacking_score} points")
print("=" * 70)

# Alle Verbesserungen berechnen
all_scores = {
    'Baseline': baseline_score,
    'Original RF': final_score,
    'Pipeline RF': pipeline_final_score,
    'Stacking': stacking_score
}

improvements = {name: score - baseline_score for name, score in all_scores.items() if name != 'Baseline'}

print("üìà IMPROVEMENTS over baseline:")
for approach, improvement in improvements.items():
    print(f"  {approach:15}: +{improvement:4.0f} points")

# Bester Ansatz
best_approach_name = max(all_scores, key=all_scores.get)
best_score_final = all_scores[best_approach_name]

print(f"\nüèÜ OVERALL WINNER: {best_approach_name} with {best_score_final} points!")

# Modell-Komplexit√§t vs Performance
print(f"\nüí° COMPLEXITY vs PERFORMANCE ANALYSIS:")
print(f"  üìä Baseline:        {baseline_score:3.0f} points (Trivial)")
print(f"  üå≥ Original RF:     {final_score:3.0f} points (Medium complexity)")
print(f"  üîÑ Pipeline RF:     {pipeline_final_score:3.0f} points (Medium+ complexity)")  
print(f"  üèóÔ∏è Stacking:        {stacking_score:3.0f} points (High complexity)")

# Finale Empfehlung
stacking_improvement = stacking_score - max(final_score, pipeline_final_score)
if stacking_improvement >= 5:
    print(f"\nüèÜ RECOMMENDATION: Stacking Ensemble!")
    print(f"   ‚úÖ Best performance: {stacking_score} points")
    print(f"   ‚úÖ Significant improvement: +{stacking_improvement} points")
    final_best_model = stacking_classifier
elif pipeline_final_score > final_score:
    print(f"\nüèÜ RECOMMENDATION: Pipeline with Feature Engineering!")
    print(f"   ‚úÖ Good performance with reasonable complexity")
    print(f"   ‚úÖ Feature engineering benefits: +{pipeline_final_score - final_score} points")
    final_best_model = pipeline_best_model
else:
    print(f"\nüèÜ RECOMMENDATION: Original RandomForest!")
    print(f"   ‚úÖ Simple and effective: {final_score} points")
    final_best_model = best_model

print(f"\nüíæ Best model ready for saving: {type(final_best_model).__name__}")


In [None]:
# Save Best Model and Encoder
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save the best performing model
model_filename = f"classifier_{timestamp}.pck"
pickle.dump(stacking_classifier, open(model_filename, "wb"))

# Save encoder
encoder_filename = "encoder.pck"
pickle.dump(enc, open(encoder_filename, "wb"))

print(f"üíæ Stacking model saved as: {model_filename}")
print(f"üíæ Encoder saved as: {encoder_filename}")
print(f"üíæ Final score: {stacking_score} points")
