In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load scaled features and labels
X_train = pd.read_csv('Features/TrainingSet/features_scaled.csv')
y_train = pd.read_csv('Features/TrainingSet/matches_engineered.csv')['FTR']  # Home (H), Draw (D), Away (A)
X_val = pd.read_csv('Features/ValidationSet/features_scaled.csv')
y_val = pd.read_csv('Features/ValidationSet/matches_engineered.csv')['FTR']

In [2]:
# Initialize with sensible defaults
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    random_state=42,
    n_jobs=-1  # Use all CPU cores
)

# Train
rf.fit(X_train, y_train)

# Validate
val_pred = rf.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, val_pred):.3f}")
print(classification_report(y_val, val_pred, target_names=['Away', 'Draw', 'Home']))

Validation Accuracy: 0.482
              precision    recall  f1-score   support

        Away       0.47      0.28      0.35       111
        Draw       0.29      0.04      0.07        96
        Home       0.49      0.86      0.63       173

    accuracy                           0.48       380
   macro avg       0.42      0.39      0.35       380
weighted avg       0.43      0.48      0.41       380



In [3]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
print(f"Best Params: {grid_search.best_params_}")

Best Params: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}


In [4]:
# Test set evaluation
X_test = pd.read_csv('Features/TestSet/features_scaled.csv')
y_test = pd.read_csv('Features/TestSet/matches_engineered.csv')['FTR']

test_pred = best_rf.predict(X_test)
print("\nTest Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, test_pred):.3f}")
print(classification_report(y_test, test_pred))


Test Set Performance:
Accuracy: 0.437
              precision    recall  f1-score   support

           A       0.44      0.25      0.32       106
           D       0.31      0.05      0.08       110
           H       0.44      0.82      0.58       164

    accuracy                           0.44       380
   macro avg       0.40      0.37      0.32       380
weighted avg       0.40      0.44      0.36       380



In [None]:
import matplotlib.pyplot as plt

# Get importances
importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': best_rf.feature_importances_
}).sort_values('Importance', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(importances['Feature'], importances['Importance'])
plt.title('Random Forest Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png', dpi=300)
plt.show()