In [None]:
# To train my model, I need the data in a wide format so the model can accurately consider both teams involved in a matchup

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
from sklearn.preprocessing import StandardScaler

In [7]:
matchup_df = pd.read_json('../processed_data/pl_data_dt.json')

matchup_df['actual_result'] = np.where(matchup_df['home_goals'] > matchup_df['away_goals'], 1, 0)

model = RandomForestClassifier(n_estimators=50, min_samples_split=2, min_samples_leaf=1,)

In [10]:
# Define features for prediction
feature_columns = [
    # Context Features
    
    # Home Team Form (Rolling Averages)
    'home_goals_rolling_avg', 'home_conceded_goals_rolling_avg',
    'home_shots_rolling_avg', 
    # 'home_conceded_shots_rolling_avg',
    'home_shots_on_goal_rolling_avg', 
    # 'home_conceded_shots_on_goal_rolling_avg',
    # 'home_goalkeeper_saves_rolling_avg', 
    # 'home_blocked_shots_rolling_avg',
    'home_conversion_rate_rolling_avg', 
    'home_target_ratio_rolling_avg',
    
    # Away Team Form (Rolling Averages)
    'away_goals_rolling_avg', 'away_conceded_goals_rolling_avg',
    'away_shots_rolling_avg', 
    # 'away_conceded_shots_rolling_avg',
    'away_shots_on_goal_rolling_avg', 
    # 'away_conceded_shots_on_goal_rolling_avg',
    # 'away_goalkeeper_saves_rolling_avg', 'away_blocked_shots_rolling_avg',
    'away_conversion_rate_rolling_avg', 'away_target_ratio_rolling_avg',
    
    # # Current Game Attack Strength
    # 'home_shot_creation_ratio', 'home_target_ratio', 'home_conversion_rate',
    # 'away_shot_creation_ratio', 'away_target_ratio', 'away_conversion_rate',
    
    # # Possession and Control
    # 'home_poss', 'away_poss',
    # 'home_chances', 'away_chances',
    # 'home_dangerous_attacks', 'away_dangerous_attacks',
    
    # Efficiency Metrics (Derived)
    f'home_danger_ratio', # home_dangerous_attacks / home_attacks
    f'away_danger_ratio', # away_dangerous_attacks / away_attacks
    f'home_shot_efficiency', # home_shots_on_goal / home_shots
    f'away_shot_efficiency', # away_shots_on_goal / away_shots
]

# Create derived features
matchup_df['home_danger_ratio'] = matchup_df['home_dangerous_attacks'] / matchup_df['home_attacks'].replace(0, 1)
matchup_df['away_danger_ratio'] = matchup_df['away_dangerous_attacks'] / matchup_df['away_attacks'].replace(0, 1)
matchup_df['home_shot_efficiency'] = matchup_df['home_shots_on_goal'] / matchup_df['home_shots'].replace(0, 1)
matchup_df['away_shot_efficiency'] = matchup_df['away_shots_on_goal'] / matchup_df['away_shots'].replace(0, 1)

# Prepare features and targets
X = matchup_df[feature_columns]
y_result = matchup_df['actual_result']  #1: Home Win, 2: Away Win

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_result_train, y_result_test = train_test_split(
    X_scaled, y_result, test_size=0.2, random_state=42
)

# Initialize models
# 1. Match Result Predictor (Classification)
result_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=30,
    min_samples_split=40,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1  # Use all available cores
    ,criterion='entropy',
    ccp_alpha=0.05
)

# Train model
result_model.fit(X_train, y_result_train)

# Make predictions
y_pred = result_model.predict(X_test)
y_pred_proba = result_model.predict_proba(X_test)
print("This is the probability: ", y_pred_proba)
print("Maximum probability", np.max(y_pred_proba[:,0]))
# Evaluate
print("Model Performance:")
print("\nAccuracy:", accuracy_score(y_result_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_result_test, y_pred, 
                          target_names=['Home Win', 'Away Win/Draw']))

# Feature importance
importance_df = pd.DataFrame({
    'feature': feature_columns,
    'importance': result_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(importance_df.head(10))


This is the probability:  [[0.53188015 0.46811985]
 [0.54673693 0.45326307]
 [0.53792494 0.46207506]
 ...
 [0.49301435 0.50698565]
 [0.54673693 0.45326307]
 [0.58922514 0.41077486]]
Maximum probability 0.6002146370249782
Model Performance:

Accuracy: 0.6406480117820325

Classification Report:
               precision    recall  f1-score   support

     Home Win       0.61      0.92      0.73       367
Away Win/Draw       0.76      0.32      0.45       312

     accuracy                           0.64       679
    macro avg       0.69      0.62      0.59       679
 weighted avg       0.68      0.64      0.60       679


Top 10 Most Important Features:
                             feature  importance
14              home_shot_efficiency    0.355237
3     home_shots_on_goal_rolling_avg    0.317073
2             home_shots_rolling_avg    0.146341
0             home_goals_rolling_avg    0.121951
9     away_shots_on_goal_rolling_avg    0.059397
1    home_conceded_goals_rolling_avg    0.0000

In [58]:
y_pred = result_model.predict(X_train)
y_pred_proba = result_model.predict_proba(X_train)

# Evaluate
print("Model Performance:")
print("\nAccuracy:", accuracy_score(y_result_train, y_pred))
print("\nClassification Report:")
print(classification_report(y_result_train, y_pred, 
                          target_names=['Home Win', 'Away Win/Draw']))

# Feature importance
importance_df = pd.DataFrame({
    'feature': feature_columns,
    'importance': result_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 Most Important Features:")
print(importance_df.head(10))


Model Performance:

Accuracy: 0.6504424778761062

Classification Report:
               precision    recall  f1-score   support

     Home Win       0.62      0.92      0.74      1473
Away Win/Draw       0.78      0.33      0.46      1239

     accuracy                           0.65      2712
    macro avg       0.70      0.62      0.60      2712
 weighted avg       0.69      0.65      0.61      2712


Top 10 Most Important Features:
                             feature  importance
14              home_shot_efficiency    0.355237
3     home_shots_on_goal_rolling_avg    0.317073
2             home_shots_rolling_avg    0.146341
0             home_goals_rolling_avg    0.121951
9     away_shots_on_goal_rolling_avg    0.059397
1    home_conceded_goals_rolling_avg    0.000000
4   home_conversion_rate_rolling_avg    0.000000
5      home_target_ratio_rolling_avg    0.000000
6             away_goals_rolling_avg    0.000000
7    away_conceded_goals_rolling_avg    0.000000


In [60]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load your data
matchup_df = pd.read_json('../processed_data/pl_data.json')

# Prepare your features and target variable
X = matchup_df[['home_goals', 'away_goals']]  # Example features
y = np.where(matchup_df['home_goals'] > matchup_df['away_goals'], 1, 
             np.where(matchup_df['home_goals'] < matchup_df['away_goals'], 2, 0))  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the DummyClassifier
dummy_clf = DummyClassifier()

# Train the DummyClassifier
dummy_clf.fit(X_train, y_train)

# Make predictions
y_pred = dummy_clf.predict(X_test)

# Evaluate the DummyClassifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.45949926362297494
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       169
           1       0.46      1.00      0.63       312
           2       0.00      0.00      0.00       198

    accuracy                           0.46       679
   macro avg       0.15      0.33      0.21       679
weighted avg       0.21      0.46      0.29       679



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [61]:
y_pred = dummy_clf.predict(X_train)

# Evaluate the DummyClassifier
accuracy = accuracy_score(y_train, y_pred)
report = classification_report(y_train, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.45685840707964603
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       614
           1       0.46      1.00      0.63      1239
           2       0.00      0.00      0.00       859

    accuracy                           0.46      2712
   macro avg       0.15      0.33      0.21      2712
weighted avg       0.21      0.46      0.29      2712



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
import pickle
filename = 'trained_model.sav'
pickle.dump(result_model, open(filename, 'wb'))

# min_sample_split = 15, min_sample_leaf = 15 => 81% (train), 68%(test)
# min_sample_split = 20, min_sample_leaf = 10 => 84% (train), 70%(test)

# min_sample_split = 40, min_sample_leaf = 5 => 83% (train), 70%(test)
# min_sample_split = 40, min_sample_leaf = 15 => 80% (train), 68%(test)

# criterion='entropy':
# min_sample_split = 40, min_sample_leaf = 15 => 80% (train), 69%(test)

In [None]:
matchup_df.tail(10)