In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from quantile_forest import RandomForestQuantileRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
df = pd.read_json('../processed_data/pl_data_dt.json')
df.head()


Unnamed: 0,season,round,date,time,home_team,away_team,home_goals,away_goals,home_poss,away_poss,...,away_shots_on_goal_rolling_avg,away_conceded_shots_on_goal_rolling_avg,away_goalkeeper_saves_rolling_avg,away_blocked_shots_rolling_avg,away_shots_off_goal_rolling_avg,away_chances_rolling_avg,away_shot_creation_ratio_rolling_avg,away_target_ratio_rolling_avg,away_conversion_rate_rolling_avg,away_target_to_goal_ratio_rolling_avg
0,20232024,1,2023-11-08,15:00,Burnley,Manchester City,0,3,0.35,0.65,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20232024,1,2023-12-08,10:00,Sheffield Utd,Crystal Palace,0,1,0.33,0.67,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20232024,1,2023-12-08,10:00,Everton,Fulham,0,1,0.41,0.59,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,20232024,1,2023-12-08,10:00,Brighton,Luton,4,1,0.71,0.29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20232024,1,2023-12-08,10:00,Bournemouth,West Ham,1,1,0.63,0.37,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df.columns

Index(['season', 'round', 'date', 'time', 'home_team', 'away_team',
       'home_goals', 'away_goals', 'home_poss', 'away_poss', 'home_shots',
       'away_shots', 'home_shots_on_goal', 'away_shots_on_goal',
       'home_corner_kicks', 'away_corner_kicks', 'home_goalkeeper_saves',
       'away_goalkeeper_saves', 'home_attacks', 'away_attacks',
       'home_dangerous_attacks', 'away_dangerous_attacks',
       'home_blocked_shots', 'away_blocked_shots', 'home_team_code',
       'away_team_code', 'home_shots_off_goal', 'away_shots_off_goal',
       'home_chances', 'away_chances', 'home_shot_creation_ratio',
       'away_shot_creation_ratio', 'home_target_ratio', 'away_target_ratio',
       'home_conversion_rate', 'away_conversion_rate',
       'home_target_to_goal_ratio', 'away_target_to_goal_ratio',
       'home_goals_rolling_avg', 'home_conceded_goals_rolling_avg',
       'home_shots_rolling_avg', 'home_conceded_shots_rolling_avg',
       'home_shots_on_goal_rolling_avg',
       'home_c

In [9]:
df.shape

(3391, 66)

In [10]:
#use random forest regression to find the number of goals
selected_features = ['home_shots_rolling_avg','home_shots_on_goal_rolling_avg', 'home_target_to_goal_ratio_rolling_avg',
                     'home_conversion_rate_rolling_avg','home_shot_creation_ratio_rolling_avg', 'home_shots_off_goal_rolling_avg']
                     

In [6]:
X = df[selected_features]
y = df['home_goals'] 

In [11]:
# Split data by date
cutoff_date = '2023-01-01'
df_past = df[df['date'] < cutoff_date]
df_future = df[df['date'] >= cutoff_date]

X_train = df_past[selected_features]
y_train = df_past['home_goals']
X_test = df_future[selected_features]
y_test = df_future['home_goals']


In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)
print('Length of our Training data:', X_train.shape, '\nLength of our Testing data:', y_test.shape)

Length of our Training data: (2712, 6) 
Length of our Testing data: (679,)


In [None]:
model = RandomForestRegressor(max_depth=2, n_estimators=100, random_state=45)
model2 = RandomForestQuantileRegressor(max_depth=2, n_estimators=100, random_state=45)

#fitting the model
model.fit(X_train, y_train)
model2.fit(X_train, y_train)

In [13]:
# Get predictions at 95% prediction intervals and median.
y_pred = model2.predict(X_test, quantiles=[0.16, 0.5, 0.84])

In [13]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': 2,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 45,
 'verbose': 0,
 'warm_start': False}

In [45]:
# Make predictions and round down to integers
y_pred_raw = model.predict(X_test)
y_pred = np.floor(y_pred_raw).astype(int)  # Round down to nearest integer

In [46]:
mse = mean_squared_error(y_true=y_test, y_pred=y_pred)
mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
r2 = r2_score(y_true=y_test, y_pred=y_pred)

print("Mean Squared Error (MSE): %f" % mse)
print("Mean Absolute Error (MAE): %f" % mae)
print("R-squared (R2 Score): %f" % r2) 

Mean Squared Error (MSE): 1.662739
Mean Absolute Error (MAE): 0.935199
R-squared (R2 Score): -0.008227


In [47]:
# Create results dataframe
results_df = pd.DataFrame({
    'Actual Goals': y_test.values,
    'Predicted Goals': y_pred
}, index=y_test.index)

if 'home_team' in df.columns:
    results_df['Home Team'] = df_future['home_team']

# Calculate training and test R² scores
train_r2 = r2_score(y_train, model.predict(X_train))
test_r2 = r2_score(y_test, y_pred)

print("\nTraining R² Score: %.4f" % train_r2)
print("Test R² Score: %.4f" % test_r2)

# Display sample predictions
print("\nSample Predictions:")
print(results_df.head(20))


Training R² Score: 0.0997
Test R² Score: -0.0082

Sample Predictions:
      Actual Goals  Predicted Goals Home Team
1968             0                1       NaN
2516             1                1       NaN
170              2                1    Wolves
2833             2                1       NaN
770              4                2       NaN
861              1                1       NaN
2806             1                1       NaN
1672             1                1       NaN
2815             0                1       NaN
1244             3                1       NaN
1429             1                2       NaN
12               3                1  West Ham
1512             4                1       NaN
2576             3                1       NaN
1446             2                1       NaN
2525             1                1       NaN
2609             0                1       NaN
3388             0                1       NaN
218              4                1     Luton
934      

In [15]:
# Predicting goals scored for each sample in the test set
predictions = model2.predict(X_test)

# Create results DataFrame with rounded predictions
results_df = pd.DataFrame({
    'Actual Goals': y_test.values,
    'Predicted Left Goals': np.floor(y_pred[:, 0]).astype(int),  
    'Predicted Median Goals': np.floor(y_pred[:, 1]).astype(int),
    'Predicted High Goals': np.floor(y_pred[:, 2]).astype(int)
}, index=y_test.index)

# Calculate range indicators
results_df['in_range'] = np.where(
    (results_df['Actual Goals'] >= results_df['Predicted Left Goals']) & 
    (results_df['Actual Goals'] <= results_df['Predicted High Goals']), 
    1, 0
)

results_df['low_range'] = np.where(
    (results_df['Actual Goals'] >= results_df['Predicted Left Goals']) & 
    (results_df['Actual Goals'] <= results_df['Predicted Median Goals']), 
    1, 0
)

results_df['high_range'] = np.where(
    (results_df['Actual Goals'] >= results_df['Predicted Median Goals']) & 
    (results_df['Actual Goals'] <= results_df['Predicted High Goals']), 
    1, 0
)

# Calculate below and above range
results_df['below_range'] = np.where(
    results_df['Actual Goals'] < results_df['Predicted Left Goals'],
    1, 0
)

results_df['above_range'] = np.where(
    results_df['Actual Goals'] > results_df['Predicted High Goals'],
    1, 0
)

# Calculate and print comprehensive statistics
print("Prediction Range Statistics:")
print(f"Total Predictions: {len(results_df)}")
print(f"\nWithin Full Range (16th to 84th percentile): {results_df['in_range'].mean():.2%}")
print(f"Within Lower Range (16th to 50th percentile): {results_df['low_range'].mean():.2%}")
print(f"Within Upper Range (50th to 84th percentile): {results_df['high_range'].mean():.2%}")
print(f"Below Range (<16th percentile): {results_df['below_range'].mean():.2%}")
print(f"Above Range (>84th percentile): {results_df['above_range'].mean():.2%}")

# Calculate average interval sizes
results_df['full_interval'] = results_df['Predicted High Goals'] - results_df['Predicted Left Goals']
results_df['lower_interval'] = results_df['Predicted Median Goals'] - results_df['Predicted Left Goals']
results_df['upper_interval'] = results_df['Predicted High Goals'] - results_df['Predicted Median Goals']

print("\nInterval Sizes:")
print(f"Average Full Interval Size: {results_df['full_interval'].mean():.2f} goals")
print(f"Average Lower Interval Size: {results_df['lower_interval'].mean():.2f} goals")
print(f"Average Upper Interval Size: {results_df['upper_interval'].mean():.2f} goals")

# Display sample of results
print("\nSample Predictions:")
results_df.head(20)

Prediction Range Statistics:
Total Predictions: 596

Within Full Range (16th to 84th percentile): 85.57%
Within Lower Range (16th to 50th percentile): 54.87%
Within Upper Range (50th to 84th percentile): 62.42%
Below Range (<16th percentile): 3.86%
Above Range (>84th percentile): 10.57%

Interval Sizes:
Average Full Interval Size: 2.58 goals
Average Lower Interval Size: 1.08 goals
Average Upper Interval Size: 1.50 goals

Sample Predictions:


Unnamed: 0,Actual Goals,Predicted Left Goals,Predicted Median Goals,Predicted High Goals,in_range,low_range,high_range,below_range,above_range,full_interval,lower_interval,upper_interval
0,0,0,1,2,1,1,0,0,0,2,1,1
1,0,0,1,2,1,1,0,0,0,2,1,1
2,0,0,1,2,1,1,0,0,0,2,1,1
3,4,0,1,2,0,0,0,0,1,2,1,1
4,1,0,1,2,1,1,1,0,0,2,1,1
5,5,0,1,2,0,0,0,0,1,2,1,1
6,2,0,1,2,1,0,1,0,0,2,1,1
7,1,0,1,2,1,1,1,0,0,2,1,1
8,1,0,1,2,1,1,1,0,0,2,1,1
9,2,0,1,2,1,0,1,0,0,2,1,1


In [27]:
print('Score: ', r2_score(y_train, model2.predict(X_train)))
print('Score: ', r2_score(y_test, y_pred[:, 1]))
print('MSE: ', mean_squared_error(y_test, y_pred[:, 1]))

Score:  0.6769401150044914
Score:  0.6785105532290987
MSE:  0.5301914580265096
