In [144]:
import pandas as pd

df = pd.read_csv('../Our Datasets/features_sample_match.csv')

start_types_wants = ['pass_interception', 'recovery']

df = df[df['start_type'].isin(start_types_wants)]

# dropping irrelevant columns
cols_to_drop = ['Unique ID', 'match_id', 'event_index', 'frame_anchor', 'rec_player_id', 'rec_team_short', 'start_type', 'end_type', 'event_row_index', 'source_file', 'error', 'third_end',
                'game_state']

df = df.drop(columns=cols_to_drop)

df['team_out_of_possession_phase_type'] = df['team_out_of_possession_phase_type'].str.replace('high_block', 'block')
df['team_out_of_possession_phase_type'] = df['team_out_of_possession_phase_type'].str.replace('medium_block', 'block')
df['team_out_of_possession_phase_type'] = df['team_out_of_possession_phase_type'].str.replace('low_block', 'block')

In [145]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor  # pip install xgboost if needed

In [146]:
third_order = ["defensive", "middle", "attacking"]
df["third_start"] = df["third_start"].map({v: i for i, v in enumerate(third_order)})

game_state_map = {
    "losing": -1,
    "drawing": 0,
    "winning": 1
}
#df["game_state"] = df["game_state"].map(game_state_map)

df = pd.get_dummies(df, columns=["team_out_of_possession_phase_type"], prefix="oop_phase")

target_col = "max_player_targeted_xthreat"
df = df[df[target_col].notna()]

In [147]:
df

Unnamed: 0,dist_to_near_goal,dist_to_far_goal,d_nearest_opp,n_opp_within5,d_nearest_team,mean_team_dist,n_forward_options,third_start,max_player_targeted_xthreat,oop_phase_block,oop_phase_chaotic,oop_phase_defending_direct,oop_phase_defending_quick_break,oop_phase_defending_set_play,oop_phase_defending_transition,oop_phase_disruption
0,10.301189,99.575170,7.026308,0.0,8.593788,31.734731,10.0,0,0.0024,True,False,False,False,False,False,False
3,34.427403,75.416484,0.000000,1.0,7.359348,27.331335,9.0,2,0.0091,False,True,False,False,False,False,False
9,30.598440,82.713146,4.389533,1.0,2.603114,13.502075,0.0,2,0.0091,False,True,False,False,False,False,False
14,51.188596,63.659032,2.489598,2.0,6.110491,23.968649,7.0,1,0.0359,False,True,False,False,False,False,False
19,47.054798,57.972010,0.982700,1.0,11.608070,23.278395,8.0,1,0.0359,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5844,39.664700,84.239471,8.829813,0.0,2.687452,15.000577,5.0,0,0.0006,False,False,True,False,False,False,False
5845,38.240528,71.880025,12.178974,0.0,4.323344,20.330756,2.0,0,0.0064,False,False,False,False,False,False,True
5847,56.429475,60.018211,3.381035,1.0,6.104629,27.814378,4.0,1,0.0014,False,False,False,False,False,False,True
5848,28.501218,78.036013,4.867700,1.0,4.610531,12.621078,2.0,0,0.0026,False,True,False,False,False,False,False


In [148]:
df_model = df.copy()

target_col = "max_player_targeted_xthreat"
df_model = df_model[df_model[target_col].notna()]

X = df_model.drop(columns=["max_player_targeted_xthreat"])
y = df_model["max_player_targeted_xthreat"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=42
)

In [149]:
rf = RandomForestRegressor(
    n_estimators=25,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

In [150]:
xgb = XGBRegressor(
    n_estimators=25,
    random_state=42,
    n_jobs=-1,
    objective="reg:squarederror"
)
xgb.fit(X_train, y_train)

In [151]:
def evaluate(model, X_train, y_train, X_test, y_test, name):
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)

    print(f"\nðŸ“Œ {name} Performance")
    print(f"Train RMSE: {np.sqrt(mean_squared_error(y_train, pred_train))} | "
          f"R2: {r2_score(y_train, pred_train):.3f}")
    print(f"Test  RMSE: {np.sqrt(mean_squared_error(y_test, pred_test))} | "
          f"R2: {r2_score(y_test, pred_test):.3f}")

evaluate(rf, X_train, y_train, X_test, y_test, "Random Forest")
evaluate(xgb, X_train, y_train, X_test, y_test, "XGBoost")


ðŸ“Œ Random Forest Performance
Train RMSE: 0.015607540011880547 | R2: 0.825
Test  RMSE: 0.05012000078815738 | R2: 0.098

ðŸ“Œ XGBoost Performance
Train RMSE: 0.011621492477787656 | R2: 0.903
Test  RMSE: 0.05098342339389901 | R2: 0.067


In [152]:
y_mean = np.full_like(y_test, y_train.mean(), dtype=float)

baseline_rmse = np.sqrt(mean_squared_error(y_test, y_mean))
baseline_r2   = r2_score(y_test, y_mean)

print("Baseline RMSE:", baseline_rmse)
print("Baseline R2:", baseline_r2)

Baseline RMSE: 0.052834685465648804
Baseline R2: -0.00221018682994667


In [153]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=0.1)
ridge.fit(X_train, y_train)
evaluate(ridge, X_train, y_train, X_test, y_test, "Ridge")


ðŸ“Œ Ridge Performance
Train RMSE: 0.035295443675820826 | R2: 0.104
Test  RMSE: 0.050892842032621635 | R2: 0.070


In [154]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train, y_train)

evaluate(linreg, X_train, y_train, X_test, y_test, "Linear Regression")


ðŸ“Œ Linear Regression Performance
Train RMSE: 0.03529543540949013 | R2: 0.104
Test  RMSE: 0.05089291699223131 | R2: 0.070
