# Predictive modelling
Author: Dr. Marco Zanin

In [1]:
# imports
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from scipy import stats

In [3]:
# functions

def data_preparation(dataset, numeric_features, binary_features,
                     categorical_features, target, outliers_ids):
    """
        This function processes the input data.
    """
    ## Handling date
    # 1. converts date to datetime
    dataset['date'] = pd.to_datetime(dataset['date'])
    # 2. converts datetime to numeric value
    dataset['date_numeric'] = dataset['date'].map(
        {date: idx for idx, date in enumerate(dataset['date'].unique())}
    )
    # 3. drops the original date column
    dataset = dataset.drop(columns=['date'])

    ## Check which columns exist in the dataset for each feature type
    numeric_features = [col for col in numeric_features if col in dataset.columns]
    binary_features = [col for col in binary_features if col in dataset.columns]
    categorical_features = [col for col in categorical_features if col in dataset.columns]

    ## Convert categorical features to 'category' dtype
    dataset[categorical_features] = dataset[categorical_features].astype('category')

    ## Process numeric features (standardize them)
    if numeric_features:
        scaler = StandardScaler()
        numeric_data_scaled = pd.DataFrame(scaler.fit_transform(dataset[numeric_features]), 
                                           columns=numeric_features, index=dataset.index)
    else:
        numeric_data_scaled = pd.DataFrame(index=dataset.index)  # Empty DataFrame if no numeric features

    ## Process binary features (leave them unchanged)
    binary_data = dataset[binary_features] if binary_features else pd.DataFrame(index=dataset.index)

    ## Process categorical features (one-hot encode them)
    if categorical_features:
        categorical_data_encoded = pd.get_dummies(dataset[categorical_features], drop_first=False)
    else:
        categorical_data_encoded = pd.DataFrame(index=dataset.index)

    ## Combine numeric, binary, and categorical data
    dataset_processed = pd.concat([numeric_data_scaled, binary_data, categorical_data_encoded], axis=1)

    ## Remove the outliers
    # Select all rows except those with the specified indices
    dataset_no_outliers = dataset.loc[~dataset.index.isin(outliers_ids)]
    dataset_processed_no_outliers = dataset_processed.loc[~dataset_processed.index.isin(outliers_ids)]

    ## Create datasets for models
    # with outliers
    X_full = dataset_processed
    y_full = dataset[target]
    # without outliers
    X_no_outliers = dataset_processed_no_outliers
    y_no_outliers = dataset_no_outliers[target]

    return X_full, y_full, X_no_outliers, y_no_outliers


In [27]:
# data import

backs_vssg = pd.read_csv("./data/backs_vssg_study.csv")

forwards_vssg = pd.read_csv("./data/forwards_vssg_study.csv")

ssg = pd.read_csv("./data/ssg_study.csv")

# Create a column to differentiate between forwards and backs
ssg['forward0_back1'] = ssg['players'].apply(lambda x: 0 if x <= 22 else 1)
# Convert the new column to integer
ssg['forward0_back1'] = ssg['forward0_back1'].astype(int)

In [28]:
# Outliers identified for each data frame
outlier_ids_backs = [97, 230, 248, 40, 264, 304, 430, 28, 178, 16, 4, 444, 157, 321, 355, 174, 372, 444]
outlier_ids_forwards = [179,238,265,299,319,339,502]
outlier_ids_ssg = [4,6,36,105,142,222,223,254,258,411,420,447,463,499,553,587,621,636,655]

# inputs
numeric_features = [ 'total_distance_m', 'tot_hsr_distance',
       'acceleration_density', 'total_player_load', 'player_load_slow',
       'get_up', 'bullet']
binary_features = ['forward0_back1']
categorical_features = ['players', 'date_numeric', 'ssg_bout']
target = 'stagnos_trimp'


# dataset for model training
X_full, y_full, X_no_outliers, y_no_outliers = data_preparation(ssg,
                                                                numeric_features,
                                                                binary_features,
                                                                categorical_features,
                                                                target,
                                                                outlier_ids_ssg)

In [29]:
# Set random state
RANDOM_STATE = 42

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'KNearestNeighbors': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor(random_state=RANDOM_STATE),
    'Gradient Boosting': GradientBoostingRegressor(random_state=RANDOM_STATE),
}

# Define the 10-fold cross-validation for consistency across models
kf = KFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)


# Create an empty DataFrame to store the scores
scores_df_full = pd.DataFrame()
scores_df_no_outliers = pd.DataFrame()

# Evaluate each model using 10-fold cross-validation and store the individual scores
for name, model in models.items():
    scores_full = -cross_val_score(model, X_full, y_full, scoring="neg_root_mean_squared_error", cv=kf)
    scores_no_outliers = -cross_val_score(model, X_no_outliers, y_no_outliers, scoring="neg_root_mean_squared_error", cv=kf)
    
    # Create a DataFrame for the individual scores and add a column for the model name
    model_scores_df_full = pd.DataFrame(scores_full, columns=[name])
    model_scores_no_outliers = pd.DataFrame(scores_no_outliers, columns=[name])
    
    # Concatenate the individual scores with the scores_df
    scores_df_full = pd.concat([scores_df_full, model_scores_df_full], axis=1)
    scores_df_no_outliers = pd.concat([scores_df_no_outliers, model_scores_no_outliers], axis=1)

In [43]:
# FULL DATA: Calculating mean and standard deviation for each column
mean_std_df_full = pd.DataFrame({
    'Mean': scores_df_full.mean(axis=0),
    'Standard Deviation': scores_df_full.std(axis=0)
}).sort_values('Mean')
mean_std_df_full

Unnamed: 0,Mean,Standard Deviation
Gradient Boosting,128.660855,27.079357
Random Forest,134.418796,27.955652
Linear Regression,135.744724,31.149857
KNearestNeighbors,158.297595,28.228378


In [42]:
# NO OUTLIERS: Calculating mean and standard deviation for each column
mean_std_df_no_outliers = pd.DataFrame({
    'Mean': scores_df_no_outliers.mean(axis=0),
    'Standard Deviation': scores_df_no_outliers.std(axis=0)
}).sort_values('Mean')
mean_std_df_no_outliers

Unnamed: 0,Mean,Standard Deviation
Gradient Boosting,100.086468,9.537541
Linear Regression,101.194564,10.294196
Random Forest,107.323823,7.983176
KNearestNeighbors,123.734664,10.865306


In [39]:
## Statistical comparison: paired samples t-test

# model name
my_model_name = 'Linear Regression'

# filter data
model_array_scores_full = scores_df_full[my_model_name]
model_array_scores_no_outliers = scores_df_no_outliers[my_model_name]

# run test
res = stats.ttest_rel(model_array_scores_full,model_array_scores_no_outliers)
print(res)
print(res.confidence_interval(confidence_level=0.95))

TtestResult(statistic=2.976403382745041, pvalue=0.015540035301557426, df=9)
ConfidenceInterval(low=8.290986740599372, high=60.8093327007113)
