In [16]:
# Import Statements
import pandas as pd
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

In [17]:
# Reading in the CSV file from Kaggle (Credits to Paola Mazza) into a Pandas Data Frame
players_df = pd.read_csv("players.csv")

In [18]:
# Split the dataset based on values in the 'Position' column
defenders_df = players_df[players_df['position'] == 'DEF']
midfielders_df = players_df[players_df['position'] == 'MID']
forwards_df = players_df[players_df['position'] == 'FWD']
keepers_df = players_df[players_df['position'] == 'GKP']

In [19]:
# Preprocess the data within the Pandas Data Frame
def preprocess(position_df):
    processed_df = position_df.copy()
    processed_df = processed_df.drop_duplicates()
    
    return processed_df

# Defenders Data
processed_defenders_df = preprocess(defenders_df)

# Midfielders Data
processed_midfielders_df = preprocess(midfielders_df)

# Forwards Data
processed_forwards_df = preprocess(forwards_df)

# Keepers Data
processed_keepers_df = preprocess(keepers_df)

In [20]:
# Function to perform hyperparameter tuning with cross-validation
# Can't run this locally, need computational power
def tune_hyperparameters(X, y):
    # Define the parameter grid
    param_grid = {
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 4],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'n_estimators': [100, 200]
    }

    # Create XGBoost regressor
    xgb_model = xgb.XGBRegressor()

    # Perform grid search with cross-validation
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=kfold)
    grid_search.fit(X, y)

    # Get the best parameters
    best_params = grid_search.best_params_
    print("Best Hyperparameters:", best_params)

    return best_params

In [21]:
# Function to train XGBoost model with the best hyperparameters
# Can't run this locally, need computational power
def train_xgboost_model_best(X, y, position):
    # Perform hyperparameter tuning
    best_params = tune_hyperparameters(X, y)

    # Create XGBoost regressor with best hyperparameters
    xgb_model = XGBRegressor(**best_params)

    # Train the model
    xgb_model.fit(X, y)

    print(f"XGBoost Model trained for {position} position.")

    return xgb_model

In [22]:
# Define hyperparameters (manually selected)
hyperparams = {
    'learning_rate': 0.1,
    'max_depth': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'n_estimators': 100
}

In [23]:
# Function to train XGBoost model with the predefined hyperparameters
def train_xgboost_model(X, y, position):
    # Create XGBoost regressor with predefined hyperparameters
    xgb_model = XGBRegressor(**hyperparams)

    # Train the model
    xgb_model.fit(X, y)

    print(f"XGBoost Model trained for {position} position.")

    return xgb_model

In [24]:
# Train XGBoost models for each position
xgb_model_gkp = train_xgboost_model(processed_keepers_df[processed_keepers_df.select_dtypes(include=['int']).columns], processed_keepers_df['total_points'], "GKP")
xgb_model_def = train_xgboost_model(processed_defenders_df[processed_defenders_df.select_dtypes(include=['int']).columns], processed_defenders_df['total_points'], "DEF")
xgb_model_mid = train_xgboost_model(processed_midfielders_df[processed_midfielders_df.select_dtypes(include=['int']).columns], processed_midfielders_df['total_points'], "MID")
xgb_model_fwd = train_xgboost_model(processed_forwards_df[processed_forwards_df.select_dtypes(include=['int']).columns], processed_forwards_df['total_points'], "FWD")

XGBoost Model trained for GKP position.
XGBoost Model trained for DEF position.
XGBoost Model trained for MID position.
XGBoost Model trained for FWD position.


In [25]:
# Function to evaluate XGBoost model and print MSE
def evaluate_model(model, X_test, y_test, position):
    # Predictions
    y_pred = model.predict(X_test)
    # Calculate MSE
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error for {position}: {mse}")

In [26]:
# Evaluate models for each position
evaluate_model(xgb_model_gkp, processed_keepers_df[processed_keepers_df.select_dtypes(include=['int']).columns], processed_keepers_df['total_points'], "GKP")
evaluate_model(xgb_model_def, processed_defenders_df[processed_defenders_df.select_dtypes(include=['int']).columns], processed_defenders_df['total_points'], "DEF")
evaluate_model(xgb_model_mid, processed_midfielders_df[processed_midfielders_df.select_dtypes(include=['int']).columns], processed_midfielders_df['total_points'], "MID")
evaluate_model(xgb_model_fwd, processed_forwards_df[processed_forwards_df.select_dtypes(include=['int']).columns], processed_forwards_df['total_points'], "FWD")

Mean Squared Error for GKP: 0.0041144134929303215
Mean Squared Error for DEF: 0.0812413861896636
Mean Squared Error for MID: 0.20155126849218444
Mean Squared Error for FWD: 0.03313075796556716
