# Combine individual player files into a combined file

In [14]:
import os
import pandas as pd

def combine_player_data(preprocessed_folder):
    """
    Combines all preprocessed player data files into a single DataFrame and sorts them by game date.

    Parameters:
    - preprocessed_folder (str): The folder containing all the preprocessed player files.

    Returns:
    - DataFrame: Combined and sorted player data.
    """
    all_data = []

    # Loop through all files in the preprocessed folder
    for file_name in os.listdir(preprocessed_folder):
        if file_name.endswith('.csv'):
            file_path = os.path.join(preprocessed_folder, file_name)
            player_df = pd.read_csv(file_path)
            
            # Append the player's data to the list
            all_data.append(player_df)
    
    # Combine all the player data into a single DataFrame
    combined_df = pd.concat(all_data, ignore_index=True)
    
    # Ensure the data is sorted by 'GAME_DATE'
    combined_df['GAME_DATE'] = pd.to_datetime(combined_df['GAME_DATE'])
    combined_df = combined_df.sort_values(by='GAME_DATE', ascending=True).reset_index(drop=True)
    
    return combined_df

# Example usage:
preprocessed_folder = 'preprocessed_players_stats'
combined_player_data = combine_player_data(preprocessed_folder)

# Optionally, save the combined data for further inspection
combined_player_data.to_csv('combined_player_data.csv', index=False)

# Data Prep for Training

In [15]:
import numpy as np
import pandas as pd

# Function to preprocess player data including date encoding
def preprocess_player_data(player_df):
    # Step 1: Convert WL to 1 for Win and 0 for Loss
    player_df['WL'] = player_df['WL'].apply(lambda x: 1 if x == 'W' else 0)
    
    # # Step 2: Drop irrelevant columns (customize this based on your needs)
    # drop_columns = ['VIDEO_AVAILABLE', 'PLUS_MINUS', 'SEASON_ID', 'Player_ID']  # Modify the list as per your data
    # player_df = player_df.drop(columns=drop_columns, errors='ignore')

    # Step 3: Encode the GAME_DATE column
    # You can choose one or more of the following methods:

    # Method 1: Extract date components (year, month, day)
    player_df['GAME_YEAR'] = player_df['GAME_DATE'].dt.year
    player_df['GAME_MONTH'] = player_df['GAME_DATE'].dt.month
    player_df['GAME_DAY'] = player_df['GAME_DATE'].dt.day

    # Method 2: Calculate days since a reference date
    reference_date = pd.to_datetime('2024-10-22')
    player_df['DAYS_SINCE_REF'] = (player_df['GAME_DATE'] - reference_date).dt.days

    # Method 3: Cyclic encoding of month and day
    player_df['MONTH_SIN'] = np.sin(2 * np.pi * player_df['GAME_MONTH'] / 12)
    player_df['MONTH_COS'] = np.cos(2 * np.pi * player_df['GAME_MONTH'] / 12)
    player_df['DAY_SIN'] = np.sin(2 * np.pi * player_df['GAME_DAY'] / 31)
    player_df['DAY_COS'] = np.cos(2 * np.pi * player_df['GAME_DAY'] / 31)

    # Step 4: Drop the original GAME_DATE column (now encoded)
    player_df.drop(columns=['GAME_DATE'], inplace=True)

    # Return the preprocessed DataFrame
    return player_df

combined_player_data = preprocess_player_data(combined_player_data)

# Optionally, save the combined data for further inspection
combined_player_data.to_csv('combined_player_data.csv', index=False)

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

def prepare_data_for_training(combined_df):
    """
    Prepares the combined player data for training an XGBoost model by selecting relevant features,
    encoding categorical data, and splitting the data into train/test sets.

    Parameters:
    - combined_df (DataFrame): The combined player game log data.

    Returns:
    - X_train, X_test, y_train, y_test: Training and test sets for the model.
    """
    # Feature selection: choose relevant features (adjust these as needed)
    features = [
        "SEASON_ID", "WL", "OPP_TEAM", # Game Context
        "GAME_YEAR", "GAME_MONTH", "GAME_DAY", "DAYS_SINCE_REF", "MONTH_SIN", "MONTH_COS", "DAY_SIN", "DAY_COS", # Game Date Context
        "MIN", "FGM", "FGA", "FG_PCT", "FG3M", "FG3A", "FG3_PCT", "FTM" , "FTA", "FT_PCT", "PTS", # Core Stats
        "OREB", "DREB", "REB", "AST", "STL", "BLK", "TOV", "PF", # Extra Non-point Stats
        "PLUS_MINUS", # Advanced Stats
        # Rolling Statistics
        "ROLLING_PTS_20_GAMES", "ROLLING_MIN_20_GAMES", "ROLLING_FGA_20_GAMES", "ROLLING_FGM_20_GAMES", "ROLLING_AST_20_GAMES", "ROLLING_REB_20_GAMES", "ROLLING_STL_20_GAMES", "ROLLING_BLK_20_GAMES", "ROLLING_TOV_20_GAMES", "ROLLING_PF_20_GAMES",
        "ROLLING_PTS_10_GAMES", "ROLLING_MIN_10_GAMES", "ROLLING_FGA_10_GAMES", "ROLLING_FGM_10_GAMES", "ROLLING_AST_10_GAMES", "ROLLING_REB_10_GAMES", "ROLLING_STL_10_GAMES", "ROLLING_BLK_10_GAMES", "ROLLING_TOV_10_GAMES", "ROLLING_PF_10_GAMES",
        "ROLLING_PTS_5_GAMES", "ROLLING_MIN_5_GAMES", "ROLLING_FGA_5_GAMES", "ROLLING_FGM_5_GAMES", "ROLLING_AST_5_GAMES", "ROLLING_REB_5_GAMES", "ROLLING_STL_5_GAMES", "ROLLING_BLK_5_GAMES", "ROLLING_TOV_5_GAMES", "ROLLING_PF_5_GAMES",
        "ROLLING_PTS_3_GAMES", "ROLLING_MIN_3_GAMES", "ROLLING_FGA_3_GAMES", "ROLLING_FGM_3_GAMES", "ROLLING_AST_3_GAMES", "ROLLING_REB_3_GAMES", "ROLLING_STL_3_GAMES", "ROLLING_BLK_3_GAMES", "ROLLING_TOV_3_GAMES", "ROLLING_PF_3_GAMES",
        "CUM_PTS_SEASON", "CUM_MIN_SEASON", "CUM_FGM_SEASON", "CUM_FGA_SEASON", "CUM_AST_SEASON", "CUM_REB_SEASON" # Cumulative Season Stats
    ]
    
    # Ensure no NaN values in features, fill with 0 or mean as appropriate
    combined_df = combined_df.fillna(0)
    
    # Encode categorical variables (e.g., OPP_TEAM, WL)
    label_encoder = LabelEncoder()
    combined_df['OPP_TEAM'] = label_encoder.fit_transform(combined_df['OPP_TEAM'])
    combined_df['WL'] = combined_df['WL'].apply(lambda x: 1 if x == 'W' else 0)
    
    # Input features (X) and target variable (y)
    X = combined_df[features]
    y = combined_df['PTS']  # Target is 'PTS' i.e. the points scored in the next game
    
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    return X_train, X_test, y_train, y_test

# Example usage:
X_train, X_test, y_train, y_test = prepare_data_for_training(combined_player_data)

# Model Training

In [19]:
import xgboost as xgb

# Convert training and test data to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Set up the parameters for training
params = {
    'objective': 'reg:squarederror',  # Regression task
    'learning_rate': 0.1,
    'max_depth': 6,
    'alpha': 10,  # L1 regularization term
    'n_estimators': 100,  # Number of trees
    'eval_metric': 'rmse'  # Root Mean Squared Error
}

# Specify evaluation set (test set)
evallist = [(dtrain, 'train'), (dtest, 'eval')]

# Train the model
xgb_model = xgb.train(params, dtrain, num_boost_round=100, evals=evallist, early_stopping_rounds=10)

# Save the model if needed
xgb_model.save_model('xgboost_NBA_model0.json')


[0]	train-rmse:8.11708	eval-rmse:8.09823
[1]	train-rmse:7.30786	eval-rmse:7.29256
[2]	train-rmse:6.57944	eval-rmse:6.56743
[3]	train-rmse:5.92376	eval-rmse:5.91476
[4]	train-rmse:5.33362	eval-rmse:5.32782
[5]	train-rmse:4.80227	eval-rmse:4.79831
[6]	train-rmse:4.32403	eval-rmse:4.32265
[7]	train-rmse:3.89346	eval-rmse:3.89384
[8]	train-rmse:3.50598	eval-rmse:3.50877
[9]	train-rmse:3.15704	eval-rmse:3.16056
[10]	train-rmse:2.84293	eval-rmse:2.84749
[11]	train-rmse:2.56032	eval-rmse:2.56613


Parameters: { "n_estimators" } are not used.



[12]	train-rmse:2.30578	eval-rmse:2.31205
[13]	train-rmse:2.07666	eval-rmse:2.08359
[14]	train-rmse:1.87041	eval-rmse:1.87756
[15]	train-rmse:1.68479	eval-rmse:1.69307
[16]	train-rmse:1.51768	eval-rmse:1.52624
[17]	train-rmse:1.36732	eval-rmse:1.37690
[18]	train-rmse:1.23195	eval-rmse:1.24202
[19]	train-rmse:1.11009	eval-rmse:1.12079
[20]	train-rmse:1.00058	eval-rmse:1.01280
[21]	train-rmse:0.90192	eval-rmse:0.91504
[22]	train-rmse:0.81319	eval-rmse:0.82782
[23]	train-rmse:0.73326	eval-rmse:0.74836
[24]	train-rmse:0.66142	eval-rmse:0.67803
[25]	train-rmse:0.59677	eval-rmse:0.61496
[26]	train-rmse:0.53849	eval-rmse:0.55734
[27]	train-rmse:0.48607	eval-rmse:0.50505
[28]	train-rmse:0.43900	eval-rmse:0.45945
[29]	train-rmse:0.39651	eval-rmse:0.41725
[30]	train-rmse:0.35833	eval-rmse:0.37955
[31]	train-rmse:0.32402	eval-rmse:0.34610
[32]	train-rmse:0.29334	eval-rmse:0.31710
[33]	train-rmse:0.26555	eval-rmse:0.28989
[34]	train-rmse:0.24055	eval-rmse:0.26568
[35]	train-rmse:0.21833	eval-rmse:

# Evaluate Model Preformance

In [20]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Make predictions on the test set
y_pred = xgb_model.predict(dtest)

# Calculate evaluation metrics
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")


RMSE: 0.0756963811127046
MAE: 0.0016925755204790986


