# Player Statistic to Score Model
- The goal of this model is to determine if it is possible to predict the 4 scores of a players tournament based on their performance that week and the tournament they are playing in. 
- Stats:
    - GIR_PERCENTAGE
    - TOTAL_DRIVING_DISTANCE
    - FIR_PERCENTAGE
    - SCRAMBLING_PERCENTAGE
    - PUTTS_PER_FOUND
- Tournament:
    - This will be a value that determines what tournament (i.e. Masters or U.S. Open) we are trying to predict.
    - The hope is that this flag will be able to help the model distinguish between scoring in different tournaments
- Multi-Output Regression Approach:
    - Linear Regression
    - Random Forrest
    - XGBoost
    - Neural Network

- Notes:
    - Unable to get much better than 2 MAE on the current approach of trying to predict just based on what the player statistics would be for that week. Need to try and pivot to create features unique to each PLAYER_ID to quantify past performance.
    - This is a time series on time series problem (Each round most likely affects the previous and the players performance coming into the tournament is relevant as well)
    - Each round has a "weather score". Some sort of combination of temperature, precipitation, and wind speed.
    - Try to give more paramters to this model. Don't be affraid to give it all the parameters.
    - Seems like got decent performance if just try to predict the total score. Makes me think need to have some sort of round level data or should just try for tournament level performance.
    - Assume that you can predict perfectly what the 5 player statistics are gonna be. Can you predict what the 4 scores will be.
    - Then treat each of those 5 statistics as their own modeling problem.

In [32]:
import sqlite3
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score)

## Pull Data from Database and Process

In [33]:
conn = sqlite3.connect('/Users/nickospelt/Documents/App_Projects/PGA_Score_Predictor/Data/PGA_SQL_DB/PGA.db')

player_tournament_query = """SELECT *,
 (CASE 
     WHEN UPPER(TOURNAMENT_NAME) LIKE "%MASTERS%" THEN 1.0
     WHEN UPPER(TOURNAMENT_NAME) LIKE "%U.S. OPEN%" THEN 2.0
     WHEN UPPER(TOURNAMENT_NAME) LIKE "%THE OPEN%" THEN 3.0
     WHEN UPPER(TOURNAMENT_NAME) LIKE "%PGA%" THEN 4.0
     WHEN UPPER(TOURNAMENT_NAME) LIKE "%PLAYERS%" THEN 5.0
  END) AS TOURNAMENT_F,
  (R1_SCORE + R2_SCORE + R3_SCORE + R4_SCORE) AS TOTAL_SCORE,
  (R1_TEMP + R2_TEMP + R3_TEMP + R4_TEMP) / 4 AS AVG_TEMP,
  (R1_PRECIP + R2_PRECIP + R3_PRECIP + R4_PRECIP) / 4 AS AVG_PRECIP,
  (R1_WIND_SPEED + R2_WIND_SPEED + R3_WIND_SPEED + R4_WIND_SPEED) / 4 AS AVG_WIND_SPEED
FROM PLAYER_TOURNAMENT_RESULTS"""

raw_player_tournament_df = pd.read_sql_query(player_tournament_query, conn)
raw_player_tournament_df

Unnamed: 0,TOURNAMENT_NAME,R1_TEMP,R1_PRECIP,R1_WIND_SPEED,R1_WIND_DIRECT,R2_TEMP,R2_PRECIP,R2_WIND_SPEED,R2_WIND_DIRECT,R3_TEMP,...,PUTTS_PER_ROUND,R1_SCORE,R2_SCORE,R3_SCORE,R4_SCORE,TOURNAMENT_F,TOTAL_SCORE,AVG_TEMP,AVG_PRECIP,AVG_WIND_SPEED
0,2018 Masters Tournament,56.5,0.000,10.0,0,61.3,0.0,12.8,185,65.6,...,26.00,69,66,67,71,1.0,273,58.725,0.0925,12.750
1,2018 Masters Tournament,56.5,0.000,10.0,0,61.3,0.0,12.8,185,65.6,...,28.25,70,72,65,67,1.0,274,58.725,0.0925,12.750
2,2018 Masters Tournament,56.5,0.000,10.0,0,61.3,0.0,12.8,185,65.6,...,29.25,66,74,71,64,1.0,275,58.725,0.0925,12.750
3,2018 Masters Tournament,56.5,0.000,10.0,0,61.3,0.0,12.8,185,65.6,...,28.00,75,68,65,69,1.0,277,58.725,0.0925,12.750
4,2018 Masters Tournament,56.5,0.000,10.0,0,61.3,0.0,12.8,185,65.6,...,30.50,73,69,68,69,1.0,279,58.725,0.0925,12.750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1724,2023 U.S. Open,64.2,0.016,8.6,197,66.5,0.0,11.9,238,67.0,...,29.75,68,73,77,74,2.0,292,66.125,0.0040,10.925
1725,2023 U.S. Open,64.2,0.016,8.6,197,66.5,0.0,11.9,238,67.0,...,31.75,70,72,75,76,2.0,293,66.125,0.0040,10.925
1726,2023 U.S. Open,64.2,0.016,8.6,197,66.5,0.0,11.9,238,67.0,...,30.75,69,73,78,74,2.0,294,66.125,0.0040,10.925
1727,2023 U.S. Open,64.2,0.016,8.6,197,66.5,0.0,11.9,238,67.0,...,31.00,70,72,74,79,2.0,295,66.125,0.0040,10.925


In [34]:
# Create weather feature (Weighted average of temp, precipiation, and wind speed)
def weather_weighted_average(temp, precip, wind_speed):
    weights = [0.3, 0.3, 0.4]
    return (temp * weights[0] + precip * weights[1] + wind_speed * weights[2]) / (weights[0] + weights[1] + weights[2])

scaler = StandardScaler()
raw_player_tournament_df[['R1_TEMP', 'R1_PRECIP', 'R1_WIND_SPEED', 
'R2_TEMP', 'R2_PRECIP', 'R2_WIND_SPEED', 
'R3_TEMP', 'R3_PRECIP', 'R3_WIND_SPEED', 
'R4_TEMP', 'R4_PRECIP', 'R4_WIND_SPEED']] = scaler.fit_transform(raw_player_tournament_df[['R1_TEMP', 'R1_PRECIP', 'R1_WIND_SPEED', 
'R2_TEMP', 'R2_PRECIP', 'R2_WIND_SPEED', 
'R3_TEMP', 'R3_PRECIP', 'R3_WIND_SPEED', 
'R4_TEMP', 'R4_PRECIP', 'R4_WIND_SPEED']])

raw_player_tournament_df['R1_WEATHER_FACTOR'] = raw_player_tournament_df.apply(lambda row:
    weather_weighted_average(row['R1_TEMP'], row['R1_PRECIP'], row['R1_WIND_SPEED']), axis=1)
raw_player_tournament_df['R2_WEATHER_FACTOR'] = raw_player_tournament_df.apply(lambda row:
    weather_weighted_average(row['R2_TEMP'], row['R2_PRECIP'], row['R2_WIND_SPEED']), axis=1)
raw_player_tournament_df['R3_WEATHER_FACTOR'] = raw_player_tournament_df.apply(lambda row:
    weather_weighted_average(row['R3_TEMP'], row['R3_PRECIP'], row['R3_WIND_SPEED']), axis=1)
raw_player_tournament_df['R4_WEATHER_FACTOR'] = raw_player_tournament_df.apply(lambda row:
    weather_weighted_average(row['R4_TEMP'], row['R4_PRECIP'], row['R4_WIND_SPEED']), axis=1)

raw_player_tournament_df

Unnamed: 0,TOURNAMENT_NAME,R1_TEMP,R1_PRECIP,R1_WIND_SPEED,R1_WIND_DIRECT,R2_TEMP,R2_PRECIP,R2_WIND_SPEED,R2_WIND_DIRECT,R3_TEMP,...,R4_SCORE,TOURNAMENT_F,TOTAL_SCORE,AVG_TEMP,AVG_PRECIP,AVG_WIND_SPEED,R1_WEATHER_FACTOR,R2_WEATHER_FACTOR,R3_WEATHER_FACTOR,R4_WEATHER_FACTOR
0,2018 Masters Tournament,-1.238719,-0.330632,-0.551762,0,-0.698112,-0.627097,0.125666,185,0.187288,...,71,1.0,273,58.725,0.0925,12.750,-0.691510,-0.347296,0.174633,-0.288896
1,2018 Masters Tournament,-1.238719,-0.330632,-0.551762,0,-0.698112,-0.627097,0.125666,185,0.187288,...,67,1.0,274,58.725,0.0925,12.750,-0.691510,-0.347296,0.174633,-0.288896
2,2018 Masters Tournament,-1.238719,-0.330632,-0.551762,0,-0.698112,-0.627097,0.125666,185,0.187288,...,64,1.0,275,58.725,0.0925,12.750,-0.691510,-0.347296,0.174633,-0.288896
3,2018 Masters Tournament,-1.238719,-0.330632,-0.551762,0,-0.698112,-0.627097,0.125666,185,0.187288,...,69,1.0,277,58.725,0.0925,12.750,-0.691510,-0.347296,0.174633,-0.288896
4,2018 Masters Tournament,-1.238719,-0.330632,-0.551762,0,-0.698112,-0.627097,0.125666,185,0.187288,...,69,1.0,279,58.725,0.0925,12.750,-0.691510,-0.347296,0.174633,-0.288896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1724,2023 U.S. Open,-0.161016,-0.215934,-0.955460,197,0.075143,-0.627097,-0.116200,238,0.414486,...,74,2.0,292,66.125,0.0040,10.925,-0.495269,-0.212066,-0.133898,-0.192212
1725,2023 U.S. Open,-0.161016,-0.215934,-0.955460,197,0.075143,-0.627097,-0.116200,238,0.414486,...,76,2.0,293,66.125,0.0040,10.925,-0.495269,-0.212066,-0.133898,-0.192212
1726,2023 U.S. Open,-0.161016,-0.215934,-0.955460,197,0.075143,-0.627097,-0.116200,238,0.414486,...,74,2.0,294,66.125,0.0040,10.925,-0.495269,-0.212066,-0.133898,-0.192212
1727,2023 U.S. Open,-0.161016,-0.215934,-0.955460,197,0.075143,-0.627097,-0.116200,238,0.414486,...,79,2.0,295,66.125,0.0040,10.925,-0.495269,-0.212066,-0.133898,-0.192212


In [35]:
processed_player_tournament_df = raw_player_tournament_df[['TOURNAMENT_F', 
    'PAR', 'LENGTH', 'ELEVATION', 
    'R1_WEATHER_FACTOR', 'R2_WEATHER_FACTOR', 'R3_WEATHER_FACTOR', 'R4_WEATHER_FACTOR',
    'GIR_PERCENTAGE', 'TOTAL_DRIVING_DISTANCE', 'FIR_PERCENTAGE', 'SCRAMBLING_PERCENTAGE', 'PUTTS_PER_ROUND', 'R1_SCORE', 'R2_SCORE', 'R3_SCORE', 'R4_SCORE', 'TOTAL_SCORE']]
processed_player_tournament_df

Unnamed: 0,TOURNAMENT_F,PAR,LENGTH,ELEVATION,R1_WEATHER_FACTOR,R2_WEATHER_FACTOR,R3_WEATHER_FACTOR,R4_WEATHER_FACTOR,GIR_PERCENTAGE,TOTAL_DRIVING_DISTANCE,FIR_PERCENTAGE,SCRAMBLING_PERCENTAGE,PUTTS_PER_ROUND,R1_SCORE,R2_SCORE,R3_SCORE,R4_SCORE,TOTAL_SCORE
0,1.0,72,7435,44.0,-0.691510,-0.347296,0.174633,-0.288896,66.67,299.3,73.21,62.50,26.00,69,66,67,71,273
1,1.0,72,7435,44.0,-0.691510,-0.347296,0.174633,-0.288896,70.83,290.6,71.43,76.19,28.25,70,72,65,67,274
2,1.0,72,7435,44.0,-0.691510,-0.347296,0.174633,-0.288896,72.22,287.1,67.86,70.00,29.25,66,74,71,64,275
3,1.0,72,7435,44.0,-0.691510,-0.347296,0.174633,-0.288896,68.06,295.4,73.21,60.87,28.00,75,68,65,69,277
4,1.0,72,7435,44.0,-0.691510,-0.347296,0.174633,-0.288896,77.78,304.0,83.93,62.50,30.50,73,69,68,69,279
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1724,2.0,70,7421,96.0,-0.495269,-0.212066,-0.133898,-0.192212,55.56,294.0,55.77,43.75,29.75,68,73,77,74,292
1725,2.0,70,7421,96.0,-0.495269,-0.212066,-0.133898,-0.192212,63.89,307.1,82.69,38.46,31.75,70,72,75,76,293
1726,2.0,70,7421,96.0,-0.495269,-0.212066,-0.133898,-0.192212,61.11,295.6,63.46,42.86,30.75,69,73,78,74,294
1727,2.0,70,7421,96.0,-0.495269,-0.212066,-0.133898,-0.192212,59.72,336.3,48.08,37.93,31.00,70,72,74,79,295


## Prepare processed data for training and testing

In [47]:
X_weather_factor = processed_player_tournament_df[['TOURNAMENT_F', 
    'PAR', 'LENGTH', 'ELEVATION',
    'R1_WEATHER_FACTOR', 'R2_WEATHER_FACTOR', 'R3_WEATHER_FACTOR', 'R4_WEATHER_FACTOR',
    'GIR_PERCENTAGE', 'TOTAL_DRIVING_DISTANCE', 'FIR_PERCENTAGE', 'SCRAMBLING_PERCENTAGE', 'PUTTS_PER_ROUND']]
X = processed_player_tournament_df[['TOURNAMENT_F', 
    'PAR', 'LENGTH', 'ELEVATION',
    'GIR_PERCENTAGE', 'TOTAL_DRIVING_DISTANCE', 'FIR_PERCENTAGE', 'SCRAMBLING_PERCENTAGE', 'PUTTS_PER_ROUND']]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_weather_factor_scaled = scaler.fit_transform(X_weather_factor)
y_round = processed_player_tournament_df[['R1_SCORE', 'R2_SCORE', 'R3_SCORE', 'R4_SCORE']]
y_total = processed_player_tournament_df['TOTAL_SCORE']

X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(X_weather_factor_scaled, columns=X_weather_factor.columns), y_round, test_size=0.2)

## Linear Regression

In [48]:
linear_regression_model = LinearRegression()

# Train the model
linear_regression_model.fit(X_train, y_train)

# Training Stats
y_train_predict = linear_regression_model.predict(X_train)
mse_train = mean_squared_error(y_train, y_train_predict)
mae_train = mean_absolute_error(y_train, y_train_predict)
r2_train = r2_score(y_train, y_train_predict, multioutput='uniform_average')

# Testing Stats
y_test_predict = linear_regression_model.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_predict)
mae_test = mean_absolute_error(y_test, y_test_predict)
r2_test = r2_score(y_test, y_test_predict, multioutput='uniform_average')

# Training and Test Performance
print(f"Training: MSE: {mse_train}, MAE: {mae_train}, R^2: {r2_train}")
print(f"Testing: MSE: {mse_test}, MAE: {mae_test}, R^2: {r2_test}")

# Feature Importance
coefficients = linear_regression_model.coef_
round_1 = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficients': coefficients[0],
}).sort_values(by=['Coefficients'], ascending=False)
round_2 = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficients': coefficients[1]
}).sort_values(by=['Coefficients'], ascending=False)
round_3 = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficients': coefficients[2]
}).sort_values(by=['Coefficients'], ascending=False)
round_4 = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficients': coefficients[3]
}).sort_values(by=['Coefficients'], ascending=False)
print("ROUND 1 FEATURES:")
print(round_1)
print("ROUND 2 FEATURES:")
print(round_2)
print("ROUND 3 FEATURES:")
print(round_3)
print("ROUND 4 FEATURES:")
print(round_4)

Training: MSE: 5.664441456778285, MAE: 1.8920326647857655, R^2: 0.31270559855707913
Testing: MSE: 5.597693296713422, MAE: 1.8869140370765054, R^2: 0.2917079081746724
ROUND 1 FEATURES:
                   Feature  Coefficients
12         PUTTS_PER_ROUND      0.695991
1                      PAR      0.587480
2                   LENGTH      0.319483
10          FIR_PERCENTAGE      0.102698
0             TOURNAMENT_F      0.057972
6        R3_WEATHER_FACTOR      0.043843
7        R4_WEATHER_FACTOR      0.036194
3                ELEVATION      0.029831
4        R1_WEATHER_FACTOR     -0.020693
9   TOTAL_DRIVING_DISTANCE     -0.182685
5        R2_WEATHER_FACTOR     -0.247363
11   SCRAMBLING_PERCENTAGE     -0.364255
8           GIR_PERCENTAGE     -1.037122
ROUND 2 FEATURES:
                   Feature  Coefficients
1                      PAR      0.809729
12         PUTTS_PER_ROUND      0.542236
2                   LENGTH      0.416899
6        R3_WEATHER_FACTOR      0.257151
5        R2_WEATHER

## Random Forest

In [None]:
param_grid = {
    'n_estimators': [150, 175, 200, 225, 250],  # Number of trees
    'max_depth': [5, 10, 15, 20],  # Depth of trees
    'min_samples_split': [3, 4, 5, 7],  # Min samples to split node
    'max_features': ['sqrt'], # Max features to consider
    'bootstrap': [True]        # Bootstrap sampling
}

random_forrest_model = RandomForestRegressor()
grid_search = GridSearchCV(estimator=random_forrest_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Cross-Validation Score: {best_score}")

optimal_random_forrest_model = grid_search.best_estimator_

# Training Stats
y_train_predict = optimal_random_forrest_model.predict(X_train)
mse_train = mean_squared_error(y_train, y_train_predict)
mae_train = mean_absolute_error(y_train, y_train_predict)
r2_train = r2_score(y_train, y_train_predict, multioutput='uniform_average')

# Testing Stats
y_test_predict = optimal_random_forrest_model.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_predict)
mae_test = mean_absolute_error(y_test, y_test_predict)
r2_test = r2_score(y_test, y_test_predict, multioutput='uniform_average')

print(f"Training: MSE: {mse_train}, MAE: {mae_train}, R^2: {r2_train}")
print(f"Testing: MSE: {mse_test}, MAE: {mae_test}, R^2: {r2_test}")

Best Parameters: {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 7, 'n_estimators': 150}
Best Cross-Validation Score: -5.984769562217241
Training: MSE: 3.5272204712401516, MAE: 1.5118005064436977, R^2: 0.5629448101511667
Testing: MSE: 6.615567058295582, MAE: 2.032774020108934, R^2: 0.24208643247627326


## XG Boost


In [None]:
# Define XGBoost model
xgb_model = XGBRegressor(
    n_estimators = 225,
    learning_rate = 0.01,
    max_depth = 5,
    min_child_weight = 5,
    reg_lambda = 0.8,
    subsample = 0.6,
    colsample_bytree = 1
)

# Paramters to use to tune
param_grid = {
    'n_estimators': [50, 100, 125, 150, 175, 200],
    'learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
    'max_depth': [3, 4, 5, 6],
    'min_child_weight': [1, 3, 5],
    'reg_lambda': [0.2, 0.4, 0.6, 0.8],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_model.fit(X_train, y_train)

# Training Stats
y_train_predict = xgb_model.predict(X_train)
mse_train = mean_squared_error(y_train, y_train_predict)
mae_train = mean_absolute_error(y_train, y_train_predict)
r2_train = r2_score(y_train, y_train_predict, multioutput='uniform_average')

# Testing Stats
y_test_predict = xgb_model.predict(X_test)
mse_test = mean_squared_error(y_test, y_test_predict)
mae_test = mean_absolute_error(y_test, y_test_predict)
r2_test = r2_score(y_test, y_test_predict, multioutput='uniform_average')

print(f"Training: MSE: {mse_train}, MAE: {mae_train}, R^2: {r2_train}")
print(f"Testing: MSE: {mse_test}, MAE: {mae_test}, R^2: {r2_test}")

Training: MSE: 4.169998645782471, MAE: 1.6263196468353271, R^2: 0.48528990149497986
Testing: MSE: 6.3489251136779785, MAE: 1.9853789806365967, R^2: 0.2704722285270691


## Support Vector Machine

In [None]:
svm_model = MultiOutputRegressor(SVR())