# Player Statistic to Score Model
- The goal of this model is to determine if it is possible to predict the 4 scores of a players tournament based on their performance that week and the tournament they are playing in. 
- Stats:
    - GIR_PERCENTAGE
    - TOTAL_DRIVING_DISTANCE
    - FIR_PERCENTAGE
    - SCRAMBLING_PERCENTAGE
    - PUTTS_PER_FOUND
- Tournament:
    - This will be a value that determines what tournament (i.e. Masters or U.S. Open) we are trying to predict.
    - The hope is that this flag will be able to help the model distinguish between scoring in different tournaments
- Multi-Output Regression Approach:
    - Linear Regression
    - Random Forrest
    - XGBoost
    - Neural Network

In [5]:
import sqlite3
import pandas as pd

## Pull Data from Database and Process

In [6]:
conn = sqlite3.connect('/Users/nickospelt/Documents/App_Projects/PGA_Score_Predictor/Data/PGA_SQL_DB/PGA.db')

player_tournament_query = """SELECT *,
 (CASE 
     WHEN UPPER(TOURNAMENT_NAME) LIKE "%MASTERS%" THEN 1.0
     WHEN UPPER(TOURNAMENT_NAME) LIKE "%U.S. OPEN%" THEN 2.0
     WHEN UPPER(TOURNAMENT_NAME) LIKE "%THE OPEN%" THEN 3.0
     WHEN UPPER(TOURNAMENT_NAME) LIKE "%PGA%" THEN 4.0
     WHEN UPPER(TOURNAMENT_NAME) LIKE "%PLAYERS%" THEN 5.0
  END) / 5 AS TOURNAMENT_F
FROM PLAYER_TOURNAMENT_RESULTS"""

raw_player_tournament_df = pd.read_sql_query(player_tournament_query, conn)
raw_player_tournament_df

Unnamed: 0,TOURNAMENT_NAME,R1_TEMP,R1_PRECIP,R1_WIND_SPEED,R1_WIND_DIRECT,R2_TEMP,R2_PRECIP,R2_WIND_SPEED,R2_WIND_DIRECT,R3_TEMP,...,GIR_PERCENTAGE,TOTAL_DRIVING_DISTANCE,FIR_PERCENTAGE,SCRAMBLING_PERCENTAGE,PUTTS_PER_ROUND,R1_SCORE,R2_SCORE,R3_SCORE,R4_SCORE,TOURNAMENT_F
0,2018 Masters Tournament,56.5,0.000,10.0,0,61.3,0.0,12.8,185,65.6,...,66.67,299.3,73.21,62.50,26.00,69,66,67,71,0.2
1,2018 Masters Tournament,56.5,0.000,10.0,0,61.3,0.0,12.8,185,65.6,...,70.83,290.6,71.43,76.19,28.25,70,72,65,67,0.2
2,2018 Masters Tournament,56.5,0.000,10.0,0,61.3,0.0,12.8,185,65.6,...,72.22,287.1,67.86,70.00,29.25,66,74,71,64,0.2
3,2018 Masters Tournament,56.5,0.000,10.0,0,61.3,0.0,12.8,185,65.6,...,68.06,295.4,73.21,60.87,28.00,75,68,65,69,0.2
4,2018 Masters Tournament,56.5,0.000,10.0,0,61.3,0.0,12.8,185,65.6,...,77.78,304.0,83.93,62.50,30.50,73,69,68,69,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1724,2023 U.S. Open,64.2,0.016,8.6,197,66.5,0.0,11.9,238,67.0,...,55.56,294.0,55.77,43.75,29.75,68,73,77,74,0.4
1725,2023 U.S. Open,64.2,0.016,8.6,197,66.5,0.0,11.9,238,67.0,...,63.89,307.1,82.69,38.46,31.75,70,72,75,76,0.4
1726,2023 U.S. Open,64.2,0.016,8.6,197,66.5,0.0,11.9,238,67.0,...,61.11,295.6,63.46,42.86,30.75,69,73,78,74,0.4
1727,2023 U.S. Open,64.2,0.016,8.6,197,66.5,0.0,11.9,238,67.0,...,59.72,336.3,48.08,37.93,31.00,70,72,74,79,0.4


In [7]:
processed_player_tournament_df = raw_player_tournament_df[['TOURNAMENT_F', 'GIR_PERCENTAGE', 'TOTAL_DRIVING_DISTANCE', 'FIR_PERCENTAGE', 'SCRAMBLING_PERCENTAGE', 'PUTTS_PER_ROUND', 'R1_SCORE', 'R2_SCORE', 'R3_SCORE', 'R4_SCORE']]
processed_player_tournament_df

Unnamed: 0,TOURNAMENT_F,GIR_PERCENTAGE,TOTAL_DRIVING_DISTANCE,FIR_PERCENTAGE,SCRAMBLING_PERCENTAGE,PUTTS_PER_ROUND,R1_SCORE,R2_SCORE,R3_SCORE,R4_SCORE
0,0.2,66.67,299.3,73.21,62.50,26.00,69,66,67,71
1,0.2,70.83,290.6,71.43,76.19,28.25,70,72,65,67
2,0.2,72.22,287.1,67.86,70.00,29.25,66,74,71,64
3,0.2,68.06,295.4,73.21,60.87,28.00,75,68,65,69
4,0.2,77.78,304.0,83.93,62.50,30.50,73,69,68,69
...,...,...,...,...,...,...,...,...,...,...
1724,0.4,55.56,294.0,55.77,43.75,29.75,68,73,77,74
1725,0.4,63.89,307.1,82.69,38.46,31.75,70,72,75,76
1726,0.4,61.11,295.6,63.46,42.86,30.75,69,73,78,74
1727,0.4,59.72,336.3,48.08,37.93,31.00,70,72,74,79
