## Historically Weighted Metrics vs. Acutal Adjusted Metrics from the Tournament

In [1]:
import sqlite3
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer, fbeta_score
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

conn = sqlite3.connect('/Users/nickospelt/Documents/App_Projects/PGA_Tournament_Winner/Data/PGA_SQL_DB/PGA.db')

### Pull Data

In [2]:
df = pd.read_sql_query("SELECT * FROM METRIC_ANALYSIS", conn)
df = df.dropna()
df

Unnamed: 0,TOURNAMENT_NAME,PLAYER_NAME,TOURNAMENT_DATE,POSITION,FINISH,HL_50_SG_P,HL_100_SG_P,HL_200_SG_P,ADJ_SG_P,HL_50_SG_OTT,...,HL_200_R4_SCR,ADJ_R4_SCR,T12_EARNINGS,T12_FED_EX_PTS,T12_WINS,T12_TOP_5,T12_TOP_10,T12_TOP_20,T12_MADE_CUTS,T12_APPERANCES
0,2024 ZOZO CHAMPIONSHIP,Nico Echavarria,2024-10-24 00:00:00,1.0,WIN,-0.089891,-0.097941,-0.102004,1.064873,-0.209515,...,-0.357448,-1.428571,13258.800000,3.400000,0.0,0.000000,0.000000,0.000000,0.400000,5
1,2024 ZOZO CHAMPIONSHIP,Justin Thomas,2024-10-24 00:00:00,2.0,TOP 5,-0.699328,-0.664840,-0.551481,-0.195627,0.559205,...,-0.006809,-2.428571,119472.142857,44.285714,0.0,0.000000,0.142857,0.142857,0.428571,7
2,2024 ZOZO CHAMPIONSHIP,Max Greyserman,2024-10-24 00:00:00,2.0,TOP 5,0.049363,0.111252,0.142483,2.592623,0.941564,...,-1.284700,-3.428571,66951.750000,29.250000,0.0,0.000000,0.000000,0.000000,0.500000,4
3,2024 ZOZO CHAMPIONSHIP,Rickie Fowler,2024-10-24 00:00:00,4.0,TOP 5,-0.111900,0.005633,0.102397,1.905873,-0.191549,...,0.197957,-4.428571,51912.750000,12.625000,0.0,0.000000,0.000000,0.000000,0.750000,8
4,2024 ZOZO CHAMPIONSHIP,Kurt Kitayama,2024-10-24 00:00:00,5.0,TOP 5,-0.627745,-0.516313,-0.380011,0.123123,0.294138,...,-1.556688,-3.428571,108649.714286,32.714286,0.0,0.000000,0.000000,0.142857,0.857143,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11673,2017 PGA Championship,Lee Westwood,2017-08-10 00:00:00,67.0,MADE CUT,-1.000013,-0.876345,-0.812916,-0.859883,-0.258473,...,0.381093,3.173333,65091.500000,0.000000,0.0,0.000000,0.000000,0.250000,1.000000,4
11674,2017 PGA Championship,Daniel Summerhays,2017-08-10 00:00:00,71.0,MADE CUT,-0.189880,-0.281847,-0.316116,-0.069133,-0.624798,...,2.784587,4.173333,53680.666667,17.166667,0.0,0.000000,0.166667,0.166667,0.833333,6
11675,2017 PGA Championship,Russell Henley,2017-08-10 00:00:00,71.0,MADE CUT,0.386100,0.348888,0.341597,-1.461133,0.342063,...,-0.592993,1.173333,143113.250000,53.500000,0.0,0.166667,0.333333,0.500000,1.000000,6
11676,2017 PGA Championship,Charles Howell III,2017-08-10 00:00:00,73.0,MADE CUT,0.482900,0.280695,0.148889,-0.608383,-0.463370,...,1.136779,0.173333,12661.600000,6.000000,0.0,0.000000,0.000000,0.000000,0.666667,3


### Assess Correlation between Historical Metrics and Actual Metrics for the Tournament

In [4]:
features = ['SG_P', 'SG_OTT', 'SG_APR', 'SG_ATG', 'R1_SCR', 'R2_SCR', 'R3_SCR', 'R4_SCR']

for feature in features:
    ADJ = "ADJ_" + feature
    HL_50 = "HL_50_" + feature
    HL_100 = "HL_100_" + feature
    HL_200 = "HL_200_" + feature

    hl_50_corr, hl_50_p_value = pearsonr(df[HL_50], df[ADJ])
    hl_100_corr, hl_100_p_value = pearsonr(df[HL_100], df[ADJ])
    hl_200_corr, hl_200_p_value = pearsonr(df[HL_200], df[ADJ])

    adj_corr, adj_p_value = pearsonr(df[ADJ], df['POSITION'])
    hl_50_pos_corr, hl_50_pos_p_value = pearsonr(df[HL_50], df['POSITION'])
    hl_100_pos_corr, hl_100_pos_p_value = pearsonr(df[HL_100], df['POSITION'])
    hl_200_pos_corr, hl_200_pos_p_value = pearsonr(df[HL_200], df['POSITION'])

    print(feature)
    print(f"{HL_50} vs {ADJ}; Correlation: [{hl_50_corr}], p-value: [{hl_50_p_value}]")
    print(f"{HL_100} vs {ADJ}; Correlation: [{hl_100_corr}], p-value: [{hl_100_p_value}]")
    print(f"{HL_200} vs {ADJ}; Correlation: [{hl_200_corr}], p-value: [{hl_200_p_value}]\n")
    print(f"{ADJ} vs POSITION; Correlation: [{adj_corr}], p-value: [{adj_p_value}]")
    print(f"{HL_50} vs POSITION; Correlation: [{hl_50_pos_corr}], p-value: [{hl_50_pos_p_value}]")
    print(f"{HL_100} vs POSITION; Correlation: [{hl_100_pos_corr}], p-value: [{hl_100_pos_p_value}]")
    print(f"{HL_200} vs POSITION; Correlation: [{hl_200_pos_corr}], p-value: [{hl_200_pos_p_value}]\n\n")


SG_P
HL_50_SG_P vs ADJ_SG_P; Correlation: [0.1146843074047269], p-value: [1.6216863886937704e-15]
HL_100_SG_P vs ADJ_SG_P; Correlation: [0.12983426707187978], p-value: [1.7441727810127896e-19]
HL_200_SG_P vs ADJ_SG_P; Correlation: [0.14070209076649184], p-value: [1.2272767421160192e-22]

ADJ_SG_P vs POSITION; Correlation: [-0.4433877585947892], p-value: [3.146889475110575e-230]
HL_50_SG_P vs POSITION; Correlation: [-0.010008353098144787], p-value: [0.4882976116037048]
HL_100_SG_P vs POSITION; Correlation: [-0.012867008128795326], p-value: [0.37294071485767805]
HL_200_SG_P vs POSITION; Correlation: [-0.015382418331324905], p-value: [0.2867956245293225]


SG_OTT
HL_50_SG_OTT vs ADJ_SG_OTT; Correlation: [0.3290564172938355], p-value: [1.6540367828909302e-121]
HL_100_SG_OTT vs ADJ_SG_OTT; Correlation: [0.35382115562301303], p-value: [1.7823781665556674e-141]
HL_200_SG_OTT vs ADJ_SG_OTT; Correlation: [0.3748256982260952], p-value: [7.055724187190828e-160]

ADJ_SG_OTT vs POSITION; Correlatio

- __First 3 Rows__: Looking how the exponentially weighted average metric relate to the actual adjust values observed in the tournament
- __Last 4 Rows__: How the adjusted value and historical values relate to the final position in the tournament
<br><br>
- All adjusted metrics strongly correlate to a players finish in the tournament
- However, like suspected, simply taking the exponentially weighted average of the adjust metrics doesn't prove to be a great estimator of what the strokes gained or score values will be (Not very significant correlation)
- Need to improve the quality of these historical estimations