# Tournament Winner Classification
Goal: Develop a model that predict the probability that a certain player has to win a PGA tournament versus the rest of the field.

In [4]:
import sqlite3
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score)
from xgboost import XGBRegressor
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pandas as pd
import numpy as np

conn = sqlite3.connect('/Users/nickospelt/Documents/App_Projects/PGA_Score_Predictor/Data/PGA_SQL_DB/PGA.db')

## Pull Data from Database and Prepare for Training
- This raw data was scrape and aggregated by the data pipeline in the Data folder and uploaded to a SQL database.
- Tournament adjustments, historical weighting, and field standardization were implemented in SQL and python functions found in the SQL database. 

### Pull Data

In [13]:
df = pd.read_sql_query("SELECT * FROM FINAL_MODEL_DATA", conn)
df.insert(3, 'PLAYER_ID', pd.factorize(df['PLAYER_NAME'])[0])
df

Unnamed: 0,TOURNAMENT_NAME,TOURNAMENT_ID,PLAYER_NAME,PLAYER_ID,TOURNAMENT_DATE,FINISH,LENGTH,ELEVATION,WIND,HL_50_SG_P,...,HL_100_R4_SCR,HL_200_R4_SCR,T12_EARNINGS,T12_FED_EX_PTS,T12_WINS,T12_TOP_5,T12_TOP_10,T12_TOP_20,T12_MADE_CUTS,T12_APPERANCES
0,2024 ZOZO CHAMPIONSHIP,1,Adam Schenk,0,2024-10-24 00:00:00,NOT WIN,7079,82.0,8.875,0.858157,...,0.392838,-0.070481,0.040106,0.099459,-0.153705,0.706063,0.204243,0.813545,-0.157123,1.621514
1,2024 ZOZO CHAMPIONSHIP,1,Adam Svensson,1,2024-10-24 00:00:00,NOT WIN,7079,82.0,8.875,-2.546122,...,0.580869,0.446211,-0.112340,-0.089145,-0.153705,-0.353204,0.292781,-0.209746,1.143727,1.244304
2,2024 ZOZO CHAMPIONSHIP,1,Andrew Novak,2,2024-10-24 00:00:00,NOT WIN,7079,82.0,8.875,0.942614,...,-0.432815,-0.271889,-0.023209,0.024017,-0.153705,-0.353204,1.089615,1.632178,0.766060,-0.264537
3,2024 ZOZO CHAMPIONSHIP,1,Andrew Putnam,3,2024-10-24 00:00:00,NOT WIN,7079,82.0,8.875,0.360869,...,-0.263804,-0.286968,-0.182216,-0.190473,-0.153705,-0.353204,-0.504054,-0.005088,1.017838,0.489883
4,2024 ZOZO CHAMPIONSHIP,1,Beau Hossler,4,2024-10-24 00:00:00,NOT WIN,7079,82.0,8.875,0.508467,...,1.507123,0.946162,-0.265548,-0.265703,-0.153705,-0.353204,-0.504054,-0.122036,0.226537,0.867093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11767,2017 Masters Tournament,10,Webb Simpson,119,2017-04-06 00:00:00,NOT WIN,7435,44.0,15.250,-1.542139,...,0.987376,0.987376,-0.025991,-0.006533,-0.103695,-0.103695,-0.210857,-0.261197,2.030237,1.441326
11768,2017 Masters Tournament,10,William McGirt,551,2017-04-06 00:00:00,NOT WIN,7435,44.0,15.250,0.000000,...,0.000000,0.000000,-0.200392,-0.209058,-0.103695,-0.103695,-0.210857,-0.261197,-0.487257,-0.686346
11769,2017 Masters Tournament,10,Yuta Ikeda,947,2017-04-06 00:00:00,NOT WIN,7435,44.0,15.250,0.000000,...,0.000000,0.000000,-0.200392,-0.209058,-0.103695,-0.103695,-0.210857,-0.261197,-0.487257,-0.686346
11770,2017 Masters Tournament,10,Zach Johnson,205,2017-04-06 00:00:00,NOT WIN,7435,44.0,15.250,0.000000,...,0.000000,0.000000,-0.200392,-0.209058,-0.103695,-0.103695,-0.210857,-0.261197,-0.487257,-0.686346


### Prepare for model
- Don't want to use 2017 tournaments for training because not enough historical data
- Use 2 most recent tournaments as a validators

In [22]:
features = ['TOURNAMENT_ID', 'PLAYER_ID', 'LENGTH', 'ELEVATION', 'WIND',
       'HL_50_SG_P', 'HL_100_SG_P', 'HL_200_SG_P', 'HL_50_SG_OTT',
       'HL_100_SG_OTT', 'HL_200_SG_OTT', 'HL_50_SG_APR', 'HL_100_SG_APR',
       'HL_200_SG_APR', 'HL_50_SG_ATG', 'HL_100_SG_ATG', 'HL_200_SG_ATG',
       'HL_50_R1_SCR', 'HL_100_R1_SCR', 'HL_200_R1_SCR', 'HL_50_R2_SCR',
       'HL_100_R2_SCR', 'HL_200_R2_SCR', 'HL_50_R3_SCR', 'HL_100_R3_SCR',
       'HL_200_R3_SCR', 'HL_50_R4_SCR', 'HL_100_R4_SCR', 'HL_200_R4_SCR',
       'T12_EARNINGS', 'T12_FED_EX_PTS', 'T12_WINS', 'T12_TOP_5', 'T12_TOP_10',
       'T12_TOP_20', 'T12_MADE_CUTS', 'T12_APPERANCES']
target = 'FINISH'

validation_df = df.loc[(df['TOURNAMENT_NAME'] == "2024 ZOZO CHAMPIONSHIP") | (df['TOURNAMENT_NAME'] == "2024 The Open")]
df = df.loc[(df['TOURNAMENT_DATE'] >= "2019") & (df['TOURNAMENT_NAME'] != "2024 ZOZO CHAMPIONSHIP") & (df['TOURNAMENT_NAME'] != "2024 The Open")]
df

Unnamed: 0,TOURNAMENT_NAME,TOURNAMENT_ID,PLAYER_NAME,PLAYER_ID,TOURNAMENT_DATE,FINISH,LENGTH,ELEVATION,WIND,HL_50_SG_P,...,HL_100_R4_SCR,HL_200_R4_SCR,T12_EARNINGS,T12_FED_EX_PTS,T12_WINS,T12_TOP_5,T12_TOP_10,T12_TOP_20,T12_MADE_CUTS,T12_APPERANCES
77,2024 Wells Fargo Championship,2,Adam Hadwin,77,2024-05-09 00:00:00,NOT WIN,7558,254.0,13.850,0.775099,...,-1.300569,-1.120239,0.685127,0.604039,-0.304377,2.694293,1.202814,0.446332,0.550004,0.681132
78,2024 Wells Fargo Championship,2,Adam Schenk,0,2024-05-09 00:00:00,NOT WIN,7558,254.0,13.850,0.908203,...,-0.610286,-0.841568,-0.011292,0.117781,-0.304377,0.113843,0.763113,1.016321,-0.610465,1.839056
79,2024 Wells Fargo Championship,2,Adam Scott,78,2024-05-09 00:00:00,NOT WIN,7558,254.0,13.850,1.614309,...,-0.146314,-0.259371,-0.166136,-0.057679,-0.304377,-0.824502,-0.262856,0.446332,1.130238,0.681132
80,2024 Wells Fargo Championship,2,Adam Svensson,1,2024-05-09 00:00:00,NOT WIN,7558,254.0,13.850,-1.024532,...,-0.634204,-0.352949,-0.566232,-0.551722,-0.304377,-0.824502,-0.262856,-0.123657,0.550004,0.681132
81,2024 Wells Fargo Championship,2,Akshay Bhatia,79,2024-05-09 00:00:00,NOT WIN,7558,254.0,13.850,0.431169,...,-0.662078,-0.776613,0.494063,0.971327,4.086026,0.739407,-0.018577,-0.503650,-0.610465,-0.476792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8825,2019 3M Open,26,Wes Roach,861,2019-07-04 00:00:00,NOT WIN,7468,275.0,7.675,1.372470,...,-1.717432,-1.619962,0.804162,1.027931,-0.140816,-0.251624,-0.440435,2.352352,1.752600,-1.078802
8826,2019 3M Open,26,Whee Kim,965,2019-07-04 00:00:00,NOT WIN,7468,275.0,7.675,1.748819,...,-0.329739,0.124569,-0.437510,-0.476936,-0.140816,-0.251624,-0.440435,-0.666093,-0.742767,-0.286557
8827,2019 3M Open,26,Will Claxton,966,2019-07-04 00:00:00,NOT WIN,7468,275.0,7.675,0.000000,...,0.000000,0.000000,-0.513067,-0.594337,-0.140816,-0.251624,-0.440435,-0.666093,-1.574556,-1.078802
8828,2019 3M Open,26,Wyndham Clark,120,2019-07-04 00:00:00,NOT WIN,7468,275.0,7.675,-0.261523,...,-0.120412,-0.237537,-0.361599,-0.359535,-0.140816,-0.251624,-0.440435,-0.666093,0.089022,-0.286557


### Observe Class Breakdown in Training/Test set

In [23]:
df[target].value_counts()

FINISH
NOT WIN    8535
WIN          64
Name: count, dtype: int64

## Build model
- To handle severe class imbalance need to weight by class and cross validate with measures like Precision, Recall, F1-score, Precision-Recall AUC
- Sklearn GridSearchCV has precision_weighted, recall_weighted, f1_weighted, average_precision (Precision-Recall AUC)