# Tournament Champion Prediction

- Goal: Identify pga golfers that have the best chance of winning (eventuall top 5, 10, or 20 finishes) a particular tournamnt based on past performance, the couse, field, and weather data.

In [2]:
import sqlite3
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import (mean_squared_error, mean_absolute_error, r2_score)
from xgboost import XGBRegressor
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pandas as pd
import numpy as np

conn = sqlite3.connect('/Users/nickospelt/Documents/App_Projects/PGA_Score_Predictor/Data/PGA_SQL_DB/PGA.db')

## Historical Data

In [54]:
tournament_query = """SELECT *,
    ((SG_PUTT * 4) + (SG_AROUND_THE_GREEN * 4) + (SG_APPROACH * 4) + (SG_OFF_THE_TEE * 4)) / 4 AS SG_TOTAL,
    (CASE
        WHEN POSITION = 1 THEN "WIN"
        WHEN POSITION <= 5 THEN "TOP 5"
        WHEN POSITION <= 10 THEN "TOP 10"
        WHEN POSITION <= 20 THEN "TOP 20"
        WHEN POSITION IS NOT NULL THEN "MADE CUT"
        ELSE "CUT"
    END) AS FINISH,
    R1_SCORE - PAR AS R1_SCORE_TO_PAR,
    R2_SCORE - PAR AS R2_SCORE_TO_PAR,
    R3_SCORE - PAR AS R3_SCORE_TO_PAR,
    R4_SCORE - PAR AS R4_SCORE_TO_PAR
FROM RAW_TOURNAMENT_ROUNDS_V5
ORDER BY TOURNAMENT_NAME DESC, TOTAL_SCORE"""

tournament_df = pd.read_sql_query(tournament_query, conn)
tournament_df = tournament_df[tournament_df['R2_SCORE'] != '--']
tournament_df

Unnamed: 0,TOURNAMENT_NAME,TOURNAMENT_DATE,ELEVATION,R1_TEMPERATURE,R1_PRECIPITATION,R1_WIND_SPEED,R1_WIND_DIRECTION,R2_TEMPERATURE,R2_PRECIPITATION,R2_WIND_SPEED,...,TOTAL_SCORE,POSITION,EARNINGS,FEDEX_PTS,SG_TOTAL,FINISH,R1_SCORE_TO_PAR,R2_SCORE_TO_PAR,R3_SCORE_TO_PAR,R4_SCORE_TO_PAR
0,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,260,1.0,1530000.0,,3.63700,WIN,-6,-6,-5.0,-3.0
1,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,261,2.0,748000.0,,3.38650,TOP 5,-4,-6,-5.0,-4.0
2,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,261,2.0,748000.0,,3.38575,TOP 5,-6,-2,-6.0,-5.0
3,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,263,4.0,408000.0,,2.88600,TOP 5,-2,-6,-3.0,-6.0
4,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,265,5.0,340000.0,,2.38525,TOP 5,-1,-2,-7.0,-5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11767,2017 Masters Tournament,2017-04-06,44.0,61.6,0.272,24.5,253,55.7,0.000,18.7,...,298,48.0,33000.0,10.0,,MADE CUT,3,3,3.0,1.0
11768,2017 Masters Tournament,2017-04-06,44.0,61.6,0.272,24.5,253,55.7,0.000,18.7,...,299,50.0,30140.0,9.0,,MADE CUT,6,0,1.0,4.0
11769,2017 Masters Tournament,2017-04-06,44.0,61.6,0.272,24.5,253,55.7,0.000,18.7,...,300,51.0,28600.0,9.0,,MADE CUT,7,-2,1.0,6.0
11770,2017 Masters Tournament,2017-04-06,44.0,61.6,0.272,24.5,253,55.7,0.000,18.7,...,305,52.0,27720.0,8.0,,MADE CUT,2,4,7.0,4.0


## Distribution of classes

In [55]:
total_rows = tournament_df.shape[0]
individual_counts = tournament_df['FINISH'].value_counts()
class_distribution_df = pd.DataFrame(individual_counts)
class_distribution_df['Percent (%)'] = (class_distribution_df['count'] / total_rows) * 100
class_distribution_df

Unnamed: 0_level_0,count,Percent (%)
FINISH,Unnamed: 1_level_1,Unnamed: 2_level_1
CUT,5451,46.304791
MADE CUT,4332,36.799185
TOP 20,958,8.137954
TOP 10,517,4.391777
TOP 5,426,3.618756
WIN,88,0.747537


- Classes are extremly in-balanced as expected

## Feature Engineering

- Historical Performance: 
    - Strokes Gained & Round Scores
    - Exponentially weighted moving average (Same as from regression feature engineering)
    - Should find a way to normalize these values by the field (subtract the average for the tournament)
- Finishes
    - EARNINGS/FEDEX_PTS:
        - Amateurs will have null earnings and 0 fedex_pts
        - Winnings over a specific time period
        - Winnings per tournament
    - Number of cuts made
        - Only have strokes gained data and round 3/4 scores if make cut
        - Include other classes as well?
- Field:
    - Average performance of the field
    - Only want to take into account the field performance for the 8 skill metric (Must choose desired half life for each first)
    - All golfers input into the field?
- Weather:
    - Use same round scores as previously. (May need to generalize to the tournament level by averaging if correlation a problem)
    - Not sure that this data is all that helpful because all parties playing under the same conditions

#### Historical Performance
- Which half life for each metric gives the best predictive power for predicting the winner.

##### Calculate Tournament Averages

In [56]:
tourn_avg_query = """
SELECT TOURNAMENT_NAME,
    AVG(SG_PUTT) AS AVG_SG_PUTT,
    AVG(SG_OFF_THE_TEE) AS AVG_SG_OFF_THE_TEE,
    AVG(SG_APPROACH) AS AVG_SG_APPROACH,
    AVG(SG_AROUND_THE_GREEN) AS AVG_SG_AROUND_THE_GREEN,
    AVG(R1_SCORE - PAR) AS AVG_R1_SCORE_TO_PAR,
    AVG(R2_SCORE - PAR) AS AVG_R2_SCORE_TO_PAR,
    AVG(R3_SCORE - PAR) AS AVG_R3_SCORE_TO_PAR,
    AVG(R4_SCORE - PAR) AS AVG_R4_SCORE_TO_PAR
FROM RAW_TOURNAMENT_ROUNDS_V5
GROUP BY TOURNAMENT_NAME
ORDER BY TOURNAMENT_DATE DESC"""

tourn_avg_df = pd.read_sql_query(tourn_avg_query, conn)
tourn_avg_df

Unnamed: 0,TOURNAMENT_NAME,AVG_SG_PUTT,AVG_SG_OFF_THE_TEE,AVG_SG_APPROACH,AVG_SG_AROUND_THE_GREEN,AVG_R1_SCORE_TO_PAR,AVG_R2_SCORE_TO_PAR,AVG_R3_SCORE_TO_PAR,AVG_R4_SCORE_TO_PAR
0,2024 ZOZO CHAMPIONSHIP,0.003127,0.001519,0.000844,0.000571,-0.896104,-1.116883,-1.896104,-1.571429
1,2024 The Open,,,,,3.337662,3.357143,1.437500,2.150000
2,2024 John Deere Classic,0.283341,0.170143,0.372403,0.169156,-2.967742,-1.083871,-2.259740,-3.038961
3,2024 U.S. Open,0.290564,0.127041,0.305122,0.322365,3.262821,2.903846,3.189189,1.783784
4,2024 RBC Canadian Open,0.465529,0.136638,0.299377,0.143449,0.766234,0.792208,-0.913043,-0.811594
...,...,...,...,...,...,...,...,...,...
83,2017 the Memorial Tournament presented by Nati...,0.164141,0.103105,0.269592,0.162197,1.296610,0.330508,-0.447368,1.973684
84,2017 THE PLAYERS Championship,0.314958,0.193352,0.389915,0.111268,0.833333,1.242424,1.323944,1.605634
85,2017 Wells Fargo Championship,0.364314,0.155230,0.285392,0.130865,0.721088,1.462585,-0.094595,0.378378
86,2017 Masters Tournament,,,,,2.180851,1.946809,0.490566,-0.132075


In [57]:
tournament_df[['PLAYER_NAME', 'R1_SCORE_TO_PAR', 'R2_SCORE_TO_PAR', 'R3_SCORE_TO_PAR', 'R4_SCORE_TO_PAR']].loc[tournament_df['R1_SCORE_TO_PAR'] == "--"]

Unnamed: 0,PLAYER_NAME,R1_SCORE_TO_PAR,R2_SCORE_TO_PAR,R3_SCORE_TO_PAR,R4_SCORE_TO_PAR


##### Compute Adjusted Performance Metrics

In [58]:
def compute_adjusted_metric(actual_metric, average_metric, par):
    if (actual_metric == "NaN"):
        return None
    else:
        return actual_metric - average_metric

performance_metrics = ['SG_PUTT', 'SG_OFF_THE_TEE', 'SG_APPROACH', 'SG_AROUND_THE_GREEN', 'R1_SCORE_TO_PAR', 'R2_SCORE_TO_PAR', 'R3_SCORE_TO_PAR', 'R4_SCORE_TO_PAR']

# calculate an adjusted performance metric based on the field performance in the tournament
tournament_df = pd.merge(tournament_df, tourn_avg_df, on=['TOURNAMENT_NAME'], how='left').sort_values(by=['TOURNAMENT_NAME', 'TOTAL_SCORE', 'PLAYER_NAME'], ascending=[False, True, True]).reset_index(drop=True)

for metric in performance_metrics:
    adj_metric = 'ADJ_' + metric
    avg_metric = 'AVG_' + metric
    
    tournament_df[adj_metric] = tournament_df.apply(lambda row: compute_adjusted_metric(row[metric], row[avg_metric], row['PAR']), axis=1)

tournament_df

Unnamed: 0,TOURNAMENT_NAME,TOURNAMENT_DATE,ELEVATION,R1_TEMPERATURE,R1_PRECIPITATION,R1_WIND_SPEED,R1_WIND_DIRECTION,R2_TEMPERATURE,R2_PRECIPITATION,R2_WIND_SPEED,...,AVG_R3_SCORE_TO_PAR,AVG_R4_SCORE_TO_PAR,ADJ_SG_PUTT,ADJ_SG_OFF_THE_TEE,ADJ_SG_APPROACH,ADJ_SG_AROUND_THE_GREEN,ADJ_R1_SCORE_TO_PAR,ADJ_R2_SCORE_TO_PAR,ADJ_R3_SCORE_TO_PAR,ADJ_R4_SCORE_TO_PAR
0,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,-1.896104,-1.571429,1.064873,0.341481,1.999156,0.225429,-5.103896,-4.883117,-3.103896,-1.428571
1,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,-1.896104,-1.571429,-0.195627,0.736481,2.005156,0.834429,-3.103896,-4.883117,-3.103896,-2.428571
2,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,-1.896104,-1.571429,2.592623,0.458481,0.716156,-0.387571,-5.103896,-0.883117,-4.103896,-3.428571
3,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,-1.896104,-1.571429,1.905873,0.019481,0.964156,-0.009571,-1.103896,-4.883117,-1.103896,-4.428571
4,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,-1.896104,-1.571429,0.123123,0.739481,1.154156,0.362429,-0.103896,-0.883117,-5.103896,-3.428571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11767,2017 Masters Tournament,2017-04-06,44.0,61.6,0.272,24.5,253,55.7,0.000,18.7,...,0.490566,-0.132075,,,,,0.819149,1.053191,2.509434,1.132075
11768,2017 Masters Tournament,2017-04-06,44.0,61.6,0.272,24.5,253,55.7,0.000,18.7,...,0.490566,-0.132075,,,,,3.819149,-1.946809,0.509434,4.132075
11769,2017 Masters Tournament,2017-04-06,44.0,61.6,0.272,24.5,253,55.7,0.000,18.7,...,0.490566,-0.132075,,,,,4.819149,-3.946809,0.509434,6.132075
11770,2017 Masters Tournament,2017-04-06,44.0,61.6,0.272,24.5,253,55.7,0.000,18.7,...,0.490566,-0.132075,,,,,-0.180851,2.053191,6.509434,4.132075


##### Exponentially Weight Performance Metrics

In [59]:
def convert_string_to_date(date_string):
    return datetime.strptime(date_string, "%Y-%m-%d")

def compute_days_between(date_one, date_two):
    duration = date_two - date_one
    return duration.days

def compute_half_life_weight(half_life, days):
    return np.power(0.5, (days / half_life))

# for each row, need to look at all previous tournaments for that player. and calculate weights based on the different between that rows tournament date and the other rows tournament weight.
def calc_weighted_avgs(df, player_name, tournament_date, features):
    prev_rounds_df = df.copy()
    tournament_date = convert_string_to_date(tournament_date)
    prev_rounds_df.loc[:, 'TOURNAMENT_DATE'] = prev_rounds_df.apply(lambda row: convert_string_to_date(row['TOURNAMENT_DATE']), axis=1)
    prev_rounds_df = prev_rounds_df.loc[(prev_rounds_df['PLAYER_NAME'] == player_name) & (prev_rounds_df['TOURNAMENT_DATE'] < tournament_date)].sort_values(by='TOURNAMENT_DATE', ascending=False)

    if prev_rounds_df.shape[0] == 0:
        return None

    # Weighting by how recent data is
    prev_rounds_df['DAYS_SINCE'] = prev_rounds_df.apply(lambda row: compute_days_between(row['TOURNAMENT_DATE'], tournament_date), axis=1)
    prev_rounds_df['HL_50_WEIGHT'] = prev_rounds_df.apply(lambda row: compute_half_life_weight(50, row['DAYS_SINCE']), axis=1)
    prev_rounds_df['HL_100_WEIGHT'] = prev_rounds_df.apply(lambda row: compute_half_life_weight(100, row['DAYS_SINCE']), axis=1)
    prev_rounds_df['HL_200_WEIGHT'] = prev_rounds_df.apply(lambda row: compute_half_life_weight(200, row['DAYS_SINCE']), axis=1)
    prev_rounds_df['HL_400_WEIGHT'] = prev_rounds_df.apply(lambda row: compute_half_life_weight(400, row['DAYS_SINCE']), axis=1)

    weight_sum_HL_50 = prev_rounds_df['HL_50_WEIGHT'].sum()
    weight_sum_HL_100 = prev_rounds_df['HL_100_WEIGHT'].sum()
    weight_sum_HL_200 = prev_rounds_df['HL_200_WEIGHT'].sum()
    weight_sum_HL_400 = prev_rounds_df['HL_400_WEIGHT'].sum()
    
    # generate half life values based on previous rounds
    hl_values = []
    for feature in features:
        fifty = "HL_50_" + feature
        hundred = "HL_100_" + feature
        two_hundred = "HL_200_" + feature
        four_hundred = "HL_400_" + feature

        # compute weighted feature
        prev_rounds_df[fifty] = prev_rounds_df['HL_50_WEIGHT'] * prev_rounds_df[feature]
        prev_rounds_df[hundred] = prev_rounds_df['HL_100_WEIGHT'] * prev_rounds_df[feature]
        prev_rounds_df[two_hundred] = prev_rounds_df['HL_200_WEIGHT'] * prev_rounds_df[feature]
        prev_rounds_df[four_hundred] = prev_rounds_df['HL_400_WEIGHT'] * prev_rounds_df[feature]

        fifty = prev_rounds_df[fifty].sum() / weight_sum_HL_50
        hundred = prev_rounds_df[hundred].sum() / weight_sum_HL_100
        two_hundred = prev_rounds_df[two_hundred].sum() / weight_sum_HL_200
        four_hundred = prev_rounds_df[four_hundred].sum() / weight_sum_HL_400

        hl_values.append(fifty)
        hl_values.append(hundred)
        hl_values.append(two_hundred)
        hl_values.append(four_hundred)

    #SG_P
    # hl_values[0], hl_values[1], hl_values[2], hl_values[3]
    #SG_OTT
    # hl_values[4], hl_values[5], hl_values[6], hl_values[7]
    #SG_APR
    # hl_values[8], hl_values[9], hl_values[10], hl_values[11]
    #SG_ATG
    # hl_values[12], hl_values[13], hl_values[14], hl_values[15]
    # R1_SCR
    # hl_values[16], hl_values[17], hl_values[18], hl_values[19]
    # R2_SCR
    # hl_values[20], hl_values[21], hl_values[22], hl_values[23]
    # R3_SCR
    # hl_values[24], hl_values[25], hl_values[26], hl_values[27]
    # R4_SCR
    # hl_values[28], hl_values[29], hl_values[30], hl_values[31]
    return hl_values[0], hl_values[1], hl_values[2], hl_values[3], hl_values[4], hl_values[5], hl_values[6], hl_values[7], hl_values[8], hl_values[9], hl_values[10], hl_values[11], hl_values[12], hl_values[13], hl_values[14], hl_values[15], hl_values[16], hl_values[17], hl_values[18], hl_values[19],  hl_values[20], hl_values[21], hl_values[22], hl_values[23], hl_values[24], hl_values[25], hl_values[26], hl_values[27], hl_values[28], hl_values[29], hl_values[30], hl_values[31]

# Apply the relevant half life values for each of the factors based on the half-life values from the research paper
tournament_df[['HL_50_SG_P', 'HL_100_SG_P', 'HL_200_SG_P', 'HL_400_SG_P',
    'HL_50_SG_OTT', 'HL_100_SG_OTT', 'HL_200_SG_OTT', 'HL_400_SG_OTT',
    'HL_50_SG_APR', 'HL_100_SG_APR', 'HL_200_SG_APR', 'HL_400_SG_APR',
    'HL_50_SG_ATG', 'HL_100_SG_ATG', 'HL_200_SG_ATG', 'HL_400_SG_ATG',
    'HL_50_R1_SCR', 'HL_100_R1_SCR', 'HL_200_R1_SCR', 'HL_400_R1_SCR',
    'HL_50_R2_SCR', 'HL_100_R2_SCR', 'HL_200_R2_SCR', 'HL_400_R2_SCR',
    'HL_50_R3_SCR', 'HL_100_R3_SCR', 'HL_200_R3_SCR', 'HL_400_R3_SCR',
    'HL_50_R4_SCR', 'HL_100_R4_SCR', 'HL_200_R4_SCR', 'HL_400_R4_SCR']] = tournament_df.apply(
        lambda row: pd.Series(calc_weighted_avgs(tournament_df, row['PLAYER_NAME'], row['TOURNAMENT_DATE'], ["ADJ_SG_PUTT",	"ADJ_SG_OFF_THE_TEE", "ADJ_SG_APPROACH", "ADJ_SG_AROUND_THE_GREEN", "ADJ_R1_SCORE_TO_PAR", "ADJ_R2_SCORE_TO_PAR", "ADJ_R3_SCORE_TO_PAR", "ADJ_R4_SCORE_TO_PAR"])), axis=1)
tournament_df

### Finishes
- Positions refer to earnings, fed ex pts, and finishes (i.e. winning, top-x, make cut)

In [None]:
def convert_string_to_date(date_string):
    return datetime.strptime(date_string, "%Y-%m-%d")

def compute_days_between(date_one, date_two):
    duration = date_two - date_one
    return duration.days

def calc_yearly_finish_totals(df, player_name, tournament_date):
    features = ['TOURNAMENT_DATE', 'EARNINGS', 'FEDEX_PTS', 'FINISH']
    prev_year_rounds_df = df[features].loc[df['PLAYER_NAME'] == player_name].copy()
    tournament_date = convert_string_to_date(tournament_date)
    prev_year_date = tournament_date - relativedelta(days=369)
    prev_year_rounds_df.loc[:, 'TOURNAMENT_DATE'] = prev_year_rounds_df.apply(lambda row: convert_string_to_date(row['TOURNAMENT_DATE']), axis=1)
    prev_year_rounds_df = prev_year_rounds_df.loc[(prev_year_rounds_df['TOURNAMENT_DATE'] < tournament_date) & (prev_year_rounds_df['TOURNAMENT_DATE'] >= prev_year_date)].sort_values(by='TOURNAMENT_DATE', ascending=False)

    if prev_year_rounds_df.shape[0] == 0:
        return None

    apperances = prev_year_rounds_df.shape[0]
    earnings = prev_year_rounds_df['EARNINGS'].sum() / apperances
    fed_ex_pts = prev_year_rounds_df['FEDEX_PTS'].sum() / apperances
    wins = (prev_year_rounds_df['FINISH'] == 'WIN').sum() / apperances
    top_five = ((prev_year_rounds_df['FINISH'] == 'WIN') | (prev_year_rounds_df['FINISH'] == 'TOP 5')).sum() / apperances
    top_ten = ((prev_year_rounds_df['FINISH'] == 'WIN') | (prev_year_rounds_df['FINISH'] == 'TOP 5') | (prev_year_rounds_df['FINISH'] == 'TOP 10')).sum() / apperances
    top_twenty = ((prev_year_rounds_df['FINISH'] == 'WIN') | (prev_year_rounds_df['FINISH'] == 'TOP 5') | (prev_year_rounds_df['FINISH'] == 'TOP 10') | (prev_year_rounds_df['FINISH'] == 'TOP 20')).sum() / apperances
    made_cuts = ((prev_year_rounds_df['FINISH'] == 'WIN') | (prev_year_rounds_df['FINISH'] == 'TOP 5') | (prev_year_rounds_df['FINISH'] == 'TOP 10') | (prev_year_rounds_df['FINISH'] == 'TOP 20') | (prev_year_rounds_df['FINISH'] == 'MADE CUT')).sum() / apperances

    return earnings, fed_ex_pts, wins, top_five, top_ten, top_twenty, made_cuts, apperances

tournament_df[['T12_EARNINGS', 'T12_FED_EX_PTS', 'T12_WINS', 'T12_TOP_5', 'T12_TOP_10', 'T12_TOP_20', 'T12_MADE_CUTS', 'T12_APPERANCES']] = tournament_df.apply(
        lambda row: pd.Series(calc_yearly_finish_totals(tournament_df, row['PLAYER_NAME'], row['TOURNAMENT_DATE'])), axis=1)
tournament_df

Unnamed: 0,TOURNAMENT_NAME,TOURNAMENT_DATE,ELEVATION,R1_TEMPERATURE,R1_PRECIPITATION,R1_WIND_SPEED,R1_WIND_DIRECTION,R2_TEMPERATURE,R2_PRECIPITATION,R2_WIND_SPEED,...,HL_400_R4_SCR,T12_EARNINGS,T12_FED_EX_PTS,T12_WINS,T12_TOP_5,T12_TOP_10,T12_TOP_20,T12_MADE_CUTS,T12_apperances,T12_APPERANCES
0,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,-0.080304,13258.800000,3.400000,0.0,0.0,0.000000,0.000000,0.400000,5.0,5.0
1,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,-0.443574,119472.142857,44.285714,0.0,0.0,0.142857,0.142857,0.428571,7.0,7.0
2,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,-0.692011,66951.750000,29.250000,0.0,0.0,0.000000,0.000000,0.500000,4.0,4.0
3,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,-0.033921,51912.750000,12.625000,0.0,0.0,0.000000,0.000000,0.750000,8.0,8.0
4,2024 ZOZO CHAMPIONSHIP,2024-10-24,82.0,71.2,0.035,9.8,47,65.4,0.012,9.7,...,-0.876433,108649.714286,32.714286,0.0,0.0,0.000000,0.142857,0.857143,7.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11767,2017 Masters Tournament,2017-04-06,44.0,61.6,0.272,24.5,253,55.7,0.000,18.7,...,-0.942029,21451.500000,11.000000,0.0,0.0,0.000000,0.000000,1.000000,1.0,1.0
11768,2017 Masters Tournament,2017-04-06,44.0,61.6,0.272,24.5,253,55.7,0.000,18.7,...,,,,,,,,,,
11769,2017 Masters Tournament,2017-04-06,44.0,61.6,0.272,24.5,253,55.7,0.000,18.7,...,,,,,,,,,,
11770,2017 Masters Tournament,2017-04-06,44.0,61.6,0.272,24.5,253,55.7,0.000,18.7,...,,,,,,,,,,


### Field
- Must decide optimal half life values first

### Weather
- Only helpful if a player finishes different in different conditions
- Would look at some sort of position factor based on weather

In [None]:
# Create weather feature (Weighted average of temp, precipiation, and wind speed)
def weather_weighted_average(temp, precip, wind_speed, elevation):
    weights = [0.1, 0.4, 0.2, 0.3]
    return (temp * -1 * weights[0] + elevation * -1 *weights[1] + precip * weights[2] + wind_speed * weights[3]) / (weights[0] + weights[1] + weights[2] + weights[3])

scaler = StandardScaler()
tournament_df[['R1_TEMPERATURE', 'R1_PRECIPITATION', 'R1_WIND_SPEED', 'R1_WIND_DIRECTION',
'R2_TEMPERATURE', 'R2_PRECIPITATION', 'R2_WIND_SPEED', 'R2_WIND_DIRECTION',
'R3_TEMPERATURE', 'R3_PRECIPITATION', 'R3_WIND_SPEED', 'R3_WIND_DIRECTION', 
'R4_TEMPERATURE', 'R4_PRECIPITATION', 'R4_WIND_SPEED', 'R4_WIND_DIRECTION', 'ELEVATION']] = scaler.fit_transform(tournament_df[['R1_TEMPERATURE', 'R1_PRECIPITATION', 'R1_WIND_SPEED', 'R1_WIND_DIRECTION',
'R2_TEMPERATURE', 'R2_PRECIPITATION', 'R2_WIND_SPEED', 'R2_WIND_DIRECTION', 
'R3_TEMPERATURE', 'R3_PRECIPITATION', 'R3_WIND_SPEED', 'R3_WIND_DIRECTION', 
'R4_TEMPERATURE', 'R4_PRECIPITATION', 'R4_WIND_SPEED', 'R4_WIND_DIRECTION', 'ELEVATION']])

tournament_df['R1_WEATHER_FACTOR'] = tournament_df.apply(lambda row:
    weather_weighted_average(row['R1_TEMPERATURE'], row['R1_PRECIPITATION'], row['R1_WIND_SPEED'], row['ELEVATION']), axis=1)
tournament_df['R2_WEATHER_FACTOR'] = tournament_df.apply(lambda row:
    weather_weighted_average(row['R2_TEMPERATURE'], row['R2_PRECIPITATION'], row['R2_WIND_SPEED'], row['ELEVATION']), axis=1)
tournament_df['R3_WEATHER_FACTOR'] = tournament_df.apply(lambda row:
    weather_weighted_average(row['R3_TEMPERATURE'], row['R3_PRECIPITATION'], row['R3_WIND_SPEED'], row['ELEVATION']), axis=1)
tournament_df['R4_WEATHER_FACTOR'] = tournament_df.apply(lambda row:
    weather_weighted_average(row['R4_TEMPERATURE'], row['R4_PRECIPITATION'], row['R4_WIND_SPEED'], row['ELEVATION']), axis=1)
tournament_df

Unnamed: 0,TOURNAMENT_NAME,TOURNAMENT_DATE,ELEVATION,R1_TEMPERATURE,R1_PRECIPITATION,R1_WIND_SPEED,R1_WIND_DIRECTION,R2_TEMPERATURE,R2_PRECIPITATION,R2_WIND_SPEED,...,T12_WINS,T12_TOP_5,T12_TOP_10,T12_TOP_20,T12_MADE_CUTS,T12_APPERANCES,R1_WEATHER_FACTOR,R2_WEATHER_FACTOR,R3_WEATHER_FACTOR,R4_WEATHER_FACTOR
0,2024 ZOZO CHAMPIONSHIP,2024-10-24,-0.233664,0.553504,-0.230798,-0.553043,-1.441494,-0.262313,-0.425203,-0.654814,...,0.0,0.0,0.000000,0.000000,0.400000,5.0,-0.173957,-0.161788,-0.376235,0.547781
1,2024 ZOZO CHAMPIONSHIP,2024-10-24,-0.233664,0.553504,-0.230798,-0.553043,-1.441494,-0.262313,-0.425203,-0.654814,...,0.0,0.0,0.142857,0.142857,0.428571,7.0,-0.173957,-0.161788,-0.376235,0.547781
2,2024 ZOZO CHAMPIONSHIP,2024-10-24,-0.233664,0.553504,-0.230798,-0.553043,-1.441494,-0.262313,-0.425203,-0.654814,...,0.0,0.0,0.000000,0.000000,0.500000,4.0,-0.173957,-0.161788,-0.376235,0.547781
3,2024 ZOZO CHAMPIONSHIP,2024-10-24,-0.233664,0.553504,-0.230798,-0.553043,-1.441494,-0.262313,-0.425203,-0.654814,...,0.0,0.0,0.000000,0.000000,0.750000,8.0,-0.173957,-0.161788,-0.376235,0.547781
4,2024 ZOZO CHAMPIONSHIP,2024-10-24,-0.233664,0.553504,-0.230798,-0.553043,-1.441494,-0.262313,-0.425203,-0.654814,...,0.0,0.0,0.000000,0.142857,0.857143,7.0,-0.173957,-0.161788,-0.376235,0.547781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11767,2017 Masters Tournament,2017-04-06,-0.348613,-0.693521,0.675474,2.740376,0.670444,-1.578792,-0.486984,1.426506,...,0.0,0.0,0.000000,0.000000,1.000000,1.0,1.166005,0.627880,0.064189,-0.194864
11768,2017 Masters Tournament,2017-04-06,-0.348613,-0.693521,0.675474,2.740376,0.670444,-1.578792,-0.486984,1.426506,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,1.166005,0.627880,0.064189,-0.194864
11769,2017 Masters Tournament,2017-04-06,-0.348613,-0.693521,0.675474,2.740376,0.670444,-1.578792,-0.486984,1.426506,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,1.166005,0.627880,0.064189,-0.194864
11770,2017 Masters Tournament,2017-04-06,-0.348613,-0.693521,0.675474,2.740376,0.670444,-1.578792,-0.486984,1.426506,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,1.166005,0.627880,0.064189,-0.194864


## Data Cleaning
- Fill empty values for historical strokes gained, round, and finish data

### Finish Data

In [None]:
apperance_null = tournament_df['T12_APPERANCES'].isna().sum()
print(f"PRE-CLEAN: T12 APPERANCES NULL: {apperance_null}")

tournament_df[['T12_EARNINGS', 'T12_FED_EX_PTS', 
    'T12_WINS', 'T12_TOP_5', 'T12_TOP_10', 'T12_TOP_20', 'T12_MADE_CUTS', 
    'T12_APPERANCES']] = tournament_df[['T12_EARNINGS', 'T12_FED_EX_PTS', 
    'T12_WINS', 'T12_TOP_5', 'T12_TOP_10', 'T12_TOP_20', 'T12_MADE_CUTS', 
    'T12_APPERANCES']].fillna(0.0)

apperance_null = tournament_df['T12_APPERANCES'].isna().sum()
print(f"POST-CLEAN: T12 APPERANCES NULL: {apperance_null}")

PRE-CLEAN: T12 APPERANCES NULL: 0
POST-CLEAN: T12 APPERANCES NULL: 0


### Strokes Gained
- Going to save this for after determine which half lifes are the best for each metric

In [None]:
strokes_gained_null = tournament_df['HL_50_SG_P'].isna().sum()
r1_score_null = tournament_df['HL_50_R1_SCR'].isna().sum()
r3_score_null = tournament_df['HL_50_R3_SCR'].isna().sum()
print(f"PRE-CLEAN:\nHISTORICAL SG NULL: {strokes_gained_null}\nHISTORICAL R1 SCORE NULL: {r1_score_null}\nHISTORICAL R3 SCORE NULL: {r3_score_null}")

# retrieve the field averages for all of the player statistic

PRE-CLEAN:
HISTORICAL SG NULL: 1182
HISTORICAL R1 SCORE NULL: 1182
HISTORICAL R3 SCORE NULL: 1182


## Determine best half life weighting for each performance metric
- Whichever half life for each metric is closer on average for winning players to the actual metric for that tournament will be used

In [None]:
hist_metrics = ['SG_P', 'SG_OTT', 'SG_APR', 'SG_ATG', 'R1_SCR', 'R2_SCR', 'R3_SCR', 'R4_SCR']
act_metrics = ['ADJ_SG_PUTT', 'ADJ_SG_OFF_THE_TEE', 'ADJ_SG_APPROACH', 'ADJ_SG_AROUND_THE_GREEN', 'ADJ_R1_SCORE_TO_PAR', 'ADJ_R2_SCORE_TO_PAR', 'ADJ_R3_SCORE_TO_PAR', 'ADJ_R4_SCORE_TO_PAR']

winning_df = tournament_df.loc[tournament_df['FINISH'] == "WIN"]
best_hl_metrics = []
for index in range(len(hist_metrics)):
    print(hist_metrics[index])
    test_df = pd.DataFrame()
    fifty_name = 'HL_50_' + hist_metrics[index]
    one_hundred_name = 'HL_100_' + hist_metrics[index]
    two_hundred_name = 'HL_200_' + hist_metrics[index]
    four_hundred_name = 'HL_400_' + hist_metrics[index]
    test_df[fifty_name] = np.abs(winning_df[fifty_name] - winning_df[act_metrics[index]])
    test_df[one_hundred_name] = np.abs(winning_df[one_hundred_name] - winning_df[act_metrics[index]])
    test_df[two_hundred_name] = np.abs(winning_df[two_hundred_name] - winning_df[act_metrics[index]])
    test_df[four_hundred_name] = np.abs(winning_df[four_hundred_name] - winning_df[act_metrics[index]])

    best_metric = test_df.mean().idxmin()
    print(test_df.mean())
    print(best_metric)
    print("\n")

    best_hl_metrics.append(best_metric)

print(best_hl_metrics)


SG_P
HL_50_SG_P     0.937353
HL_100_SG_P    0.922847
HL_200_SG_P    0.926036
HL_400_SG_P    0.934189
dtype: float64
HL_100_SG_P


SG_OTT
HL_50_SG_OTT     0.484192
HL_100_SG_OTT    0.476801
HL_200_SG_OTT    0.487056
HL_400_SG_OTT    0.494436
dtype: float64
HL_100_SG_OTT


SG_APR
HL_50_SG_APR     1.114851
HL_100_SG_APR    1.119754
HL_200_SG_APR    1.118382
HL_400_SG_APR    1.117424
dtype: float64
HL_50_SG_APR


SG_ATG
HL_50_SG_ATG     0.516161
HL_100_SG_ATG    0.488943
HL_200_SG_ATG    0.478817
HL_400_SG_ATG    0.485333
dtype: float64
HL_200_SG_ATG


R1_SCR
HL_50_R1_SCR     3.320486
HL_100_R1_SCR    3.298244
HL_200_R1_SCR    3.308203
HL_400_R1_SCR    3.351164
dtype: float64
HL_100_R1_SCR


R2_SCR
HL_50_R2_SCR     3.596590
HL_100_R2_SCR    3.518822
HL_200_R2_SCR    3.555827
HL_400_R2_SCR    3.593483
dtype: float64
HL_100_R2_SCR


R3_SCR
HL_50_R3_SCR     3.151237
HL_100_R3_SCR    3.121699
HL_200_R3_SCR    3.106759
HL_400_R3_SCR    3.118832
dtype: float64
HL_200_R3_SCR


R4_SCR
HL_50_R4_SCR

## Build Classifier
- Features:
    - Historical performance metrics listed in "best_hl_metrics"
    - Finish metrics: T12_EARNINGS, T12_WINS, T12_TOP_5, T12_TOP_10, T12_TOP_20, T12_MADE_CUTS
    - Course metrics: Par, Length
    - Weather metrics: 4 Round Weather Factors
- Target:
    - Finish: Win or Not

### Specify Features and Target

In [4]:
tournament_df = pd.read_csv('/Users/nickospelt/Documents/App_Projects/PGA_Score_Predictor/Model Development/classification_data_v3.csv')
tournament_df.drop(labels=["Unnamed: 0"], axis=1, inplace=True)
tournament_df

Unnamed: 0,TOURNAMENT_NAME,TOURNAMENT_DATE,ELEVATION,R1_TEMPERATURE,R1_PRECIPITATION,R1_WIND_SPEED,R1_WIND_DIRECTION,R2_TEMPERATURE,R2_PRECIPITATION,R2_WIND_SPEED,...,T12_WINS,T12_TOP_5,T12_TOP_10,T12_TOP_20,T12_MADE_CUTS,T12_APPERANCES,R1_WEATHER_FACTOR,R2_WEATHER_FACTOR,R3_WEATHER_FACTOR,R4_WEATHER_FACTOR
0,2024 ZOZO CHAMPIONSHIP,2024-10-24,-0.233664,0.553504,-0.230798,-0.553043,-1.441494,-0.262313,-0.425203,-0.654814,...,0.0,0.0,0.000000,0.000000,0.400000,5.0,-0.173957,-0.161788,-0.376235,0.547781
1,2024 ZOZO CHAMPIONSHIP,2024-10-24,-0.233664,0.553504,-0.230798,-0.553043,-1.441494,-0.262313,-0.425203,-0.654814,...,0.0,0.0,0.142857,0.142857,0.428571,7.0,-0.173957,-0.161788,-0.376235,0.547781
2,2024 ZOZO CHAMPIONSHIP,2024-10-24,-0.233664,0.553504,-0.230798,-0.553043,-1.441494,-0.262313,-0.425203,-0.654814,...,0.0,0.0,0.000000,0.000000,0.500000,4.0,-0.173957,-0.161788,-0.376235,0.547781
3,2024 ZOZO CHAMPIONSHIP,2024-10-24,-0.233664,0.553504,-0.230798,-0.553043,-1.441494,-0.262313,-0.425203,-0.654814,...,0.0,0.0,0.000000,0.000000,0.750000,8.0,-0.173957,-0.161788,-0.376235,0.547781
4,2024 ZOZO CHAMPIONSHIP,2024-10-24,-0.233664,0.553504,-0.230798,-0.553043,-1.441494,-0.262313,-0.425203,-0.654814,...,0.0,0.0,0.000000,0.142857,0.857143,7.0,-0.173957,-0.161788,-0.376235,0.547781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11767,2017 Masters Tournament,2017-04-06,-0.348613,-0.693521,0.675474,2.740376,0.670444,-1.578792,-0.486984,1.426506,...,0.0,0.0,0.000000,0.000000,1.000000,1.0,1.166005,0.627880,0.064189,-0.194864
11768,2017 Masters Tournament,2017-04-06,-0.348613,-0.693521,0.675474,2.740376,0.670444,-1.578792,-0.486984,1.426506,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,1.166005,0.627880,0.064189,-0.194864
11769,2017 Masters Tournament,2017-04-06,-0.348613,-0.693521,0.675474,2.740376,0.670444,-1.578792,-0.486984,1.426506,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,1.166005,0.627880,0.064189,-0.194864
11770,2017 Masters Tournament,2017-04-06,-0.348613,-0.693521,0.675474,2.740376,0.670444,-1.578792,-0.486984,1.426506,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,1.166005,0.627880,0.064189,-0.194864


In [5]:
tournament_df['FINISH_V2'] = np.where(tournament_df['FINISH'] == 'WIN', 'WIN', 'LOSS')
tournament_df['FINISH_V2'].value_counts()

FINISH_V2
LOSS    11684
WIN        88
Name: count, dtype: int64

In [7]:
features = ['PLAYER_NAME', 'HL_100_SG_P', 'HL_100_SG_OTT', 'HL_50_SG_APR', 'HL_200_SG_ATG', 'HL_100_R1_SCR', 'HL_100_R2_SCR', 'HL_200_R3_SCR', 'HL_100_R4_SCR',
            'T12_EARNINGS', 'T12_WINS', 'T12_TOP_5', 'T12_TOP_10', 'T12_TOP_20', 'T12_MADE_CUTS',
            'PAR', 'LENGTH',
            'R1_WEATHER_FACTOR', 'R2_WEATHER_FACTOR', 'R3_WEATHER_FACTOR', 'R4_WEATHER_FACTOR']
target = 'FINISH_V2'
features_and_target = features.copy()
features_and_target.append(target)
features_and_target

['PLAYER_NAME',
 'HL_100_SG_P',
 'HL_100_SG_OTT',
 'HL_50_SG_APR',
 'HL_200_SG_ATG',
 'HL_100_R1_SCR',
 'HL_100_R2_SCR',
 'HL_200_R3_SCR',
 'HL_100_R4_SCR',
 'T12_EARNINGS',
 'T12_WINS',
 'T12_TOP_5',
 'T12_TOP_10',
 'T12_TOP_20',
 'T12_MADE_CUTS',
 'PAR',
 'LENGTH',
 'R1_WEATHER_FACTOR',
 'R2_WEATHER_FACTOR',
 'R3_WEATHER_FACTOR',
 'R4_WEATHER_FACTOR',
 'FINISH_V2']

### Sepearte Validation Set
- Use the two most recent tournaments

In [8]:
validation_df = tournament_df[features_and_target].loc[(tournament_df['TOURNAMENT_NAME'] == '2024 ZOZO CHAMPIONSHIP') | (tournament_df['TOURNAMENT_NAME'] == '2024 The Open')]
validation_df

Unnamed: 0,PLAYER_NAME,HL_100_SG_P,HL_100_SG_OTT,HL_50_SG_APR,HL_200_SG_ATG,HL_100_R1_SCR,HL_100_R2_SCR,HL_200_R3_SCR,HL_100_R4_SCR,T12_EARNINGS,...,T12_TOP_10,T12_TOP_20,T12_MADE_CUTS,PAR,LENGTH,R1_WEATHER_FACTOR,R2_WEATHER_FACTOR,R3_WEATHER_FACTOR,R4_WEATHER_FACTOR,FINISH_V2
0,Nico Echavarria,-0.043504,-0.087887,-1.624501e-01,-0.130415,1.969448,1.076815,0.900512,-0.152142,13258.800000,...,0.000000,0.000000,0.400000,70,7079,-0.173957,-0.161788,-0.376235,0.547781,WIN
1,Justin Thomas,-0.201779,0.146792,1.334040e-01,0.188201,-1.274995,1.206528,-0.895975,0.526803,119472.142857,...,0.142857,0.142857,0.428571,70,7079,-0.173957,-0.161788,-0.376235,0.547781,LOSS
2,Max Greyserman,0.065204,0.524542,-2.345658e-01,-0.071669,-0.727655,0.405319,-0.520694,-0.701681,66951.750000,...,0.000000,0.000000,0.500000,70,7079,-0.173957,-0.161788,-0.376235,0.547781,LOSS
3,Rickie Fowler,0.002496,-0.098260,-3.068023e-01,0.090682,1.143482,-0.981749,-0.003647,0.370427,51912.750000,...,0.000000,0.000000,0.750000,70,7079,-0.173957,-0.161788,-0.376235,0.547781,LOSS
4,Kurt Kitayama,-0.229835,0.105048,2.550274e-01,-0.136549,-0.412208,-0.952341,0.766903,-1.339164,108649.714286,...,0.000000,0.142857,0.857143,70,7079,-0.173957,-0.161788,-0.376235,0.547781,LOSS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603,Alex Cejka,-0.000816,-0.000116,1.230792e-07,0.011451,-0.823643,6.358285,0.161684,-0.000470,0.000000,...,0.000000,0.000000,0.000000,71,7385,0.430575,0.467829,0.359559,0.566617,LOSS
604,Darren Clarke,0.000000,0.000000,0.000000e+00,0.000000,0.213988,3.537821,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,71,7385,0.430575,0.467829,0.359559,0.566617,LOSS
605,Luis Masaveu,,,,,,,,,0.000000,...,0.000000,0.000000,0.000000,71,7385,0.430575,0.467829,0.359559,0.566617,LOSS
606,Andy Ogletree,0.000000,0.000000,0.000000e+00,0.000000,1.823424,3.947360,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,71,7385,0.430575,0.467829,0.359559,0.566617,LOSS


### General Model Building

In [None]:
# df: the data to train/test the model on
# features: the feature labels in df
# target: the target label in df
# model: the model object want to build
# param_grid: the paramters to use in cross validation
# scoring: the metric to optimize the model on
def build_eval_model(df, features, target, model, param_grid, scoring):
    kf = KFold(n_splits=5, shuffle=True)
    #xgb_model = XGBRegressor(objective='reg:squarederror')

    """param_grid = {
        'learning_rate': [0.025, 0.05, 0.1, 0.15],
        'max_depth': [2, 3, 4, 5, 6],
        'n_estimators': [50, 70, 100, 120],
        'subsample': [0.7, 0.8, 0.9],
        'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9]
    }"""

    cv = GridSearchCV(estimator=model, param_grid=param_grid, 
        scoring=scoring, cv=kf, n_jobs=-1, return_train_score=True)
    cv.fit(df[features], df[target])

    # Convert results into a DataFrame
    cv_results = pd.DataFrame(cv.cv_results_)

    # Get best model index
    best_index = cv.best_index_
    best_model = cv.best_estimator_

    # Retrieve Training and Testing MSE
    train_mse = -cv_results.loc[best_index, 'mean_train_score']
    test_mse = -cv_results.loc[best_index, 'mean_test_score']
    params = cv_results.loc[best_index, 'params']

    # Retrieve Feature Importance
    feature_importance = best_model.feature_importances_
    importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importance})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    print("Best Hyperparameters:", params)
    print("Feature Importance", importance_df)
    print("Training MSE:", train_mse)
    print("Testing MSE:", test_mse)