In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# import PGA data
pga_df = pd.read_csv('PGA_Data.csv')

In [3]:
# drop fantasy related and filler columns
pga_df = pga_df.drop(columns=['hole_DKP', 'hole_FDP', 'hole_SDP', 'streak_DKP',
       'streak_FDP', 'streak_SDP', 'n_rounds', 'made_cut', 'pos', 'finish_DKP',
       'finish_FDP', 'finish_SDP', 'total_DKP', 'total_FDP', 'total_SDP',
       'player', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',])

In [4]:
# make a column that is the finish, removing ties and turning 'CUT' to 0. 

# 0 could be changed to 99, NA, or whatever the cut line is (count in tournament / 2)

# note, the data column is an 'Object' type so it has strings and integers. 
# this methods turns everything to a string, makes its caclualtions, then turns to a float so we can do statistical calculations.

def adjust_finish(x):
    # this treats cut, dq, wd, etc. all as the same and assigns a value of 99
    # todo: determine if we want to treat these all the same
    # todo: because lower is better, NFs are treated as 99, but we can hone this in, 
    if x == 'CUT' or x == 'DQ' or x == 'WD' or x == 'MDF' or x == 'W/D':
        return '99'
    elif x.startswith('T'):
        return x[1:]
    else:
        return x
pga_df['finish_adj'] = pga_df['Finish'].astype(str).apply(adjust_finish).astype(float)


In [5]:
pga_df[pga_df['Player_initial_last'] == 'S. Scheffler'][[ 'Finish', 'finish_adj']]

Unnamed: 0,Finish,finish_adj
52,T3,3.0
159,CUT,99.0
648,T21,21.0
842,CUT,99.0
1114,T13,13.0
...,...,...
19977,T45,45.0
21884,T43,43.0
23699,CUT,99.0
26555,,


In [6]:
# look at a tournament and examine scoring stats with ranked leaderboad.

# initial findings after research: sg total is highly correlated with victory. but the best putter doesn't always win, for example.
# determining a model that weighs each of these stats to predict finish_adj is the early stages of preidction model, for each tournament.
pga_df[pga_df['tournament id'] ==  401353275][['Player_initial_last', 'sg_putt','sg_arg','sg_app',	'sg_ott','sg_t2g','sg_total','finish_adj']].sort_values('sg_putt', ascending= False)

Unnamed: 0,Player_initial_last,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total,finish_adj
9,C. Bezuidenhout,2.36,-0.11,-0.25,-0.76,-1.11,1.25,12.0
17,D. McCarthy,1.65,0.25,-0.90,-0.76,-1.40,0.25,28.0
2,A. Putnam,1.30,0.49,-0.27,-1.01,-0.80,0.50,23.0
60,T. Moore,1.28,0.02,-1.64,-0.17,-1.78,-0.50,44.0
40,M. Kuchar,1.27,-0.39,-0.61,-0.52,-1.52,-0.25,35.0
...,...,...,...,...,...,...,...,...
59,T. Merritt,-1.71,-0.29,0.59,-0.09,0.21,-1.50,58.0
63,W. Clark,-1.85,-0.61,-0.16,0.37,-0.40,-2.25,64.0
36,L. List,-2.14,-0.20,0.12,0.47,0.39,-1.75,61.0
49,S. Kim,-3.07,0.10,0.29,-0.58,-0.18,-3.25,67.0


In [7]:
# starting to build a model that predicts the finish, for one tournament to start, the most recent bmw championship.
# will use linear regression


# lets try on the whole data set, must drop NaNs from this data
pga_df = pga_df.dropna(subset=['finish_adj','sg_putt',	'sg_arg',	'sg_app',	'sg_ott'])

In [8]:
# X = elements of model
# Y = predicted value
X = pga_df[['sg_putt',	'sg_arg',	'sg_app',	'sg_ott']]
y = pga_df['finish_adj']

# Split the data into a training set and a testing set. 
# The training set will be used to train the model, and the testing set will be used to evaluate the model's performance.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Create a LinearRegression object and fit it to the training data.

lm = LinearRegression()
lm.fit(X_train, y_train)

LinearRegression()

In [10]:
y_pred = lm.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean squared error:', mse)
print('R-squared:', r2)

Mean squared error: 558.2654064071799
R-squared: 0.5631600465812399


In [21]:
# model has been trained on entire PGA training set. lets see how the results compare to the bmw championship
bmw = pga_df[pga_df['tournament id'] ==  401353275][['tournament id','player id','Player_initial_last', 'sg_putt','sg_arg','sg_app',	'sg_ott','sg_t2g','sg_total','finish_adj']].dropna(subset=['finish_adj'])


# attach a 'predicted' column that, based on the 4 parameters (these match the training parameters), that will predict the finish
bmw['predicted'] = lm.predict(bmw[['sg_putt','sg_arg','sg_app',	'sg_ott']])

# print bmw dataframe that has player id, what their actual finish is, and what, based on their scoring stats, their predicted finish would be
# sort by predicted to compare. 
bmw[['player id', 'finish_adj','predicted']].sort_values('predicted').head(50)

# it works pretty well, identifies the winner but struggles with places 8 thru 15. not sure how valuable this is but it is a start. 

# trying this on the entire dataset
pga_df['predicted'] = lm.predict(pga_df[['sg_putt','sg_arg','sg_app','sg_ott']])
pga_df[['tournament id', 'player id', 'finish_adj', 'predicted']].sort_values('predicted').head(50)

# interesting results, tournament id 401243416 is the memorial tournament where jon rahm, who was leading, WD because of a covid test.


Unnamed: 0,tournament id,player id,finish_adj,predicted
35600,2245,1112,1.0,-61.880975
6655,401243416,9780,99.0,-38.809714
35570,2245,446,2.0,-34.092026
18821,401056514,7082,99.0,-26.723521
10847,401155474,3448,1.0,-23.376473
3522,401353235,6086,1.0,-22.428177
37032,2233,1067,10.0,-22.33805
3727,401353233,5504,1.0,-22.014024
13347,401155419,9261,2.0,-21.649814
35556,2245,5025,55.0,-20.818091


In [None]:
# looking at some results, we will need to come up with a better way to handle CUTs and WDs. 

In [27]:
pga_df[pga_df['tournament id'] == 2233].sort_values('finish_adj')

Unnamed: 0,Player_initial_last,tournament id,player id,hole_par,strokes,tournament name,course,date,purse,season,no_cut,Finish,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total,finish_adj,predicted
36990,B. Snedeker,2233,1222,287,265,AT&T Pebble Beach Pro-Am,"Pebble Beach Resort - Pebble Beach, CA",2015-02-15,6.8,2015,0,1,1.53,0.78,0.02,0.81,1.61,3.13,1.0,14.630793
37055,N. Watney,2233,1042,287,268,AT&T Pebble Beach Pro-Am,"Pebble Beach Resort - Pebble Beach, CA",2015-02-15,6.8,2015,0,2,3.96,-0.18,-0.84,0.19,-0.83,3.13,2.0,11.501854
36994,C. Beljan,2233,3777,287,269,AT&T Pebble Beach Pro-Am,"Pebble Beach Resort - Pebble Beach, CA",2015-02-15,6.8,2015,0,3,-1.18,0.21,2.83,0.35,3.39,2.21,3.0,28.710989
37027,J. Day,2233,1680,287,270,AT&T Pebble Beach Pro-Am,"Pebble Beach Resort - Pebble Beach, CA",2015-02-15,6.8,2015,0,T4,0.18,0.45,0.36,-0.28,0.53,0.71,4.0,48.084184
37057,P. Perez,2233,707,287,270,AT&T Pebble Beach Pro-Am,"Pebble Beach Resort - Pebble Beach, CA",2015-02-15,6.8,2015,0,T4,1.52,1.70,0.07,-0.09,1.69,3.21,4.0,13.640206
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37052,M. Thompson,2233,3688,215,214,AT&T Pebble Beach Pro-Am,"Pebble Beach Resort - Pebble Beach, CA",2015-02-15,6.8,2015,0,CUT,2.46,1.29,0.60,-2.29,-0.40,2.06,99.0,26.213578
37053,M. Weir,2233,453,215,225,AT&T Pebble Beach Pro-Am,"Pebble Beach Resort - Pebble Beach, CA",2015-02-15,6.8,2015,0,CUT,0.88,-0.63,-0.87,-3.33,-4.83,-3.94,99.0,108.384050
37054,N. Taylor,2233,3792,215,215,AT&T Pebble Beach Pro-Am,"Pebble Beach Resort - Pebble Beach, CA",2015-02-15,6.8,2015,0,CUT,1.62,0.47,-2.16,0.68,-1.01,0.61,99.0,49.358852
37060,R. Barnes,2233,801,215,214,AT&T Pebble Beach Pro-Am,"Pebble Beach Resort - Pebble Beach, CA",2015-02-15,6.8,2015,0,CUT,2.04,1.26,-0.67,-2.01,-1.42,0.61,99.0,46.817728
