In [18]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# import PGA data
pga_df = pd.read_csv('PGA_Data.csv')

In [4]:
# drop fantasy related and filler columns
pga_df = pga_df.drop(columns=['hole_DKP', 'hole_FDP', 'hole_SDP', 'streak_DKP',
       'streak_FDP', 'streak_SDP', 'n_rounds', 'made_cut', 'pos', 'finish_DKP',
       'finish_FDP', 'finish_SDP', 'total_DKP', 'total_FDP', 'total_SDP',
       'player', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',])

In [5]:
# make a column that is the finish, removing ties and turning 'CUT' to 0. 

# 0 could be changed to 99, NA, or whatever the cut line is (count in tournament / 2)

# note, the data column is an 'Object' type so it has strings and integers. 
# this methods turns everything to a string, makes its caclualtions, then turns to a float so we can do statistical calculations.

def adjust_finish(x):
    # this treats cut, dq, wd, etc. all as the same and assigns a value of 99
    # todo: determine if we want to treat these all the same
    # todo: because lower is better, NFs are treated as 99, but we can hone this in, 
    if x == 'CUT' or x == 'DQ' or x == 'WD' or x == 'MDF' or x == 'W/D':
        return '99'
    elif x.startswith('T'):
        return x[1:]
    else:
        return x
pga_df['finish_adj'] = pga_df['Finish'].astype(str).apply(adjust_finish).astype(float)


In [6]:
pga_df[pga_df['Player_initial_last'] == 'S. Scheffler'][[ 'Finish', 'finish_adj']]

Unnamed: 0,Finish,finish_adj
52,T3,3.0
159,CUT,99.0
648,T21,21.0
842,CUT,99.0
1114,T13,13.0
...,...,...
19977,T45,45.0
21884,T43,43.0
23699,CUT,99.0
26555,,


In [7]:
# look at a tournament and examine scoring stats with ranked leaderboad.

# initial findings after research: sg total is highly correlated with victory. but the best putter doesn't always win, for example.
# determining a model that weighs each of these stats to predict finish_adj is the early stages of preidction model, for each tournament.
pga_df[pga_df['tournament id'] ==  401353275][['tournament id','Player_initial_last', 'sg_putt','sg_arg','sg_app',	'sg_ott','sg_t2g','sg_total','finish_adj']].sort_values('sg_putt', ascending= False)

Unnamed: 0,tournament id,Player_initial_last,sg_putt,sg_arg,sg_app,sg_ott,sg_t2g,sg_total,finish_adj
9,401353275,C. Bezuidenhout,2.36,-0.11,-0.25,-0.76,-1.11,1.25,12.0
17,401353275,D. McCarthy,1.65,0.25,-0.90,-0.76,-1.40,0.25,28.0
2,401353275,A. Putnam,1.30,0.49,-0.27,-1.01,-0.80,0.50,23.0
60,401353275,T. Moore,1.28,0.02,-1.64,-0.17,-1.78,-0.50,44.0
40,401353275,M. Kuchar,1.27,-0.39,-0.61,-0.52,-1.52,-0.25,35.0
...,...,...,...,...,...,...,...,...,...
59,401353275,T. Merritt,-1.71,-0.29,0.59,-0.09,0.21,-1.50,58.0
63,401353275,W. Clark,-1.85,-0.61,-0.16,0.37,-0.40,-2.25,64.0
36,401353275,L. List,-2.14,-0.20,0.12,0.47,0.39,-1.75,61.0
49,401353275,S. Kim,-3.07,0.10,0.29,-0.58,-0.18,-3.25,67.0


In [42]:
# starting to build a model that predicts the finish, for one tournament to start, the most recent bmw championship.
# will use linear regression

# bmw_input = pga_df[pga_df['tournament id'] ==  401353275][['tournament id','Player_initial_last', 'sg_putt','sg_arg','sg_app',	'sg_ott','sg_t2g','sg_total','finish_adj']].dropna(subset=['finish_adj'])

# lets try on the whole data set, must drop NaNs from this data
pga_df = pga_df.dropna(subset=['finish_adj','sg_putt',	'sg_arg',	'sg_app',	'sg_ott'])

In [58]:
# X = elements of model
# Y = predicted value
X = pga_df[['sg_putt',	'sg_arg',	'sg_app',	'sg_ott']]
y = pga_df['finish_adj']

# Split the data into a training set and a testing set. 
# The training set will be used to train the model, and the testing set will be used to evaluate the model's performance.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [59]:
# Create a LinearRegression object and fit it to the training data.

lm = LinearRegression()
lm.fit(X_train, y_train)

LinearRegression()

In [60]:
y_pred = lm.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean squared error:', mse)
print('R-squared:', r2)

Mean squared error: 558.2654064071799
R-squared: 0.5631600465812399


In [61]:
# after making the prediction, lets try on the bmw tournament
bmw = pga_df[pga_df['tournament id'] ==  401353275][[ 'player id','sg_putt','sg_arg','sg_app',	'sg_ott','finish_adj']].dropna(subset=[ 'sg_putt','sg_arg','sg_app',	'sg_ott'])


In [65]:
# model has been trained on entire PGA training set. lets see how the results compare to the bmw championship


# attach a 'predicted' column that, based on the 4 parameters (these match the training parameters), that will predict the finish
bmw['predicted'] = lm.predict(bmw[['sg_putt','sg_arg','sg_app',	'sg_ott']])

# print bmw dataframe that has player id, what their actual finish is, and what, based on their scoring stats, their predicted finish would be
# sort by predicted to compare. 
bmw[['player id', 'finish_adj','predicted']].sort_values('predicted').head(50)

# it works pretty well, identifies the winner but struggles with places 8 thru 15. not sure how valuable this is but it is a start. 

Unnamed: 0,player id,finish_adj,predicted
44,6007,1.0,20.437595
53,3378,2.0,23.319246
52,9478,3.0,30.421784
32,7083,5.0,33.154065
3,388,5.0,33.875545
10,9126,5.0,34.204795
25,9780,8.0,37.277782
9,9243,12.0,38.10421
62,9658,8.0,38.14587
23,11099,8.0,38.358179
