## Make predictions on upcoming Tournament

In [1]:
import pandas as pd
import numpy as np

from config import majors, metrics, feat_cols, target_col
from utils import create_rolling_agg_features_by_golfer

import pickle

In [5]:
# Reading in the historical event and pre tournament data
event_df = pd.read_csv('../data/historical_event_data.csv')
year = 2021
event_name = 'fortinet_championship'
pre_tourney = pd.read_csv(f'../data/{year}_{event_name}_pre_tourney_snapshot.csv')
pre_tourney.rename(columns={'bet365':'close_odds'}, inplace=True)

#Appending the new event onto the historical data
new_event_df = event_df.append(pre_tourney[['dg_id', 'player_name','event_name','close_odds']], sort=False)
new_event_df['major'] = np.where(new_event_df.event_name.isin(majors), 1, 0)

#Creating the lagging aggregate features
for metric in metrics:
    field = metric.get('field')
    agg = metric.get('agg')
    for period in metric.get('periods'):
        col = f"{field}_in_prev_{period}_events"
        new_event_df[col] = create_rolling_agg_features_by_golfer(new_event_df, field, 1, period, agg)
        
#Filtering for only the new event
new_event_df = new_event_df.loc[pd.isnull(new_event_df.fin_num)]

#Subsetting feature columns
X_test = new_event_df[feat_cols].fillna(75)

#Loading in the random forest model
rf = pickle.load(open("../models/rf_model.pkl", "rb"))

#Making Predictions
new_event_df['sg_preds'] = rf.predict(X_test)

#Merging in the prediction data with the pre-tourney data
final_df = pre_tourney.merge(new_event_df[['dg_id','sg_preds']], how='left', on='dg_id')

### Strokes gained predictions relative to DraftKings Salary
The `rnk_diff` field indicates whether the predictions we should consider betting or staying away from a golfer.
- If `rnk_diff` is extremely **positive**, it means his predicted strokes gained ranks much better than his salary, thus we may be a good bet
- If `rnk_diff` is extremely **negative**, it means his predicted strokes gained ranks much lower than his salary, thus we should not on him

In [8]:
# Surfacing Potential Golfers to bet on or stay away from
final_df['dk_salary'] = final_df['dk_salary'].astype(int)
final_df = final_df.query("dk_salary > 0")
final_df['dk_salary_rnk'] = final_df['dk_salary'].rank(ascending=False)
final_df['sg_preds_rnk'] = final_df['sg_preds'].rank(ascending=False)
final_df['rnk_diff'] = final_df['dk_salary_rnk'] - final_df['sg_preds_rnk']

print_df = (final_df[['dg_id', 'player_name', 'dk_salary','sg_preds',
           'dk_salary_rnk', 'sg_preds_rnk','rnk_diff']])

print('Top 10 Golfers with positive Rnk_Diff')
print_df.sort_values('rnk_diff', ascending=False).reset_index(drop=True).head(20)

Top 10 Golfers with positive Rnk_Diff


Unnamed: 0,dg_id,player_name,dk_salary,sg_preds,dk_salary_rnk,sg_preds_rnk,rnk_diff
0,17543,"Kim, Michael",6000,1.717671,150.0,26.0,124.0
1,8769,"Potter Jr, Ted",6100,1.38094,137.0,30.0,107.0
2,7626,"O'Hair, Sean",6200,1.56458,124.5,28.0,96.5
3,22050,"Suh, Justin",6400,1.976423,104.0,20.0,84.0
4,15651,"Ventura, Kristoffer",6100,0.433497,137.0,62.0,75.0
5,19865,"Hardy, Nick",6200,0.806715,124.5,52.0,72.5
6,23505,"McGreevy, Max",6100,0.156018,137.0,78.5,58.5
7,22306,"McCarthy, Josh",6000,-0.186108,150.0,96.0,54.0
8,17881,"Chun-an, Yu",6800,1.928729,71.0,22.0,49.0
9,7340,"Hearn, David",6000,-0.711397,150.0,103.0,47.0


In [9]:
print_df.sort_values('sg_preds', ascending=False)

Unnamed: 0,dg_id,player_name,dk_salary,sg_preds,dk_salary_rnk,sg_preds_rnk,rnk_diff
155,19428,"Zalatoris, Will",10500,4.992311,4.0,1.0,3.0
114,11049,"Simpson, Webb",10800,4.924137,3.0,2.0,1.0
102,19195,"Rahm, Jon",12100,4.818204,1.0,3.0,-2.0
74,13562,"Matsuyama, Hideki",11000,4.206463,2.0,4.0,-2.0
140,14013,"Tringale, Cameron",9700,3.903939,6.0,5.0,1.0
...,...,...,...,...,...,...,...
118,22985,"Smotherman, Austin",6300,-3.645803,114.0,152.0,-38.0
148,28558,"Werbylo, Trevor",6200,-3.906288,124.5,153.0,-28.5
121,7602,"Stadler, Kevin",6000,-4.077850,150.0,154.0,-4.0
139,15634,"Trainer, Martin",6000,-4.215740,150.0,155.0,-5.0


In [7]:
print('Bottom 10 Golfers with negative Rnk_Diff')
print_df.sort_values('rnk_diff', ascending=True).reset_index(drop=True).head(10)

Bottom 10 Golfers with negative Rnk_Diff


Unnamed: 0,dg_id,player_name,dk_salary,sg_preds,dk_salary_rnk,sg_preds_rnk,rnk_diff
0,1547,"Mickelson, Phil",8100,0.055884,20.0,89.0,-69.0
1,18761,"Mullinax, Trey",6600,-1.843987,85.5,136.0,-50.5
2,17723,"Lower, Justin",6500,-2.138656,94.0,142.0,-48.0
3,26651,"Young, Cameron",6500,-2.060372,94.0,141.0,-47.0
4,26211,"Augenstein, John",6400,-3.138284,104.0,150.0,-46.0
5,16395,"Kohles, Ben",6400,-2.641082,104.0,148.0,-44.0
6,25002,"Wu, Dylan",6400,-2.625017,104.0,147.0,-43.0
7,22985,"Smotherman, Austin",6300,-3.645803,114.0,152.0,-38.0
8,6169,"Kuchar, Matt",7000,-0.150932,58.5,95.0,-36.5
9,18579,"Cook, Austin",6300,-2.687608,114.0,149.0,-35.0
