## Make predictions on upcoming Tournament

In [1]:
import pandas as pd
import numpy as np

from config import majors, metrics, feat_cols, target_col
from utils import create_rolling_agg_features_by_golfer

import pickle

In [7]:
# Reading in the historical event and pre tournament data
event_df = pd.read_csv('../data/historical_event_data.csv')
year = 2021
event_name = 'tour_championship'
pre_tourney = pd.read_csv(f'../data/{year}_{event_name}_pre_tourney_snapshot.csv')
pre_tourney.rename(columns={'bet365':'close_odds'}, inplace=True)

#Appending the new event onto the historical data
new_event_df = event_df.append(pre_tourney[['dg_id', 'player_name','event_name','close_odds']], sort=False)
new_event_df['major'] = np.where(new_event_df.event_name.isin(majors), 1, 0)

#Creating the lagging aggregate features
for metric in metrics:
    field = metric.get('field')
    agg = metric.get('agg')
    for period in metric.get('periods'):
        col = f"{field}_in_prev_{period}_events"
        new_event_df[col] = create_rolling_agg_features_by_golfer(new_event_df, field, 1, period, agg)
        
#Filtering for only the new event
new_event_df = new_event_df.loc[pd.isnull(new_event_df.fin_num)]

#Subsetting feature columns
X_test = new_event_df[feat_cols].fillna(75)

#Loading in the random forest model
rf = pickle.load(open("../models/rf_model.pkl", "rb"))

#Making Predictions
new_event_df['sg_preds'] = rf.predict(X_test)

#Merging in the prediction data with the pre-tourney data
final_df = pre_tourney.merge(new_event_df[['dg_id','sg_preds']], how='left', on='dg_id')

### Strokes gained predictions relative to DraftKings Salary
The `rnk_diff` field indicates whether the predictions we should consider betting or staying away from a golfer.
- If `rnk_diff` is extremely **positive**, it means his predicted strokes gained ranks much better than his salary, thus we may be a good bet
- If `rnk_diff` is extremely **negative**, it means his predicted strokes gained ranks much lower than his salary, thus we should not on him

In [10]:
# Surfacing Potential Golfers to bet on or stay away from
final_df['dk_salary'] = final_df['dk_salary'].astype(int)
final_df = final_df.query("dk_salary > 0")
final_df['dk_salary_rnk'] = final_df['dk_salary'].rank(ascending=False)
final_df['sg_preds_rnk'] = final_df['sg_preds'].rank(ascending=False)
final_df['rnk_diff'] = final_df['dk_salary_rnk'] - final_df['sg_preds_rnk']

print_df = (final_df[['dg_id', 'player_name', 'dk_salary','sg_preds',
           'dk_salary_rnk', 'sg_preds_rnk','rnk_diff']])

print('Top 10 Golfers with positive Rnk_Diff')
print_df.sort_values('rnk_diff', ascending=False).reset_index(drop=True).head(10)

Top 10 Golfers with positive Rnk_Diff


Unnamed: 0,dg_id,player_name,dk_salary,sg_preds,dk_salary_rnk,sg_preds_rnk,rnk_diff
0,7672,"Oosthuizen, Louis",7600,4.960929,17.0,3.0,14.0
1,17550,"Van Rooyen, Erik",5000,1.978188,29.0,20.0,9.0
2,17488,"Im, Sungjae",7300,3.771367,18.0,12.0,6.0
3,16243,"Koepka, Brooks",7800,4.03946,16.0,10.0,6.0
4,19483,"Burns, Sam",8500,4.203351,13.0,7.0,6.0
5,18238,"Ancer, Abraham",9600,4.510238,9.0,5.0,4.0
6,17576,"Conners, Corey",6200,2.561568,22.0,18.0,4.0
7,10091,"McIlroy, Rory",9300,4.409452,10.0,6.0,4.0
8,14139,"Thomas, Justin",11000,5.125974,5.0,2.0,3.0
9,14577,"English, Harris",8700,4.075792,12.0,9.0,3.0


In [12]:
print('Bottom 10 Golfers with negative Rnk_Diff')
print_df.sort_values('rnk_diff', ascending=True).reset_index(drop=True).head(10)

Bottom 10 Golfers with negative Rnk_Diff


Unnamed: 0,dg_id,player_name,dk_salary,sg_preds,dk_salary_rnk,sg_preds_rnk,rnk_diff
0,15466,"Cantlay, Patrick",13400,3.418857,1.0,15.0,-14.0
1,19841,"DeChambeau, Bryson",12300,3.254097,3.0,16.0,-13.0
2,19195,"Rahm, Jon",13000,3.573888,2.0,14.0,-12.0
3,15856,"Smith, Cameron",10100,3.10011,8.0,17.0,-9.0
4,22085,"Morikawa, Collin",8300,2.336286,14.0,19.0,-5.0
5,13562,"Matsuyama, Hideki",6400,0.961295,21.0,24.0,-3.0
6,12337,"Kokrak, Jason",6600,1.290809,20.0,23.0,-3.0
7,17606,"Berger, Daniel",5800,0.655094,24.0,26.0,-2.0
8,5665,"Cink, Stewart",5200,-1.049379,27.0,29.0,-2.0
9,18417,"Scheffler, Scottie",6800,1.882162,19.0,21.0,-2.0


** Note the tour championship is an odd tournament since the field does not start even.
Which is why many of the best golfers are in the bottom 10.
These predictions are not usable for this event.
Hold off for the next standard scoring event.