In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np

In [2]:
scores = pd.read_csv('data/historical_round_scores.csv')
pre_tourney = pd.read_csv('pre_tourney_snapshot_2021_08_24.csv')

In [3]:
scores['year_event_golfer'] = (scores['year'].astype(str) +
                                      '_' + scores['event_id'].astype(str) +
                                      '_' + scores['dg_id'].astype(str)
                              )

early_outs = ['CUT','DQ','WD', 'W/D', 'MDF',]
early_out_num = 75

scores['early_out'] = np.where(scores.fin_text.isin(early_outs),1,0)

scores['fin_num'] = (np.where(scores.fin_text.isin(early_outs),
                             early_out_num,
                             scores.fin_text.str.replace('T',''))
                    ).astype(int)

# Aggregating round data to an event level
event_df = (scores.groupby(['dg_id','player_name','event_name',
                            'event_id','fin_text','fin_num','early_out',
                            'year','year_event_golfer','event_completed',]).
            agg(rounds=('round','count'),
                mean_score = ('score', 'mean'),
                mean_sg = ('sg_total', 'mean')
               ).reset_index().sort_values('event_completed')
           )

majors = ['Masters Tournament','The Masters','The Masters #2','The Open Championship',
          'U.S. Open','U.S. Open #2','PGA Championship',
         ]

event_df['major'] = np.where(event_df['event_name'].isin(majors), 1, 0)

In [4]:
def create_rolling_agg_features_by_golfer(df, field, n_shift, n_rolling, agg_func):
    """
    """
    df = df.copy()
    
    df['shifted_field'] = (df.groupby('dg_id')[field]
                           .shift(n_shift)
                           .fillna(0)
                          )
    
    return (df.groupby('dg_id')['shifted_field']
            .transform(lambda x: x.rolling(n_rolling, min_periods=1)
                       .agg(agg_func))
           )

fields = 'early_out'

    

In [5]:
metric_lst = [
    {'field': 'early_out','periods': [1, 2, 5], 'agg': 'sum'},
    {'field': 'fin_num', 'periods': [1, 2, 5], 'agg': 'mean'},
    {'field': 'mean_sg', 'periods': [1, 2, 5], 'agg':'mean'},
    ]


In [6]:
for metric in metric_lst:
    field = metric.get('field')
    agg = metric.get('agg')
    for period in metric.get('periods'):
        col = f"{field}_in_prev_{period}_events"
        event_df[col] = create_rolling_agg_features_by_golfer(event_df, field, 1, period, agg)

In [7]:
# fin_by_course = (scores[['course_num', 'course_name', 'dg_id', 'player_name','event_completed','fin_num']]
#                  .drop_duplicates()
#                  .sort_values('event_completed')
#                 )

# fin_by_course['prev_fin_on_course'] = fin_by_course.groupby(['dg_id','course_num'])['fin_num'].shift(1).fillna(40)

### Training

In [8]:
X = event_df[['major', 'early_out_in_prev_1_events',
       'early_out_in_prev_2_events', 'early_out_in_prev_5_events',
       'fin_num_in_prev_1_events', 'fin_num_in_prev_2_events',
       'fin_num_in_prev_5_events', 'mean_sg_in_prev_1_events',
       'mean_sg_in_prev_2_events', 'mean_sg_in_prev_5_events']]

y = event_df['fin_num']

In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [10]:
rf = RandomForestRegressor( n_estimators=100)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 42)

In [12]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [13]:
preds = rf.predict(X_test)

In [14]:
mean_absolute_error(y_test, preds)

21.272203286714863

## Make predictions

In [15]:
player_field = pre_tourney['dg_id'].unique()

In [16]:
pre_tourney['major'] = np.where(pre_tourney.event_name.isin(majors), 1, 0)

In [18]:
new_event_df = event_df.append(pre_tourney[['dg_id', 'player_name','event_name']], sort=False)

In [21]:
for metric in metric_lst:
    field = metric.get('field')
    agg = metric.get('agg')
    for period in metric.get('periods'):
        col = f"{field}_in_prev_{period}_events"
        new_event_df[col] = create_rolling_agg_features_by_golfer(new_event_df, field, 1, period, agg)

In [24]:
new_event_df.query("dg_id == 17550").tail(10)

Unnamed: 0,dg_id,player_name,event_name,event_id,fin_text,fin_num,early_out,year,year_event_golfer,event_completed,rounds,mean_score,mean_sg,major,early_out_in_prev_1_events,early_out_in_prev_2_events,early_out_in_prev_5_events,fin_num_in_prev_1_events,fin_num_in_prev_2_events,fin_num_in_prev_5_events,mean_sg_in_prev_1_events,mean_sg_in_prev_2_events,mean_sg_in_prev_5_events
22227,17550,"van Rooyen, Erik",AT&T Byron Nelson,19.0,CUT,75.0,1.0,2021.0,2021_19_17550,2021-05-16,2.0,71.5,-1.941,0.0,1.0,2.0,2.0,75.0,75.0,56.2,-0.6065,-0.79525,-0.0458
22232,17550,"van Rooyen, Erik",PGA Championship,33.0,CUT,75.0,1.0,2021.0,2021_33_17550,2021-05-23,2.0,76.5,-1.3255,1.0,1.0,2.0,3.0,75.0,75.0,59.8,-1.941,-1.27375,-0.4016
22235,17550,"van Rooyen, Erik",Palmetto Championship at Congaree,538.0,T10,10.0,0.0,2021.0,2021_538_17550,2021-06-13,4.0,69.0,2.32825,0.0,1.0,2.0,4.0,75.0,75.0,62.8,-1.3255,-1.63325,-0.63935
22252,17550,"van Rooyen, Erik",U.S. Open #2,535.0,CUT,75.0,1.0,2021.0,2021_535_17550,2021-06-20,2.0,73.5,0.2525,1.0,0.0,1.0,4.0,10.0,42.5,62.0,2.32825,0.501375,-0.50575
22247,17550,"van Rooyen, Erik",The Open Championship,100.0,CUT,75.0,1.0,2021.0,2021_100_17550,2021-07-18,2.0,71.5,-0.829,1.0,1.0,1.0,4.0,75.0,42.5,62.0,0.2525,1.290375,-0.25845
22226,17550,"van Rooyen, Erik",3M Open,525.0,T58,58.0,0.0,2021.0,2021_525_17550,2021-07-25,4.0,70.5,-0.095,0.0,1.0,2.0,4.0,75.0,75.0,62.0,-0.829,-0.28825,-0.30295
22229,17550,"van Rooyen, Erik",Barracuda Championship,472.0,1,1.0,0.0,2021.0,2021_472_17550,2021-08-08,4.0,65.75,3.9745,0.0,0.0,1.0,3.0,58.0,66.5,58.6,-0.095,-0.462,0.06625
22263,17550,"van Rooyen, Erik",Wyndham Championship,13.0,T37,37.0,0.0,2021.0,2021_13_17550,2021-08-15,4.0,68.0,0.81275,0.0,0.0,0.0,2.0,1.0,29.5,43.8,3.9745,1.93975,1.12625
22241,17550,"van Rooyen, Erik",THE NORTHERN TRUST,27.0,7,7.0,0.0,2021.0,2021_27_17550,2021-08-22,4.0,67.5,2.1985,0.0,0.0,0.0,2.0,37.0,19.0,49.2,0.81275,2.393625,0.82315
64,17550,"Van Rooyen, Erik",BMW Championship,,,,,,,,,,,,0.0,0.0,1.0,7.0,22.0,35.6,2.1985,1.505625,1.21235
