### Notes:

Old testing notebooks are messy, but located in the 'Old Testing and Notes' folder, which contain previous attempts at feature transformation and model tuning. RandomForestRegressor parameters were determined from a GridSearchCV over a portion of the dataset.

In [1]:
import pandas as pd
import numpy as np
import cloudpickle
import sklearn
df=pd.read_csv("../../assets/assignment/df_train.csv.gz")
events=df['event.id'].unique()

train_set=events[0:100]
test_set=events[100:200]
holdout_set=events[200:300]

train=df.query("`event.id` in @train_set")
test=df.query("`event.id` in @test_set")
holdout=df.query("`event.id` in @holdout_set")

# You notice how I just hard coded some slices in and made the sets the same
# size. You don't have to do this, and in the end, you want to submit your models
# to the autograder using *all* of the data in this dataset. But this is real
# world data so there will be weird format errors, and having a clear holdout
# set will give you a chance to "fail fast" and see those errors crop up without
# having to submit to the autograder
    
# Speaking about the holdout dataset, it will not have a bunch of data in it, like
# race times. That would of course leak the results, and wouldn't be available in
# practice. The addendum to the assignment has this description, and I just copy
# and past it here and create the same thing. This ensures when I am playing with
# my models and then want to evaluate them I won't make a mistake and use a
# column incorrectly.

holdout=holdout.drop(
    columns=['time.end',
             'body.results_certificate',
             'event.results_posted',
             'event.results_posted',
             'event.results_certificate',
             'event.photos_available',
             'event.photos_faces',
             'event.photos_social_sharing',
             'event.results_searchable',
             'corral.id',
             'corral.name',
             'corral.wave',
             'corral.time.close',
             'corral.time.start',
             'result.duration.chip',
             'result.duration.pace',
             'result.rankings',
             'result.splits',
             'result.videos',
             'result.finished',
             'result.disqualified',
             'result.duration'])

# Also, I will garuntee in the holdout set there is data. At least 6 rows per race
holdout=df.groupby(["event.id","clean_categories.name"]).filter(lambda z: len(z)>5)

y=pd.to_timedelta(df['result.duration.chip']).astype(int)


### Evaluate Function

In [2]:
import pandas as pd
import numpy as np
import cloudpickle
import sklearn

# This code simulates the autograder. It is not the full autograder implementation
# but shares an API with the autograder. It expects that your fitted pipeline is
# submitted with the name pipeline.cloudpickle as demonstrated above. This object
# must implement the predict() function. This is done automatically by the sklearn
# Pipeline object if the last element of your pipeline is a classifier which has
# a predict() function. If you are not submitting a Pipeline, and want to do something
# different, you *must* have a predict() function of the same method signature, e.g.:
#
#   predict(self, X, **predict_params)->np.ndarray

# Load holdout data, in this case I'll simulate it by loading the training data
#df=pd.read_csv("../../assets/assignment/df_train.csv.gz")

# And evaluate on all 5k races that we didn't consider for training
#holdout_data=df.query("`event.id`!='583f013a-1e54-4906-87f7-2b625206f5f9' and `clean_categories.name`=='5k'")
holdout_data=holdout

# This is the scoring function to determine model fitness
def score(left: pd.DataFrame, right: pd.DataFrame):
    '''
    Calculates the difference between the left and the right when considering rank of items. 
    This scoring function requires that the two DataFrames have identical indicies, and that
    they each contain only one column of values and no missing values. Props to Blake Atkinson
    for providing MWE indicating issues with autograder version #1.
    '''
    assert(type(left)==pd.DataFrame)
    assert(type(right)==pd.DataFrame)
    assert(len(left)==len(right))
    assert(not np.any(np.isnan(left)))
    assert(not np.any(np.isnan(right)))
    assert(left.index.equals(right.index))
    # convert to ndarrays
    left=left.squeeze()
    right=right.squeeze()
    return np.sum(np.abs(left-right))/(len(left)*(len(left)-1))

# This function runs the prediction model agains a given event/category pair. It
# intentionally loads the student model each time to avoid accidental leakage of data
# between events.
def evaluate(data, pipeline_file='pipeline.cloudpickle'):
    # Load student pipeline
    fitted_pipe = cloudpickle.load(open(pipeline_file,'rb'))
    
    # Separate out the X and y
    X=list(set(data.columns)-{'overall_ranking'})
    y=['overall_ranking']
    
    # Drop any missing results (DNFs)
    data=data.dropna(subset=['overall_ranking'])
    
    # Ensure there is data to actually predict on
    if len(data)==0:
        return np.nan

    # Predict on unseen data
    from IPython.utils import io
    with io.capture_output() as captured:
        predictions=pd.DataFrame(fitted_pipe.predict(data[X]),data.index)
    observed=data[y]
    
    # Generate rankings within this bracket
    observed=pd.DataFrame(data[y].rank(),data.index)
    
    # Return the ratio of the student score
    return pd.Series({"score":score(observed,predictions)})

### Generate Pipeline and Test Against Holdout

In [3]:
import pandas as pd
import numpy as np
import cloudpickle
import sklearn
import xgboost as xgb
from sklearn.compose import ColumnTransformer
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import OneHotEncoder

pd.options.display.max_seq_items = 2000
pd.set_option('display.max_columns', 1000)  # or 1000

#y=pd.to_timedelta(train['result.duration.chip']).astype(int) #Uncomment to test on training set

y=pd.to_timedelta(df['result.duration.chip']).astype(int) #Uncomment to test on full set

class CustomTransformer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def fit(self, X, y = None):
        return self
    def transform(self, X, y=None):
        # Just select the features we want
        df=X[["age","sex","category.completed.distance.quantity",
              "category.completed.distance.unit","price","fundraising.goal",
              "clean_category.completed.name","location.state","bib"]]                   # EDIT FEATURES HERE <-------
        
        
        #category.completed.distance.quantity
        df.loc[df.query("`clean_category.completed.name` == '1 mile fun run/walk'").index, 'category.completed.distance.quantity']=1.609
        df.loc[df.query("`clean_category.completed.name` == '1 mile run'").index, 'category.completed.distance.quantity']=1.609
        df.loc[df.query("`clean_category.completed.name` == '10k scenic challenge'").index, 'category.completed.distance.quantity']=10.0
        df.loc[df.query("`clean_category.completed.name` == '5k'").index, 'category.completed.distance.quantity']=5.0
        df.loc[df.query("`clean_category.completed.name` == '5 km run'").index, 'category.completed.distance.quantity']=5.0
        df.loc[df.query("`clean_category.completed.name` == '5k run'").index, 'category.completed.distance.quantity']=5.0
        df.loc[df.query("`clean_category.completed.name` == '5k walk/run'").index, 'category.completed.distance.quantity']=5.0
        df.loc[df.query("`clean_category.completed.name` == '5k run/walk'").index, 'category.completed.distance.quantity']=5.0
        df.loc[df.query("`clean_category.completed.name` == '8k'").index, 'category.completed.distance.quantity']=8.0
        df.loc[df.query("`clean_category.completed.name` == '10k'").index, 'category.completed.distance.quantity']=10.0
        df.loc[df.query("`clean_category.completed.name` == '10k run'").index, 'category.completed.distance.quantity']=10.0
        df.loc[df.query("`clean_category.completed.name` == 'commitment day 5k - master'").index, 'category.completed.distance.quantity']=5.0
        df.loc[df.query("`clean_category.completed.name` == 'life time commitment day 5k'").index, 'category.completed.distance.quantity']=5.0
        df.loc[df.query("`clean_category.completed.name` == 'olympic duathlon'").index, 'category.completed.distance.quantity']=10.0
        df.loc[df.query("`clean_category.completed.name` == 'olympic triathlon'").index, 'category.completed.distance.quantity']=10.0
        df.loc[df.query("`clean_category.completed.name` == 'one mile fun run'").index, 'category.completed.distance.quantity']=1.609
        df.loc[df.query("`clean_category.completed.name` == 'quarter marathon'").index, 'category.completed.distance.quantity']=10.55
        df.loc[df.query("`clean_category.completed.name` == 'half marathon'").index, 'category.completed.distance.quantity']=21.1
        df.loc[df.query("`clean_category.completed.name` == 'sprint duathlon'").index, 'category.completed.distance.quantity']=5.0
        df.loc[df.query("`clean_category.completed.name` == 'sprint triathlon'").index, 'category.completed.distance.quantity']=5.0
        df.loc[df.query("`clean_category.completed.name` == 'midnight streak'").index, 'category.completed.distance.quantity']=5.0
        
        #Clean sex category
        df.loc[df.query("`sex` == 'Male'").index, 'sex']=1
        df.loc[df.query("`sex` == 'M'").index, 'sex']=1
        df.loc[df.query("`sex` == 'Female'").index, 'sex']=0
        df.loc[df.query("`sex` == 'F'").index, 'sex']=0
        df.loc[df.query("`sex` not in [0,1]").index, 'sex']=np.nan

        #Change all completed distances to KM
        df.loc[df['category.completed.distance.unit'] == 'mi', 'category.completed.distance.quantity'] *= 1.609
        df = df.drop(['category.completed.distance.unit'], axis=1)

        #Upper limit on ages
        df.loc[df.query("`age` > 99").index, 'age'] = -1

        #fundraising.goal
        df['fundraising.goal'] = df['fundraising.goal'].fillna(0)
        df['fundraising.goal'] = df['fundraising.goal'].astype(str)
        df['fundraising.goal'] = df['fundraising.goal'].str.replace('$','')
        df['fundraising.goal'] = df['fundraising.goal'].str.replace(',','')
        df['fundraising.goal'] = df['fundraising.goal'].astype(float)
        
        df=df.fillna(-1)
        
        
        return df

def evaluation_function(x):
    #display(pd.Series(x.squeeze()).rank())
    return pd.Series(x.squeeze()).rank().values

from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer

reg=TransformedTargetRegressor(regressor=RandomForestRegressor(max_depth=3, min_samples_split=3, n_estimators=1000, n_jobs=-1), inverse_func=evaluation_function)
#reg.original_predict=reg.predict

transformer = ColumnTransformer(transformers=[
    ('category_name', OneHotEncoder(sparse=False, drop=None, handle_unknown='ignore'), ['clean_category.completed.name']),
    ('state', OneHotEncoder(sparse=False, drop=None, handle_unknown='ignore'), ['location.state'])])

# Build the pipeline
pipe = make_pipeline(CustomTransformer(), transformer, SimpleImputer(missing_values=-1), QuantileTransformer(), reg, verbose=True) #Removed to reduce filesize: KNNImputer(missing_values=-1)

from sklearn import set_config
set_config(display="diagram")
display(pipe)

fitted_pipe=pipe.fit(df,y)


cloudpickle.dump(fitted_pipe, open('pipeline.cloudpickle','wb'))

evaluate(holdout)

[Pipeline] . (step 1 of 5) Processing customtransformer, total=   0.7s
[Pipeline] . (step 2 of 5) Processing columntransformer, total=   0.4s
[Pipeline] ..... (step 3 of 5) Processing simpleimputer, total=   0.5s
[Pipeline]  (step 4 of 5) Processing quantiletransformer, total=   2.2s
[Pipeline]  (step 5 of 5) Processing transformedtargetregressor, total= 1.2min


score    0.267341
dtype: float64