## Recommeder Model

In [1]:
#Use conda to install scikit-surprise with the below command
#conda install -c conda-forge scikit-surprise

#libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#surprise imports
import surprise
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise import accuracy

## Load Data

In [2]:
#load in csv data
raw_df = pd.read_csv('../../raw/input_runners_2020.csv')
raw_df.head()

Unnamed: 0,athlete,prev_month_weekly_km,prev_month_weekly_pace,prev_month_weekly_days_run,age_bucket,gender,country,current_month,run_distance_rounded,weekly_frequency
0,0,47.48,6.185043,3.4,18 - 34,F,United States,October,4.0,0.225806
1,0,47.48,6.185043,3.4,18 - 34,F,United States,October,6.0,0.225806
2,0,47.48,6.185043,3.4,18 - 34,F,United States,October,8.0,0.225806
3,0,47.48,6.185043,3.4,18 - 34,F,United States,October,9.0,0.225806
4,0,47.48,6.185043,3.4,18 - 34,F,United States,October,10.0,0.225806


In [3]:
#set "users" to athlete id + month

#convert months to number
def mtn(x):
    months = {
        'jan': 1,
        'feb': 2,
        'mar': 3,
        'apr':4,
         'may':5,
         'jun':6,
         'jul':7,
         'aug':8,
         'sep':9,
         'oct':10,
         'nov':11,
         'dec':12
        }
    a = x.strip()[:3].lower()
    try:
        ez = months[a]
        return ez
    except:
        raise ValueError('Not a month')
        
raw_df["current_month"] =  raw_df["current_month"].apply(lambda x:mtn(x)) 

#create user ID
raw_df["user_id"] = raw_df["athlete"].astype(str) + "." + raw_df["current_month"].astype(str)

In [4]:
raw_df.head()

Unnamed: 0,athlete,prev_month_weekly_km,prev_month_weekly_pace,prev_month_weekly_days_run,age_bucket,gender,country,current_month,run_distance_rounded,weekly_frequency,user_id
0,0,47.48,6.185043,3.4,18 - 34,F,United States,10,4.0,0.225806,0.1
1,0,47.48,6.185043,3.4,18 - 34,F,United States,10,6.0,0.225806,0.1
2,0,47.48,6.185043,3.4,18 - 34,F,United States,10,8.0,0.225806,0.1
3,0,47.48,6.185043,3.4,18 - 34,F,United States,10,9.0,0.225806,0.1
4,0,47.48,6.185043,3.4,18 - 34,F,United States,10,10.0,0.225806,0.1


## Input New User Info

### Key next steps:
 - Turn into callable functions
 - Build method to add/update data for existing user

In [5]:
#filter dataset based on targets
gender = "F"
weekly_target = 40
age_bucket = "18 - 34"
month = mtn("October")
number_of_days = 5
new_id = '000.0'

#cold start for new user. Assumes long run 2x distance other runs.
new_user_data = { 
    'user_id' : [new_id] * 2,
    'run_distance_rounded' : [round(weekly_target/6), round(weekly_target/3)],
    'weekly_frequency' : [number_of_days -1, 1]
                }
new_user_df = pd.DataFrame(new_user_data)
new_user_df

Unnamed: 0,user_id,run_distance_rounded,weekly_frequency
0,0.0,7,4
1,0.0,13,1


## Train Model Based on User Filters

### Key next steps:
 - Turn into callable functions

In [6]:
#define objects for Surprise. Must be in user, item, rating order
filtered_df = raw_df.loc[
    (raw_df['gender'] == gender) &
    (raw_df['age_bucket'] == age_bucket) &
    (raw_df['current_month'] == month) &
    (raw_df['prev_month_weekly_days_run'] > (number_of_days-1)) & 
    (raw_df['prev_month_weekly_days_run'] < (number_of_days+1)) &
    (raw_df['prev_month_weekly_km'] > weekly_target-5) &
    (raw_df['prev_month_weekly_km'] < weekly_target+5)
]

#append new user data

filtered_df = pd.concat([filtered_df,new_user_df])
    
data = Dataset.load_from_df(filtered_df[["user_id","run_distance_rounded","weekly_frequency"]], Reader(rating_scale=(0,7)))

In [7]:
#train model with 3 fold cross validation
svd = SVD(verbose=True, n_epochs=10)
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.4811  0.3168  0.3542  0.3840  0.0703  
MAE (testset)     0.2689  0.2418  0.2690  0.2599  0.0128  
Fit time          0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([0.4810824 , 0.31677534, 0.35419119]),
 'test_mae': array([0.26891425, 0.24183649, 0.26904938]),
 'fit_time': (0.0009505748748779297,
  0.0008702278137207031,
  0.0008618831634521484),
 'test_time': (0.0005669593811035156,
  0.0003905296325683594,
  0.0004947185516357422)}

In [8]:
#fit the model using the full dataset
trainset = data.build_full_trainset()
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f9aba90f2b0>

In [9]:
#generate recommendations by generating a list of run distances based on the average expected run lenghy

def generate_run_ratings(user_id, weekly_target, number_of_days, long_run_multiple = 3):
    
    """
    Generates a list of run distances with ratings based on weekly target, 
    number of days run each week, with an optional param that sets the upper 
    bound long run distance
    """
    
    #create list of run lengths
    run_list = range(round(weekly_target/(number_of_days+(long_run_multiple-1))), round((weekly_target/number_of_days) * long_run_multiple))
    results_dict = {'run_distance': [],
                   'run_rating': []}
    
    
    for run in run_list:
        rating_prediction = svd.predict(uid = user_id, iid = run)[3]
        results_dict['run_distance'].append(run)
        results_dict['run_rating'].append(rating_prediction)
        
    return pd.DataFrame(results_dict)


predictions = generate_run_ratings(new_id, weekly_target, number_of_days)

In [10]:
#sorted recommendations
predictions.sort_values(by=['run_rating'], ascending = False)

Unnamed: 0,run_distance,run_rating
1,7,1.056411
7,13,0.827507
5,11,0.711374
4,10,0.694204
9,15,0.664022
11,17,0.659958
8,14,0.651959
12,18,0.643589
17,23,0.640863
0,6,0.639559


## Model Evaluation

In [11]:
import time
import seaborn as sns

In [12]:
#build function to run simulations

def model_RMSE(gender, age_bucket, start_km, end_km, min_num_athletes=10):
    """
    Function to evaluate SDV model, incrementing by 1km through km range for specific age group and gender
    """
    
    #Create empty dict for results
    results_dict = {
        'age_bucket': [],
        'gender': [],
        'weekly_target': [],
        'RMSE': [],
        'athlete_count': []
    }

    start = time.time()
    
    for target_kms in range(start_km, end_km+1): 

        #define objects for Surprise. Must be in user, item, rating order
        filtered_df = raw_df.loc[
            (raw_df['gender'] == gender) &
            (raw_df['age_bucket'] == age_bucket) &
            (raw_df['prev_month_weekly_days_run'] >= (4)) & 
            (raw_df['prev_month_weekly_days_run'] <= (7)) &
            (raw_df['prev_month_weekly_km'] >= target_kms-1) &
            (raw_df['prev_month_weekly_km'] <= target_kms+1)
        ]
        
        athlete_count = filtered_df.size
        
        if filtered_df.size <= min_num_athletes:
            print("insufficient data for",gender,age_bucket,target_kms)
            continue 

        #convert datates    
        data = Dataset.load_from_df(filtered_df[["user_id","run_distance_rounded","weekly_frequency"]], Reader(rating_scale=(0,7)))

        #build model
        svd = SVD(verbose=False, n_epochs=10)
        validation = cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=False)

        #capture outputs for dict from final fold
        results_dict['gender'].append(gender)
        results_dict['age_bucket'].append(age_bucket)
        results_dict['weekly_target'].append(target_kms)
        results_dict['RMSE'].append(validation['test_rmse'][2])
        results_dict['athlete_count'].append(athlete_count)
        
        #clean up memory
        del filtered_df

    end = time.time()
        
    print("Finished in:", end - start)
    
    return pd.DataFrame(results_dict)

df1 = model_RMSE("F","18 - 34",10,100)
df2 = model_RMSE("M","18 - 34",10,100)
df3 = model_RMSE("F","35 - 54",10,100)
df4 = model_RMSE("M","35 - 54",10,100)
df5 = model_RMSE("F","55 +",10,100)
df6 = model_RMSE("M","55 +",10,100)

ValueError: Incorrect value for n_splits=1. Must be >=2 and less than the number of ratings

In [None]:
#combine datasets and graph
graph_df = pd.concat([df1,df2,df3,df4,df5,df6])

sns.set_theme(style="white")

sns.relplot(
    x="weekly_target", 
    y="RMSE", 
    hue="age_bucket", 
    size="athlete_count",
    sizes=(40, 400), 
    alpha=.5, 
    palette="muted",
    height=6, 
    data=graph_df.loc[graph_df['gender'] == "F"]
).set(title="RMSE for Female Runners, in km per week")

In [None]:
sns.relplot(
    x="weekly_target", 
    y="RMSE", 
    hue="age_bucket", 
    size="athlete_count",
    sizes=(40, 400), 
    alpha=.5, 
    palette="muted",
    height=6, 
    data=graph_df.loc[graph_df['gender'] == "M"]
).set(title="RMSE for Male Runners, in km per week")

### Weighted RMSE

For use to compare overall model performance

In [None]:
#calc weighted average RMSE for each set
graph_df = pd.concat([df1,df2,df3,df4,df5,df6])
graph_df["RMSE_weight"] = graph_df["RMSE"]*graph_df["athlete_count"] 
df_SVD = graph_df.groupby(['age_bucket','gender']).agg({'RMSE_weight':'sum','athlete_count':'sum'})
df_SVD["Weighted_RMSE"] = df_SVD["RMSE_weight"]/df_SVD["athlete_count"]

In [None]:
df_SVD

### Baseline KNN

In [None]:
#build function to run simulations

def model_RMSE(gender, age_bucket, start_km, end_km, min_num_athletes=10):
    """
    Function to evaluate SDV model, incrementing by 1km through km range for specific age group and gender
    """
    
    #Create empty dict for results
    results_dict = {
        'age_bucket': [],
        'gender': [],
        'weekly_target': [],
        'RMSE': [],
        'athlete_count': []
    }

    start = time.time()
    
    for target_kms in range(start_km, end_km+1): 

        #define objects for Surprise. Must be in user, item, rating order
        filtered_df = raw_df.loc[
            (raw_df['gender'] == gender) &
            (raw_df['age_bucket'] == age_bucket) &
            (raw_df['prev_month_weekly_days_run'] >= (4)) & 
            (raw_df['prev_month_weekly_days_run'] <= (7)) &
            (raw_df['prev_month_weekly_km'] >= target_kms-1) &
            (raw_df['prev_month_weekly_km'] <= target_kms+1)
        ]
        
        athlete_count = filtered_df.size
        
        if filtered_df.size <= min_num_athletes:
            print("insufficient data for",gender,age_bucket,target_kms)
            continue 

        #convert datates    
        data = Dataset.load_from_df(filtered_df[["user_id","run_distance_rounded","weekly_frequency"]], Reader(rating_scale=(0,7)))

        #build model
        knn = KNNBasic(verbose=False)
        validation = cross_validate(knn, data, measures=['RMSE'], cv=3, verbose=False)

        #capture outputs for dict from final fold
        results_dict['gender'].append(gender)
        results_dict['age_bucket'].append(age_bucket)
        results_dict['weekly_target'].append(target_kms)
        results_dict['RMSE'].append(validation['test_rmse'][2])
        results_dict['athlete_count'].append(athlete_count)
        
        #clean up memory
        del filtered_df

    end = time.time()
        
    print("Finished in:", end - start)
    
    return pd.DataFrame(results_dict)

df1 = model_RMSE("F","18 - 34",10,100)
df2 = model_RMSE("M","18 - 34",10,100)
df3 = model_RMSE("F","35 - 54",10,100)
df4 = model_RMSE("M","35 - 54",10,100)
df5 = model_RMSE("F","55 +",10,100)
df6 = model_RMSE("M","55 +",10,100)

In [None]:
#combine datasets and graph
graph_df = pd.concat([df1,df2,df3,df4,df5,df6])

sns.set_theme(style="white")

sns.relplot(
    x="weekly_target", 
    y="RMSE", 
    hue="age_bucket", 
    size="athlete_count",
    sizes=(40, 400), 
    alpha=.5, 
    palette="muted",
    height=6, 
    data=graph_df.loc[graph_df['gender'] == "F"]
).set(title="RMSE for Female Runners, in km per week")

In [None]:
sns.relplot(
    x="weekly_target", 
    y="RMSE", 
    hue="age_bucket", 
    size="athlete_count",
    sizes=(40, 400), 
    alpha=.5, 
    palette="muted",
    height=6, 
    data=graph_df.loc[graph_df['gender'] == "M"]
).set(title="RMSE for Male Runners, in km per week")

In [None]:
#calc weighted average RMSE for each set
graph_df["RMSE_weight"] = graph_df["RMSE"]*graph_df["athlete_count"] 
df_KNN = graph_df.groupby(['age_bucket','gender']).agg({'RMSE_weight':'sum','athlete_count':'sum'})
df_KNN["Weighted_RMSE"] = df_KNN["RMSE_weight"]/df_KNN["athlete_count"]
df_KNN

In [None]:
df_SVD

### Grid Search Tuning

For tuning, we drop runners with minimal data. Those with 35km or less in weekly training

In [None]:
#restrict data to 35km and greater
tuning_df = raw_df.loc[(raw_df['prev_month_weekly_km'] >= 35)]

#### SVD Tuning

In [None]:
#SVD paramaters
SVDparam = {'n_factors': [20, 50, 80],
            'reg_all': [0.04, 0.06],
            'n_epochs': [10, 20, 30],
            'lr_all': [.022, .005, .01]}
gridSVD = GridSearchCV(SVD, param_grid=SVDparam, measures=["rmse"], cv=3, joblib_verbose = 1, n_jobs = 16)

data = Dataset.load_from_df(tuning_df[["user_id","run_distance_rounded","weekly_frequency"]], Reader(rating_scale=(0,7)))

In [None]:
gridSVD.fit(data)

In [None]:
print('Best SVD results')
print(gridSVD.best_score['rmse'])
print(gridSVD.best_params['rmse'])

In [None]:
#confirm SVD results

def model_RMSE(gender, age_bucket, start_km, end_km, min_num_athletes=10):
    """
    Function to evaluate SDV model, incrementing by 1km through km range for specific age group and gender
    """
    
    #Create empty dict for results
    results_dict = {
        'age_bucket': [],
        'gender': [],
        'weekly_target': [],
        'RMSE': [],
        'athlete_count': []
    }

    start = time.time()
    
    for target_kms in range(start_km, end_km+1): 

        #define objects for Surprise. Must be in user, item, rating order
        filtered_df = raw_df.loc[
            (raw_df['gender'] == gender) &
            (raw_df['age_bucket'] == age_bucket) &
            (raw_df['prev_month_weekly_days_run'] >= (4)) & 
            (raw_df['prev_month_weekly_days_run'] <= (7)) &
            (raw_df['prev_month_weekly_km'] >= target_kms-1) &
            (raw_df['prev_month_weekly_km'] <= target_kms+1)
        ]
        
        athlete_count = filtered_df.size
        
        if filtered_df.size <= min_num_athletes:
            print("insufficient data for",gender,age_bucket,target_kms)
            continue 

        #convert datates    
        data = Dataset.load_from_df(filtered_df[["user_id","run_distance_rounded","weekly_frequency"]], Reader(rating_scale=(0,7)))

        #build model
        knn = KNNBasic(n_factors = 20, reg_all = 0.06, n_epochs = 10, lr_all = 0.05, verbose=False)
        validation = cross_validate(knn, data, measures=['RMSE'], cv=3, verbose=False)

        #capture outputs for dict from final fold
        results_dict['gender'].append(gender)
        results_dict['age_bucket'].append(age_bucket)
        results_dict['weekly_target'].append(target_kms)
        results_dict['RMSE'].append(validation['test_rmse'][2])
        results_dict['athlete_count'].append(athlete_count)
        
        #clean up memory
        del filtered_df

    end = time.time()
        
    print("Finished in:", end - start)
    
    return pd.DataFrame(results_dict)

df1 = model_RMSE("F","18 - 34",10,100)
df2 = model_RMSE("M","18 - 34",10,100)
df3 = model_RMSE("F","35 - 54",10,100)
df4 = model_RMSE("M","35 - 54",10,100)
df5 = model_RMSE("F","55 +",10,100)
df6 = model_RMSE("M","55 +",10,100)

#calc weighted average RMSE for each set
graph_df["RMSE_weight"] = graph_df["RMSE"]*graph_df["athlete_count"] 
df_SVD = graph_df.groupby(['age_bucket','gender']).agg({'RMSE_weight':'sum','athlete_count':'sum'})
df_SVD["Weighted_RMSE"] = df_SVD["RMSE_weight"]/df_SVD["athlete_count"]
df_SVD

#### KNN Tuning

#### grid search freezes for KNN

In [None]:
'''
KNNparam = { 'k': [30, 40],
             'min_k': [1, 2]
           }

gridKNN = GridSearchCV(KNNBasic, param_grid=KNNparam, measures=["rmse"], joblib_verbose = 1)

data = Dataset.load_from_df(tuning_df[["user_id","run_distance_rounded","weekly_frequency"]], Reader(rating_scale=(0,7)))
'''

In [None]:
#gridKNN.fit(data)

In [None]:
'''
print('Best SVD results')
print(gridKNN.best_score['rmse'])
print(gridKNN.best_params['rmse'])
'''

#### manual search

In [None]:
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline

In [None]:
data = Dataset.load_from_df(tuning_df[["user_id","run_distance_rounded","weekly_frequency"]], Reader(rating_scale=(0,7)))
knn = KNNWithMeans(verbose=True)
cross_validate(knn, data, measures=['RMSE'], cv=3, verbose=True)

In [None]:
#build function to run simulations

def model_RMSE(gender, age_bucket, start_km, end_km, min_num_athletes=10, function_name = KNNBasic):
    """
    Function to evaluate SDV model, incrementing by 1km through km range for specific age group and gender
    """
    
    #Create empty dict for results
    results_dict = {
        'age_bucket': [],
        'gender': [],
        'weekly_target': [],
        'RMSE': [],
        'athlete_count': []
    }

    start = time.time()
    
    for target_kms in range(start_km, end_km+1): 

        #define objects for Surprise. Must be in user, item, rating order
        filtered_df = raw_df.loc[
            (raw_df['gender'] == gender) &
            (raw_df['age_bucket'] == age_bucket) &
            (raw_df['prev_month_weekly_days_run'] >= (4)) & 
            (raw_df['prev_month_weekly_days_run'] <= (7)) &
            (raw_df['prev_month_weekly_km'] >= target_kms-1) &
            (raw_df['prev_month_weekly_km'] <= target_kms+1)
        ]
        
        athlete_count = filtered_df.size
        
        if filtered_df.size <= min_num_athletes:
            print("insufficient data for",gender,age_bucket,target_kms)
            continue 

        #convert datates    
        data = Dataset.load_from_df(filtered_df[["user_id","run_distance_rounded","weekly_frequency"]], Reader(rating_scale=(0,7)))

        #build model
        knn = function_name(verbose=False)
        validation = cross_validate(knn, data, measures=['RMSE'], cv=3, verbose=False)

        #capture outputs for dict from final fold
        results_dict['gender'].append(gender)
        results_dict['age_bucket'].append(age_bucket)
        results_dict['weekly_target'].append(target_kms)
        results_dict['RMSE'].append(validation['test_rmse'][2])
        results_dict['athlete_count'].append(athlete_count)
        
        #clean up memory
        del filtered_df

    end = time.time()
        
    print("Finished in:", end - start)
    
    return pd.DataFrame(results_dict)

graph_df = pd.concat([df1,df2,df3,df4,df5,df6])

df1 = model_RMSE("F","18 - 34",10,100, function_name=KNNWithMeans)
df2 = model_RMSE("M","18 - 34",10,100, function_name=KNNWithMeans)
df3 = model_RMSE("F","35 - 54",10,100, function_name=KNNWithMeans)
df4 = model_RMSE("M","35 - 54",10,100, function_name=KNNWithMeans)
df5 = model_RMSE("F","55 +",10,100, function_name=KNNWithMeans)
df6 = model_RMSE("M","55 +",10,100, function_name=KNNWithMeans)

In [None]:
def weighted_results(model_name):   
    graph_df = pd.concat([df1,df2,df3,df4,df5,df6])
    graph_df["RMSE_weight"] = graph_df["RMSE"]*graph_df["athlete_count"] 
    df = graph_df.groupby(['age_bucket','gender']).agg({'RMSE_weight':'sum','athlete_count':'sum'})
    df["Weighted_RMSE"] = df["RMSE_weight"]/df["athlete_count"]
    print(model_name)
    return df

weighted_results('KNN with Means')

In [None]:
df1 = model_RMSE("F","18 - 34",10,100, function_name=KNNWithZScore)
df2 = model_RMSE("M","18 - 34",10,100, function_name=KNNWithZScore)
df3 = model_RMSE("F","35 - 54",10,100, function_name=KNNWithZScore)
df4 = model_RMSE("M","35 - 54",10,100, function_name=KNNWithZScore)
df5 = model_RMSE("F","55 +",10,100, function_name=KNNWithZScore)
df6 = model_RMSE("M","55 +",10,100, function_name=KNNWithZScore)

weighted_results('KNN with Z score')

In [None]:
df1 = model_RMSE("F","18 - 34",10,100, function_name=KNNBaseline)
df2 = model_RMSE("M","18 - 34",10,100, function_name=KNNBaseline)
df3 = model_RMSE("F","35 - 54",10,100, function_name=KNNBaseline)
df4 = model_RMSE("M","35 - 54",10,100, function_name=KNNBaseline)
df5 = model_RMSE("F","55 +",10,100, function_name=KNNBaseline)
df6 = model_RMSE("M","55 +",10,100, function_name=KNNBaseline)

weighted_results('KNN with Baseline')

### Over sample the less than 35 results and update KNN

In [None]:
#over sample low distances
low_dist_df = raw_df.loc[(raw_df['prev_month_weekly_km'] < 35)]
oversample_df = low_dist_df.sample(n=round(len(low_dist_df)/2),replace=True, random_state = 1)

#append to raw data
raw_with_oversample = pd.concat([raw_df,oversample_df])

In [None]:
def model_RMSE(gender, age_bucket, start_km, end_km, min_num_athletes=10):
    """
    Function to evaluate SDV model, incrementing by 1km through km range for specific age group and gender
    """
    
    #Create empty dict for results
    results_dict = {
        'age_bucket': [],
        'gender': [],
        'weekly_target': [],
        'RMSE': [],
        'athlete_count': []
    }

    start = time.time()
    
    for target_kms in range(start_km, end_km+1): 

        #define objects for Surprise. Must be in user, item, rating order
        filtered_df = raw_with_oversample.loc[
            (raw_with_oversample['gender'] == gender) &
            (raw_with_oversample['age_bucket'] == age_bucket) &
            (raw_with_oversample['prev_month_weekly_days_run'] >= (4)) & 
            (raw_with_oversample['prev_month_weekly_days_run'] <= (7)) &
            (raw_with_oversample['prev_month_weekly_km'] >= target_kms-1) &
            (raw_with_oversample['prev_month_weekly_km'] <= target_kms+1)
        ]
        
        athlete_count = filtered_df.size
        
        if filtered_df.size <= min_num_athletes:
            print("insufficient data for",gender,age_bucket,target_kms)
            continue 

        #convert datates    
        data = Dataset.load_from_df(filtered_df[["user_id","run_distance_rounded","weekly_frequency"]], Reader(rating_scale=(0,7)))

        #build model
        knn = KNNBasic(verbose=False)
        validation = cross_validate(knn, data, measures=['RMSE'], cv=3, verbose=False)

        #capture outputs for dict from final fold
        results_dict['gender'].append(gender)
        results_dict['age_bucket'].append(age_bucket)
        results_dict['weekly_target'].append(target_kms)
        results_dict['RMSE'].append(validation['test_rmse'][2])
        results_dict['athlete_count'].append(athlete_count)
        
        #clean up memory
        del filtered_df

    end = time.time()
        
    print("Finished in:", end - start)
    
    return pd.DataFrame(results_dict)

df1 = model_RMSE("F","18 - 34",10,100)
df2 = model_RMSE("M","18 - 34",10,100)
df3 = model_RMSE("F","35 - 54",10,100)
df4 = model_RMSE("M","35 - 54",10,100)
df5 = model_RMSE("F","55 +",10,100)
df6 = model_RMSE("M","55 +",10,100)

weighted_results('KNN Basic with oversampling')

In [None]:
#combine datasets and graph
graph_df = pd.concat([df1,df2,df3,df4,df5,df6])

sns.set_theme(style="white")

sns.relplot(
    x="weekly_target", 
    y="RMSE", 
    hue="age_bucket", 
    size="athlete_count",
    sizes=(40, 400), 
    alpha=.5, 
    palette="muted",
    height=6, 
    data=graph_df.loc[graph_df['gender'] == "F"]
).set(title="RMSE for Female Runners, in km per week")

In [None]:
#combine datasets and graph
graph_df = pd.concat([df1,df2,df3,df4,df5,df6])

sns.set_theme(style="white")

sns.relplot(
    x="weekly_target", 
    y="RMSE", 
    hue="age_bucket", 
    size="athlete_count",
    sizes=(40, 400), 
    alpha=.5, 
    palette="muted",
    height=6, 
    data=graph_df.loc[graph_df['gender'] == "M"]
).set(title="RMSE for Male Runners, in km per week")

In [None]:
#EOF