## Recommeder Model

In [36]:
#Use conda to install scikit-surprise with the below command
#conda install -c conda-forge scikit-surprise

#libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#surprise imports
import surprise
import hyperopt
import json
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise import accuracy
from auto_surprise.engine import Engine

## Load Data

In [5]:
#load in csv data
raw_df = pd.read_csv('../../raw/input_runners_2020.csv')
raw_df.head()

Unnamed: 0,athlete,prev_month_weekly_km,prev_month_weekly_pace,prev_month_weekly_days_run,age_bucket,gender,country,current_month,run_distance_rounded,weekly_frequency
0,0,47.48,6.185043,3.4,18 - 34,F,United States,October,4.0,0.225806
1,0,47.48,6.185043,3.4,18 - 34,F,United States,October,6.0,0.225806
2,0,47.48,6.185043,3.4,18 - 34,F,United States,October,8.0,0.225806
3,0,47.48,6.185043,3.4,18 - 34,F,United States,October,9.0,0.225806
4,0,47.48,6.185043,3.4,18 - 34,F,United States,October,10.0,0.225806


In [6]:
raw_df.size

27156850

In [7]:
#set "users" to athlete id + month

#convert months to number
def mtn(x):
    months = {
        'jan': 1,
        'feb': 2,
        'mar': 3,
        'apr':4,
         'may':5,
         'jun':6,
         'jul':7,
         'aug':8,
         'sep':9,
         'oct':10,
         'nov':11,
         'dec':12
        }
    a = x.strip()[:3].lower()
    try:
        ez = months[a]
        return ez
    except:
        raise ValueError('Not a month')
        
raw_df["current_month"] =  raw_df["current_month"].apply(lambda x:mtn(x)) 

#create user ID
raw_df["user_id"] = raw_df["athlete"].astype(str) + "." + raw_df["current_month"].astype(str)

In [8]:
#remove any missings from the data
raw_df= raw_df[raw_df["prev_month_weekly_km"].notna()]
raw_df.size

27373005

In [9]:
#remove 0s
raw_df= raw_df[raw_df["prev_month_weekly_km"]!=0]
raw_df.size

26912776

In [23]:
sampled_df = raw_df.sample(frac = 0.05, replace = True)

#define objects for Surprise. Must be in user, item, rating order    
data = Dataset.load_from_df(sampled_df[["user_id","run_distance_rounded","weekly_frequency"]], Reader(rating_scale=(0,7)))
sampled_df.size

1345641

In [None]:
#setup parameters for tuning
#engine = Engine(verbose=True, algorithms=['svd', 'svdpp', 'knn_basic', 'knn_with_means', 'knn_baseline'])
engine = Engine(verbose=True, algorithms=['svdpp'])

best_algo, best_params, best_score, tasks = engine.train(
    data=data,
    target_metric='test_rmse',
    cpu_time_limit=60*60*8,
    max_evals=20,
    hpo_algo=hyperopt.tpe.suggest
)

In [None]:
#write out best results
results = best_params
results['algo'] = best_algo
results['rmse'] = best_score

with open('auto_tune_results.json', 'w') as file:
    file.write(json.dumps(results))

In [16]:
#write out best results
algo = "svd"
path = algo + 'auto_tune_results.json'
results = {}
results['algo'] = 'test2'

with open(path, 'w') as file:
    file.write(json.dumps(results))

In [None]:
for model in ['svd', 'svdpp', 'knn_basic', 'knn_with_means', 'knn_baseline']:
    
    engine = Engine(verbose=True, algorithms=[model])

    best_algo, best_params, best_score, tasks = engine.train(
        data=data,
        target_metric='test_rmse',
        cpu_time_limit=60*60*2,
        max_evals=20,
        hpo_algo=hyperopt.tpe.suggest
    )
        
    path = model + 'auto_tune_results.json'
    results = best_params
    results['algo'] = best_algo
    results['rmse'] = best_score

    with open(path, 'w') as file:
        file.write(json.dumps(results))
        

Evaluating RMSE, MAE, MSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.4209  0.4240  0.4142  0.4209  0.4154  0.4191  0.0037  
MAE (testset)     0.3086  0.3125  0.3082  0.3097  0.3076  0.3093  0.0017  
MSE (testset)     0.1772  0.1797  0.1716  0.1771  0.1725  0.1756  0.0031  
Fit time          0.10    0.10    0.11    0.10    0.12    0.11    0.01    
Test time         0.07    0.07    0.07    0.07    0.22    0.10    0.06    


Starting process with svd algorithm
Evaluating RMSE, MAE, MSE of algorithm SVD on 5 split(s).                                                 
                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std                                     
RMSE (testset)    0.3182  0.3264  0.3331  0.3300  0.3243  0.3264  0.0051  
MAE (testset)     0.2059  0.2073  0.2095  0.2097  0.2087  0.2082  0.0014  
MSE (testset)     0.1012  0.1065  0.1110  0.1089  0.1052  0.1066  0.0033  
Fit time          12.57   9.32    12.61   10.03   12.58   11.42   1.44    
Test time         0.25    0.09    0.09    0.20    0.09    0.14    0.07    
Evaluating RMSE, MAE, MSE of algorithm SVD on 5 split(s).                                                 
                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std                                     
RMSE (testset)    0.3230  0.3221  0.3179  0.3184  0.3050  0.3173  0.0064  
MAE (testset)     0.2041  0.2006  0.2004  0.2001  0.1969  0.2004  0.0023  
MSE (testse

Evaluating RMSE, MAE, MSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.4140  0.4191  0.4194  0.4191  0.4167  0.4177  0.0021  
MAE (testset)     0.3088  0.3105  0.3087  0.3099  0.3095  0.3095  0.0007  
MSE (testset)     0.1714  0.1757  0.1759  0.1757  0.1737  0.1744  0.0017  
Fit time          0.09    0.10    0.12    0.11    0.12    0.11    0.01    
Test time         0.22    0.07    0.07    0.19    0.07    0.13    0.07    


Starting process with svdpp algorithm
Evaluating RMSE, MAE, MSE of algorithm SVDpp on 5 split(s).                                               
                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std                                     
RMSE (testset)    0.2932  0.2968  0.3030  0.3089  0.2973  0.2998  0.0055  
MAE (testset)     0.1961  0.1984  0.1983  0.2001  0.1988  0.1983  0.0013  
MSE (testset)     0.0860  0.0881  0.0918  0.0954  0.0884  0.0899  0.0033  
Fit time          27.36   25.52   27.63   25.55   27.27   26.67   0.93    
Test time         0.15    0.14    0.14    0.14    0.14    0.15    0.00    
Evaluating RMSE, MAE, MSE of algorithm SVDpp on 5 split(s).                                               
                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std                                     
RMSE (testset)    0.3145  0.3131  0.3090  0.3142  0.3071  0.3116  0.0030  
MAE (testset)     0.2067  0.2041  0.2055  0.2081  0.2106  0.2070  0.0022  
MSE (test

In [3]:
#read results back in 
with open('svdauto_tune_results.json') as json_file:
    data = json.load(json_file)
data

{'lr_bi': 0.0026091015953433926,
 'lr_bu': 0.0005280929964239043,
 'lr_pu': 0.01314852295352534,
 'lr_qi': 0.027761447092886575,
 'n_epochs': 110,
 'n_factors': 79,
 'reg_bi': 0.003163318480771626,
 'reg_bu': 0.09706485078334093,
 'reg_pu': 0.001501289759229521,
 'reg_qi': 0.03204258644534104,
 'algo': 'svd',
 'rmse': 0.29545557249412063}

In [4]:
#SVD PP
with open('svdppauto_tune_results.json') as json_file:
    data = json.load(json_file)
data

{'lr_bi': 0.0005151212831720828,
 'lr_bu': 0.0009288888065908822,
 'lr_pu': 0.009868712726952077,
 'lr_qi': 0.023835406402101852,
 'lr_yj': 0.00015168489156722396,
 'n_epochs': 181,
 'n_factors': 87,
 'reg_bi': 0.0003976840617776604,
 'reg_bu': 0.005246827975729171,
 'reg_pu': 0.00956582556158568,
 'reg_qi': 0.0312118125180339,
 'reg_yj': 0.08246206940085145,
 'algo': 'svdpp',
 'rmse': 0.29431712433251966}

In [5]:
#KNN Baseline
with open('2_5pctrun/knn_baselineauto_tune_results.json') as json_file:
    data = json.load(json_file)
data

{'bsl_options': {'method': 'als',
  'n_epochs': 22,
  'reg_i': 48.35241195444295,
  'reg_u': 12.044236277774175},
 'k': 438,
 'min_k': 41,
 'sim_options': {'name': 'pearson_baseline',
  'user_based': False,
  'min_support': 64,
  'shrinkage': 230},
 'algo': 'knn_baseline',
 'rmse': 0.2952774453430375}

## Evaluate on group level model

In [37]:
import time
import seaborn as sns
from surprise import SVD, SVDpp
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans, KNNBaseline

In [38]:
#load in csv data
raw_df_2020 = pd.read_csv('../../raw/input_runners_2020.csv')
raw_df_2019 = pd.read_csv('../../raw/input_runners_2019.csv')

raw_df = pd.concat([raw_df_2020,raw_df_2019])
raw_df.size

54313700

In [39]:
#convert months to number
def mtn(x):
    months = {
        'jan': 1,
        'feb': 2,
        'mar': 3,
        'apr':4,
         'may':5,
         'jun':6,
         'jul':7,
         'aug':8,
         'sep':9,
         'oct':10,
         'nov':11,
         'dec':12
        }
    a = x.strip()[:3].lower()
    try:
        ez = months[a]
        return ez
    except:
        raise ValueError('Not a month')
        
raw_df["current_month"] =  raw_df["current_month"].apply(lambda x:mtn(x)) 

#create user ID
raw_df["user_id"] = raw_df["athlete"].astype(str) + "." + raw_df["current_month"].astype(str)

#remove any missings from the data
raw_df= raw_df[raw_df["prev_month_weekly_km"].notna()]

#remove 0s
raw_df= raw_df[raw_df["prev_month_weekly_km"]!=0]
raw_df.size

53825552

In [40]:
#function to weight rmse by pop numbers
def weighted_results(model_name):   
    graph_df = pd.concat([df1,df2,df3,df4,df5,df6])
    graph_df["RMSE_weight"] = graph_df["RMSE"]*graph_df["athlete_count"] 
    df = graph_df.groupby(['age_bucket','gender']).agg({'RMSE_weight':'sum','athlete_count':'sum'})
    df["Weighted_RMSE"] = df["RMSE_weight"]/df["athlete_count"]
    print(model_name)
    return df

In [47]:
def filter_df(gender, age_bucket, start_km, min_num_athletes=10, function_name = KNNBasic, **kwargs):
    """
    Function to evaluate collaborative filtering models, incrementing by 1km through km range for specific age group and gender
    """
    
    #Create empty dict for results
    results_dict = {
        'age_bucket': [],
        'gender': [],
        'weekly_target': [],
        'RMSE': [],
        'athlete_count': []
    }

    
    #define objects for Surprise. Must be in user, item, rating order
    filtered_df = raw_df.loc[
        (raw_df['gender'] == gender) &
        (raw_df['age_bucket'] == age_bucket) &
        (raw_df['prev_month_weekly_days_run'] >= (4)) & 
        (raw_df['prev_month_weekly_days_run'] <= (7)) &
        (raw_df['prev_month_weekly_km'] >= start_km-1) &
        (raw_df['prev_month_weekly_km'] <= start_km+1)
    ]
        
    return  filtered_df  

df1 = filter_df("F","18 - 34",10,100, function_name=KNNBaseline)
df1['athlete'].size

6

In [64]:
#build function to run simulations

def model_RMSE(gender, age_bucket, start_km, end_km, min_num_athletes=10, function_name = KNNBasic, **kwargs):
    """
    Function to evaluate collaborative filtering models, incrementing by 1km through km range for specific age group and gender
    """
    
    #Create empty dict for results
    results_dict = {
        'age_bucket': [],
        'gender': [],
        'weekly_target': [],
        'RMSE': [],
        'athlete_count': []
    }

    start = time.time()
    
    for target_kms in range(start_km, end_km+1): 

        #define objects for Surprise. Must be in user, item, rating order
        filtered_df = raw_df.loc[
            (raw_df['gender'] == gender) &
            (raw_df['age_bucket'] == age_bucket) &
            (raw_df['prev_month_weekly_days_run'] >= (4)) & 
            (raw_df['prev_month_weekly_days_run'] <= (7)) &
            (raw_df['prev_month_weekly_km'] >= target_kms-1) &
            (raw_df['prev_month_weekly_km'] <= target_kms+1)
        ]
        
        athlete_count = filtered_df['athlete'].size
        
        if filtered_df.size <= min_num_athletes:
            print("insufficient data for",gender,age_bucket,target_kms)
            continue 

        #convert datates    
        data = Dataset.load_from_df(filtered_df[["user_id","run_distance_rounded","weekly_frequency"]], Reader(rating_scale=(0,7)))
        
        #build model
        model = function_name(verbose=False, **kwargs)
        validation = cross_validate(model, data, measures=['RMSE'], cv=5, verbose=False)

        #capture outputs for dict from final fold
        results_dict['gender'].append(gender)
        results_dict['age_bucket'].append(age_bucket)
        results_dict['weekly_target'].append(target_kms)
        results_dict['RMSE'].append(validation['test_rmse'][2])
        results_dict['athlete_count'].append(athlete_count)
        
        #clean up memory
        del filtered_df

    end = time.time()
        
    print("Finished in:", end - start)
    
    return pd.DataFrame(results_dict)

#read in results
with open('2_5pctrun/knn_baselineauto_tune_results.json') as json_file:
    data = json.load(json_file)

df1 = model_RMSE("F","18 - 34",10,100, min_num_athletes=50, function_name=KNNBaseline, 
                 bsl_options = data['bsl_options'], sim_options = data['sim_options'], k = data['k'], min_k = data['min_k'])
df2 = model_RMSE("M","18 - 34",10,100, min_num_athletes=50, function_name=KNNBaseline, 
                 bsl_options = data['bsl_options'], sim_options = data['sim_options'], k = data['k'], min_k = data['min_k'])
df3 = model_RMSE("F","35 - 54",10,100, min_num_athletes=50, function_name=KNNBaseline, 
                 bsl_options = data['bsl_options'], sim_options = data['sim_options'], k = data['k'], min_k = data['min_k'])
df4 = model_RMSE("M","35 - 54",10,100, min_num_athletes=50, function_name=KNNBaseline, 
                 bsl_options = data['bsl_options'], sim_options = data['sim_options'], k = data['k'], min_k = data['min_k'])
df5 = model_RMSE("F","55 +",10,100, min_num_athletes=50, function_name=KNNBaseline, 
                 bsl_options = data['bsl_options'], sim_options = data['sim_options'], k = data['k'], min_k = data['min_k'])
df6 = model_RMSE("M","55 +",10,100, min_num_athletes=50, function_name=KNNBaseline, 
                 bsl_options = data['bsl_options'], sim_options = data['sim_options'], k = data['k'], min_k = data['min_k'])

weighted_results('KNN with Baseline')

insufficient data for F 18 - 34 11
Finished in: 38.79423189163208
insufficient data for M 18 - 34 11
Finished in: 45.713014364242554
Finished in: 39.80094790458679
Finished in: 58.912473917007446
insufficient data for F 55 + 10
insufficient data for F 55 + 11
insufficient data for F 55 + 12
insufficient data for F 55 + 13
insufficient data for F 55 + 14
insufficient data for F 55 + 15
insufficient data for F 55 + 18
insufficient data for F 55 + 93
insufficient data for F 55 + 94
insufficient data for F 55 + 97
insufficient data for F 55 + 98
insufficient data for F 55 + 99
insufficient data for F 55 + 100
Finished in: 33.75628995895386
insufficient data for M 55 + 10
insufficient data for M 55 + 13
insufficient data for M 55 + 14
Finished in: 34.18601369857788
KNN with Baseline


Unnamed: 0_level_0,Unnamed: 1_level_0,RMSE_weight,athlete_count,Weighted_RMSE
age_bucket,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18 - 34,F,79630.641648,244318,0.32593
18 - 34,M,199693.186663,611362,0.326637
35 - 54,F,97342.287343,283974,0.342786
35 - 54,M,393665.005302,1163970,0.338209
55 +,F,10631.380452,27784,0.382644
55 +,M,51340.862761,142990,0.359052


In [68]:
#Basic KNN
with open('2_5pctrun/knn_basicauto_tune_results.json') as json_file:
    data = json.load(json_file)

df1 = model_RMSE("F","18 - 34",10,100, min_num_athletes=50, function_name=KNNBasic, 
                 sim_options = data['sim_options'], k = data['k'], min_k = data['min_k'])
df2 = model_RMSE("M","18 - 34",10,100, min_num_athletes=50, function_name=KNNBasic, 
                 sim_options = data['sim_options'], k = data['k'], min_k = data['min_k'])
df3 = model_RMSE("F","35 - 54",10,100, min_num_athletes=50, function_name=KNNBasic, 
                 sim_options = data['sim_options'], k = data['k'], min_k = data['min_k'])
df4 = model_RMSE("M","35 - 54",10,100, min_num_athletes=50, function_name=KNNBasic, 
                 sim_options = data['sim_options'], k = data['k'], min_k = data['min_k'])
df5 = model_RMSE("F","55 +",10,100, min_num_athletes=50, function_name=KNNBasic, 
                 sim_options = data['sim_options'], k = data['k'], min_k = data['min_k'])
df6 = model_RMSE("M","55 +",10,100, min_num_athletes=50, function_name=KNNBasic, 
                 sim_options = data['sim_options'], k = data['k'], min_k = data['min_k'])

weighted_results('KNN Basic')

insufficient data for F 18 - 34 11
Finished in: 38.00745224952698
insufficient data for M 18 - 34 11
Finished in: 43.21994614601135
Finished in: 38.76235103607178
Finished in: 54.12440776824951
insufficient data for F 55 + 10
insufficient data for F 55 + 11
insufficient data for F 55 + 12
insufficient data for F 55 + 13
insufficient data for F 55 + 14
insufficient data for F 55 + 15
insufficient data for F 55 + 18
insufficient data for F 55 + 93
insufficient data for F 55 + 94
insufficient data for F 55 + 97
insufficient data for F 55 + 98
insufficient data for F 55 + 99
insufficient data for F 55 + 100
Finished in: 33.78467679023743
insufficient data for M 55 + 10
insufficient data for M 55 + 13
insufficient data for M 55 + 14
Finished in: 33.833582639694214
KNN Basic


Unnamed: 0_level_0,Unnamed: 1_level_0,RMSE_weight,athlete_count,Weighted_RMSE
age_bucket,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18 - 34,F,88516.041335,244318,0.362298
18 - 34,M,216502.779743,611362,0.354132
35 - 54,F,109621.876211,283974,0.386028
35 - 54,M,432970.339507,1163970,0.371977
55 +,F,11618.229166,27784,0.418163
55 +,M,56343.088244,142990,0.394035


In [69]:
#KNN with means
with open('2_5pctrun/knn_with_meansauto_tune_results.json') as json_file:
    data = json.load(json_file)

df1 = model_RMSE("F","18 - 34",10,100, min_num_athletes=50, function_name=KNNWithMeans, 
                 sim_options = data['sim_options'], min_k = data['min_k'])
df2 = model_RMSE("M","18 - 34",10,100, min_num_athletes=50, function_name=KNNWithMeans, 
                 sim_options = data['sim_options'], min_k = data['min_k'])
df3 = model_RMSE("F","35 - 54",10,100, min_num_athletes=50, function_name=KNNWithMeans, 
                 sim_options = data['sim_options'], min_k = data['min_k'])
df4 = model_RMSE("M","35 - 54",10,100, min_num_athletes=50, function_name=KNNWithMeans, 
                 sim_options = data['sim_options'], min_k = data['min_k'])
df5 = model_RMSE("F","55 +",10,100, min_num_athletes=50, function_name=KNNWithMeans, 
                 sim_options = data['sim_options'], min_k = data['min_k'])
df6 = model_RMSE("M","55 +",10,100, min_num_athletes=50, function_name=KNNWithMeans, 
                 sim_options = data['sim_options'], min_k = data['min_k'])

weighted_results('KNN with means')


insufficient data for F 18 - 34 11
Finished in: 39.10376024246216
insufficient data for M 18 - 34 11
Finished in: 45.33127212524414
Finished in: 39.53412222862244
Finished in: 57.33166289329529
insufficient data for F 55 + 10
insufficient data for F 55 + 11
insufficient data for F 55 + 12
insufficient data for F 55 + 13
insufficient data for F 55 + 14
insufficient data for F 55 + 15
insufficient data for F 55 + 18
insufficient data for F 55 + 93
insufficient data for F 55 + 94
insufficient data for F 55 + 97
insufficient data for F 55 + 98
insufficient data for F 55 + 99
insufficient data for F 55 + 100
Finished in: 33.832945346832275
insufficient data for M 55 + 10
insufficient data for M 55 + 13
insufficient data for M 55 + 14
Finished in: 34.400203704833984
KNN with means


Unnamed: 0_level_0,Unnamed: 1_level_0,RMSE_weight,athlete_count,Weighted_RMSE
age_bucket,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18 - 34,F,78189.417235,244318,0.320031
18 - 34,M,201652.892714,611362,0.329842
35 - 54,F,99739.186968,283974,0.351226
35 - 54,M,394854.748029,1163970,0.339231
55 +,F,10766.853789,27784,0.38752
55 +,M,53046.319267,142990,0.370979


In [81]:
#SVD
with open('svdauto_tune_results.json') as json_file:
    data = json.load(json_file)
data.pop('algo')
data.pop('rmse')

df1 = model_RMSE("F","18 - 34",10,100, min_num_athletes=50, function_name=SVD, 
                 lr_bi = data['lr_bi'], lr_bu = data['lr_bu'], lr_pu = data['lr_pu'], lr_qi = data['lr_qi'], n_epochs = data['n_epochs'], 
                 n_factors = data['n_factors'], reg_bi= data['reg_bi'], reg_bu = data['reg_bu'], reg_pu = data['reg_pu'], reg_qi = data['reg_qi'])
df2 = model_RMSE("M","18 - 34",10,100, min_num_athletes=50, function_name=SVD, 
                 lr_bi = data['lr_bi'], lr_bu = data['lr_bu'], lr_pu = data['lr_pu'], lr_qi = data['lr_qi'], n_epochs = data['n_epochs'], 
                 n_factors = data['n_factors'], reg_bi= data['reg_bi'], reg_bu = data['reg_bu'], reg_pu = data['reg_pu'], reg_qi = data['reg_qi'])
df3 = model_RMSE("F","35 - 54",10,100, min_num_athletes=50, function_name=SVD, 
                 lr_bi = data['lr_bi'], lr_bu = data['lr_bu'], lr_pu = data['lr_pu'], lr_qi = data['lr_qi'], n_epochs = data['n_epochs'], 
                 n_factors = data['n_factors'], reg_bi= data['reg_bi'], reg_bu = data['reg_bu'], reg_pu = data['reg_pu'], reg_qi = data['reg_qi'])
df4 = model_RMSE("M","35 - 54",10,100, min_num_athletes=50, function_name=SVD, 
                 lr_bi = data['lr_bi'], lr_bu = data['lr_bu'], lr_pu = data['lr_pu'], lr_qi = data['lr_qi'], n_epochs = data['n_epochs'], 
                 n_factors = data['n_factors'], reg_bi= data['reg_bi'], reg_bu = data['reg_bu'], reg_pu = data['reg_pu'], reg_qi = data['reg_qi'])
df5 = model_RMSE("F","55 +",10,100, min_num_athletes=50, function_name=SVD, 
                 lr_bi = data['lr_bi'], lr_bu = data['lr_bu'], lr_pu = data['lr_pu'], lr_qi = data['lr_qi'], n_epochs = data['n_epochs'], 
                 n_factors = data['n_factors'], reg_bi= data['reg_bi'], reg_bu = data['reg_bu'], reg_pu = data['reg_pu'], reg_qi = data['reg_qi'])
df6 = model_RMSE("M","55 +",10,100, min_num_athletes=50, function_name=SVD, 
                 lr_bi = data['lr_bi'], lr_bu = data['lr_bu'], lr_pu = data['lr_pu'], lr_qi = data['lr_qi'], n_epochs = data['n_epochs'], 
                 n_factors = data['n_factors'], reg_bi= data['reg_bi'], reg_bu = data['reg_bu'], reg_pu = data['reg_pu'], reg_qi = data['reg_qi'])

weighted_results('SVD')


insufficient data for F 18 - 34 11
Finished in: 59.70398736000061
insufficient data for M 18 - 34 11
Finished in: 102.51276111602783
Finished in: 64.46778297424316
Finished in: 171.27458453178406
insufficient data for F 55 + 10
insufficient data for F 55 + 11
insufficient data for F 55 + 12
insufficient data for F 55 + 13
insufficient data for F 55 + 14
insufficient data for F 55 + 15
insufficient data for F 55 + 18
insufficient data for F 55 + 93
insufficient data for F 55 + 94
insufficient data for F 55 + 97
insufficient data for F 55 + 98
insufficient data for F 55 + 99
insufficient data for F 55 + 100
Finished in: 36.174323081970215
insufficient data for M 55 + 10
insufficient data for M 55 + 13
insufficient data for M 55 + 14
Finished in: 46.21928286552429
SVD


Unnamed: 0_level_0,Unnamed: 1_level_0,RMSE_weight,athlete_count,Weighted_RMSE
age_bucket,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18 - 34,F,43998.035703,244318,0.180085
18 - 34,M,112303.727597,611362,0.183694
35 - 54,F,54782.891022,283974,0.192915
35 - 54,M,220560.71194,1163970,0.18949
55 +,F,5221.505402,27784,0.187932
55 +,M,26557.809623,142990,0.185732


In [None]:
#SVD PP
with open('svdppauto_tune_results.json') as json_file:
    data = json.load(json_file)
data.pop('algo')
data.pop('rmse')

df1 = model_RMSE("F","18 - 34",10,100, min_num_athletes=50, function_name=SVDpp, lr_bi = data['lr_bi'], lr_bu = data['lr_bu'], lr_pu = data['lr_pu'], 
                 lr_qi = data['lr_qi'], lr_yj = data['lr_yj'], n_epochs = data['n_epochs'], n_factors = data['n_factors'], reg_bi= data['reg_bi'], 
                 reg_bu = data['reg_bu'], reg_pu = data['reg_pu'], reg_qi = data['reg_qi'], reg_yj = data['reg_yj'])
df2 = model_RMSE("M","18 - 34",10,100, min_num_athletes=50, function_name=SVDpp, 
                 lr_bi = data['lr_bi'], lr_bu = data['lr_bu'], lr_pu = data['lr_pu'], lr_qi = data['lr_qi'], n_epochs = data['n_epochs'], 
                 n_factors = data['n_factors'], reg_bi= data['reg_bi'], reg_bu = data['reg_bu'], reg_pu = data['reg_pu'], reg_qi = data['reg_qi'],
                 lr_yj = data['lr_yj'], reg_yj = data['reg_yj'])
df3 = model_RMSE("F","35 - 54",10,100, min_num_athletes=50, function_name=SVDpp, 
                 lr_bi = data['lr_bi'], lr_bu = data['lr_bu'], lr_pu = data['lr_pu'], lr_qi = data['lr_qi'], n_epochs = data['n_epochs'], 
                 n_factors = data['n_factors'], reg_bi= data['reg_bi'], reg_bu = data['reg_bu'], reg_pu = data['reg_pu'], reg_qi = data['reg_qi'],
                 lr_yj = data['lr_yj'], reg_yj = data['reg_yj'])
df4 = model_RMSE("M","35 - 54",10,100, min_num_athletes=50, function_name=SVDpp, 
                 lr_bi = data['lr_bi'], lr_bu = data['lr_bu'], lr_pu = data['lr_pu'], lr_qi = data['lr_qi'], n_epochs = data['n_epochs'], 
                 n_factors = data['n_factors'], reg_bi= data['reg_bi'], reg_bu = data['reg_bu'], reg_pu = data['reg_pu'], reg_qi = data['reg_qi'],
                 lr_yj = data['lr_yj'], reg_yj = data['reg_yj'])
df5 = model_RMSE("F","55 +",10,100, min_num_athletes=50, function_name=SVDpp, 
                 lr_bi = data['lr_bi'], lr_bu = data['lr_bu'], lr_pu = data['lr_pu'], lr_qi = data['lr_qi'], n_epochs = data['n_epochs'], 
                 n_factors = data['n_factors'], reg_bi= data['reg_bi'], reg_bu = data['reg_bu'], reg_pu = data['reg_pu'], reg_qi = data['reg_qi'],
                 lr_yj = data['lr_yj'], reg_yj = data['reg_yj'])
df6 = model_RMSE("M","55 +",10,100, min_num_athletes=50, function_name=SVDpp, 
                 lr_bi = data['lr_bi'], lr_bu = data['lr_bu'], lr_pu = data['lr_pu'], lr_qi = data['lr_qi'], n_epochs = data['n_epochs'], 
                 n_factors = data['n_factors'], reg_bi= data['reg_bi'], reg_bu = data['reg_bu'], reg_pu = data['reg_pu'], reg_qi = data['reg_qi'],
                 lr_yj = data['lr_yj'], reg_yj = data['reg_yj'])

weighted_results('SVD')

insufficient data for F 18 - 34 11
Finished in: 826.069821357727
insufficient data for M 18 - 34 11
Finished in: 2103.8918023109436
Finished in: 932.873875617981


In [67]:
def test1(**kwargs):
    print('in 1')
    test2(**kwargs)
    
def test2(**kwargs):
    print(kwargs)
    
test1(a='1', b='2', c='3')

in 1
{'a': '1', 'b': '2', 'c': '3'}


In [None]:
#EOF