## Recommeder Model

In [17]:
#Use conda to install scikit-surprise with the below command
#conda install -c conda-forge scikit-surprise

#libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#surprise imports
import surprise
import hyperopt
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise import accuracy
from auto_surprise.engine import Engine

## Load Data

In [6]:
#load in csv data
raw_df = pd.read_csv('../../raw/input_runners_2020.csv')
raw_df.head()

Unnamed: 0,athlete,prev_month_weekly_km,prev_month_weekly_pace,prev_month_weekly_days_run,age_bucket,gender,country,current_month,run_distance_rounded,weekly_frequency
0,0,47.48,6.185043,3.4,18 - 34,F,United States,October,4.0,0.225806
1,0,47.48,6.185043,3.4,18 - 34,F,United States,October,6.0,0.225806
2,0,47.48,6.185043,3.4,18 - 34,F,United States,October,8.0,0.225806
3,0,47.48,6.185043,3.4,18 - 34,F,United States,October,9.0,0.225806
4,0,47.48,6.185043,3.4,18 - 34,F,United States,October,10.0,0.225806


In [8]:
raw_df.size

27156850

In [9]:
#set "users" to athlete id + month

#convert months to number
def mtn(x):
    months = {
        'jan': 1,
        'feb': 2,
        'mar': 3,
        'apr':4,
         'may':5,
         'jun':6,
         'jul':7,
         'aug':8,
         'sep':9,
         'oct':10,
         'nov':11,
         'dec':12
        }
    a = x.strip()[:3].lower()
    try:
        ez = months[a]
        return ez
    except:
        raise ValueError('Not a month')
        
raw_df["current_month"] =  raw_df["current_month"].apply(lambda x:mtn(x)) 

#create user ID
raw_df["user_id"] = raw_df["athlete"].astype(str) + "." + raw_df["current_month"].astype(str)

In [10]:
#remove any missings from the data
raw_df= raw_df[raw_df["prev_month_weekly_km"].notna()]
raw_df.size

27373005

In [11]:
#remove 0s
raw_df= raw_df[raw_df["prev_month_weekly_km"]!=0]
raw_df.size

26912776

In [4]:
raw_df.head()

Unnamed: 0,athlete,prev_month_weekly_km,prev_month_weekly_pace,prev_month_weekly_days_run,age_bucket,gender,country,current_month,run_distance_rounded,weekly_frequency,user_id
0,0,47.48,6.185043,3.4,18 - 34,F,United States,10,4.0,0.225806,0.1
1,0,47.48,6.185043,3.4,18 - 34,F,United States,10,6.0,0.225806,0.1
2,0,47.48,6.185043,3.4,18 - 34,F,United States,10,8.0,0.225806,0.1
3,0,47.48,6.185043,3.4,18 - 34,F,United States,10,9.0,0.225806,0.1
4,0,47.48,6.185043,3.4,18 - 34,F,United States,10,10.0,0.225806,0.1


In [24]:
sampled_df = raw_df.sample(frac = 0.01, replace = True)

#define objects for Surprise. Must be in user, item, rating order    
data = Dataset.load_from_df(sampled_df[["user_id","run_distance_rounded","weekly_frequency"]], Reader(rating_scale=(0,7)))
sampled_df.size

269126

In [25]:
#setup parameters for tuning
#engine = Engine(verbose=True, algorithms=['svd', 'svdpp', 'knn_basic', 'knn_with_means', 'knn_baseline'], random_state=20230715)
engine = Engine(verbose=True, algorithms=['knn_with_means'])

best_algo, best_params, best_score, tasks = engine.train(
    data=data,
    target_metric='test_rmse',
    cpu_time_limit=60*60*3,
    max_evals=100,
    hpo_algo=hyperopt.tpe.suggest
)

Evaluating RMSE, MAE, MSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.4300  0.4264  0.4225  0.4218  0.4274  0.4256  0.0031  
MAE (testset)     0.3135  0.3090  0.3186  0.3117  0.3103  0.3126  0.0033  
MSE (testset)     0.1849  0.1818  0.1785  0.1779  0.1827  0.1812  0.0026  
Fit time          0.01    0.02    0.02    0.02    0.02    0.02    0.00    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


Starting process with knn_with_means algorithm
Computing the pearson similarity matrix...                                      
Done computing similarity matrix.                                               
Computing the pearson similarity matrix...                                      
Done computing similarity matrix.                                               
Computing the pearson similarity matrix...                                      
Done computing similarity matrix.                                               
Computing the pearson similarity matrix...                                      
Done computing similarity matrix.                                               
Computing the pearson similarity matrix...                                      
Done computing similarity matrix.                                               
Evaluating RMSE, MAE, MSE of algorithm KNNWithMeans on 5 split(s).              
                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean

In [38]:
#write out best results
results = best_params
results['algo'] = best_algo
results['rmse'] = best_score

import json
with open('auto_tune_results.json', 'w') as file:
    file.write(json.dumps(results))

In [40]:
#read results back in 
with open('auto_tune_results.json') as json_file:
    data = json.load(json_file)

In [23]:
#EOF