In [2]:
#Use conda to install scikit-surprise with the below command
#conda install -c conda-forge scikit-surprise

#libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

#surprise imports
import surprise
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy

In [3]:
#load in csv data
raw_df = pd.read_csv('../../raw/input_runners_2020.csv')
raw_df.head()

Unnamed: 0,athlete,prev_month_weekly_km,prev_month_weekly_pace,prev_month_weekly_days_run,age_bucket,gender,country,current_month,run_distance_rounded,weekly_frequency
0,0,5.165,4.973678,1.0,18 - 34,F,United States,October,4.0,0.225806
1,0,5.165,4.973678,1.0,18 - 34,F,United States,October,6.0,0.677419
2,0,3.703333,3.42443,0.666667,18 - 34,F,United States,November,5.0,0.7
3,0,3.703333,3.42443,0.666667,18 - 34,F,United States,November,6.0,0.466667
4,0,3.703333,3.42443,0.666667,18 - 34,F,United States,November,8.0,0.233333


In [4]:
#convert pace to minutes
raw_df['prev_month_weekly_pace'] = 60/raw_df['prev_month_weekly_pace']
raw_df.head()

Unnamed: 0,athlete,prev_month_weekly_km,prev_month_weekly_pace,prev_month_weekly_days_run,age_bucket,gender,country,current_month,run_distance_rounded,weekly_frequency
0,0,5.165,12.063507,1.0,18 - 34,F,United States,October,4.0,0.225806
1,0,5.165,12.063507,1.0,18 - 34,F,United States,October,6.0,0.677419
2,0,3.703333,17.521165,0.666667,18 - 34,F,United States,November,5.0,0.7
3,0,3.703333,17.521165,0.666667,18 - 34,F,United States,November,6.0,0.466667
4,0,3.703333,17.521165,0.666667,18 - 34,F,United States,November,8.0,0.233333


In [5]:
#set "users" to athlete id + month

#convert months to number
def mtn(x):
    months = {
        'jan': 1,
        'feb': 2,
        'mar': 3,
        'apr':4,
         'may':5,
         'jun':6,
         'jul':7,
         'aug':8,
         'sep':9,
         'oct':10,
         'nov':11,
         'dec':12
        }
    a = x.strip()[:3].lower()
    try:
        ez = months[a]
        return ez
    except:
        raise ValueError('Not a month')
        
raw_df["current_month"] =  raw_df["current_month"].apply(lambda x:mtn(x)) 

#create user ID
raw_df["user_id"] = raw_df["athlete"].astype(str) + "." + raw_df["current_month"].astype(str)

In [6]:
raw_df.head()

Unnamed: 0,athlete,prev_month_weekly_km,prev_month_weekly_pace,prev_month_weekly_days_run,age_bucket,gender,country,current_month,run_distance_rounded,weekly_frequency,user_id
0,0,5.165,12.063507,1.0,18 - 34,F,United States,10,4.0,0.225806,0.1
1,0,5.165,12.063507,1.0,18 - 34,F,United States,10,6.0,0.677419,0.1
2,0,3.703333,17.521165,0.666667,18 - 34,F,United States,11,5.0,0.7,0.11
3,0,3.703333,17.521165,0.666667,18 - 34,F,United States,11,6.0,0.466667,0.11
4,0,3.703333,17.521165,0.666667,18 - 34,F,United States,11,8.0,0.233333,0.11


In [47]:
#filter dataset based on targets
gender = "F"
weekly_target = 40
age_bucket = "18 - 34"
month = mtn("October")
number_days = 5
new_id = '000.0'

#cold start for new user
new_user_data = { 'user_id' : [new_id] * number_days,
                  ''
                }
new_user_data

{'user_id': ['000.0', '000.0', '000.0', '000.0', '000.0']}

In [40]:
#define objects for Surprise. Must be in user, item, rating order
filtered_df = raw_df.loc[
    (raw_df['gender'] == gender) &
    (raw_df['age_bucket'] == age_bucket) &
    (raw_df['current_month'] == month) &
    (raw_df['prev_month_weekly_days_run'] > (number_days-1)) & 
    (raw_df['prev_month_weekly_days_run'] < (number_days+1)) &
    (raw_df['prev_month_weekly_km'] > weekly_target-5) &
    (raw_df['prev_month_weekly_km'] < weekly_target+5)
]
filtered_df.size
    
data = Dataset.load_from_df(filtered_df[["user_id","run_distance_rounded","weekly_frequency"]], Reader(rating_scale=(0,7)))

In [41]:
#train model with 3 fold cross validation
svd = SVD(verbose=True, n_epochs=10)
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.5019  0.6029  0.4865  0.5304  0.0516  
MAE (testset)     0.3315  0.3830  0.3399  0.3515  0.0226  
Fit time          0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    


{'test_rmse': array([0.50186349, 0.60287584, 0.4864619 ]),
 'test_mae': array([0.33149899, 0.38299108, 0.33988777]),
 'fit_time': (0.0012331008911132812,
  0.0010302066802978516,
  0.0010387897491455078),
 'test_time': (0.0008015632629394531,
  0.0005552768707275391,
  0.0004699230194091797)}

In [42]:
#fit the model using the full dataset
trainset = data.build_full_trainset()
svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fdceff06d40>

In [60]:
#generate recommendations
svd.predict(uid="300.11",iid = 5)

Prediction(uid='300.11', iid=5, r_ui=None, est=0.5449523037281014, details={'was_impossible': False})

In [63]:
raw_df[raw_df['prev_month_weekly_km'] > 100]

Unnamed: 0,athlete,prev_month_weekly_km,prev_month_weekly_pace,prev_month_weekly_days_run,age_bucket,gender,country,current_month,run_distance_rounded,weekly_frequency,user_id
1696,10025,118.331667,5.759395,4.500000,18 - 34,M,Sweden,4,8.0,0.233333,10025.4
1697,10025,118.331667,5.759395,4.500000,18 - 34,M,Sweden,4,12.0,0.466667,10025.4
1698,10025,118.331667,5.759395,4.500000,18 - 34,M,Sweden,4,13.0,0.233333,10025.4
1699,10025,118.331667,5.759395,4.500000,18 - 34,M,Sweden,4,15.0,0.233333,10025.4
1700,10025,118.331667,5.759395,4.500000,18 - 34,M,Sweden,4,18.0,0.466667,10025.4
...,...,...,...,...,...,...,...,...,...,...,...
2398011,9900,109.105000,5.902005,5.166667,18 - 34,M,United Kingdom,2,22.0,0.482759,9900.2
2398012,9900,109.105000,5.902005,5.166667,18 - 34,M,United Kingdom,2,23.0,0.241379,9900.2
2398013,9900,109.105000,5.902005,5.166667,18 - 34,M,United Kingdom,2,28.0,0.482759,9900.2
2398014,9900,109.105000,5.902005,5.166667,18 - 34,M,United Kingdom,2,31.0,0.241379,9900.2


In [69]:
svd.predict(uid="9900.2",iid = 6)

Prediction(uid='9900.2', iid=6, r_ui=None, est=0.5730369502344005, details={'was_impossible': False})