# Model Training

In [30]:
import pandas as pd
from surprise import SVD, NMF, Dataset, Reader
from surprise.model_selection import cross_validate

In [2]:
df_user_pairs = pd.read_pickle('data/user_pairs.pkl')
df_user_pairs.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7698683 entries, 117720890 to 87989261
Data columns (total 4 columns):
 #   Column     Dtype
---  ------     -----
 0   user_num1  int32
 1   user_num2  int32
 2   plays      int32
 3   songs      int32
dtypes: int32(4)
memory usage: 176.2 MB


## Calculate score
- Represents the average number of unique songs per play. It could be seen as a measure of diversity or variety in the music listening behavior of a user pair. 

In [3]:
df_user_pairs['score'] = df_user_pairs['songs'] / df_user_pairs['plays']

In [4]:
pd.options.display.float_format = '{:.4f}'.format
df_user_pairs[['score']].describe()

Unnamed: 0,score
count,7698683.0
mean,0.6161
std,0.3613
min,0.0385
25%,0.25
50%,0.5
75%,1.0
max,1.0


In [5]:
df_user_pairs_sample = df_user_pairs.sample(frac=0.4, random_state=42)
df_user_pairs_sample.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3079473 entries, 654307979 to 361701112
Data columns (total 5 columns):
 #   Column     Dtype  
---  ------     -----  
 0   user_num1  int32  
 1   user_num2  int32  
 2   plays      int32  
 3   songs      int32  
 4   score      float64
dtypes: float64(1), int32(4)
memory usage: 94.0 MB


In [6]:
df_user_pairs = None # release memory

## Create a Dataset

In [7]:
rating_scale = df_user_pairs_sample['score'].min(), df_user_pairs_sample['score'].max()
rating_scale

(0.038461538461538464, 1.0)

In [8]:
ds_ratings = Dataset.load_from_df(df_user_pairs_sample[['user_num1', 'user_num2', 'score']],
                                  Reader(rating_scale=rating_scale))

## Matrix Factorization Model: SVD Algorithm

In [35]:
user_user_svd = SVD(n_factors=100, n_epochs=5, lr_all=0.005, reg_all=0.02, random_state=42, verbose=False)

In [36]:
cross_validate(user_user_svd, ds_ratings, measures=['RMSE', 'MAE'], cv=5, n_jobs=4, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.3128  0.3128  0.3127  0.3123  0.3126  0.3127  0.0002  
MAE (testset)     0.2677  0.2678  0.2677  0.2673  0.2676  0.2676  0.0002  
Fit time          6.10    6.17    8.60    6.87    4.72    6.49    1.26    
Test time         6.48    6.87    4.32    5.32    3.98    5.40    1.14    


{'test_rmse': array([0.31282125, 0.31280601, 0.31271819, 0.3122903 , 0.31262715]),
 'test_mae': array([0.26769047, 0.26775908, 0.26765656, 0.26728856, 0.26760341]),
 'fit_time': (6.10103178024292,
  6.168542146682739,
  8.599815607070923,
  6.8695268630981445,
  4.71898889541626),
 'test_time': (6.480862379074097,
  6.872457027435303,
  4.322003126144409,
  5.322954893112183,
  3.9813284873962402)}

### NMF Algorithm

In [31]:
user_user_nmf = NMF(n_factors=100, n_epochs=5, random_state=42, verbose=False)
cross_validate(user_user_nmf, ds_ratings, measures=['RMSE', 'MAE'], cv=5, n_jobs=4, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.3567  0.3570  0.3563  0.3568  0.3568  0.3567  0.0002  
MAE (testset)     0.2878  0.2879  0.2873  0.2878  0.2877  0.2877  0.0002  
Fit time          27.17   25.87   25.41   25.64   15.47   23.91   4.27    
Test time         5.74    5.45    4.76    4.37    3.65    4.79    0.75    


{'test_rmse': array([0.35672819, 0.35701825, 0.35630742, 0.35682463, 0.3568213 ]),
 'test_mae': array([0.28779308, 0.28785186, 0.28732793, 0.28775741, 0.28767876]),
 'fit_time': (27.165193796157837,
  25.870254039764404,
  25.41075873374939,
  25.64201593399048,
  15.465425491333008),
 'test_time': (5.737767457962036,
  5.449893951416016,
  4.760817766189575,
  4.3743672370910645,
  3.6481359004974365)}

### Save the Models

In [40]:
dump.dump('models/user_user_svd.dump', algo=user_user_svd, verbose=True)
dump.dump('models/user_user_nmf.dump', algo=user_user_nmf, verbose=True)

The dump has been saved as file models/user_user_svd.dump
The dump has been saved as file models/user_user_nmf.dump


In [10]:
df_user_pairs_sample['user_num1'].values[:5]

array([63525, 41004, 40891, 22373,  5947], dtype=int32)

In [37]:
def get_recommendations(model, user_num, k=10):
    estimated_ratings = []
    raw_user_id = model.trainset.to_inner_uid(user_num)
    for raw_item_id in model.trainset.all_items():
        est = model.estimate(raw_user_id, raw_item_id)
        estimated_ratings.append((raw_item_id, est))
    return sorted(estimated_ratings, key=lambda x: x[1], reverse=True)[:k]

In [None]:
get_recommendations(user_user_svd, 28853, k=10)