In [20]:
import gzip, json, os, pickle
import pandas as pd
from surprise import Dataset, Reader, accuracy
from surprise import prediction_algorithms
from surprise.model_selection import train_test_split

DIR = '/home/mthommes/insight/project/'

# Load Data

In [21]:
file_name = 'reviews_trimmed.pkl'
data_df = pd.read_pickle(os.path.join(DIR, 'data', file_name))
display(data_df.head())

# define rating scale
reader = Reader(rating_scale=(1, 5))

# convert data to dataset
col_names = ['book_id', 'user_id', 'rating']
data = Dataset.load_from_df(data_df[col_names], reader)

# training (75%) and test (25%) sets
trainingSet, testSet = train_test_split(data, test_size=.25)

#trainingSet = data.build_full_trainset()
#testSet = trainingSet.build_anti_testset() # return a new dataset with user-book pairs not present in the training set

Unnamed: 0,book_id,date_added,date_updated,is_read,rating,read_at,review_id,review_text_incomplete,started_at,user_id
0,22557272,Tue Mar 17 11:10:46 -0700 2015,Wed Mar 22 11:32:22 -0700 2017,1.0,4.0,,089ed6d735e2e2357bd8eb2a0dd2ed42,,,8842281e1d1347389f2ab93d60773d4d
0,8127,Tue Jun 07 10:40:48 -0700 2011,Wed Mar 22 11:46:59 -0700 2017,1.0,5.0,,242ab743400f31a5b8ab89baf0c9e876,,,8842281e1d1347389f2ab93d60773d4d
0,157993,Tue Jun 07 10:39:10 -0700 2011,Tue Jun 07 10:39:10 -0700 2011,1.0,5.0,,2f5d8a16d8b39eb909fe3f2f6b870ca9,,,8842281e1d1347389f2ab93d60773d4d
0,2998,Fri Apr 29 13:09:56 -0700 2011,Wed Mar 22 11:46:53 -0700 2017,1.0,5.0,,4cb1341793c6f4e1ae38e5c3365128fc,,,8842281e1d1347389f2ab93d60773d4d
0,7613,Fri Apr 29 13:09:36 -0700 2011,Wed Mar 22 11:32:12 -0700 2017,1.0,5.0,,ce116905969822c8762a82171e726b2f,,,8842281e1d1347389f2ab93d60773d4d


# Train Models

In [22]:
# Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal.
# Note: The training set is NOT normal
model_NormalPredictor = prediction_algorithms.NormalPredictor()
model_NormalPredictor.fit(trainingSet);

# Algorithm predicting the baseline estimate for given user and item.
model_BaselineOnly = prediction_algorithms.BaselineOnly(verbose=False)
model_BaselineOnly.fit(trainingSet);

# k-NN inspired algorithms
sim_options = {"name": "cosine",
               "user_based": False,  # Compute  similarities between items
              }
model_KNNBasic = prediction_algorithms.KNNBasic(sim_options=sim_options, verbose=False)
model_KNNBasic.fit(trainingSet);
# Take into account the mean ratings of each user.
model_KNNWithMeans = prediction_algorithms.KNNWithMeans(sim_options=sim_options, verbose=False)
model_KNNWithMeans.fit(trainingSet);
# Take into account the z-score normalization of each user
#model_KNNWithZScore = prediction_algorithms.knns.model_KNNWithZScore(sim_options=sim_options)
#model_KNNWithZScore.fit(trainingSet)

# Test Model

In [23]:
# Normal Predictor
predictions_NormalPredictor = model_NormalPredictor.test(testSet)
# Baseline Only
predictions_BaselineOnly = model_BaselineOnly.test(testSet)
# KNNWithMeans
predictions_KNNWithMeans = model_KNNWithMeans.test(testSet)

In [24]:
col_names = ['Model', 'RMSE']
rmse_df = pd.DataFrame(columns=col_names)

# Normal Predictor
d_sr = pd.Series(['NormalPredictor', accuracy.rmse(predictions_NormalPredictor)], index=col_names)
rmse_df = rmse_df.append(d_sr, ignore_index=True)

# Baseline Only
d_sr = pd.Series(['BaselineOnly', accuracy.rmse(predictions_BaselineOnly)], index=col_names)
rmse_df = rmse_df.append(d_sr, ignore_index=True)

# KNNWithMeans
d_sr = pd.Series(['KNNWithMeans', accuracy.rmse(predictions_KNNWithMeans)], index=col_names)
rmse_df = rmse_df.append(d_sr, ignore_index=True)


RMSE: 1.3292
RMSE: 0.9114
RMSE: 0.9250


In [25]:
display(rmse_df)

Unnamed: 0,Model,RMSE
0,NormalPredictor,1.32916
1,BaselineOnly,0.911404
2,KNNWithMeans,0.924957


# Save Models

In [8]:
# Normal Predictor
file_name = 'model_collab_NormalPredictor.sav'
pickle.dump(model_NormalPredictor, open(os.path.join(DIR, 'models', file_name), 'wb'))

# Baseline Only
file_name = 'model_collab_BaselineOnly.sav'
pickle.dump(model_BaselineOnly, open(os.path.join(DIR, 'models', file_name), 'wb'))

# KNNWithMeans
file_name = 'model_collab_KNNWithMeans.sav'
pickle.dump(model_KNNWithMeans, open(os.path.join(DIR, 'models', file_name), 'wb'))