In [1]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

# Import Data

In [2]:
interactions_df = pd.read_csv("full_interactions_dataset.csv")

In [3]:
df = pd.read_csv("full_dataset.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,timestamp_x,eventType_x,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,...,text,lang,timestamp_y,eventType_y,personId,sessionId,userAgent,userRegion,userCountry,eventStrength
0,3,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,...,All of this work is still very early. The firs...,en,1459270471,VIEW,2873028073541627603,-3228450896145227905,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,SP,BR,1.0
1,4,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,...,The alarm clock wakes me at 8:00 with stream o...,en,1459539433,VIEW,8414731042150985013,4543899740167763020,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4...,MG,BR,1.0
2,5,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,...,We're excited to share the Google Data Center ...,en,1459476271,VIEW,-7267769888748948232,-1350132153769633851,Android - Native Mobile App,MG,BR,1.0
3,6,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,...,We're excited to share the Google Data Center ...,en,1459476266,VIEW,-7267769888748948232,-1350132153769633851,Android - Native Mobile App,MG,BR,1.0
4,7,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,...,We're excited to share the Google Data Center ...,en,1459476139,LIKE,-7267769888748948232,-1350132153769633851,,,,2.0


# Using Surprise

Notice how there is no X_train or y_train in our values here. Our only features here are the ratings of other users and items, so we need to keep everything together. What is happening in the train-test split here is that surprise is randomly selecting certain  𝑟𝑖𝑗  for users  𝑢𝑖  and items  𝑖𝑗 . 80% of the ratings are in the training set and 20% in the test set. Let's investigate trainset and testset further.

In [5]:
pip install surprise

Note: you may need to restart the kernel to use updated packages.


# Models

Source materials from:
https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b

https://nbviewer.jupyter.org/github/NicolasHug/Surprise/blob/master/examples/notebooks/KNNBasic_analysis.ipynb

In [9]:
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise.reader import Reader
from surprise import Dataset

# Train test split

To cross validate using surprise, use Reader method to put the data into the format below. To be used with the surpirse elements it should be split into three columns.

In [18]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(interactions_df[['personId', 'contentId', 'event_strength']], reader)

In [21]:
from surprise.model_selection import train_test_split

#The surprise train_test_split creates a 'Trainset' object

train_set, test_set = train_test_split(data, test_size = 0.2, random_state = 42)

In [25]:
type(train_set)

surprise.trainset.Trainset

In [39]:
type(test_set)

list

In [45]:
# print(f' Len of train: {len(train_set)}')
print(f'Len of test set: {len(test_set)}')

Len of test set: 7815


# KNN

In [40]:
from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy

In [41]:
print('Number of users: ', train_set.n_users, '\n')
print('Number of items: ', train_set.n_items, '\n')


Number of users:  1139 

Number of items:  2904 



# Initial Algorithm Testing

In [33]:
from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, BaselineOnly, CoClustering, KNNWithZScore
from surprise.model_selection import cross_validate


The code below produces a first look at several possible options and the output is give in a table that contains:
- testRMSE
- fit time
- test time

In [34]:
benchmark = []

# Iterate over all algorithms
for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
   
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')    

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BaselineOnly,0.671508,0.075826,0.060787
SVDpp,0.672543,39.824948,1.549323
SVD,0.677941,1.690022,0.094554
KNNBaseline,0.702065,0.139999,0.671136
KNNWithMeans,0.711667,0.095373,0.62936
KNNWithZScore,0.712975,0.129521,0.589096
NMF,0.737276,1.820308,0.094625
SlopeOne,0.738245,0.400675,1.024324
CoClustering,0.744264,0.714756,0.102146
KNNBasic,0.747606,0.083608,0.525974


# Baseline Algorithm

Start the modeling process with a baseline model using ALS (alternating least squares). 

**BaselineOnly algorithm predicts the baseline estimate for given user and item.**

In [35]:
from surprise.model_selection import cross_validate

print('Using ALS')

#Need to test these hyperparameters!
bsl_options = {'method': 'als',
               'n_epochs': 5,
               'reg_u': 12,
               'reg_i': 5
               }

algo = BaselineOnly(bsl_options=bsl_options)

cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)


Using ALS
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_rmse': array([0.67319505, 0.66816342, 0.67004922]),
 'fit_time': (0.042662858963012695, 0.043261051177978516, 0.04409003257751465),
 'test_time': (0.11980795860290527, 0.05920100212097168, 0.06870818138122559)}

In [36]:
from surprise import BaselineOnly

#Estimates for the training and testing rmse

algo = BaselineOnly(bsl_options=bsl_options)
predictions = algo.fit(train_set).test(test_set)
accuracy.rmse(predictions)

Estimating biases using als...
RMSE: 0.6591


0.659117769866205

# Evaluate:

In [38]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(train_set.ur[train_set.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(train_set.ir[train_set.to_inner_iid(iid)])
    except ValueError:
        return 0
    
df_eval = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
df_eval['Iu'] = df_eval.uid.apply(get_Iu)
df_eval['Ui'] = df_eval.iid.apply(get_Ui)
df_eval['err'] = abs(df_eval.est - df_eval.rui)

best_predictions = df_eval.sort_values(by='err')[:10]
worst_predictions = df_eval.sort_values(by='err')[-10:]