In [1]:
pip install google



In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install numpy
!pip install scikit-surprise
import os
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.accuracy import rmse
from collections import defaultdict
from surprise import KNNBasic
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import accuracy
from surprise import AlgoBase

In [None]:
# Reading the file ratings and storing it in a dataframe
ratings = pd.read_csv('/content/drive/MyDrive/data/ratings.csv')
ratings.head()

In [None]:
ratings_map = {}
ratings_map['itemID'] = list(ratings.movieId)
ratings_map['userID'] = list(ratings.userId)
ratings_map['rating'] = list(ratings.rating)

df = pd.DataFrame(ratings_map)
df.shape

In [None]:
# Create 5 folds
from surprise.model_selection import KFold
rr = Reader(rating_scale=(0.5, 5.0))
rating_map_keys = ['userID', 'itemID', 'rating']
filter = df[rating_map_keys]
data = Dataset.load_from_df(filter, rr)

kf = KFold(n_splits=5)
kf.split(data)

In [None]:
# class to predict ratings using a standard stochastic gradient descent algo
class SGDMatixSelf(AlgoBase):
    '''A basic rating prediction algorithm based on matrix factorization.'''
    
    def __init__(self, l_r, n_e, n_f):
        self.n_f = n_f
        self.n_e = n_e
        self.lr = l_r
        
    def fit(self, trainset):
        # print('Fit started')
        
        p = np.random.normal(0, .1, (trainset.n_users, self.n_f))
        q = np.random.normal(0, .1, (trainset.n_items, self.n_f))
        
        for z in range(self.n_e):
            for i, j, k in trainset.all_ratings():
                e = k - np.dot(p[i], q[j])
                p[i] = p[i] + q[j] * e * self.lr
                q[j] = q[j] + p[i] * e * self.lr
        
        self.p = p
        self.q = q
        self.trainset = trainset

    def estimate(self, i, j):
        if  self.trainset.knows_user(i) and self.trainset.knows_item(j):
            return np.dot(self.p[i], self.q[j])
        else:
            return self.trainset.global_mean

In [None]:
import numpy as np
#removing timestamp
rating_map_keys = ['userID', 'itemID', 'rating']
filter = df[rating_map_keys]
data = Dataset.load_from_df(filter, reader)

eval = []
for i in [SGDMatixSelf(.01, 10,10),SVD(), NMF(), KNNBasic()]:
    cva = cross_validate(i, data, measures=['RMSE'], cv=4, verbose=False)
    tempdataframe = pd.DataFrame.from_dict(cva).mean(axis=0)
    algoSplitArray = str(i).split(' ')
    finalAlgoSplitArray = algoSplitArray[0].split('.')
    tempdataframe.append(pd.Series([finalAlgoSplitArray[-1]],index=['Algorithm']))
    eval.append(tempdataframe)

                     
                                        

In [None]:
eval

In [None]:
trainset, testset = train_test_split(data, test_size=0.2)
algo = SVD(n_factors=30, n_epochs=20, lr_all=0.008, reg_all=0.08)
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

In [None]:
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)

In [None]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):

    userid_to_estimate_map = defaultdict(list)
    for user_id, _, ratings_true, estimate_value, _ in predictions:
        userid_to_estimate_map[user_id].append((estimate_value, ratings_true))

    precisions = dict()
    recalls = dict()
    for user_id, user_ratings in userid_to_estimate_map.items():

        user_ratings.sort(key=lambda x: x[0], reverse=True)

        n_rel = sum((ratings_true >= threshold) for (_, ratings_true) in user_ratings)

        n_rec_k = sum((estimate_value >= threshold) for (estimate_value, _) in user_ratings[:k])

        n_rel_and_rec_k = sum(((ratings_true >= threshold) and (estimate_value >= threshold))
                              for (estimate_value, ratings_true) in user_ratings[:k])

        precisions[user_id] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        recalls[user_id] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

In [None]:
kf = KFold(n_splits=4)

algo = SVD(n_factors=30, n_epochs=20, lr_all=0.008, reg_all=0.08)
i = 1
for trainset, testset in kf.split(data):
    print("Split:", i)
    predictions = algo.fit(trainset).test(testset)
    accuracy.rmse(predictions, verbose=True)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    print("Precision:", sum(prec for prec in precisions.values()) / len(precisions))
    print("Recall:", sum(rec for rec in recalls.values()) / len(recalls))
    i+=1

In [None]:
def getpreds(predictions):
    
    fin = defaultdict(list)    
    for user_id, id, ratings_true, estimate_value, _ in predictions:
        fin[user_id].append((id, estimate_value))

    for user_id, user_ratings in fin.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)

    return fin


In [None]:
trainset = data.build_full_trainset()
algo = SVD(n_factors=30, n_epochs=20, lr_all=0.008, reg_all=0.08)
algo.fit(trainset)

testset = trainset.build_anti_testset()
predictions = algo.test(testset)
all_pred = getpreds(predictions)



#### Now as we have all the predicted rating, We'll subset to only top " " movies for every user

In [None]:
#setting recommendation size to 10
n = 10

for user_id, user_ratings in all_pred.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    all_pred[user_id] = user_ratings[:n]


In [None]:
tmp = pd.DataFrame.from_dict(all_pred)
tmp_transpose = tmp.transpose()


In [None]:
res = []
for user_id,user_ratings in all_pred.items():
  res.append(tmp_transpose.loc[user_id])

In [None]:
#movieids of reommended movies
recomml = []
for i in res:
  recommended_movie_ids=[]
  for x in range(0, n):
    recommended_movie_ids.append(i[x][0])
  recomml.append(recommended_movie_ids)

In [None]:
recomml[0]

In [None]:
finall[0]

In [None]:
movies = pd.read_csv('/content/drive/MyDrive/data/movies.csv')
finall = []
for i in recomml:
  df = movies[movies['movieId'].isin(i)]
  temp = df['title'].tolist()
  finall.append(temp)

In [None]:
fin = pd.DataFrame(finall)

In [None]:
#Saving recommendations to a file
fin.to_csv('/content/drive/MyDrive/data/file2.csv',index = False)

In [None]:
r = pd.read_csv('/content/drive/MyDrive/data/file2.csv')