In [1]:
# Install the required packages
pip install google



In [3]:
# Import drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#  Install the required packages for the collab code to run
!pip install numpy
!pip install scikit-surprise
import os
import pandas as pd
from surprise import KNNBasic
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import accuracy
from surprise import AlgoBase
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.accuracy import rmse
from collections import defaultdict


In [None]:
# Get the ratings data from ratings.csv file
ratings = pd.read_csv('/content/drive/MyDrive/data/ratings.csv')
ratings.head()

In [None]:
# Map the relative data into the variable
ratings_map_data = {}
ratings_map_data['itemID'] = list(ratings_data.movieId)
ratings_map_data['userID'] = list(ratings_data.userId)
ratings_map_data['rating'] = list(ratings_data.rating)

# Contruct a data frame for the ratings data
df = pd.DataFrame(ratings_map_data)
df.shape

In [None]:
# Use 5 fold method
from surprise.model_selection import KFold

# Set the rating scale
rr = Reader(rating_scale=(0.5, 5.0))

# Categorize the rating keys
rating_map_keys = ['userID', 'itemID', 'rating']
filter = df[rating_map_keys]

# Get the rating data according the key filters
data = Dataset.load_from_df(filter, rr)

# Use 5-fold split
kf = KFold(n_splits=5)
kf.split(data)

In [None]:
# Utilizing a standard stochastic gradient descent algorithm for predicting ratings within a class
class SGDMatixAlgorithmSelf(AlgoBase):
    '''An elementary algorithm for predicting ratings grounded on matrix factorization.'''
    
    def __init__(self, l_r, n_e, n_f):
        self.n_f = n_f
        self.n_e = n_e
        self.l_r = l_r
        
    # Fit the training dataset
    def fit(self, trainset):
        first = np.random.normal(0, .1, (trainset.n_users, self.n_f))
        second = np.random.normal(0, .1, (trainset.n_items, self.n_f))
        
        for m in range(self.n_e):
            for i, j, k in trainset.all_ratings():
                each_val = k - np.dot(first[i], second[j])
                first[i] = first[i] + second[j] * each_val * self.l_r
                second[j] = second[j] + first[i] * each_val * self.l_r
        
        self.p = first
        self.q = second
        self.trainset = trainset

    
    # Estimate using the train data
    def estimate(self, i, j):
        if  self.trainset.knows_user(i) and self.trainset.knows_item(j):
            return np.dot(self.first[i], self.second[j])
        else:
            return self.trainset.global_mean

In [None]:
import numpy as np
# Remove timestamp from data

# Categorize the rating keys
rating_map_keys = ['userID', 'itemID', 'rating']
filter = df[rating_map_keys]

# Set the rating scale
reader = Reader(rating_scale=(0.5, 5.0))

# Get the rating data according the key filters
data = Dataset.load_from_df(filter, reader)

eval = []
# Use the matrix factorization algorithm and find evaluation
for i in [SGDMatixAlgorithmSelf(.01, 10,10),SVD(), NMF(), KNNBasic()]:
    cva = cross_validate(i, data, measures=['RMSE'], cv=4, verbose=False)
    tempdataframe = pd.DataFrame.from_dict(cva).mean(axis=0)
    algoSplitArray = str(i).split(' ')
    finalAlgoSplitArray = algoSplitArray[0].split('.')
    tempdataframe.append(pd.Series([finalAlgoSplitArray[-1]],index=['Algorithm']))
    eval.append(tempdataframe)

In [None]:
# Print the evaluated data
eval

In [None]:
# Split the training and test data in 80:20 ratio
trainset, testset = train_test_split(data, test_size=0.2)
algo = SVD(n_factors=30, n_epochs=20, lr_all=0.008, reg_all=0.08)

# Predict the results
predictions = algo.fit(trainset).test(testset)

# Find the accuracy in RMSE
accuracy.rmse(predictions)

In [None]:
# Validate using measures RMSE
cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=False)

In [None]:
def calculate_precision_recall_for_k(predictions, k=10, threshold=3.5):

    # Construct user id value to estimate
    user_id_to_estimate_map = defaultdict(list)
    for user_id, _, ratings_true, estimate_value, _ in predictions:
        user_id_to_estimate_map[user_id].append((estimate_value, ratings_true))

    # Precision and Recall variables definition
    precisions = dict()
    recalls = dict()

    # Return precision and recall for each user id 
    for user_id, user_ratings in user_id_to_estimate_map.items():

        user_ratings.sort(key=lambda x: x[0], reverse=True)

        n_rel = sum((ratings_true >= threshold) for (_, ratings_true) in user_ratings)

        n_rec_k = sum((estimate_value >= threshold) for (estimate_value, _) in user_ratings[:k])

        n_rel_and_rec_k = sum(((ratings_true >= threshold) and (estimate_value >= threshold))
                              for (estimate_value, ratings_true) in user_ratings[:k])

        # Calculate precision and recall
        precisions[user_id] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        recalls[user_id] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

In [None]:
# Using K fold technique with 4 splits
kf = KFold(n_splits=4)

algo = SVD(n_factors=30, n_epochs=20, lr_all=0.008, reg_all=0.08)
i = 1

# Calculating the final results with split
for trainset, testset in kf.split(data):
    print("Split:", i)
    predictions = algo.fit(trainset).test(testset)

    # Calculate the accuracy using RMSE
    accuracy.rmse(predictions, verbose=True)
    precisions, recalls = calculate_precision_recall_for_k(predictions, k=5, threshold=4)

    print("Precision:", sum(prec for prec in precisions.values()) / len(precisions))
    print("Recall:", sum(rec for rec in recalls.values()) / len(recalls))
    i+=1

In [None]:
# Calculate the predictions
def getpreds(predictions):
    
    pred_value = defaultdict(list)    
    for user_id, id, ratings_true, estimate_value, _ in predictions:
        pred_value[user_id].append((id, estimate_value))

    for user_id, user_ratings in pred_value.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)

    return pred_value


In [None]:
trainset = data.build_full_trainset()
algo = SVD(n_factors=30, n_epochs=20, lr_all=0.008, reg_all=0.08)
algo.fit(trainset)

# Get the predictions 
testset = trainset.build_anti_testset()
predictions = algo.test(testset)
final_pred = getpreds(predictions)



#### Given that we've obtained all the projected ratings, we'll filter to include solely the top " " movies per user.

In [None]:
#Adjusting the recommendation quantity to 10.
n = 10

for user_id, user_ratings in final_pred.items():
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    final_pred[user_id] = user_ratings[:n]


In [None]:
# Prepare the data
tmp = pd.DataFrame.from_dict(final_pred)
tmp_transpose = tmp.transpose()


In [None]:
# Compile the result values
response = []
for user_id,user_ratings in final_pred.items():
  response.append(tmp_transpose.loc[user_id])

In [None]:
# Setting movie recommenation with respect to movie ids
recommendation = []
# Compile the recommendation
for i in response:
  recommended_movie_ids=[]
  for x in range(0, n):
    recommended_movie_ids.append(i[x][0])
  recommendation.append(recommended_movie_ids)

In [None]:
recommendation[0]

In [None]:
movies = pd.read_csv('/content/drive/MyDrive/data/movies.csv')
final_value = []

# Get final results for the recommendation
for i in recommendation:
  df = movies[movies['movieId'].isin(i)]
  temp = df['title'].tolist()
  final_value.append(temp)

In [None]:
# Construct dataframe from the results
final_df = pd.DataFrame(final_value)

In [None]:
# Convert the results to csv file
final_df.to_csv('/content/drive/MyDrive/data/file2.csv',index = False)

In [None]:
# Set the compiled csv results
result = pd.read_csv('/content/drive/MyDrive/data/file2.csv')