In [1]:
import pandas as pd
import numpy as np
import datetime
import random
import time
import os
import wget
from IPython.display import display, HTML
import zipfile

from surprise import SVD, accuracy
from surprise import BaselineOnly, Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV, KFold
from surprise.model_selection import train_test_split

from surprise import (
    BaselineOnly,
    CoClustering,
    Dataset,
    KNNBaseline,
    KNNBasic,
    KNNWithMeans,
    NMF,
    NormalPredictor,
    SlopeOne,
    SVD,
    SVDpp,
)


from collections import defaultdict

from sklearn.metrics import ndcg_score
from scipy import sparse
from tabulate import tabulate

In [3]:
movie_df = pd.read_csv('ml-25m/movies.csv')
link_df = pd.read_csv('ml-25m/links.csv')
metadata_df = pd.merge(movie_df, link_df, on='movieId')

In [4]:
df = pd.read_csv('ml-25m/ratings.csv')

new_rating_url = 'https://drive.google.com/file/d/1-GN1jGBOpcEFWF-a1ffpvPI245xbn92W/view?usp=share_link'
new_rating_url='https://drive.google.com/uc?id=' + new_rating_url.split('/')[-2]

new_user_df = pd.read_csv(new_rating_url, index_col=0)
new_ratings = new_user_df[['userId', 'movieId', 'rating', 'timestamp']]

df = pd.concat([df, new_ratings])

In [5]:
reader = Reader(line_format="user item rating timestamp", sep="\t")

data = Dataset.load_from_df(df[["userId", "movieId", "rating"]], reader=reader)

### User-based train test split utility:

In [2]:
def ubased_tt_split(data, split_ratio, random_state = 42):
    
    user_ratings = defaultdict(list)
    for uid, iid, r_ui, _ in data.raw_ratings:
        user_ratings[uid].append((uid, iid, r_ui, None))

    raw_trainset, raw_testset = [], []
    
    for uid, ratings in user_ratings.items():
        random.shuffle(ratings)
        usertest_count = int(len(ratings) * split_ratio)
        # user_test = random.sample(ratings, usertest_count)
        
        user_test = ratings[0:usertest_count]
        user_train = ratings[usertest_count:]
        
        assert (len(user_test) + len(user_train) == len(ratings))
        
        raw_testset += [rating for (j, rating) in enumerate(user_test)]
        raw_trainset += [rating for (j, rating) in enumerate(user_train)]

    trainset = data.construct_trainset(raw_trainset)
    testset = data.construct_testset(raw_testset)
    
    return trainset, testset

### Adding NDCG evaluation to the library:

In [7]:
def get_ndcg(surprise_predictions, k_highest_scores=None):
    """ 
    Calculates the ndcg (normalized discounted cumulative gain) from surprise predictions, using sklearn.metrics.ndcg_score and scipy.sparse
  
    Parameters:
    surprise_predictions (List of surprise.prediction_algorithms.predictions.Prediction): list of predictions
    k_highest_scores (positive integer): Only consider the highest k scores in the ranking. If None, use all. 
  
    Returns:
    float in [0., 1.]: The averaged NDCG scores over all recommendations
    """
    
    uids = [int(p.uid) for p in surprise_predictions]
    iids = [int(p.iid) for p in surprise_predictions]
    r_uis = [p.r_ui for p in surprise_predictions]
    ests = [p.est for p in surprise_predictions]
    
    assert(len(uids) == len(iids) == len(r_uis) == len(ests))    
    
    sparse_preds = sparse.coo_matrix( (ests, (uids , iids )) )
    sparse_vals = sparse.coo_matrix( (r_uis, (uids , iids )) )
    
    dense_preds = sparse_preds.toarray()
    dense_vals = sparse_vals.toarray()
    
    return ndcg_score(y_true= dense_vals , y_score= dense_preds, k=k_highest_scores)

### Test-train split method:

In [1]:
# set RNG
np.random.seed(42)
random.seed(42)

algos = (
    SVD(), # Original SVD implementation
    NMF(), # A CF algorithm based on Non-negative Matrix Factorization.
    KNNBaseline(verbose=False), # Basic CF
    KNNWithMeans(verbose=False), # CF with consideration of user ratings mean
    CoClustering(), # CF based on co-clustering
    NormalPredictor() # Random recs based on training distribution (assumes normal)
)


table = []
for algo in algos:

    trainset, testset = ubased_tt_split(data, split_ratio=0.2)
      
    start = time.time()
    algo.fit(trainset)
    predictions = algo.test(testset)
    
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    mean_rmse = "{:.3f}".format(accuracy.rmse(predictions, verbose=False))
    mean_mae = "{:.3f}".format(accuracy.mae(predictions, verbose=False))
    ndcg10 =  "{:.3f}".format(get_ndcg(predictions, 10))
    new_line = [algo.__class__.__name__, mean_rmse, mean_mae, ndcg10, cv_time]
    table.append(new_line)

header = ["ML25M", "RMSE", "MAE", "NDCG@10", "Time"]
print(tabulate(table, header, tablefmt="pipe"))

### Utility to generate recommendations:

In [8]:
def get_movie_info(movieId):
    
    movie_data = metadata_df[metadata_df['movieId'] == movieId]
    return movie_data['title'].values[0], movie_data['imdbId'].values[0]

details={'was_impossible': False}

def generate_recommendation(user_id, model, movieIds, thresh=4):
    
    recs_array = []

    for movieId in movieIds:
        pred_result = model.predict(uid=user_id, iid=movieId)
        est_rating = pred_result.est
        if (pred_result.details['was_impossible'] == False and est_rating >= thresh):
            title, imdbId = get_movie_info(movieId)
            recs_array.append([title, movieId, est_rating, imdbId])
    
    result_df = pd.DataFrame(recs_array, columns = ['title', 'movieId',
                                                    'est_rating', 'imdbId'])
    return result_df

This link would be usefull in future: [https://towardsdatascience.com/a-complete-guide-to-recommender-system-tutorial-with-sklearn-surprise-keras-recommender-5e52e8ceace1]

Also, the implementation of the two stage recsys (mainly for iiCF): https://www.the-odd-dataguy.com/2022/03/14/surprise/

### Giving the whole profile to the system:

In [10]:
np.random.seed(42)
random.seed(42)

trainset = data.build_full_trainset()

algos = (
    SVD(n_factors=200, n_epochs=40, lr_all=0.01, reg_all=0.3),
    NMF(),
    KNNBaseline(k=50, min_k=5, sim_options =  {'name': 'pearson_baseline', 'user_based': False}),
    CoClustering(), ## no parameter tuning
    NormalPredictor() ## no parameter tuning
)

movieIds = metadata_df['movieId'].unique()

for algo in algos:
    algo.fit(trainset)
    recommedations = generate_recommendation(0, algo, movieIds, 4.5).sort_values(
      by=['est_rating'], ascending=False)[:1000]
    print(algo.__class__.__name__)
    display(recommedations.head())

SVD


Unnamed: 0,title,movieId,est_rating,imdbId
160,Inside Out (1991),205277,4.954059,104512
153,Civilisation (1969),201821,4.866667,264234
55,All About My Wife (2012),149484,4.855054,2173264
82,Anybody's Son Will Do,159125,4.833827,222730
74,Cien niños esperando un tren (1988),156765,4.77945,144829


NMF


Unnamed: 0,title,movieId,est_rating,imdbId
8907,11 Days 11 Nights: Part 1 - Fantasy Becomes Re...,159558,5.0,91002
7832,Toilet (2010),146311,5.0,1603933
7814,The Adventures of Sherlock Holmes and Dr. Wats...,146028,5.0,83100
2193,"Phantom of Liberty, The (Fantôme de la liberté...",26318,5.0,71487
7810,Elder Sister (1966),146016,5.0,61028


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
KNNBaseline


Unnamed: 0,title,movieId,est_rating,imdbId
3080,Band of Brothers (2001),170705,5.0,185906
2894,Life (2009),159819,5.0,1533395
1802,"3rd Voice, The (1960)",77744,5.0,54380
544,Dodsworth (1936),4999,4.994579,27532
3087,Planet Earth II (2016),171011,4.993903,5491994


CoClustering


Unnamed: 0,title,movieId,est_rating,imdbId
955,The Oogieloves in the Big Balloon Adventure (2...,160195,5.0,1520498
1548,The Baron Against the Demons (2006),181495,5.0,1073098
760,Aschenputtel (2010),149879,5.0,1787655
2169,Prairie Dog (2015),201869,5.0,3159412
763,Rumors of Wars (2014),150612,5.0,2512236


NormalPredictor


Unnamed: 0,title,movieId,est_rating,imdbId
5557,Silent Tongue (1993),137996,5.0,108135
6013,Mamma Ebe (1985),144082,5.0,89542
6033,As the Light Goes Out (2014),144300,5.0,3414954
6031,Nightlight (2015),144266,5.0,2236160
6030,The Con Artists (2014),144264,5.0,4319112
