In [8]:
import pandas as pd
import numpy as np
import datetime
import random
import time
import os
import wget
from IPython.display import display, HTML
import zipfile

from collections import defaultdict

from surprise import SVD, accuracy
from surprise import BaselineOnly, Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV, KFold

In [2]:
movie_df = pd.read_csv('ml-latest-small/movies.csv')
link_df = pd.read_csv('ml-latest-small/links.csv')
metadata_df = pd.merge(movie_df, link_df, on='movieId')

In [3]:
df = pd.read_csv('ml-latest-small/ratings.csv')

new_rating_url = 'https://drive.google.com/file/d/1-GN1jGBOpcEFWF-a1ffpvPI245xbn92W/view?usp=share_link'
new_rating_url='https://drive.google.com/uc?id=' + new_rating_url.split('/')[-2]

new_user_df = pd.read_csv(new_rating_url, index_col=0)
new_ratings = new_user_df[['userId', 'movieId', 'rating', 'timestamp']]

df = pd.concat([df, new_ratings])

In [5]:
reader = Reader(line_format="user item rating timestamp", sep="\t")

data = Dataset.load_from_df(df[["userId", "movieId", "rating"]], reader=reader)

trainset = data.build_full_trainset()

### User-based train test split utilitu:

In [48]:
def ubased_tt_split(data, split_ratio, random_state = 42):
    
    user_ratings = defaultdict(list)
    for uid, iid, r_ui, _ in data.raw_ratings:
        user_ratings[uid].append((uid, iid, r_ui, None))

    raw_trainset, raw_testset = [], []
    
    for uid, ratings in user_ratings.items():
        random.shuffle(ratings)
        usertest_count = int(len(ratings) * split_ratio)
        # user_test = random.sample(ratings, usertest_count)
        
        user_test = ratings[0:usertest_count]
        user_train = ratings[usertest_count:]
        
        assert (len(user_test) + len(user_train) == len(ratings))
        
        raw_testset += [rating for (j, rating) in enumerate(user_test)]
        raw_trainset += [rating for (j, rating) in enumerate(user_train)]

    trainset = data.construct_trainset(raw_trainset)
    testset = data.construct_testset(raw_testset)
    
    return trainset, testset



trainset, testset = ubased_tt_split(data, split_ratio=0.2)
algo = SVD(random_state=42)
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions, verbose=False)

0.8675898948452854

In [20]:
from sklearn.metrics import ndcg_score
from scipy import sparse

from surprise.model_selection import cross_validate, KFold
from surprise.model_selection import train_test_split
from tabulate import tabulate


def get_ndcg(surprise_predictions, k_highest_scores=None):
    """ 
    Calculates the ndcg (normalized discounted cumulative gain) from surprise predictions, using sklearn.metrics.ndcg_score and scipy.sparse
  
    Parameters: 
    surprise_predictions (List of surprise.prediction_algorithms.predictions.Prediction): list of predictions
    k_highest_scores (positive integer): Only consider the highest k scores in the ranking. If None, use all. 
  
    Returns:
    float in [0., 1.]: The averaged NDCG scores over all recommendations
    """
    
    uids = [int(p.uid) for p in surprise_predictions]
    iids = [int(p.iid) for p in surprise_predictions]
    r_uis = [p.r_ui for p in surprise_predictions]
    ests = [p.est for p in surprise_predictions]
    
    assert(len(uids) == len(iids) == len(r_uis) == len(ests))    
    
    sparse_preds = sparse.coo_matrix( (ests, (uids , iids )) )
    sparse_vals = sparse.coo_matrix( (r_uis, (uids , iids )) )
    
    dense_preds = sparse_preds.toarray()
    dense_vals = sparse_vals.toarray()
    
    return ndcg_score(y_true= dense_vals , y_score= dense_preds, k=k_highest_scores)



from surprise import (
    BaselineOnly,
    CoClustering,
    Dataset,
    KNNBaseline,
    KNNBasic,
    KNNWithMeans,
    NMF,
    NormalPredictor,
    SlopeOne,
    SVD,
    SVDpp,
)

algos = (
    SVD(random_state=0), # Original SVD implementation
    NMF(random_state=0), # A CF algorithm based on Non-negative Matrix Factorization.
    KNNBaseline(verbose=False), # Basic CF
    KNNWithMeans(verbose=False), # CF with consideration of user ratings mean
    CoClustering(), # CF based on co-clustering
    NormalPredictor() # Random recs based on training distribution (assumes normal)
)

# set RNG
np.random.seed(0)
random.seed(0)

kf = KFold(random_state=0)

table = []
for algo in algos:
    
    # out = cross_validate(algo, data, ["rmse", "mae"], kf)

    trainset, testset = train_test_split(data, test_size=0.2)
      
    start = time.time()
    algo.fit(trainset)
    predictions = algo.test(testset)
    
    cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    mean_rmse = "{:.3f}".format(accuracy.rmse(predictions, verbose=False))
    mean_mae = "{:.3f}".format(accuracy.mae(predictions, verbose=False))
    ndcg10 =  "{:.3f}".format(get_ndcg(predictions, 10))
    new_line = [algo.__class__.__name__, mean_rmse, mean_mae, ndcg10, cv_time]
    table.append(new_line)

header = ["ML100k", "RMSE", "MAE", "NDCG@10", "Time"]
print(tabulate(table, header, tablefmt="pipe"))

| ML100k          |   RMSE |   MAE |   NDCG@10 | Time    |
|:----------------|-------:|------:|----------:|:--------|
| SVD             |  0.869 | 0.668 |     0.905 | 0:00:00 |
| NMF             |  0.913 | 0.701 |     0.896 | 0:00:01 |
| KNNBaseline     |  0.875 | 0.669 |     0.902 | 0:00:01 |
| KNNWithMeans    |  0.901 | 0.689 |     0.898 | 0:00:00 |
| CoClustering    |  0.936 | 0.725 |     0.897 | 0:00:01 |
| NormalPredictor |  1.427 | 1.14  |     0.844 | 0:00:00 |


In [21]:
def get_movie_info(movieId):
    
    movie_data = metadata_df[metadata_df['movieId'] == movieId]
    return movie_data['title'].values[0], movie_data['imdbId'].values[0]

details={'was_impossible': False}

def generate_recommendation(user_id, model, movieIds, thresh=4):
    
    recs_array = []

    for movieId in movieIds:
        pred_result = model.predict(uid=user_id, iid=movieId)
        est_rating = pred_result.est
        if (pred_result.details['was_impossible'] == False and est_rating >= thresh):
            title, imdbId = get_movie_info(movieId)
            recs_array.append([title, movieId, est_rating, imdbId])
    
    result_df = pd.DataFrame(recs_array, columns = ['title', 'movieId',
                                                    'est_rating', 'imdbId'])
    return result_df

In [22]:
np.random.seed(0)
random.seed(0)

algos = (
    SVD(random_state=0, n_factors=200, n_epochs=40, lr_all=0.01, reg_all=0.3),
    NMF(random_state=0),
    KNNBaseline(k=50, min_k=5, sim_options =  {'name': 'pearson_baseline', 'user_based': False}),
    CoClustering(), ## no parameter tuning
    NormalPredictor() ## no parameter tuning
)

movieIds = metadata_df['movieId'].unique()


for algo in algos:
  algo.fit(trainset)
  recommedations = generate_recommendation(0, algo, movieIds, 4.5).sort_values(
      by=['est_rating'], ascending=False)[:20]
  print(algo.__class__.__name__)
  display(recommedations)

SVD


Unnamed: 0,title,movieId,est_rating,imdbId
396,Come and See (Idi i smotri) (1985),6818,4.997923,91251
578,The Artist (2011),89904,4.980309,1655442
400,Last Tango in Paris (Ultimo tango a Parigi) (1...,7008,4.964516,70849
273,Guess Who's Coming to Dinner (1967),3451,4.948067,61735
433,"Jetée, La (1962)",8477,4.940433,56119
455,Neon Genesis Evangelion: The End of Evangelion...,27156,4.928994,169858
406,Adam's Rib (1949),7121,4.927966,41090
613,"Day of the Doctor, The (2013)",106642,4.923481,2779318
500,Reign Over Me (2007),51931,4.896794,490204
70,Secrets & Lies (1996),1041,4.887853,117589


NMF


Unnamed: 0,title,movieId,est_rating,imdbId
1424,Dragon Ball Z: The History of Trunks (Doragon ...,96004,5.0,142247
579,Battleship Potemkin (1925),3742,5.0,15648
1412,Dragon Ball Z the Movie: The World's Strongest...,95165,5.0,142240
565,Blow-Out (La grande bouffe) (1973),3655,5.0,70130
567,Benji (1974),3672,5.0,71206
571,For a Few Dollars More (Per qualche dollaro in...,3681,5.0,59578
576,"Conversation, The (1974)",3730,5.0,71360
1409,Eva (2011),94810,5.0,1298554
578,Badlands (1973),3741,5.0,69762
581,Duel in the Sun (1946),3792,5.0,38499


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
KNNBaseline


Unnamed: 0,title,movieId,est_rating,imdbId
29,Dr. Strangelove or: How I Learned to Stop Worr...,750,5.0,57012
456,Incredibles 2 (2018),187541,4.955566,3606756
458,Mission: Impossible - Fallout (2018),189333,4.951867,4912910
225,Harry Potter and the Prisoner of Azkaban (2004),8368,4.936892,304141
325,Harry Potter and the Deathly Hallows: Part 1 (...,81834,4.935231,926084
443,The Accountant (2016),162606,4.914696,2140479
287,"Dark Knight, The (2008)",58559,4.908285,468569
208,Lost in Translation (2003),6711,4.906243,335266
407,Unbroken (2014),118702,4.90593,1809398
444,Hacksaw Ridge (2016),163645,4.904432,2119532


CoClustering


Unnamed: 0,title,movieId,est_rating,imdbId
1631,Oscar (1967),142020,5.0,62083
971,Raise Your Voice (2004),8911,5.0,361696
342,Swept Away (Travolti da un insolito destino ne...,2239,5.0,73817
1639,Human (2015),143511,5.0,3327994
1640,L.A. Slasher (2015),143559,5.0,2735292
338,Knock Off (1998),2196,5.0,120724
962,"Woman Is a Woman, A (femme est une femme, Une)...",8738,5.0,55572
605,Triumph of the Will (Triumph des Willens) (1934),4278,5.0,25913
966,"Story of Women (Affaire de femmes, Une) (1988)",8804,5.0,96336
1643,Formula of Love (1984),145994,5.0,216755


NormalPredictor


Unnamed: 0,title,movieId,est_rating,imdbId
0,Sudden Death (1995),9,5.0,114576
896,Johnny Stecchino (1991),26732,5.0,102164
833,"Farmer's Daughter, The (1947)",8611,5.0,39370
837,Catwoman (2004),8666,5.0,327554
839,"Snake Pit, The (1948)",8718,5.0,40806
842,"Funny Thing Happened on the Way to the Forum, ...",8796,5.0,60438
847,Anacondas: The Hunt for the Blood Orchid (2004),8830,5.0,366174
848,Warriors of Heaven and Earth (Tian di ying xio...,8832,5.0,374330
850,Resident Evil: Apocalypse (2004),8861,5.0,318627
856,"Country Girl, The (1954)",8920,5.0,46874
