# Loading the required libraries

In [1]:
import numpy as np
import pandas as pd

from surprise import Dataset
from surprise import Reader, KNNWithMeans
from surprise.model_selection import cross_validate
from surprise import accuracy

# Loading the File

In [2]:
data = pd.read_csv('MovieRatings.csv')

In [3]:
data.head()

Unnamed: 0,UserID,Movie,Rating
0,1,M1,0
1,2,M1,0
2,3,M1,0
3,4,M1,0
4,5,M1,0


In [4]:
data.tail()

Unnamed: 0,UserID,Movie,Rating
23859,1252,M19,3
23860,1253,M19,0
23861,1254,M19,2
23862,1255,M19,3
23863,1256,M19,3


#### Check the unique users and unique jokes that were rated

In [5]:
data.UserID.nunique()

1256

In [6]:
data.Movie.nunique()

19

In [7]:
data = data[data.Rating != 0]

#### Get the summary of the dataset

In [8]:
#Observe the ratings 
data.describe(include = 'all')

Unnamed: 0,UserID,Movie,Rating
count,17038.0,17038,17038.0
unique,,19,
top,,M18,
freq,,947,
mean,660.588508,,2.02964
std,346.963309,,0.8208
min,1.0,,1.0
25%,363.0,,1.0
50%,662.0,,2.0
75%,963.0,,3.0


In [9]:
data.shape

(17038, 3)

In [10]:
data.isnull().sum()

UserID    0
Movie     0
Rating    0
dtype: int64

---
# Model Building

In [11]:
reader = Reader(line_format = 'user item rating', rating_scale=(1, 3))
data_obj = Dataset.load_from_df(data, reader)

In [12]:
data_obj

<surprise.dataset.DatasetAutoFolds at 0x2aa53ec55f8>

#### Simulation Parameters
-  Algorithm Type
-  User-Based vs Item-Based
-  Similarity Metric

In [13]:
sim_parameters = {'name': 'cosine','user_based': True}
algo = KNNWithMeans(k=5,sim_options=sim_parameters)

#### Cross Validation Accuracies

In [16]:
cross_validate(algo, data_obj, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9028  0.9035  0.9107  0.8963  0.9191  0.9065  0.0078  
MAE (testset)     0.7729  0.7661  0.7709  0.7573  0.7812  0.7697  0.0079  
Fit time          4.26    3.79    3.71    3.49    3.51    3.75    0.28    
Test time         2.95    3.31    2.63    3.07    3.08    3.01    0.22    


{'test_rmse': array([0.90278723, 0.90354769, 0.91071703, 0.89634177, 0.91911458]),
 'test_mae': array([0.77286488, 0.76606778, 0.77090482, 0.75727538, 0.78116138]),
 'fit_time': (4.257612943649292,
  3.7888660430908203,
  3.708045482635498,
  3.4926583766937256,
  3.5106098651885986),
 'test_time': (2.9471185207366943,
  3.311145782470703,
  2.6309638023376465,
  3.0697903633117676,
  3.083752155303955)}

#### Training the model on complete data

In [17]:
trainset = data_obj.build_full_trainset()
print(trainset)

<surprise.trainset.Trainset object at 0x000002AA53EC5710>


In [18]:
# Train the algorithm on the trainset
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x2aa53ec5dd8>

In [19]:
# Then predict ratings for all pairs (uid, iid) that are NOT in the training set.
testset = trainset.build_anti_testset()
print(testset)

[(7, 'M2', 2.029639629064444), (7, 'M3', 2.029639629064444), (7, 'M4', 2.029639629064444), (7, 'M5', 2.029639629064444), (7, 'M7', 2.029639629064444), (7, 'M8', 2.029639629064444), (7, 'M9', 2.029639629064444), (7, 'M10', 2.029639629064444), (7, 'M12', 2.029639629064444), (7, 'M13', 2.029639629064444), (7, 'M14', 2.029639629064444), (7, 'M15', 2.029639629064444), (7, 'M16', 2.029639629064444), (7, 'M17', 2.029639629064444), (7, 'M18', 2.029639629064444), (10, 'M2', 2.029639629064444), (10, 'M3', 2.029639629064444), (10, 'M4', 2.029639629064444), (10, 'M5', 2.029639629064444), (10, 'M7', 2.029639629064444), (10, 'M8', 2.029639629064444), (10, 'M9', 2.029639629064444), (10, 'M10', 2.029639629064444), (10, 'M12', 2.029639629064444), (10, 'M13', 2.029639629064444), (10, 'M14', 2.029639629064444), (10, 'M15', 2.029639629064444), (10, 'M16', 2.029639629064444), (10, 'M18', 2.029639629064444), (10, 'M19', 2.029639629064444), (11, 'M2', 2.029639629064444), (11, 'M3', 2.029639629064444), (11, '

In [20]:
data.head()

Unnamed: 0,UserID,Movie,Rating
6,7,M1,3
9,10,M1,3
10,11,M1,2
12,13,M1,3
17,18,M1,2


In [21]:
predictions = algo.test(testset)

In [22]:
predictions

[Prediction(uid=7, iid='M2', r_ui=2.029639629064444, est=2.6396825396825396, details={'actual_k': 5, 'was_impossible': False}),
 Prediction(uid=7, iid='M3', r_ui=2.029639629064444, est=2.5388888888888888, details={'actual_k': 5, 'was_impossible': False}),
 Prediction(uid=7, iid='M4', r_ui=2.029639629064444, est=2.7, details={'actual_k': 5, 'was_impossible': False}),
 Prediction(uid=7, iid='M5', r_ui=2.029639629064444, est=3, details={'actual_k': 5, 'was_impossible': False}),
 Prediction(uid=7, iid='M7', r_ui=2.029639629064444, est=2.7333333333333334, details={'actual_k': 5, 'was_impossible': False}),
 Prediction(uid=7, iid='M8', r_ui=2.029639629064444, est=2.7, details={'actual_k': 5, 'was_impossible': False}),
 Prediction(uid=7, iid='M9', r_ui=2.029639629064444, est=2.871269841269841, details={'actual_k': 5, 'was_impossible': False}),
 Prediction(uid=7, iid='M10', r_ui=2.029639629064444, est=2.190952380952381, details={'actual_k': 5, 'was_impossible': False}),
 Prediction(uid=7, iid='

In [23]:
# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.4519


0.45185142818742613

#### Filtering instances which can be used for predictions

In [24]:
predictions[0:2]

[Prediction(uid=7, iid='M2', r_ui=2.029639629064444, est=2.6396825396825396, details={'actual_k': 5, 'was_impossible': False}),
 Prediction(uid=7, iid='M3', r_ui=2.029639629064444, est=2.5388888888888888, details={'actual_k': 5, 'was_impossible': False})]

#### Function to calculate top 10 predictions for each user

In [25]:
# Fetching top 10 predictions for each user
from collections import defaultdict
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [26]:
top_n = get_top_n(predictions, n=10)
take(5, top_n.items())

[(7,
  [('M5', 3),
   ('M13', 3),
   ('M14', 3),
   ('M9', 2.871269841269841),
   ('M7', 2.7333333333333334),
   ('M4', 2.7),
   ('M8', 2.7),
   ('M2', 2.6396825396825396),
   ('M17', 2.6046031746031746),
   ('M3', 2.5388888888888888)]),
 (10,
  [('M5', 3),
   ('M13', 3),
   ('M14', 3),
   ('M9', 2.8617460317460317),
   ('M4', 2.8),
   ('M12', 2.691818181818182),
   ('M2', 2.6396825396825396),
   ('M19', 2.568888888888889),
   ('M3', 2.555555555555556),
   ('M15', 2.5055555555555555)]),
 (11,
  [('M14', 3),
   ('M7', 2.962051282051282),
   ('M4', 2.8566666666666665),
   ('M19', 2.7866666666666666),
   ('M11', 2.6142857142857143),
   ('M3', 2.55995670995671),
   ('M5', 2.466666666666667),
   ('M12', 2.4266666666666667),
   ('M8', 2.3566666666666665),
   ('M2', 2.3434798534798533)]),
 (13,
  [('M2', 3),
   ('M9', 2.843809523809524),
   ('M7', 2.8312820512820513),
   ('M14', 2.8),
   ('M10', 2.7266666666666666),
   ('M13', 2.6952380952380954),
   ('M5', 2.48),
   ('M11', 2.466666666666667

#### Top Predictions Matrix

In [27]:
# Printing top predictions
for uid, user_ratings in take(10,top_n.items()):
    print(uid, [iid for (iid, _) in user_ratings])

7 ['M5', 'M13', 'M14', 'M9', 'M7', 'M4', 'M8', 'M2', 'M17', 'M3']
10 ['M5', 'M13', 'M14', 'M9', 'M4', 'M12', 'M2', 'M19', 'M3', 'M15']
11 ['M14', 'M7', 'M4', 'M19', 'M11', 'M3', 'M5', 'M12', 'M8', 'M2']
13 ['M2', 'M9', 'M7', 'M14', 'M10', 'M13', 'M5', 'M11', 'M8', 'M3']
18 ['M2', 'M9', 'M15', 'M10', 'M7', 'M13', 'M4', 'M16', 'M5', 'M12']
26 ['M6', 'M14', 'M7', 'M4', 'M3', 'M13', 'M18', 'M9', 'M19', 'M2']
55 ['M5', 'M9', 'M14', 'M17', 'M13', 'M2', 'M3', 'M4', 'M10', 'M7']
66 ['M7', 'M15', 'M12', 'M10', 'M14', 'M8', 'M5', 'M3', 'M4', 'M13']
78 ['M10', 'M15', 'M12', 'M3', 'M4', 'M8', 'M5', 'M11', 'M2', 'M7']
81 ['M4', 'M5', 'M7', 'M8', 'M9', 'M13', 'M14', 'M17', 'M3', 'M18']
