In [1]:
import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
from collections import defaultdict

In [2]:
# Load ratings
ratings = pd.read_csv("../data/input/ml-32m/ratings.csv")

# Sample 50% of the dataset to reduce computational time
sampled_ratings = ratings.sample(frac=0.5, random_state=42)

# Define a reader with the appropriate rating scale
reader = Reader(rating_scale=(0.5, 5.0))

# Convert the DataFrame into a Surprise Dataset
data = Dataset.load_from_df(sampled_ratings[['userId', 'movieId', 'rating']], reader)

data.df

Unnamed: 0,userId,movieId,rating
10685861,66954,781,5.0
1552723,9877,574,4.0
6145184,38348,1088,2.0
16268584,101952,2706,1.0
22418634,140400,275079,3.5
...,...,...,...
28225727,176940,60766,4.0
27523878,172675,2605,3.5
5991563,37451,6059,2.0
13419160,83892,106489,3.5


In [3]:
# Train/Test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train a SVD Model
algo = SVD()
algo.fit(trainset)

# Predict on testset
predictions = algo.test(testset)

# Calculate RMSE
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse:.4f}")

RMSE: 0.8124
RMSE: 0.8124


In [4]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions, n=3)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

95377 [1270, 1262, 3578]
137819 [81845, 2819, 1358]
120284 [593, 202439, 49272]
240 [318, 527, 2028]
196307 [115569, 64614, 1967]
68382 [4993, 3114, 1291]
168213 [920, 4857, 68358]
82645 [318, 920, 356]
52591 [5618, 608, 1089]
34164 [122914, 5952, 122918]
153161 [1203, 1193, 1213]
88203 [448, 367, 440]
98396 [1212, 2351, 2927]
50875 [122882, 2571, 260]
21680 [3114, 480, 2011]
152540 [296, 4973, 4878]
36683 [4993, 170705, 2959]
86960 [1, 262, 440]
73314 [7361, 1089, 3949]
144130 [8368, 86347, 4896]
8031 [130634, 64716, 54286]
94881 [1247, 922, 1214]
104709 [2067, 593, 1221]
22179 [296, 1193, 74458]
42361 [1292, 2871, 1982]
145645 [50, 47, 39]
136435 [48516, 318, 31364]
112135 [923, 1217, 3503]
77697 [102903, 157699, 92259]
101239 [527, 904, 480]
19925 [2398, 1610, 44555]
192712 [2908, 318, 3556]
83494 [1198, 1230, 1219]
45884 [2918, 2916, 2762]
178180 [2692, 5064, 1416]
152900 [50, 457, 110]
68030 [1682, 4016, 5364]
87798 [318, 54503, 47]
114692 [858, 2028, 4886]
140871 [1, 1198, 899]
1