In [7]:
import pandas as pd
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split
from collections import defaultdict

In [None]:
# Load ratings
ratings = pd.read_csv("../data/input/ml-32m/ratings.csv")

# Sample 50% of the dataset to reduce computational time
sampled_ratings = ratings.sample(frac=0.5, random_state=42)

# Define a reader with the appropriate rating scale
reader = Reader(rating_scale=(0.5, 5.0))

# Convert the DataFrame into a Surprise Dataset
data = Dataset.load_from_df(sampled_ratings[['userId', 'movieId', 'rating']], reader)

data.df

Unnamed: 0,userId,movieId,rating
10685861,66954,781,5.0
1552723,9877,574,4.0
6145184,38348,1088,2.0
16268584,101952,2706,1.0
22418634,140400,275079,3.5
...,...,...,...
3651342,22951,1721,3.5
19347597,121210,69122,1.5
30197936,189346,36,5.0
18122486,113309,54785,2.5


In [10]:
# Train/Test split
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train a SVD Model
algo = SVD()
algo.fit(trainset)

# Predict on testset
predictions = algo.test(testset)

# Calculate RMSE
rmse = accuracy.rmse(predictions)
print(f"RMSE: {rmse:.4f}")

RMSE: 0.8974
RMSE: 0.8974


In [11]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions, n=3)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

57584 [5989, 160980]
166918 [144]
3906 [1961, 3100, 3499]
106866 [1302, 6, 225]
107541 [1180, 1958, 1960]
26958 [55247, 1222, 218537]
75364 [296, 1913, 158238]
51408 [950, 6377, 6183]
130098 [364, 7438, 266]
167675 [2028, 3949]
89954 [1221, 48780, 1208]
192954 [1580, 6977, 2628]
154766 [318, 47, 608]
32215 [8644, 2701]
135645 [2502, 81834, 66097]
8922 [1431]
54315 [911, 4754, 319]
135502 [858, 373, 1291]
50491 [27773, 8014, 1200]
125398 [79132, 26471, 88235]
79910 [99114]
69683 [3448, 1376, 1101]
160292 [3000, 58559, 2959]
15299 [37729, 5010, 924]
70209 [307, 951, 25940]
131601 [54001, 27773, 168252]
44753 [4226, 3005, 3826]
105354 [8529, 1580, 1270]
98933 [58559, 88163]
14199 [1282, 1270, 3868]
194109 [589, 4011, 1268]
109218 [1300, 2761, 232]
89898 [969, 4537, 608]
68751 [527, 1704, 3160]
45576 [4896, 6567, 4025]
36239 [2542, 62374, 59315]
66050 [81591]
134755 [95, 427]
127151 [1278, 122882, 529]
193295 [2396, 1089, 3224]
153053 [589, 4359, 1500]
147827 [109374, 2395, 140816]
131610 