In [18]:
from surprise import SVDpp
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import KFold

import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv('movies.dat', sep='::', engine='python', names=['movieId', 'title', 'genres'])
ratings = pd.read_csv('ratings.dat', sep='::', engine='python', names=['userId', 'movieId', 'rating', 'timestamp'])

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [4]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [5]:
movies_with_ratings[movies_with_ratings.userId == 2.0].title.unique()

array(['Get Shorty (1995)', 'Broken Arrow (1996)', 'Braveheart (1995)',
       'Desperado (1995)', 'Die Hard: With a Vengeance (1995)',
       'Ed Wood (1994)',
       'Like Water for Chocolate (Como agua para chocolate) (1992)',
       'Outbreak (1995)', 'Shawshank Redemption, The (1994)',
       'Clear and Present Danger (1994)', 'Forrest Gump (1994)',
       'Maverick (1994)', 'True Lies (1994)', 'Cliffhanger (1993)',
       'Demolition Man (1993)', 'Fugitive, The (1993)',
       'Getaway, The (1994)', 'Jurassic Park (1993)', 'Mr. Jones (1993)',
       'Remains of the Day, The (1993)',
       'Terminator 2: Judgment Day (1991)', 'Dances with Wolves (1990)',
       'Silence of the Lambs, The (1991)', 'Courage Under Fire (1996)',
       'Mission: Impossible (1996)', 'Twister (1996)',
       'Independence Day (ID4) (1996)', "Breakfast at Tiffany's (1961)",
       'Gone with the Wind (1939)', 'Picnic (1955)',
       'Bonnie and Clyde (1967)', 'Platoon (1986)',
       "Sophie's Choice (1

In [6]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [7]:
dataset.head()

Unnamed: 0,uid,iid,rating
0,1.0,Toy Story (1995),5.0
1,6.0,Toy Story (1995),4.0
2,8.0,Toy Story (1995),4.0
3,9.0,Toy Story (1995),5.0
4,10.0,Toy Story (1995),5.0


In [8]:
ratings.rating.min()

1

In [9]:
ratings.rating.max()

5

In [10]:
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)

In [11]:
algo = SVDpp()

In [14]:
kf = KFold(n_splits=2, random_state=42, shuffle=True)

In [15]:
algo = SVDpp()
i = 0
acc = []
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    test_pred = algo.test(testset)
    print(f'Fold {i}:')
    a = accuracy.rmse(test_pred, verbose=True)
    acc.append(a)

Fold 0:
RMSE: 0.8858
Fold 0:
RMSE: 0.8847


In [20]:
mean_acc = np.mean(acc)
print(mean_acc)

0.8852870255107852
