In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, accuracy
from surprise import SVD, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline 
from surprise.model_selection import train_test_split, cross_validate
from surprise.model_selection import GridSearchCV

In [2]:
# Load the MovieLens dataset - https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
movies = pd.read_csv('./data/movies.csv')
ratings = pd.read_csv('./data/ratings.csv')
# Data encoding using Surprise library
reader = Reader(rating_scale=(0.5, 5.0))  # Define the rating scale using Reader class from Surprise

# Convert the DataFrame to a Surprise dataset
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
# Split dataset using train_test_split from Surprise
trainset, testset = train_test_split(data, test_size=0.2) # 80% training and 20% testing
print(type(trainset))
print(type(testset))

<class 'surprise.trainset.Trainset'>
<class 'list'>


In [3]:
from collections import defaultdict
from surprise import Dataset, SVD
from surprise.model_selection import KFold

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [5]:

#data = Dataset.load_builtin("ml-100k")
#kf = KFold(n_splits=2)
algo = SVD()
algo.fit(trainset)
predictions = algo.test(testset)
precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

# Precision and recall can then be averaged over all users
print(sum(prec for prec in precisions.values()) / len(precisions))
print(sum(rec for rec in recalls.values()) / len(recalls))

0.6021346469622345
0.24324516025267232


In [12]:
print(type(predictions))
print(pd.DataFrame(predictions).head())


<class 'list'>
   uid    iid  r_ui       est                    details
0  317  44555   4.0  4.020991  {'was_impossible': False}
1  513    150   4.5  4.174174  {'was_impossible': False}
2  402    719   3.0  3.501272  {'was_impossible': False}
3  599   1917   3.0  2.112975  {'was_impossible': False}
4  307   2515   1.5  2.643389  {'was_impossible': False}


In [10]:
print(predictions[0:5])
def head_predictions(predictions_list, n=5, include_titles=True):
    """
    Display the first n predictions in a readable format, similar to df.head()
    
    Parameters:
    -----------
    predictions_list : list
        List of Surprise prediction objects
    n : int, optional (default=5)
        Number of predictions to show
    include_titles : bool, optional (default=True)
        Whether to include movie titles in the output
    """
    # Ensure n is not larger than the list
    n = min(n, len(predictions_list))
    
    # Print header
    print(f"Showing first {n} predictions:")
    print("-" * 100)
    
    if include_titles:
        print(f"{'User ID':<8} {'Movie ID':<8} {'Movie Title':<40} {'Actual':<8} {'Predicted':<8} {'Error':<8}")
    else:
        print(f"{'User ID':<8} {'Movie ID':<8} {'Actual':<8} {'Predicted':<8} {'Error':<8}")
    
    print("-" * 100)
    
    # Print each prediction
    for i, pred in enumerate(predictions_list[:n]):
        user_id = pred.uid
        movie_id = pred.iid
        actual = pred.r_ui
        predicted = pred.est
        error = actual - predicted
        
        if include_titles:
            # Get movie title if available
            title_row = movies[movies['movieId'] == movie_id]
            title = title_row['title'].values[0] if not title_row.empty else "Unknown"
            
            # Truncate title if too long
            if len(title) > 38:
                title = title[:35] + "..."
            
            print(f"{user_id:<8} {movie_id:<8} {title:<40} {actual:<8.2f} {predicted:<8.2f} {error:<8.2f}")
        else:
            print(f"{user_id:<8} {movie_id:<8} {actual:<8.2f} {predicted:<8.2f} {error:<8.2f}")

# Show the first 10 predictions with movie titles
head_predictions(predictions, 10)

[Prediction(uid=317, iid=44555, r_ui=4.0, est=4.0209912899719535, details={'was_impossible': False}), Prediction(uid=513, iid=150, r_ui=4.5, est=4.174174044680569, details={'was_impossible': False}), Prediction(uid=402, iid=719, r_ui=3.0, est=3.501272124672278, details={'was_impossible': False}), Prediction(uid=599, iid=1917, r_ui=3.0, est=2.1129750181624902, details={'was_impossible': False}), Prediction(uid=307, iid=2515, r_ui=1.5, est=2.6433891401336385, details={'was_impossible': False})]
Showing first 10 predictions:
----------------------------------------------------------------------------------------------------
User ID  Movie ID Movie Title                              Actual   Predicted Error   
----------------------------------------------------------------------------------------------------
317      44555    Lives of Others, The (Das leben der...   4.00     4.02     -0.02   
513      150      Apollo 13 (1995)                         4.50     4.17     0.33    
402      71