In [73]:
import numpy as np
import pandas as pd
import json
import sys

sys.path.insert(0, "../")

from models.recsys import recommend, print_recs
from surprise import Trainset, Dataset, Reader, SVD, accuracy
from tqdm import tqdm
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms.predictions import Prediction

In [68]:
df_ratings = pd.read_csv(
    '../data/interim/preprocessed/ratings.csv',
    index_col=0
).reset_index()

print(df_ratings.shape)
df_ratings.head()

(99990, 3)


Unnamed: 0,user_id,movie_id,rating
0,195,242,3
1,185,302,3
2,21,377,1
3,243,51,2
4,165,346,1


In [67]:
df_users_similarity = pd.read_csv(
    '../data/interim/users/users_similarity.csv',
    index_col=0
)

print(df_users_similarity.shape)
df_users_similarity.head()

(943, 1)


Unnamed: 0_level_0,similar_ids
user_id,Unnamed: 1_level_1
0,"[(888, 0.9931382418006969), (310, 0.9894742240..."
1,"[(272, 0.9851447001360526), (459, 0.9849620550..."
2,"[(444, 0.9855481657176935), (832, 0.9721014780..."
3,"[(293, 0.972894605130872), (811, 0.95356116078..."
4,"[(416, 0.931826885347274), (37, 0.931367324520..."


In [48]:
ratings_columns = ['user_id', 'movie_id', 'rating']

In [131]:
def apply_users_similarity(df_predictions, df_ratings, df_users_similarity, similarity_threshold):
    for user_id in tqdm(df_predictions['user_id'].unique()):
        df_predictions_svd_user = df_predictions.query(f'user_id == {user_id}')

        users_similarities = pd.eval(df_users_similarity.loc[user_id, 'similar_ids'])
        # take the similar users with cosine similarity >= similarity_threshold
        similar_users_ids = [tuple[0] for tuple in users_similarities if tuple[1] > similarity_threshold]
        similar_users_ratings = df_ratings[df_ratings['user_id'].isin(similar_users_ids)]

        for movie_id in df_predictions_svd_user['movie_id'].unique():
            # get relevant ratings, i.e., the specified movie rating from users in similar_users_ids
            relevant_ratings = similar_users_ratings.query(f'movie_id == {movie_id}')
            relevant_ratings = relevant_ratings['rating'].tolist()
            num_relevant_ratings = np.count_nonzero(relevant_ratings)

            # get the rating_svd
            rating_svd = \
                df_predictions_svd_user.query(f'user_id == {user_id} and movie_id == {movie_id}')['rating_svd'].iloc[0]

            # compute the hybrid rating
            if relevant_ratings:
                rating_hybrid = (rating_svd + sum(relevant_ratings)) / (num_relevant_ratings + 1)
            else:
                rating_hybrid = rating_svd
            
            df_predictions.loc[
                (df_predictions['user_id'] == user_id) & (df_predictions['movie_id'] == movie_id),
                'rating_hybrid'
            ] = rating_hybrid

    return df_predictions


def predictions_to_df(predictions):
    data = []
    for pred in tqdm(predictions):
        data.append({
            'user_id': pred.uid,
            'movie_id': pred.iid,
            'rating_svd': pred.est,
            'rating_hybrid': -1,
        })
    df = pd.DataFrame(data)
    return df

In [135]:
def evaluate_recs(svd, testset):
    # Make predictions on the test set
    predictions_svd = svd.test(testset)
    
    # apply users_similarity
    predictions_sim = []
    df_predictions_svd = predictions_to_df(predictions_svd)
    df_predictions_sim = apply_users_similarity(
        df_predictions=df_predictions_svd,
        df_ratings=df_ratings,
        df_users_similarity=df_users_similarity,
        similarity_threshold=0.9
    )

    for idx, pred in tqdm(df_predictions_sim.iterrows()):
        pred_sim = Prediction(
            uid     = pred.user_id,
            iid     = pred.movie_id,
            r_ui    = pred.rating_svd,
            est     = pred.rating_hybrid,
            details = None
        )
        predictions_sim.append(pred_sim)

    # evaluate the predictions
    accuracy.rmse(predictions_svd)
    accuracy.rmse(predictions_sim)

    return df_predictions_svd, df_predictions_sim

In [133]:
df_predictions_svd, df_predictions_sim = evaluate_recs(svd, testset)

RMSE: 0.9546


100%|██████████| 20000/20000 [00:00<00:00, 1110603.19it/s]
  df_predictions.loc[
100%|██████████| 459/459 [02:25<00:00,  3.15it/s]
20000it [00:01, 12324.65it/s]

RMSE: 0.9546
RMSE: 0.4676





In [137]:
for i in range(1, 6):
    print(f"Split {i}")
    # read train and test tables
    df_train = pd.read_csv(
        f'data/u{i}.base',
        sep='\t',
        encoding='latin-1',
        index_col=0,
        names=ratings_columns + ['unix_timestamp']
    ).reset_index()
    df_train['user_id'] = df_train['user_id'] - 1

    df_test = pd.read_csv(
        f'data/u{i}.test',
        sep='\t',
        encoding='latin-1',
        index_col=0,
        names=ratings_columns + ['unix_timestamp']
    ).reset_index()
    df_test['user_id'] = df_test['user_id'] - 1

    # load train data
    reader = Reader(rating_scale=(1, 5))
    train_data = Dataset.load_from_df(df_train[ratings_columns], reader)
    trainset = train_data.build_full_trainset()

    # train the SVD model
    svd = SVD()
    svd.fit(trainset)

    # load test data
    test_data = Dataset.load_from_df(df_test[ratings_columns], reader)
    testset = test_data.build_full_trainset().build_testset()

    # Make predictions on the test set
    predictions = svd.test(testset)

    # evaluate the model
    evaluate_recs(svd, testset)
    print("\n\n")

Split 1


100%|██████████| 20000/20000 [00:00<00:00, 766082.92it/s]
  df_predictions.loc[
100%|██████████| 459/459 [02:28<00:00,  3.09it/s]
20000it [00:01, 12400.51it/s]


RMSE: 0.9526
RMSE: 0.4657



Split 2


100%|██████████| 20000/20000 [00:00<00:00, 1338258.86it/s]
  df_predictions.loc[
100%|██████████| 653/653 [02:56<00:00,  3.70it/s]
20000it [00:01, 12273.11it/s]


RMSE: 0.9382
RMSE: 0.4759



Split 3


100%|██████████| 20000/20000 [00:00<00:00, 722545.44it/s]
  df_predictions.loc[
100%|██████████| 869/869 [03:26<00:00,  4.21it/s]
20000it [00:01, 11557.26it/s]


RMSE: 0.9341
RMSE: 0.4705



Split 4


100%|██████████| 20000/20000 [00:00<00:00, 780313.85it/s]
  df_predictions.loc[
100%|██████████| 923/923 [03:31<00:00,  4.36it/s]
20000it [00:01, 13292.19it/s]


RMSE: 0.9318
RMSE: 0.4725



Split 5


100%|██████████| 20000/20000 [00:00<00:00, 1264582.50it/s]
  df_predictions.loc[
100%|██████████| 927/927 [03:32<00:00,  4.36it/s]
20000it [00:01, 12291.00it/s]

RMSE: 0.9345
RMSE: 0.4624








The results are quite impressive. Cross-validated RMSE (5 folds):

- pure SVD: `0.9382`
- SVD + similar users ratings: `0.4694`