# Imports and Data Import

In [18]:
import numpy as np
import pandas as pd
import json
import sys

sys.path.insert(0, "../")

from models.recsys import recommend_svd, recommend_hybrid, print_recs
from surprise import Dataset, Reader, SVD, dump
from surprise.model_selection import cross_validate

In [5]:
df_ratings = pd.read_csv(
    '../data/interim/preprocessed/ratings.csv',
    index_col=0
).reset_index()

print(df_ratings.shape)
df_ratings.head()

(99990, 3)


Unnamed: 0,user_id,movie_id,rating
0,195,242,3
1,185,302,3
2,21,377,1
3,243,51,2
4,165,346,1


In [6]:
df_items = pd.read_csv(
    '../data/interim/preprocessed/items.csv',
    index_col=0
)

print(df_items.shape)
df_items.head()

(1680, 20)


Unnamed: 0_level_0,title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,Toy Story (1995),0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


# SVD

Singular Value Decomposition is 

In [7]:
reader = Reader(rating_scale=(1, 5))

columns = ['user_id', 'movie_id', 'rating']
ratings_data = Dataset.load_from_df(df_ratings[columns], reader)

In [8]:
svd = SVD()

cross_validate(svd, ratings_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9392  0.9304  0.9377  0.9420  0.9390  0.9377  0.0039  
MAE (testset)     0.7398  0.7347  0.7396  0.7433  0.7375  0.7390  0.0028  
Fit time          1.18    1.23    1.24    1.19    1.19    1.21    0.02    
Test time         0.13    0.20    0.13    0.21    0.14    0.16    0.04    


{'test_rmse': array([0.93918304, 0.93043807, 0.93766811, 0.9420394 , 0.93897008]),
 'test_mae': array([0.73981596, 0.73471696, 0.73962767, 0.74333316, 0.73753489]),
 'fit_time': (1.1845219135284424,
  1.2342610359191895,
  1.2355749607086182,
  1.1925992965698242,
  1.1930224895477295),
 'test_time': (0.13382983207702637,
  0.20425915718078613,
  0.12867283821105957,
  0.2088005542755127,
  0.13602995872497559)}

# RecSys

A recsys implementation with two options:

- SVD only
- hybrid: SVD + cosine similarity-based recommendations (considering the users data as well)

In [9]:
df_users_similarity = pd.read_csv(
    '../data/interim/users/users_similarity.csv',
    index_col=0
)

print(df_users_similarity.shape)
df_users_similarity.head()

(943, 1)


Unnamed: 0_level_0,similar_ids
user_id,Unnamed: 1_level_1
0,"[(888, 0.9931382418006969), (310, 0.9894742240..."
1,"[(272, 0.9851447001360526), (459, 0.9849620550..."
2,"[(444, 0.9855481657176935), (832, 0.9721014780..."
3,"[(293, 0.972894605130872), (811, 0.95356116078..."
4,"[(416, 0.931826885347274), (37, 0.931367324520..."


In [15]:
movies_ids = df_items.index.tolist()

test_user_id = 2

Recommendations with pure SVD algorithm:

In [16]:
recs_best, recs_all = recommend_svd(
    user_id=test_user_id,
    df_ratings=df_ratings,
    movies_ids=movies_ids,
    svd=svd,
    num_of_recs=5
)
print_recs(df_items, recs_best)

Recommendations for the user:
1. Raging Bull (1980).
Predicted rating: 4.128992935618387

2. Pulp Fiction (1994).
Predicted rating: 4.082865805052177

3. Boot, Das (1981).
Predicted rating: 4.078920811200042

4. Alien (1979).
Predicted rating: 4.073339122395408

5. Close Shave, A (1995).
Predicted rating: 4.051272345407039



Recommendations with hybrid system (SVD + users similarity):

In [17]:
recs_best, recs_all = recommend_hybrid(
    recs=recs_all,
    user_id=test_user_id,
    df_ratings=df_ratings,
    df_users_similarity=df_users_similarity,
    similarity_threshold=0.9,
    num_of_recs=5
)
print_recs(df_items, recs_best)

Recommendations for the user:
1. Psycho (1960).
Predicted rating: 4.695378554616434

2. Raging Bull (1980).
Predicted rating: 4.688165489269731

3. Philadelphia Story, The (1940).
Predicted rating: 4.601613563869823

4. 12 Angry Men (1957).
Predicted rating: 4.591155586023087

5. Citizen Kane (1941).
Predicted rating: 4.474916755234956



Serialize the SVD model:

In [19]:
dump.dump('../models/svd_dump_file', algo=svd)