## Collaborative Filtering using SVD matrix factorization

In [34]:
import pandas as pd
from surprise import Reader, Dataset, SVD, NMF
from surprise.model_selection import cross_validate, train_test_split
from surprise.accuracy import rmse

In [2]:
ratings = pd.read_csv("../data/ratings_small.csv", usecols=['userId','movieId','rating'])

In [3]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0
...,...,...,...
99999,671,6268,2.5
100000,671,6269,4.0
100001,671,6365,4.0
100002,671,6385,2.5


In [4]:
ratings.rating.value_counts()

4.0    28750
3.0    20064
5.0    15095
3.5    10538
4.5     7723
2.0     7271
2.5     4449
1.0     3326
1.5     1687
0.5     1101
Name: rating, dtype: int64

In [5]:
ratings['movieId'].nunique()

9066

In [6]:
# surprise reader API to read the dataset
reader = Reader(rating_scale=(0.5,5))

In [7]:
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [8]:
# svd = SVD()
# cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [38]:
movies = pd.read_csv("../data/movies_metadata.csv",usecols=['id','title', 'genres'])

In [39]:
movies = movies.drop([19730, 29503, 35587])
movies = movies.drop_duplicates(subset="title")
movies = movies.astype({'id': 'int32'})

In [40]:
links = pd.read_csv("../data/links.csv",usecols=['movieId','tmdbId'])

In [41]:
links

Unnamed: 0,movieId,tmdbId
0,1,862.0
1,2,8844.0
2,3,15602.0
3,4,31357.0
4,5,11862.0
...,...,...
45838,176269,439050.0
45839,176271,111109.0
45840,176273,67758.0
45841,176275,227506.0


In [42]:
movies = pd.merge(movies, links, left_on='id', right_on='tmdbId')

In [46]:
movies

Unnamed: 0,genres,id,title,movieId,tmdbId
0,"[Animation, Comedy, Family]",862,Toy Story,1,862.0
1,"[Adventure, Fantasy, Family]",8844,Jumanji,2,8844.0
2,"[Romance, Comedy]",15602,Grumpier Old Men,3,15602.0
3,"[Comedy, Drama, Romance]",31357,Waiting to Exhale,4,31357.0
4,[Comedy],11862,Father of the Bride Part II,5,11862.0
...,...,...,...,...,...
42301,[Science Fiction],222848,Caged Heat 3000,176263,222848.0
42302,"[Drama, Family]",439050,Subdue,176269,439050.0
42303,[Drama],111109,Century of Birthing,176271,111109.0
42304,[],227506,Satan Triumphant,176275,227506.0


In [45]:
from ast import literal_eval
movies['genres'] = movies['genres']\
    .apply(literal_eval)\
    .apply(lambda x: [i['name'] for i in x])

## Collaborative Filtering using Matrix Decomposition

In [35]:
trainset, testset = train_test_split(data, test_size=0.1)
algo = NMF()
algo.fit(trainset)

# testset= trainset.build_anti_testset()
predictions = algo.test(testset)

In [36]:
rmse(predictions)

RMSE: 0.9315


0.9314613943575001

In [20]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9422  0.9438  0.9426  0.9544  0.9534  0.9473  0.0054  
MAE (testset)     0.7236  0.7241  0.7219  0.7322  0.7333  0.7270  0.0047  
Fit time          6.65    6.76    7.41    8.41    7.71    7.39    0.65    
Test time         0.13    0.16    0.19    0.28    0.15    0.18    0.05    


{'test_rmse': array([0.94222372, 0.94382909, 0.9426425 , 0.95440242, 0.95344621]),
 'test_mae': array([0.72364789, 0.72410509, 0.72193241, 0.7322496 , 0.73331114]),
 'fit_time': (6.654446125030518,
  6.760913610458374,
  7.412480592727661,
  8.411000728607178,
  7.712035417556763),
 'test_time': (0.13275480270385742,
  0.15603899955749512,
  0.1860969066619873,
  0.281998872756958,
  0.15395832061767578)}

## Top n movies for user with given Id
 

In [20]:
def make_predictions_for_user(userId, n_recommendations, model, movies):
    predictions = {}
    for index, movie in movies.iterrows():
        row_pred = {'movieId': movie['movieId'], 'prediction': model.predict(userId, movie['movieId']).est}
        predictions[index] = row_pred
    predictions = pd.DataFrame.from_dict(predictions, orient='index')
    predictions = predictions.sort_values(by='prediction', ascending=False).head(n_recommendations)
    predictions = pd.merge(predictions, movies, left_on='movieId', right_on='movieId')
    return predictions[["movieId", 'prediction', 'title']]
#     return predictions

In [25]:
preds = make_predictions_for_user(2, 10, algo, movies)
preds

Unnamed: 0,movieId,prediction,title
0,318,4.431211,The Shawshank Redemption
1,4993,4.373547,The Lord of the Rings: The Fellowship of the Ring
2,2064,4.368343,Roger & Me
3,1228,4.355071,Raging Bull
4,969,4.349982,The African Queen
5,1197,4.332923,The Princess Bride
6,3462,4.332846,Modern Times
7,858,4.328197,The Godfather
8,994,4.316069,Big Night
9,1178,4.303276,Paths of Glory


## Top n movies for movie with given Id

In [21]:
from sklearn.metrics import pairwise

In [22]:
def make_predictions_for_item(itemId, n_recommendations, model, movies):
    movie_matrix = model.qi
    movies_similarities = pairwise.cosine_similarity(movie_matrix)
    inner_item_id = trainset.to_inner_iid(itemId)
    sim_scores = list(enumerate(movies_similarities[inner_item_id]))
    predictions = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    predictions = predictions[1:n_recommendations]
    predictions = [(trainset.to_raw_iid(x), y) for (x,y) in predictions]
    df = pd.DataFrame(predictions, columns=['movieId', 'prediction'])
    dfx = pd.merge(df, movies, left_on='movieId', right_on='movieId')
    return dfx[["movieId", 'prediction', 'title']]


In [58]:
toy_story_preditions = make_predictions_for_item(1, 11, algo, movies)
toy_story_preditions

Unnamed: 0,movieId,prediction,title
0,80350,0.965745,Vampires Suck
1,104076,0.960291,The Smurfs 2
2,69805,0.958921,The Librarian: The Curse of the Judas Chalice
3,100527,0.956361,Safe Haven
4,471,0.955492,The Hudsucker Proxy
5,1193,0.955021,One Flew Over the Cuckoo's Nest
6,85179,0.954349,Summer Wars
7,49299,0.954031,Luna de Avellaneda
8,4886,0.951325,"Monsters, Inc."
9,156609,0.950885,Neighbors 2: Sorority Rising


In [59]:
pd.merge(toy_story_preditions, movies, left_on='movieId', right_on='movieId', suffixes=("", "x"))[['title', 'genres']]

Unnamed: 0,title,genres
0,Vampires Suck,"[Horror, Comedy]"
1,The Smurfs 2,"[Fantasy, Family, Comedy, Animation]"
2,The Librarian: The Curse of the Judas Chalice,"[Fantasy, Action, Adventure, Comedy]"
3,Safe Haven,[Romance]
4,The Hudsucker Proxy,"[Comedy, Drama]"
5,One Flew Over the Cuckoo's Nest,[Drama]
6,Summer Wars,[Animation]
7,Luna de Avellaneda,"[Drama, Romance]"
8,"Monsters, Inc.","[Animation, Comedy, Family]"
9,Neighbors 2: Sorority Rising,[Comedy]


In [56]:
avatar_preditions = make_predictions_for_item(72998, 11, algo, movies)
avatar_preditions

Unnamed: 0,movieId,prediction,title
0,6561,0.966116,The Mouse That Roared
1,527,0.956278,Schindler's List
2,30793,0.954581,Charlie and the Chocolate Factory
3,78088,0.953816,Buried
4,1274,0.953531,Akira
5,5981,0.951654,The Day of the Triffids
6,26947,0.950323,Pusher
7,6297,0.947127,Holes
8,2268,0.947114,A Few Good Men
9,4691,0.946703,Def-Con 4


In [57]:
pd.merge(avatar_preditions, movies, left_on='movieId', right_on='movieId', suffixes=("", "x"))[['title', 'genres']]

Unnamed: 0,title,genres
0,The Mouse That Roared,[Comedy]
1,Schindler's List,"[Drama, History, War]"
2,Charlie and the Chocolate Factory,"[Adventure, Comedy, Family, Fantasy]"
3,Buried,"[Drama, Thriller, Mystery]"
4,Akira,"[Science Fiction, Animation]"
5,The Day of the Triffids,"[Horror, Science Fiction, Thriller]"
6,Pusher,"[Action, Crime, Drama, Thriller]"
7,Holes,"[Adventure, Family, Drama, Comedy]"
8,A Few Good Men,[Drama]
9,Def-Con 4,"[Horror, Science Fiction, Thriller, Action]"
