In [65]:
import pandas as pd
import numpy as np

In [66]:
data = {
    "movie_id":[1,2,3,4,5],
    "title":["Inception","Intersteller","Avatar","Titanic","The Dark Knight"],
    "genre":["Sci-Fi|Action|Thriller","Drama|Sci-Fi","Action|Sci-Fi|Adventure","Romance|Drama","Action|Drama|Crime"]
}

In [67]:
movies = pd.DataFrame(data)

In [68]:
movies

Unnamed: 0,movie_id,title,genre
0,1,Inception,Sci-Fi|Action|Thriller
1,2,Intersteller,Drama|Sci-Fi
2,3,Avatar,Action|Sci-Fi|Adventure
3,4,Titanic,Romance|Drama
4,5,The Dark Knight,Action|Drama|Crime


In [69]:
ratings_data = {
    "user_id":[1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4],
    "movie_id":[2,3,4,1,1,2,5,4,3,4,2,5,4,1,2,5],
    "rating":[4,5,3,2,2,5,2,4,5,3,1,3,4,3,5,4]
}

In [70]:
ratings = pd.DataFrame(ratings_data)

In [71]:
ratings

Unnamed: 0,user_id,movie_id,rating
0,1,2,4
1,1,3,5
2,1,4,3
3,1,1,2
4,2,1,2
5,2,2,5
6,2,5,2
7,2,4,4
8,3,3,5
9,3,4,3


In [72]:
# Content-Based

In [73]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [74]:
vectorizer = CountVectorizer( tokenizer = lambda x:x.split('|') )

In [75]:
genre_matrix = vectorizer.fit_transform(movies['genre'])



In [76]:
genre_matrix

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 13 stored elements and shape (5, 7)>

In [77]:
cos_sim = cosine_similarity(genre_matrix)

In [78]:
cos_sim

array([[1.        , 0.40824829, 0.66666667, 0.        , 0.33333333],
       [0.40824829, 1.        , 0.40824829, 0.5       , 0.40824829],
       [0.66666667, 0.40824829, 1.        , 0.        , 0.33333333],
       [0.        , 0.5       , 0.        , 1.        , 0.40824829],
       [0.33333333, 0.40824829, 0.33333333, 0.40824829, 1.        ]])

In [79]:
genre_matrix.toarray()

array([[1, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 1, 0, 1, 0],
       [1, 1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 1, 0, 0],
       [1, 0, 1, 1, 0, 0, 0]])

In [80]:
vectorizer.get_feature_names_out()

array(['action', 'adventure', 'crime', 'drama', 'romance', 'sci-fi',
       'thriller'], dtype=object)

In [81]:
genre = pd.DataFrame(
    genre_matrix.toarray() , 
    columns = vectorizer.get_feature_names_out() , 
    index = movies['title']
)

In [82]:
genre

Unnamed: 0_level_0,action,adventure,crime,drama,romance,sci-fi,thriller
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Inception,1,0,0,0,0,1,1
Intersteller,0,0,0,1,0,1,0
Avatar,1,1,0,0,0,1,0
Titanic,0,0,0,1,1,0,0
The Dark Knight,1,0,1,1,0,0,0


In [83]:
cos = pd.DataFrame(
    cos_sim , 
    columns = movies['title'],
    index = movies['title']
)

In [84]:
cos

title,Inception,Intersteller,Avatar,Titanic,The Dark Knight
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Inception,1.0,0.408248,0.666667,0.0,0.333333
Intersteller,0.408248,1.0,0.408248,0.5,0.408248
Avatar,0.666667,0.408248,1.0,0.0,0.333333
Titanic,0.0,0.5,0.0,1.0,0.408248
The Dark Knight,0.333333,0.408248,0.333333,0.408248,1.0


In [85]:
def recommend_by_genre(movie_title , top_n = 2):
    idx = movies[movies['title']==movie_title].index[0]
    sim_score = list( enumerate( cos_sim[idx] ) )
    sim_score = sorted(sim_score , key=lambda x:x[1] , reverse=True) 
    sim_score = sim_score[1:top_n+1]
    return movies.iloc[ [i[0] for i in sim_score] ][['title','genre']]

In [86]:
recommend_by_genre('Avatar')

Unnamed: 0,title,genre
0,Inception,Sci-Fi|Action|Thriller
1,Intersteller,Drama|Sci-Fi


### Collaborative Filtering

In [87]:
user_rating_matrix = ratings.pivot_table(
    index="user_id",
    columns="movie_id",
    values="rating"
).fillna(0)

In [88]:
user_rating_matrix

movie_id,1,2,3,4,5
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2.0,4.0,5.0,3.0,0.0
2,2.0,5.0,0.0,4.0,2.0
3,0.0,1.0,5.0,3.0,3.0
4,3.0,5.0,0.0,4.0,4.0


In [89]:
from sklearn.decomposition import TruncatedSVD

In [90]:
svd = TruncatedSVD( n_components=2 , random_state=42 )  
# n_components = 2  => It tells model that understand the data with 2 important features only

In [91]:
latent_matrix = svd.fit_transform(user_rating_matrix)

In [92]:
latent_matrix

array([[ 6.39244544,  2.82019248],
       [ 6.52643068, -2.37346607],
       [ 5.0344002 ,  3.76765912],
       [ 7.52342464, -2.85848129]])

In [93]:
reconstructed_matrix = np.dot(
    latent_matrix , 
    svd.components_
)

In [94]:
reconstructed_matrix

array([[ 1.26825515,  3.02612954,  4.79187445,  3.40548119,  1.86856904],
       [ 2.41718443,  4.71293042,  0.07985262,  3.64695572,  2.62039737],
       [ 0.66836121,  1.90526319,  5.19082331,  2.63192027,  1.26176669],
       [ 2.81260076,  5.47072928, -0.02012412,  4.20803867,  3.03730757]])

In [95]:
predicted_ratings = pd.DataFrame(
    reconstructed_matrix ,
    index=user_rating_matrix.index , 
    columns=user_rating_matrix.columns 
)

In [96]:
predicted_ratings

movie_id,1,2,3,4,5
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.268255,3.02613,4.791874,3.405481,1.868569
2,2.417184,4.71293,0.079853,3.646956,2.620397
3,0.668361,1.905263,5.190823,2.63192,1.261767
4,2.812601,5.470729,-0.020124,4.208039,3.037308


### Genre_Similarity + SVD_Ratings

In [97]:
def recommend(user_id , movie_title , top_n=2):
    idx = movies[ movies['title']==movie_title ].index[0]
    sim_score = list( enumerate( cos_sim[idx] ) )
    sim_scores = sorted( sim_score , key = lambda x:x[1] , reverse=True )[1:]
    recommendation = []
    for i , genre_score in sim_scores:
        movie_id = movies.iloc[i]['movie_id']
        # Predicted rating from SVD
        if movie_id in predicted_ratings.columns:
            cf_score = predicted_ratings.loc[user_id,movie_id]
        else:
            cf_score = 0
        final_score = (0.5*genre_score)+(0.5*cf_score)
        recommendation.append((movies.iloc[i]['title'],final_score))
    recommendation.sort(key=lambda x:x[1] , reverse=True)
    return recommendation[:top_n]

In [98]:
recommend(1,'Avatar')

[('Intersteller', np.float64(1.7171889150877635)),
 ('Titanic', np.float64(1.7027405941201503))]