# Content Base Filtering

In [3]:
import pandas as pd
from math import sqrt
import numpy as np

In [4]:
ratings = pd.read_csv('dataset/movie/ratings.csv')
movies = pd.read_csv('dataset/movie/movies.csv')

In [22]:
def rekomendasi_film_content(user_id):
    #Melakukan One Hot Encoding untuk dataframe baru
    #Kita copy dulu
    new_movies = movies.copy()
    #Kita looping untuk setiap row di df, lalu looping lagi di genres list dan taroh list tsb ke dalam kolom
    for index, row in movies.iterrows():
        for genre in row['genres']:
            new_movies.at[index, genre] = 1    
    #Yang tidak termasuk genrenya ke dalam list maka kita diisi dengan 0
    new_movies = new_movies.fillna(0)
    
    #Kita membuat movie input dari user
    rating_grouping = ratings.groupby('userId')
    user_id_film = rating_grouping.get_group(user_id).head(5)
    
    #Kemudian melakukan One Hot Encoding untuk input movie user
    user_movie_encoding = new_movies[new_movies['movieId'].isin(user_id_film['movieId'].tolist())]
    #Reset index
    user_movie_encoding = user_movie_encoding.reset_index(drop=True)
    #Drop unused columns
    user_movie_encoding = user_movie_encoding.drop(columns=['movieId', 'title', 'genres'])
    
    #Mencari tahu user profile berdasarkan genrenya
    #Menggunakan dot product untuk mendapatkan bobot dari setiap genre
    user_profile = user_movie_encoding.transpose().dot(user_id_film['rating'].reset_index(drop=True))
    
    #Melakukan Rekomendasi Film
    new_movies.set_index(new_movies['movieId'], inplace=True)
    new_movies = new_movies.drop('movieId', 1).drop('title', 1).drop('genres', 1)
    #Multiply the genres by the weights and then take the weighted average
    rekomendasi_film = ((new_movies*user_profile).sum(axis=1))/(user_profile.sum())
    #Urutkan berdasarkan nilai weughted average tertinggti
    rekomendasi_film = rekomendasi_film.sort_values(ascending=False)
    #Rekomendasi film final untuk User
    rekomendasi_film_final = movies.loc[movies['movieId'].isin(rekomendasi_film.head(10).keys())]
    
    return rekomendasi_film_final

In [23]:
user = int(input('Masukan User Id: '))
rekomendasi_film_content(user)

Masukan User Id: 123


Unnamed: 0,movieId,title,genres
167,198,Strange Days (1995),Action|Crime|Drama|Mystery|Sci-Fi|Thriller
4631,6902,Interstate 60 (2002),Adventure|Comedy|Drama|Fantasy|Mystery|Sci-Fi|...
5556,26701,Patlabor: The Movie (Kidô keisatsu patorebâ: T...,Action|Animation|Crime|Drama|Film-Noir|Mystery...
5808,31921,"Seven-Per-Cent Solution, The (1976)",Adventure|Comedy|Crime|Drama|Mystery|Thriller
7170,71999,Aelita: The Queen of Mars (Aelita) (1924),Action|Adventure|Drama|Fantasy|Romance|Sci-Fi|...
7372,79132,Inception (2010),Action|Crime|Drama|Mystery|Sci-Fi|Thriller|IMAX
7441,81132,Rubber (2010),Action|Adventure|Comedy|Crime|Drama|Film-Noir|...
8597,117646,Dragonheart 2: A New Beginning (2000),Action|Adventure|Comedy|Drama|Fantasy|Thriller
9169,148775,Wizards of Waverly Place: The Movie (2009),Adventure|Children|Comedy|Drama|Fantasy|Sci-Fi
9394,164226,Maximum Ride (2016),Action|Adventure|Comedy|Fantasy|Sci-Fi|Thriller


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 

In [34]:
df_tag = pd.read_csv('dataset/movie/tags.csv')

In [39]:
#ngram_range untuk kombinasi kata contoh:ngram_range(1,2) very expensive watch -> very expensive, expensive watch
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df_tag['tag'])

In [45]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix) 
results = {}
for idx, row in df_tag.iterrows():
   similar_indices = cosine_similarities[idx].argsort()[:-100:-1] 
   similar_items = [(cosine_similarities[idx][i], df_tag['movieId'][i]) for i in similar_indices] 
   results[row['movieId']] = similar_items[1:]

In [79]:
df_movie = pd.read_csv('dataset/movie/movies.csv')

In [80]:
df_tag = pd.merge(df_tag,df_movie,how='left',on='movieId')

In [82]:
df_tag.head()

Unnamed: 0,userId,movieId,tag,timestamp,title,genres
0,2,60756,funny,1445714994,Step Brothers (2008),Comedy
1,2,60756,Highly quotable,1445714996,Step Brothers (2008),Comedy
2,2,60756,will ferrell,1445714992,Step Brothers (2008),Comedy
3,2,89774,Boxing story,1445715207,Warrior (2011),Drama
4,2,89774,MMA,1445715200,Warrior (2011),Drama


In [83]:
def item(movieId):  
    return df_tag.loc[df_tag['movieId'] == movieId]['title'].tolist()[0].split(' - ')[0] 
#     return df_tag.loc[df_tag['movieId'] == movieId]['movieId']
    # Just reads the results out of the dictionary.
def recommend(movieId, num):
    print("Recommending " + str(num) + " products similar to " + item(movieId) + "...")   
    print("-------")    
    recs = results[movieId][:num]   
    for rec in recs: 
       print("Recommended: " + item(rec[1]) + " (score:" +      str(rec[0]) + ")")

In [85]:
recommend(movieId=1,num=5)

Recommending 5 products similar to Toy Story (1995)...
-------
Recommended: Pulp Fiction (1994) (score:1.0)
Recommended: Guardians of the Galaxy 2 (2017) (score:1.0)
Recommended: The Lego Movie (2014) (score:1.0)
Recommended: Toy Story (1995) (score:1.0)
Recommended: Big Hero 6 (2014) (score:0.4340087390404574)
