# Content Base Filtering

Import libraries

In [668]:
import pandas as pd
from math import sqrt
# import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel 

Import dataset

In [669]:
df_tag = pd.read_csv('dataset/movie/tags.csv')

In [670]:
df_movie = pd.read_csv('dataset/movie/movies.csv')

In [671]:
df_tag.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


aggregate tag string into one column 

In [672]:
df_tag = df_tag.groupby(['movieId'])['tag'].apply(' '.join).reset_index()

In [673]:
df_tag.head()

Unnamed: 0,movieId,tag
0,1,pixar pixar fun
1,2,fantasy magic board game Robin Williams game
2,3,moldy old
3,5,pregnancy remake
4,7,remake


merge dataset to get genre movie

In [674]:
df_movie['genres'] = df_movie['genres'].str.replace('|',' ',regex=True)
df_tag = pd.merge(df_tag,df_movie,how='left',on='movieId')


In [675]:
df_tag.head()

Unnamed: 0,movieId,tag,title,genres
0,1,pixar pixar fun,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,fantasy magic board game Robin Williams game,Jumanji (1995),Adventure Children Fantasy
2,3,moldy old,Grumpier Old Men (1995),Comedy Romance
3,5,pregnancy remake,Father of the Bride Part II (1995),Comedy
4,7,remake,Sabrina (1995),Comedy Romance


In [676]:
# df_tag.drop('timestamp',axis=1,inplace=True)

In [677]:
df_tag[df_tag['movieId']==11]

Unnamed: 0,movieId,tag,title,genres
5,11,politics president,"American President, The (1995)",Comedy Drama Romance


In [678]:
df_tag['description'] = df_tag[['tag', 'genres']].apply(lambda x: ' '.join(x), axis=1)

In [679]:
df_tag[['description']].head()

Unnamed: 0,description
0,pixar pixar fun Adventure Animation Children C...
1,fantasy magic board game Robin Williams game A...
2,moldy old Comedy Romance
3,pregnancy remake Comedy
4,remake Comedy Romance


In [680]:
df_tag.head()

Unnamed: 0,movieId,tag,title,genres,description
0,1,pixar pixar fun,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun Adventure Animation Children C...
1,2,fantasy magic board game Robin Williams game,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game Robin Williams game A...
2,3,moldy old,Grumpier Old Men (1995),Comedy Romance,moldy old Comedy Romance
3,5,pregnancy remake,Father of the Bride Part II (1995),Comedy,pregnancy remake Comedy
4,7,remake,Sabrina (1995),Comedy Romance,remake Comedy Romance


In [681]:
#ngram_range untuk kombinasi kata contoh:ngram_range(1,2) very expensive watch -> very expensive, expensive watch
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df_tag['description'])

In [682]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix) 
results = {}
for idx, row in df_tag.iterrows():
   similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
   similar_items = [(cosine_similarities[idx][i], df_tag['movieId'][i]) for i in similar_indices] 
   results[row['movieId']] = similar_items[1:]

In [683]:
def item(movieId):  
    return df_tag.loc[df_tag['movieId'] == movieId]['title'].tolist()[0].split(' - ')[0] 
#     return df_tag.loc[df_tag['movieId'] == movieId]['movieId']
    # Just reads the results out of the dictionary.
def recommend(movieId, num):
    print("Recommending " + str(num) + " movies similar to " + item(movieId) + "...")   
    print("-------")    
    recs = results[movieId][:num]   
    for rec in recs: 
       print("Recommended: " + item(rec[1]) + " (score:" +      str(rec[0]) + ")")

In [684]:
# df_tag.loc[df_tag['userId'] == 336]['title']
recommend(movieId=1,num=10)
# item(5)

Recommending 10 movies similar to Toy Story (1995)...
-------
Recommended: Bug's Life, A (1998) (score:0.5921343144967754)
Recommended: Toy Story 2 (1999) (score:0.4386220690868849)
Recommended: Aladdin (1992) (score:0.30979520735754373)
Recommended: Shrek (2001) (score:0.30645150829803924)
Recommended: Space Jam (1996) (score:0.2870084011513777)
Recommended: Cat Returns, The (Neko no ongaeshi) (2002) (score:0.28406629015409257)
Recommended: 101 Dalmatians (One Hundred and One Dalmatians) (1961) (score:0.282647780847494)
Recommended: Grand Day Out with Wallace and Gromit, A (1989) (score:0.270297206395349)
Recommended: Sinbad: Legend of the Seven Seas (2003) (score:0.2695883667083091)
Recommended: Kiki's Delivery Service (Majo no takkyûbin) (1989) (score:0.26647513249745913)


In [685]:
from functools import reduce
from operator import concat

In [714]:
user_id = int(input('Masukan User Id: '))

Masukan User Id: 2


List movie Id from selected user id

In [715]:
list_movie = df_rating[df_rating['userId']==user_id]['movieId'].tolist()

### Top 10 Recommended Movie for user ID 2 using Content Base Filtering

In [716]:
keys = list_movie
#find recommended movie on the result
recommended_movie = {item:results.get(item) for item in keys}

recommended_movie = {k: v for k, v in recommended_movie.items() if v is not None}
final_recommended_movie = pd.DataFrame(reduce(concat,recommended_movie.values()),columns={'prediction','movieId'})
final_recommended_movie = pd.merge(final_recommended_movie,df_movie,how='left',on='movieId').sort_values('prediction',ascending=False)
final_recommended_movie.drop_duplicates().sort_values('prediction',ascending=False).head(25)

Unnamed: 0,prediction,movieId,title,genres
1862,0.727192,122922,Doctor Strange (2016),Action Adventure Sci-Fi
588,0.490156,6188,Old School (2003),Comedy
589,0.47979,107348,Anchorman 2: The Legend Continues (2013),Comedy
294,0.470912,7438,Kill Bill: Vol. 2 (2004),Action Drama Thriller
295,0.467944,2420,"Karate Kid, The (1984)",Drama
296,0.460738,8983,House of Flying Daggers (Shi mian mai fu) (2004),Action Drama Romance
297,0.456203,7090,Hero (Ying xiong) (2002),Action Adventure Drama
298,0.456203,2421,"Karate Kid, Part II, The (1986)",Action Adventure Drama
1666,0.426411,8914,Primer (2004),Drama Sci-Fi
1863,0.42364,5349,Spider-Man (2002),Action Adventure Sci-Fi Thriller
