TF-IDF Content Based Movie Recommender

In [0]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Loading The Data

In [0]:
# Read dataframes
df_movies = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Machine Learning Data/movies.csv')
df_links = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Machine Learning Data/links.csv')
df_ratings = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Machine Learning Data/ratings.csv')
df_genome_tags = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Machine Learning Data/genome-tags.csv')
df_genome_scores = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Machine Learning Data/genome-scores.csv')


In [0]:
#Merge Score and Tags
df_genome_tags_score = pd.merge(df_genome_scores, df_genome_tags, on='tagId')[['movieId', 'tag', 'relevance']]

In [0]:
# Only keep tags with relevance higher than 0.3
df_genome_tags_score = df_genome_scores[df_genome_scores.relevance > 0.3][['movieId', 'tagId']]

Display The first MovieId

In [79]:
df_movies[df_movies.movieId == 1]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


Finding the Tag score for MovieId 1

In [80]:
df_genome_tags_score[df_genome_tags_score['movieId'] == 1].merge(df_genome_tags, on='tagId').sample(10)

Unnamed: 0,movieId,tagId,tag
174,1,1091,visually appealing
56,1,345,effects
171,1,1071,very funny
175,1,1092,visually stunning
147,1,961,spying
67,1,378,fantasy world
64,1,372,fairy tales
169,1,1064,unusual plot structure
101,1,588,kids
26,1,195,chase


Encode Features

In [0]:
df_genome_tags_score = pd.merge(df_genome_tags_score, df_genome_tags, on='tagId', how='left')[['movieId', 'tagId']]
df_genome_tags_score['tagId'] = df_genome_tags_score.tagId.astype(str)

In [0]:

def _concatenate_tags_of_movie(tags):
    tags_as_str = ' '.join(set(tags))
    return tags_as_str

In [0]:

df_tags_per_movie = df_genome_tags_score.groupby('movieId')['tagId'].agg(_concatenate_tags_of_movie)
df_tags_per_movie.name = 'movie_tags'
df_tags_per_movie = df_tags_per_movie.reset_index()

In [9]:
df_tags_per_movie[df_tags_per_movie['movieId'] == 1]



Unnamed: 0,movieId,movie_tags
0,1,86 743 453 1092 121 79 412 255 345 378 387 505...


In [0]:
df_avg_ratings  = df_ratings.groupby('movieId')['rating'].agg(['mean', 'median', 'size'])
df_avg_ratings.columns = ['rating_mean', 'rating_median', 'num_ratingsdf_tags_per_movie']
df_avg_ratings = df_avg_ratings.reset_index()


In [0]:
df_movies_with_ratings = pd.merge(df_movies, df_avg_ratings, how='left', on='movieId')

In [0]:
df_data = pd.merge(df_movies_with_ratings, df_tags_per_movie, how='left', on='movieId')

In [0]:
df_data_with_tags = df_data[~df_data.movie_tags.isnull()].reset_index(drop=True)


TD-IDF vectorize

In [0]:
tf_idf = TfidfVectorizer()

In [0]:
df_movies_tf_idf = tf_idf.fit_transform(df_data_with_tags.movie_tags)

In [0]:
df1 = cosine_similarity(df_movies_tf_idf)


In [0]:
df_tf_idf_df1 = pd.DataFrame(cosine_similarity(df_movies_tf_idf))


In [0]:
index_to_movie_id = df_data_with_tags['movieId']

In [0]:
df_tf_idf_df1.columns = [str(index_to_movie_id[int(col)]) for col in df_tf_idf_df1.columns]

In [0]:
df_tf_idf_df1.index = [index_to_movie_id[idx] for idx in df_tf_idf_df1.index]

In [22]:
df_tf_idf_df1.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,127178,127196,127198,127202,127204,127206,127208,127212,127298,127319,127323,128488,128520,128542,128604,128671,128832,128975,129354,129370,129428,129657,129659,129707,129737,129779,129781,129937,130052,130073,130075,130087,130490,130496,130520,130578,130840,131013,131168,131170
1,1.0,0.408613,0.171842,0.141888,0.208914,0.244326,0.235981,0.257648,0.08478,0.25285,0.240785,0.069502,0.376399,0.176336,0.120842,0.199554,0.203921,0.21732,0.161782,0.136685,0.217783,0.169683,0.117813,0.204864,0.223395,0.214838,0.211017,0.142273,0.241967,0.202178,0.196958,0.213644,0.142287,0.50893,0.125294,0.198838,0.313781,0.188134,0.204508,0.098311,...,0.138441,0.074747,0.294902,0.240046,0.111392,0.306218,0.115647,0.199717,0.084458,0.144851,0.069799,0.076002,0.142035,0.078603,0.0429,0.127978,0.096323,0.165458,0.153872,0.254541,0.169647,0.15069,0.251237,0.064702,0.164854,0.250952,0.070755,0.126832,0.110264,0.392409,0.294736,0.233159,0.117382,0.17356,0.425415,0.058436,0.133622,0.13851,0.16046,0.221307
2,0.408613,1.0,0.114144,0.109254,0.211285,0.114038,0.151752,0.282273,0.18946,0.243929,0.164869,0.070874,0.246031,0.102127,0.271632,0.113939,0.126234,0.127425,0.150632,0.157445,0.11124,0.132741,0.114015,0.208897,0.09326,0.129239,0.135643,0.082844,0.140271,0.052601,0.236645,0.16749,0.157084,0.303697,0.053449,0.092321,0.204227,0.171169,0.135624,0.075395,...,0.075002,0.050665,0.18132,0.133132,0.075526,0.138342,0.026386,0.123044,0.021816,0.082224,0.089903,0.080916,0.040031,0.126028,0.032456,0.065615,0.074784,0.191613,0.115533,0.185693,0.096321,0.161997,0.198092,0.091791,0.082481,0.094909,0.011026,0.13709,0.15899,0.279941,0.257415,0.188582,0.142794,0.213585,0.260667,0.125768,0.12749,0.077935,0.052138,0.203258
3,0.171842,0.114144,1.0,0.253147,0.415846,0.112252,0.379635,0.194865,0.063392,0.179836,0.180066,0.146073,0.101536,0.077274,0.130253,0.101643,0.166067,0.128517,0.277179,0.138172,0.139578,0.104875,0.11324,0.112558,0.112552,0.142137,0.207748,0.085016,0.081846,0.054209,0.092987,0.056812,0.109797,0.123778,0.126144,0.089697,0.127705,0.156593,0.198006,0.103678,...,0.084615,0.078678,0.165938,0.151651,0.083844,0.18231,0.042663,0.066012,0.03729,0.131153,0.107005,0.079483,0.423512,0.044183,0.027684,0.104349,0.099029,0.261053,0.157196,0.119291,0.278657,0.101983,0.128393,0.04574,0.192004,0.060426,0.089093,0.138105,0.061475,0.159149,0.223727,0.103251,0.155037,0.116895,0.19641,0.060445,0.101306,0.210882,0.115281,0.158815
4,0.141888,0.109254,0.253147,1.0,0.260538,0.081417,0.39178,0.2488,0.061966,0.07599,0.308088,0.107578,0.120467,0.081747,0.109211,0.112652,0.327561,0.058364,0.070559,0.128741,0.084622,0.11647,0.078041,0.241271,0.165085,0.121938,0.356482,0.197595,0.069982,0.06052,0.196833,0.06007,0.180931,0.203366,0.297398,0.121407,0.198735,0.254636,0.262012,0.108579,...,0.126978,0.086918,0.131442,0.271447,0.118767,0.277238,0.082661,0.075473,0.023352,0.193039,0.106146,0.062588,0.078918,0.055402,0.065632,0.10235,0.098306,0.081277,0.170688,0.041216,0.322822,0.1714,0.215916,0.061624,0.130871,0.08965,0.09573,0.101574,0.050823,0.252853,0.226544,0.148416,0.118956,0.110497,0.184142,0.066132,0.133652,0.084864,0.180241,0.123381
5,0.208914,0.211285,0.415846,0.260538,1.0,0.047524,0.446837,0.179263,0.105187,0.133696,0.317799,0.149092,0.177083,0.061599,0.127882,0.045832,0.191977,0.115968,0.213692,0.125055,0.087908,0.139062,0.073812,0.170885,0.093066,0.072349,0.29893,0.079827,0.077202,0.043567,0.157876,0.078255,0.17845,0.223761,0.091322,0.081882,0.12066,0.379533,0.201461,0.075766,...,0.146344,0.047529,0.105705,0.161459,0.060715,0.249277,0.04853,0.086508,0.022028,0.151208,0.087835,0.035166,0.296108,0.057686,0.018766,0.086596,0.08485,0.230484,0.10698,0.11062,0.303264,0.085374,0.209068,0.034737,0.155897,0.043531,0.029945,0.127253,0.061481,0.258186,0.333483,0.220159,0.132119,0.116496,0.239599,0.030563,0.100656,0.144673,0.110727,0.190319


Movies Similar to Toy Story

In [23]:
df_tf_idf_df1.iloc[0].sort_values(ascending=False)[:10]


1        1.000000
4886     0.750146
3114     0.735659
78499    0.708184
2355     0.702002
76093    0.670875
5218     0.664707
6377     0.656608
68954    0.654471
50872    0.633375
Name: 1, dtype: float64

In [24]:
df_data_with_tags[df_data_with_tags.movieId == 3114]

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
2769,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,3.841853,4.0,22770.0,743 1092 121 412 255 378 387 505 446 646 519 2...


In [25]:
df_data_with_tags[df_data_with_tags.movieId == 4886]

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
4331,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,3.879444,4.0,23657.0,86 743 453 1092 121 79 412 255 378 387 481 505...


In [26]:
df_data_with_tags[df_data_with_tags.title.str.contains('Terminator 2')]

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
555,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,3.931954,4.0,52244.0,86 54 743 1092 221 121 1022 311 412 255 345 80...


In [27]:
df_tf_idf_df1.iloc[1][555]

0.2731060638459876

Building Profile for user #1

In [0]:
df_user_ratings = df_ratings[df_ratings.userId == 1]

In [0]:
df_user_data_with_tags = df_data_with_tags.reset_index().merge(df_user_ratings, on='movieId')

In [65]:
df_user_data_with_tags[['title', 'rating']]

Unnamed: 0,title,rating
0,Jumanji (1995),3.5
1,"City of Lost Children, The (Cité des enfants p...",3.5
2,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),3.5
3,Seven (a.k.a. Se7en) (1995),3.5
4,"Usual Suspects, The (1995)",3.5
...,...,...
169,Freaks (1932),5.0
170,Spider-Man 2 (2004),4.5
171,Slaughterhouse-Five (1972),3.5
172,"Incredibles, The (2004)",4.0


In [0]:
df_user_data_with_tags['weight'] = df_user_data_with_tags['rating']/5.

In [0]:
import numpy as np
user_profile = np.dot(df_movies_tf_idf[df_user_data_with_tags['index'].values].toarray().T, df_user_data_with_tags['weight'].values)

In [0]:
C = cosine_similarity(np.atleast_2d(user_profile), df_movies_tf_idf)

In [0]:
R = np.argsort(C)[:, ::-1]

In [0]:
recommendations = [i for i in R[0] if i not in df_user_data_with_tags['index'].values]

In [71]:
df_data_with_tags['title'][recommendations].head(10)

7148                               Ju-on: The Curse (2000)
8345                               Dark Knight, The (2008)
8482     Let the Right One In (Låt den rätte komma in) ...
7898                                  Prestige, The (2006)
1072     Star Wars: Episode VI - Return of the Jedi (1983)
1060                            Princess Bride, The (1987)
1135                                     Highlander (1986)
10144                                   Life Itself (2014)
3153                                      Gladiator (2000)
2270                                    Matrix, The (1999)
Name: title, dtype: object