# Content-based Movie Recommender

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Load the MovieLens data

You can download the file `ml_latest.zip`[here](https://grouplens.org/datasets/movielens/) and then unzip into the `data/` directory.

In [3]:
!ls data/

genome-scores.csv links.csv         ratings.csv
genome-tags.csv   movies.csv


In [4]:
# Read dataframes
df_movies = pd.read_csv('data/movies.csv')
df_links = pd.read_csv('data/links.csv')
df_ratings = pd.read_csv('data/ratings.csv')
df_genome_tags = pd.read_csv('data/genome-tags.csv')
df_genome_scores = pd.read_csv('data/genome-scores.csv')

# Merge scores and tags
df_movie_tags_in_text = pd.merge(df_genome_scores, df_genome_tags, on='tagId')[['movieId', 'tag', 'relevance']]

# Only keep tags with relevance higher than 0.3
df_movie_tags = df_genome_scores[df_genome_scores.relevance > 0.3][['movieId', 'tagId']]

### Which is movie with Id 1?

In [5]:
df_movies[df_movies.movieId == 1]

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


### Let's have a look at a few of the tags Toy Story 

In [6]:
df_movie_tags[df_movie_tags['movieId'] == 1].merge(df_genome_tags, on='tagId').sample(10)

Unnamed: 0,movieId,tagId,tag
50,1,323,drama
114,1,743,original plot
92,1,529,humorous
95,1,548,inspirational
173,1,1092,visually stunning
104,1,623,magic
91,1,528,humor
77,1,453,good story
139,1,895,script
166,1,1053,underdog


### Encode features

In [7]:
df_tags_to_movies = pd.merge(df_movie_tags, df_genome_tags, on='tagId', how='left')[['movieId', 'tagId']]
df_tags_to_movies['tagId'] = df_tags_to_movies.tagId.astype(str)

In [8]:
def _concatenate_tags_of_movie(tags):
    tags_as_str = ' '.join(set(tags))
    return tags_as_str

In [9]:
df_tags_per_movie = df_tags_to_movies.groupby('movieId')['tagId'].agg(_concatenate_tags_of_movie)
df_tags_per_movie.name = 'movie_tags'
df_tags_per_movie = df_tags_per_movie.reset_index()

In [10]:
df_tags_per_movie[df_tags_per_movie['movieId'] == 1]

Unnamed: 0,movieId,movie_tags
0,1,216 623 215 669 760 664 497 490 691 21 1072 29...


In [11]:
df_avg_ratings  = df_ratings.groupby('movieId')['rating'].agg(['mean', 'median', 'size'])
df_avg_ratings.columns = ['rating_mean', 'rating_median', 'num_ratingsdf_tags_per_movie']
df_avg_ratings = df_avg_ratings.reset_index()

In [12]:
df_movies_with_ratings = pd.merge(df_movies, df_avg_ratings, how='left', on='movieId')

In [13]:
df_data = pd.merge(df_movies_with_ratings, df_tags_per_movie, how='left', on='movieId')

In [14]:
df_data_with_tags = df_data[~df_data.movie_tags.isnull()].reset_index(drop=True)

### TF-IDF vectors

In [15]:
tf_idf = TfidfVectorizer()

In [16]:
df_movies_tf_idf_described = tf_idf.fit_transform(df_data_with_tags.movie_tags)

In [17]:
m2m = cosine_similarity(df_movies_tf_idf_described)

In [18]:
df_tfidf_m2m = pd.DataFrame(cosine_similarity(df_movies_tf_idf_described))

In [19]:
index_to_movie_id = df_data_with_tags['movieId']

In [20]:
df_tfidf_m2m.columns = [str(index_to_movie_id[int(col)]) for col in df_tfidf_m2m.columns]

In [21]:
df_tfidf_m2m.index = [index_to_movie_id[idx] for idx in df_tfidf_m2m.index]

In [22]:
df_tfidf_m2m.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,170957,171011,171763,171765,172547,173145,173209,174053,174055,174585
1,1.0,0.460368,0.163286,0.132615,0.22627,0.259194,0.211049,0.257855,0.079703,0.266038,...,0.406805,0.320944,0.349831,0.409419,0.345129,0.253684,0.122336,0.290244,0.360141,0.230191
2,0.460368,1.0,0.107767,0.106618,0.234752,0.125419,0.153636,0.275896,0.19799,0.262494,...,0.265312,0.182559,0.196636,0.191116,0.237748,0.19826,0.069587,0.218103,0.239306,0.265747
3,0.163286,0.107767,1.0,0.260229,0.445249,0.113001,0.355249,0.192246,0.062738,0.169872,...,0.153955,0.120308,0.162403,0.142376,0.24244,0.149103,0.066651,0.06779,0.107225,0.117665
4,0.132615,0.106618,0.260229,1.0,0.328667,0.072143,0.421189,0.22922,0.062326,0.072101,...,0.109277,0.135234,0.168605,0.224888,0.222388,0.159902,0.071548,0.073836,0.137307,0.091273
5,0.22627,0.234752,0.445249,0.328667,1.0,0.057446,0.476907,0.191526,0.126756,0.151448,...,0.205023,0.120247,0.172031,0.191384,0.310622,0.151059,0.062195,0.126224,0.151979,0.175246


### Most similar movies to Toy Story

In [23]:
df_tfidf_m2m.iloc[0].sort_values(ascending=False)[:10]

1        1.000000
3114     0.737982
4886     0.736047
2355     0.721830
78499    0.708378
76093    0.685637
5218     0.653424
4306     0.642794
6377     0.639971
68954    0.635059
Name: 1, dtype: float64

In [24]:
df_data_with_tags[df_data_with_tags.movieId == 3114]

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
2769,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,3.811412,4.0,7352.0,215 669 664 496 138 490 1072 29 1070 1071 406 ...


In [25]:
df_data_with_tags[df_data_with_tags.movieId == 4886]

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
4331,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,3.861679,4.0,8708.0,216 215 669 664 663 765 136 497 490 493 690 10...


### Cosine similarity for Terminator 2

The costine similarity between Toy Story 2 and Terminator 2

In [26]:
df_data_with_tags[df_data_with_tags.title.str.contains('Terminator 2')]

Unnamed: 0,movieId,title,genres,rating_mean,rating_median,num_ratingsdf_tags_per_movie,movie_tags
555,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,3.947648,4.0,16093.0,216 768 215 212 1085 668 452 761 663 767 132 8...


In [27]:
df_tfidf_m2m.iloc[1][555]

0.28311016453467081

### Build user profile for user #1

In [28]:
df_user_ratings = df_ratings[df_ratings.userId == 1]

In [29]:
df_user_data_with_tags = df_data_with_tags.reset_index().merge(df_user_ratings, on='movieId')

In [30]:
df_user_data_with_tags[['title', 'rating']]

Unnamed: 0,title,rating
0,Braveheart (1995),1.0
1,"Basketball Diaries, The (1995)",4.5
2,"Godfather, The (1972)",5.0
3,"Godfather: Part II, The (1974)",5.0
4,Dead Poets Society (1989),5.0
5,"Breakfast Club, The (1985)",4.0
6,"Sixth Sense, The (1999)",4.5
7,Ferris Bueller's Day Off (1986),5.0
8,Fight Club (1999),4.0
9,Memento (2000),4.0


In [31]:
df_user_data_with_tags['weight'] = df_user_data_with_tags['rating']/5.

In [32]:
user_profile = np.dot(df_movies_tf_idf_described[df_user_data_with_tags['index'].values].toarray().T, df_user_data_with_tags['weight'].values)

In [33]:
C = cosine_similarity(atleast_2d(user_profile), df_movies_tf_idf_described)

In [34]:
R = argsort(C)[:, ::-1]

In [35]:
recommendations = [i for i in R[0] if i not in df_user_data_with_tags['index'].values]

In [36]:
df_data_with_tags['title'][recommendations].head(10)

302            Shawshank Redemption, The (1994)
10955                              Logan (2017)
1119                         Stand by Me (1986)
2530                     American Beauty (1999)
10810                           11.22.63 (2016)
5189        City of God (Cidade de Deus) (2002)
49                   Usual Suspects, The (1995)
1075                          Goodfellas (1990)
1057     One Flew Over the Cuckoo's Nest (1975)
10414         Untitled Spider-Man Reboot (2017)
Name: title, dtype: object