In [99]:
# import dependencies
import pandas as pd
import numpy as np
from numpy import array
from numpy import asarray
from numpy import zeros
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from zipfile import ZipFile
import json
import os.path
import re
import pickle
import requests
import math

import data_download

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Add some convenience functions to Pandas DataFrame.
pd.options.display.max_rows = 1000
pd.options.display.float_format = '{:.3f}'.format
def mask(df, key, function):
  """Returns a filtered dataframe, by applying function to key"""
  return df[function(df[key])]

def flatten_cols(df):
  df.columns = [' '.join(col).strip() for col in df.columns.values]
  return df

pd.DataFrame.mask = mask
pd.DataFrame.flatten_cols = flatten_cols

In [2]:
# movies, links, ratings, tags = data_download.load_unprocessed_df(use_large=False)
movies, links, ratings, tags, genome_tags, genome_scores = data_download.load_unprocessed_df(use_large=True)

MovieLens 25M Dataset is downloaded!
MovieLens Latest Small Dataset is downloaded!
MovieLens 25M Dataset is already extracted!
MovieLens Latest Small Dataset is already extracted!


In [3]:
%%time
user_ids = ratings.userId.unique().tolist()
user2idx = {userId: idx for (idx, userId) in enumerate(user_ids)}
idx2user = {idx: userId for (idx, userId) in enumerate(user_ids)}
ratings.userId = ratings.userId.map(user2idx)

movie_ids = ratings.movieId.unique().tolist()
movie2idx = {movieId: idx for (idx, movieId) in enumerate(movie_ids)}
idx2movie = {idx: movieId for (idx, movieId) in enumerate(movie_ids)}
ratings.movieId = ratings.movieId.map(movie2idx)

# map rest
tags.movieId = tags.movieId.map(movie2idx)
movies.movieId = movies.movieId.map(movie2idx)
links.movieId = links.movieId.map(movie2idx)

CPU times: user 1.52 s, sys: 174 ms, total: 1.69 s
Wall time: 1.7 s


In [4]:
num_users = int(ratings.userId.nunique())
num_movies = int(ratings.movieId.nunique())
ratings["rating"] = ratings["rating"].values.astype(np.float32)
# min and max ratings will be used to normalize the ratings later
min_rating = min(ratings["rating"])
max_rating = max(ratings["rating"])

print(
    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
        num_users, num_movies, min_rating, max_rating
    )
)

Number of users: 162541, Number of Movies: 59047, Min rating: 0.5, Max rating: 5.0


In [5]:
# drop some columns we do not care about
ratings = ratings.drop("timestamp", axis=1)
tags = tags.drop("timestamp", axis=1)

In [6]:
# drop duplicates for ratings for content 
ratings_content = ratings.drop_duplicates("movieId")

In [7]:
# we need take make sure that we have equal movies and ratings.
movies.isnull().sum()

movieId    3376
title         0
genres        0
dtype: int64

In [8]:
movies[~movies.movieId.isin(ratings_content.movieId)]

Unnamed: 0,movieId,title,genres
8371,,Break of Hearts (1935),Drama|Romance
8763,,Baby Blue Marine (1976),Drama
11556,,"Thousand and One Nights, A (1001 Nights) (1945)",Adventure
11997,,Suicide Killers (2006),Documentary
12173,,Alex in Wonder (Sex and a Girl) (2001),Comedy|Drama
...,...,...,...
62238,,Eternal Blood (2002),Action|Horror|Thriller
62239,,Big Business (1929),Comedy
62240,,The Student of Prague (1926),Horror
62298,,The Coldest Game (2019),(no genres listed)


In [9]:
movies = movies[movies['movieId'].notna()]

In [10]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [11]:
# basic stats
print("{} unique movies in ratings_content".format(len(ratings_content.movieId.unique())))
print("{} unique movies in tags.csv".format(len(tags.movieId.unique())))
print("{} unique movies in movies.csv".format(len(movies.movieId.unique())))

59047 unique movies in ratings_content
41876 unique movies in tags.csv
59047 unique movies in movies.csv


In [12]:
# do we have tags with movies that do NOT have a ratings?
len(tags[~tags["movieId"].isin(ratings_content.movieId)]["movieId"].unique())

1

In [13]:
# movies that are not in ratings, should be removed (using right join)
tags_content = pd.merge(tags, ratings_content, on="movieId", how="right")

In [14]:
tags_content[tags_content.movieId == 0]

Unnamed: 0,userId_x,movieId,tag,userId_y,rating
0,264.000,0.000,assassin,0,5.000
1,264.000,0.000,Black comedy,0,5.000
2,264.000,0.000,cult film,0,5.000
3,264.000,0.000,dark comedy,0,5.000
4,264.000,0.000,Quentin Tarantino,0,5.000
...,...,...,...,...,...
4762,162400.000,0.000,Oscar Nominee: Best Picture,0,5.000
4763,162400.000,0.000,Quentin Tarantino,0,5.000
4764,162400.000,0.000,satire,0,5.000
4765,162400.000,0.000,Steve Buscemi,0,5.000


In [15]:
tags_content.head()

Unnamed: 0,userId_x,movieId,tag,userId_y,rating
0,264.0,0.0,assassin,0,5.0
1,264.0,0.0,Black comedy,0,5.0
2,264.0,0.0,cult film,0,5.0
3,264.0,0.0,dark comedy,0,5.0
4,264.0,0.0,Quentin Tarantino,0,5.0


In [16]:
len(tags[~tags["movieId"].isin(ratings_content.movieId)]["movieId"].unique())

1

In [17]:
# after some clean
print("{} unique movies in ratings_content".format(len(ratings_content.movieId.unique())))
print("{} unique movies in tags_content".format(len(tags_content.movieId.unique())))
print("{} unique movies in movies.csv".format(len(movies.movieId.unique())))

59047 unique movies in ratings_content
59047 unique movies in tags_content
59047 unique movies in movies.csv


In [18]:
# handle genres (remove all non alphabet characters)
movies['genres'] = movies['genres'].str.replace(pat="|", repl=" ")
movies['genres'] = movies['genres'].str.replace(pat="-", repl="")
movies['genres'] = movies['genres'].str.replace(pat="(no genres listed)", repl="")
movies['genres'] = movies['genres'].str.replace(pat="(", repl="")
movies['genres'] = movies['genres'].str.replace(pat=")", repl="")
movies.head()

Unnamed: 0,movieId,title,genres
0,70.0,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,1103.0,Jumanji (1995),Adventure Children Fantasy
2,1017.0,Grumpier Old Men (1995),Comedy Romance
3,4270.0,Waiting to Exhale (1995),Comedy Drama Romance
4,1858.0,Father of the Bride Part II (1995),Comedy


In [19]:
# merge movies with ratings and map idx
movies_content = pd.merge(movies, ratings_content, on="movieId", how="inner")

In [20]:
movies.query("title == 'Batman Begins (2005)'")

Unnamed: 0,movieId,title,genres
10002,245.0,Batman Begins (2005),Action Crime IMAX


In [21]:
# there is some nans in tags
tags_content.isnull().sum().sort_values(ascending = False)

tag         17188
userId_x    17172
rating          0
userId_y        0
movieId         0
dtype: int64

In [22]:
# fill some nans 
tags_content.fillna("", inplace=True)

In [23]:
tags_content.isnull().sum().sort_values(ascending = False)

rating      0
userId_y    0
tag         0
movieId     0
userId_x    0
dtype: int64

In [24]:
tags_content = pd.DataFrame(tags_content.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x)))
tags_content.reset_index(inplace=True)

In [25]:
tags_content.head()

Unnamed: 0,movieId,tag
0,0.0,assassin Black comedy cult film dark comedy Qu...
1,1.0,atmospheric enigmatic gentle lyrical meditativ...
2,2.0,atmospheric CRISIS OF FAITH DEATH OF A CHILD D...
3,3.0,biting cerebral cynical harsh irreverent madca...
4,4.0,Dance 50s imdb top 250 musical romance happy m...


In [26]:
movies_content.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,70.0,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1,3.5
1,1103.0,Jumanji (1995),Adventure Children Fantasy,8,5.0
2,1017.0,Grumpier Old Men (1995),Comedy Romance,7,4.0
3,4270.0,Waiting to Exhale (1995),Comedy Drama Romance,140,3.0
4,1858.0,Father of the Bride Part II (1995),Comedy,17,4.0


In [27]:
# group tags by movie id to create corpus
content_data = pd.merge(movies_content, tags_content, on="movieId", how="right")
content_data["corpus"] = content_data[["genres", "tag"]].apply(lambda x: " ".join(x), axis=1)

In [28]:
movies_content[movies_content.movieId == 245]

Unnamed: 0,movieId,title,genres,userId,rating
10000,245.0,Batman Begins (2005),Action Crime IMAX,1,5.0


In [29]:
content_data[content_data.movieId == 245]

Unnamed: 0,movieId,title,genres,userId,rating,tag,corpus
245,245.0,Batman Begins (2005),Action Crime IMAX,1,5.0,action batman billionaire Christian Bale comic...,Action Crime IMAX action batman billionaire Ch...


In [30]:
# drop items we do not need 
content_data = content_data.drop("rating", axis=1)
content_data = content_data.drop("userId", axis=1)

In [31]:
content_data.head()

Unnamed: 0,movieId,title,genres,tag,corpus
0,0.0,Pulp Fiction (1994),Comedy Crime Drama Thriller,assassin Black comedy cult film dark comedy Qu...,Comedy Crime Drama Thriller assassin Black com...
1,1.0,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama,atmospheric enigmatic gentle lyrical meditativ...,Drama atmospheric enigmatic gentle lyrical med...
2,2.0,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama,atmospheric CRISIS OF FAITH DEATH OF A CHILD D...,Drama atmospheric CRISIS OF FAITH DEATH OF A C...
3,3.0,Underground (1995),Comedy Drama War,biting cerebral cynical harsh irreverent madca...,Comedy Drama War biting cerebral cynical harsh...
4,4.0,Singin' in the Rain (1952),Comedy Musical Romance,Dance 50s imdb top 250 musical romance happy m...,Comedy Musical Romance Dance 50s imdb top 250 ...


In [32]:
# find ratings count and mean
movie_summary = pd.DataFrame(ratings.groupby('movieId')['rating'].mean())
movie_summary['ratings_count'] = ratings.groupby('movieId')['rating'].count()
movie_summary.head()

Unnamed: 0_level_0,rating,ratings_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4.189,79672
1,4.073,7058
2,3.981,6616
3,3.946,1269
4,4.051,10895


In [33]:
# merge with content data
content_data = pd.merge(content_data, movie_summary, on="movieId", how="left")
content_data.head()

Unnamed: 0,movieId,title,genres,tag,corpus,rating,ratings_count
0,0.0,Pulp Fiction (1994),Comedy Crime Drama Thriller,assassin Black comedy cult film dark comedy Qu...,Comedy Crime Drama Thriller assassin Black com...,4.189,79672
1,1.0,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama,atmospheric enigmatic gentle lyrical meditativ...,Drama atmospheric enigmatic gentle lyrical med...,4.073,7058
2,2.0,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama,atmospheric CRISIS OF FAITH DEATH OF A CHILD D...,Drama atmospheric CRISIS OF FAITH DEATH OF A C...,3.981,6616
3,3.0,Underground (1995),Comedy Drama War,biting cerebral cynical harsh irreverent madca...,Comedy Drama War biting cerebral cynical harsh...,3.946,1269
4,4.0,Singin' in the Rain (1952),Comedy Musical Romance,Dance 50s imdb top 250 musical romance happy m...,Comedy Musical Romance Dance 50s imdb top 250 ...,4.051,10895


In [34]:
content_data.shape

(59047, 7)

https://help.imdb.com/article/imdb/track-movies-tv/ratings-faq/G67Y87TFYYP6TWAV#ratings

The following formula is used to calculate the Top Rated 250 titles. This formula provides a true 'Bayesian estimate', which takes into account the number of votes each title has received, minimum votes required to be on the list, and the mean vote for all titles:

weighted rating (WR) = (v ÷ (v+m)) × R + (m ÷ (v+m)) × C

Where:

R = average for the movie (mean) = (rating)

v = number of votes for the movie = (votes)

m = minimum votes required to be listed in the Top Rated list (currently 25,000)

C = the mean vote across the whole report

In [35]:
C = content_data["rating"].mean()
print(C)
m = content_data["ratings_count"].quantile(0.90)
print(m)

3.071374
413.0


In [36]:
# weighted ratings based on IMDB
def weighted_rating(x):
    v = x['ratings_count']
    R = x['rating']
    return (v / (v + m) * R) + (m / (m + v) * C)


In [37]:
content_data["weighted_rating"] = content_data.apply(weighted_rating, axis=1)

In [38]:
content_data = pd.merge(content_data, links, on="movieId", how="left")

In [39]:
content_data.sort_values("weighted_rating", ascending=False).head(20)

Unnamed: 0,movieId,title,genres,tag,corpus,rating,ratings_count,weighted_rating,imdbId,tmdbId
79,79.0,"Shawshank Redemption, The (1994)",Crime Drama,bad ending stephan king freedom hope inspirati...,Crime Drama bad ending stephan king freedom ho...,4.414,81482,4.407,111161,278.0
96,96.0,"Godfather, The (1972)",Crime Drama,italian mafia italy Mafia Marlon Brando Mafia ...,Crime Drama italian mafia italy Mafia Marlon B...,4.324,52498,4.315,68646,238.0
252,252.0,"Usual Suspects, The (1995)",Crime Mystery Thriller,imdb top 250 heist suspense thriller twist end...,Crime Mystery Thriller imdb top 250 heist susp...,4.284,55366,4.275,114814,629.0
276,276.0,"Godfather: Part II, The (1974)",Crime Drama,imdb top 250 Oscar (Best Picture) 100 Greatest...,Crime Drama imdb top 250 Oscar (Best Picture) ...,4.262,34188,4.248,71562,240.0
89,89.0,Schindler's List (1993),Drama War,based on a true story true story imdb top 250 ...,Drama War based on a true story true story imd...,4.248,60411,4.24,108052,424.0
297,297.0,Fight Club (1999),Action Crime Drama Thriller,complicated mindfuck violence atmospheric dark...,Action Crime Drama Thriller complicated mindfu...,4.228,58773,4.22,137523,550.0
288,288.0,Seven Samurai (Shichinin no samurai) (1954),Action Adventure Drama,Akira Kurosawa atmospheric epic historical lon...,Action Adventure Drama Akira Kurosawa atmosphe...,4.255,13367,4.219,47478,346.0
1063,1063.0,Rear Window (1954),Mystery Thriller,50s imdb top 250 Edgar Award (Best Motion Pict...,Mystery Thriller 50s imdb top 250 Edgar Award ...,4.238,20162,4.215,47396,567.0
999,999.0,12 Angry Men (1957),Drama,classic courtroom courtroom drama group psycho...,Drama classic courtroom courtroom drama group ...,4.243,16569,4.215,50083,389.0
1844,1844.0,One Flew Over the Cuckoo's Nest (1975),Drama,imdb top 250 Oscar (Best Picture) asylum based...,Drama imdb top 250 Oscar (Best Picture) asylum...,4.219,36058,4.206,73486,510.0


In [40]:
content_data.shape

(59047, 10)

In [41]:
content_data.to_csv('data/content_data.csv')

## Similarity Calculator

In [51]:
content_embeddings = pd.read_pickle("data/autoencoder_embeddings_4.pkl")
content_embeddings = pd.DataFrame(content_embeddings)
content_embeddings.shape

(59047, 200)

In [52]:
content_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,-0.475,0.925,-0.379,5.966,0.821,-0.539,-0.497,-0.435,-0.417,0.285,...,-0.315,-0.251,-0.394,-0.562,1.853,7.514,-0.345,-0.302,-0.389,-0.567
1,-0.475,-0.378,-0.379,-0.218,-0.159,-0.539,-0.497,-0.435,-0.417,-0.368,...,-0.315,-0.251,-0.394,-0.562,-0.142,-0.221,-0.345,-0.302,-0.389,0.717
2,-0.185,-0.378,-0.379,0.859,-0.159,-0.539,-0.497,-0.435,-0.417,0.096,...,-0.315,-0.251,-0.394,-0.195,-0.142,0.641,-0.225,-0.302,-0.389,-0.54
3,-0.475,-0.378,-0.379,-0.442,-0.159,-0.539,-0.497,-0.435,-0.417,-0.368,...,-0.315,-0.251,-0.394,0.402,-0.142,-0.221,-0.345,-0.302,-0.389,-0.567
4,-0.092,-0.157,-0.269,-0.442,-0.159,-0.539,-0.396,-0.435,-0.417,-0.368,...,-0.315,-0.251,-0.394,-0.454,-0.142,-0.221,-0.144,-0.302,-0.389,-0.499


In [53]:
collaborative_embeddings = pd.read_pickle("data/collab_nn_embeddings_23.pkl")
collaborative_embeddings = pd.DataFrame(collaborative_embeddings)
collaborative_embeddings.shape

(59047, 100)

In [48]:
collaborative_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.518,0.651,0.024,-0.287,0.736,-0.55,0.344,0.258,-0.398,-0.641,...,0.858,-0.288,-0.815,0.458,-0.567,0.396,0.634,0.199,0.461,-0.488
1,0.086,0.546,-0.4,0.018,0.334,-0.449,0.289,-0.793,0.046,0.148,...,-0.047,0.216,0.489,0.597,0.374,0.315,-0.644,0.204,0.44,0.016
2,-0.016,0.789,-0.308,-0.267,0.616,-0.251,0.447,-0.847,-0.018,0.468,...,0.238,0.108,0.415,0.654,0.479,-0.119,-0.502,0.348,0.537,0.046
3,0.259,0.236,0.081,0.133,0.308,-0.364,-0.608,0.329,-0.271,0.181,...,-0.11,0.042,-0.301,-0.03,0.396,0.009,0.098,0.645,0.416,-0.457
4,0.453,0.754,0.028,-0.108,0.174,0.082,-0.137,-0.32,0.214,-0.649,...,0.529,-0.833,0.232,0.35,-0.285,0.306,0.127,0.74,0.609,0.202


In [42]:
from sklearn.metrics.pairwise import cosine_similarity

class SimilarityCalculator(object):
    def __init__(self, embeddings):
        self.embeddings = embeddings
        self.ids = embeddings.index.tolist()
        self.similarity_matrix = self.calculate_cosine_similarity_matrix()
      

    def calculate_cosine_similarity_matrix(self):
        '''Calculates a cosine similarity matrix from the embeddings'''
        similarity_matrix = pd.DataFrame(
            cosine_similarity(X=self.embeddings),
            index=self.ids
        )
        similarity_matrix.columns = self.ids
        return similarity_matrix
    

    def predict_similarity(self, seed, n):
        '''Use the similarity_matrix to return n most similar items.'''
        similar_items = pd.DataFrame(self.similarity_matrix.loc[seed])
        similar_items.columns = ["similarity_score"]
        similar_items = similar_items.sort_values('similarity_score', ascending=False)
        similar_items = similar_items.head(n)
        similar_items.reset_index(inplace=True)
        similar_items = similar_items.rename(index=str, columns={"index": "movieId"})
        return similar_items.to_dict()        

In [43]:
poster_lookup = {}

In [92]:
import re

class RecommenderEngine(object):
    def __init__(self, content_emb, collab_emb, content_data):
        self.content_emb = content_emb
        self.collab_emb = collab_emb
        self.content_data = content_data
        
        
    def get_release_year(self, row):
        # strip the year from the title 
        title = row["title"]
        year = re.search(r'\(\d{4}\)', title)
        if year:
            year = year.group(0)
        else:
            return None
        year = int(year.replace('(', '').replace(')', ''))
        return year


    # we need to fetch more data for each movie, create function that does that
    def fetch_tmdb_info(self, row):
        movie_id = int(row.movieId)
        if math.isnan(movie_id):
            return ""
        poster_path = poster_lookup.get(movie_id)
        if poster_path:
            print('{} found in cache'.format(movie_id))
            return poster_path
        # else:
            print('{} not found in cache, starting download...'.format(movie_id))
        url = "https://api.themoviedb.org/3/movie/{}?api_key=dd79e14b13a7da58c6c6a90e68a301a9&language=en-US".format(int(row.tmdbId))
        response = requests.request('GET', url)
        r = response.json()
        movie_id = int(row.movieId)
        poster_path = r.get('poster_path')
        # store for quick lookup later
        poster_lookup[movie_id] = poster_path
        print('{} downloaded and cached'.format(movie_id))
        return poster_path


    def top_list(self, n):
        # load or build top 200 based on weighted IMDB ratings formula 
        # and remove movies from before 1990
        if os.path.isfile('data/top_list.json'):
            return pd.read_json('data/top_list.json')
        else:
            # remove old movies
            release_year = self.content_data.apply(self.get_release_year, axis=1)
            year_mask = release_year >= 1990
            top_100 = self.content_data[year_mask]
            # rank by imdb ratings formula and keep top 100
            top_100 = top_100.sort_values("weighted_rating", ascending=False).head(n)
            top_100["poster_path"] = top_100.apply(self.fetch_tmdb_info, axis=1)
            # drop columns that we don't need 
            top_100 = top_100.drop("genres", axis=1)
            top_100 = top_100.drop("tag", axis=1)
            top_100 = top_100.drop("rating", axis=1)
            top_100 = top_100.drop("ratings_count", axis=1)
            top_100 = top_100.drop("weighted_rating", axis=1)
            top_100 = top_100.drop("imdbId", axis=1)
            top_100 = top_100.drop("corpus", axis=1)
            top_100 = top_100.drop("tmdbId", axis=1)
            top_100.to_json('data/top_list.json', orient='records')
            return top_100


    def recommend_movies(self, movie_id):
        file_name = '{}_recs.json'.format(str(movie_id))
        path = 'data/recs/{}'.format(file_name)
        if os.path.isfile(path):
            return pd.read_json(path)
            
        # find similar from collab
        similarity_collab = SimilarityCalculator(self.collab_emb)
        similiarty_collab = similarity_collab.predict_similarity(seed=movie_id, n=self.content_data.shape[0])
        similiarty_collab_df = pd.DataFrame(similiarty_collab)
        similiarty_collab_df.set_index('movieId', inplace=True)
        similiarty_collab_df = pd.merge(self.content_data, similiarty_collab_df, left_index=True, right_index=True)
        similiarty_collab_df.sort_values('similarity_score', ascending=False, inplace=True)
        similiarty_collab_df = similiarty_collab_df.rename(index=str, columns={"similarity_score": "collaborative_similarity_score"})
        
        # find simlar from content
        similarity_content = SimilarityCalculator(self.content_emb)
        similarity_content = similarity_content.predict_similarity(seed=movie_id, n=self.content_data.shape[0])
        similarity_content_df = pd.DataFrame(similarity_content)
        similarity_content_df.set_index('movieId', inplace=True)
        similarity_content_df = pd.merge(self.content_data, similarity_content_df, left_index=True, right_index=True)
        similarity_content_df.sort_values('similarity_score', ascending=False, inplace=True)
        similarity_content_df = similarity_content_df.rename(index=str, columns={"similarity_score": "content_similarity_score"})
        
        # average the results (please test if we should weight content higher)
        similarity_avg = pd.merge(similiarty_collab_df, pd.DataFrame(similarity_content_df['content_similarity_score']), left_index=True, right_index=True)
        similarity_avg['average_similarity_score'] = (similarity_avg['content_similarity_score'] + similarity_avg['collaborative_similarity_score'])/2
        similarity_avg.sort_values('average_similarity_score', ascending=False, inplace=True)
        
        # remove old movies
        release_year = similarity_avg.apply(self.get_release_year, axis=1)
        year_mask = release_year >= 1990
        similarity_avg = similarity_avg[year_mask]
        # save the top 10
        similarity_avg = similarity_avg.head(10)
        similarity_avg["poster_path"] = similarity_avg.apply(self.fetch_tmdb_info, axis=1)
        # drop columns that we don't need 
        similarity_avg = similarity_avg.drop("genres", axis=1)
        similarity_avg = similarity_avg.drop("tag", axis=1)
        similarity_avg = similarity_avg.drop("rating", axis=1)
        similarity_avg = similarity_avg.drop("ratings_count", axis=1)
        similarity_avg = similarity_avg.drop("weighted_rating", axis=1)
        similarity_avg = similarity_avg.drop("imdbId", axis=1)
        similarity_avg = similarity_avg.drop("corpus", axis=1)
        similarity_avg = similarity_avg.drop("tmdbId", axis=1)
        
        similarity_avg.to_json(path, orient='records')
        return similarity_avg

In [93]:
rec_engine = RecommenderEngine(content_embeddings, collaborative_embeddings, content_data)

In [94]:
%%time
top_list = rec_engine.top_list(200)
top_list.head()

CPU times: user 12.7 ms, sys: 1.04 ms, total: 13.8 ms
Wall time: 12.1 ms


Unnamed: 0,movieId,title,poster_path
0,79,"Shawshank Redemption, The (1994)",/5KCVkau1HEl7ZzfPsKAPM0sMiKc.jpg
1,252,"Usual Suspects, The (1995)",/bUPmtQzrRhzqYySeiMpv7GurAfm.jpg
2,89,Schindler's List (1993),/lKb6h6LURE33I3GRFYgHBUfjwSU.jpg
3,297,Fight Club (1999),/k1lICEYRpJeFRIRfjxYwmpO9LTu.jpg
4,907,Planet Earth (2006),


In [95]:
top_list.poster_path[0]

'/5KCVkau1HEl7ZzfPsKAPM0sMiKc.jpg'

In [96]:
content_data[content_data.movieId == 245]

Unnamed: 0,movieId,title,genres,tag,corpus,rating,ratings_count,weighted_rating,imdbId,tmdbId
245,245.0,Batman Begins (2005),Action Crime IMAX,action batman billionaire Christian Bale comic...,Action Crime IMAX action batman billionaire Ch...,3.93,30684,3.919,372784,272.0


In [97]:
content_data.query("title == 'Batman Begins (2005)'").movieId

245   245.000
Name: movieId, dtype: float64

In [100]:
%%time
rec_engine.recommend_movies(245)

245 found in cache
578 found in cache
698 found in cache
206 found in cache
6096 found in cache
1699 found in cache
233 found in cache
2548 found in cache
9279 found in cache
2578 found in cache
CPU times: user 48.7 s, sys: 1min 18s, total: 2min 7s
Wall time: 22.2 s


Unnamed: 0,movieId,title,collaborative_similarity_score,content_similarity_score,average_similarity_score,poster_path
245,245.0,Batman Begins (2005),1.0,1.0,1.0,/8RW2runSEc34IwKN2D1aPcJd2UL.jpg
578,578.0,"Dark Knight, The (2008)",0.859,0.816,0.837,/qJ2tW6WMUDux911r6m7haRef0WH.jpg
698,698.0,"Dark Knight Rises, The (2012)",0.801,0.802,0.802,/85cWkCVftiVs0BVey6pxX8uNmLt.jpg
206,206.0,Spider-Man (2002),0.394,0.848,0.621,/gh4cZbhZxyTbgxQPxD0dOudNPTn.jpg
6096,6096.0,Batman vs. Robin (2015),0.345,0.839,0.592,/aGp9XDXmVM5lCKHWtgBC15S7XLr.jpg
1699,1699.0,Batman Returns (1992),0.259,0.923,0.591,/jKBjeXM7iBBV9UkUcOXx3m7FSHY.jpg
233,233.0,Spider-Man 2 (2004),0.308,0.861,0.584,/olxpyq9kJAZ2NU1siLshhhXEPR7.jpg
2548,2548.0,Batman: Mask of the Phantasm (1993),0.266,0.867,0.567,/l4jaQjkgznu2Rz05X18f24UjPNW.jpg
9279,9279.0,Justice League: Gods and Monsters (2015),0.277,0.819,0.548,/bz9717vMiTw2EGvGQUPRK4WLQ6G.jpg
2578,2578.0,Superman Returns (2006),0.232,0.863,0.547,/4p527M0wif7dyEQr7UD01dUkYck.jpg


In [101]:
content_data.query("title == 'Planet Earth (2006)'").movieId

907   907.000
Name: movieId, dtype: float64

In [102]:
content_data.query("title == 'Shawshank Redemption, The (1994)'").movieId

79   79.000
Name: movieId, dtype: float64

In [103]:
content_data.query("title == 'Lord of the Rings: The Two Towers, The (2002)'").movieId

36   36.000
Name: movieId, dtype: float64

In [104]:
# Planet Earth (2006)
rec_engine.recommend_movies(907)

907 downloaded and cached
4018 downloaded and cached
24812 downloaded and cached
17227 downloaded and cached
6842 downloaded and cached
4055 downloaded and cached
32367 downloaded and cached
26604 downloaded and cached
6492 downloaded and cached
32542 downloaded and cached


Unnamed: 0,movieId,title,collaborative_similarity_score,content_similarity_score,average_similarity_score,poster_path
907,907.0,Planet Earth (2006),1.0,1.0,1.0,
4018,4018.0,Blue Planet II (2017),0.693,0.688,0.69,
24812,24812.0,Bab'Aziz -The Prince Who Contemplated His Soul...,0.713,0.664,0.688,/kuYuA9Wl7ZaERz4YkPUYSBNLlDb.jpg
17227,17227.0,Winter on Fire: Ukraine's Fight for Freedom (2...,0.611,0.745,0.678,/c3IMOdEzVOiGli1LpEcZyjEIGje.jpg
6842,6842.0,"First Day of the Rest of Your Life, The (Le pr...",0.555,0.778,0.666,/w6pi0wqfNnwbCWc98jW3opwl0dF.jpg
4055,4055.0,Company (2011),0.596,0.731,0.664,/kpvvHmWTnVAl9oEiGouQ1GZKvmf.jpg
32367,32367.0,Louis Theroux: America's Medicated Kids (2010),0.559,0.761,0.66,
26604,26604.0,Midnight Diner (2014),0.552,0.742,0.647,/ujVev04jwc1kB9NenXK50xPE6rG.jpg
6492,6492.0,Silenced (2011),0.617,0.675,0.646,/mbMp0oIFmYnw0i5gzRoKt8cH5ve.jpg
32542,32542.0,Root of All Evil? (2006),0.54,0.75,0.645,/gOw03UkPXB7klc9XLsqFdP2Dvyj.jpg


In [105]:
# Shawshank Redemption, The (1994)
rec_engine.recommend_movies(79)

79 found in cache
1717 found in cache
252 found in cache
1001 found in cache
89 found in cache
167 found in cache
82 found in cache
203 found in cache
260 found in cache
924 found in cache


Unnamed: 0,movieId,title,collaborative_similarity_score,content_similarity_score,average_similarity_score,poster_path
79,79.0,"Shawshank Redemption, The (1994)",1.0,1.0,1.0,/5KCVkau1HEl7ZzfPsKAPM0sMiKc.jpg
1717,1717.0,"Green Mile, The (1999)",0.664,0.967,0.815,/velWPhVMQeQKcxggNEU8YmIo52R.jpg
252,252.0,"Usual Suspects, The (1995)",0.601,0.938,0.77,/bUPmtQzrRhzqYySeiMpv7GurAfm.jpg
1001,1001.0,Good Will Hunting (1997),0.54,0.973,0.757,/ylagLHIbG0F1blqSFqY6pa56Omr.jpg
89,89.0,Schindler's List (1993),0.553,0.959,0.756,/lKb6h6LURE33I3GRFYgHBUfjwSU.jpg
167,167.0,"Sixth Sense, The (1999)",0.426,0.973,0.699,/fIssD3w3SvIhPPmVo4WMgZDVLID.jpg
82,82.0,Forrest Gump (1994),0.499,0.894,0.697,/clolk7rB5lAjs41SD0Vt6IXYLMm.jpg
203,203.0,"Beautiful Mind, A (2001)",0.444,0.932,0.688,/mV4k9qlrdvECe4COBTN256FROR3.jpg
260,260.0,"Silence of the Lambs, The (1991)",0.452,0.909,0.681,/rplLJ2hPcOQmkFhTqUte0MkEaO2.jpg
924,924.0,Seven (a.k.a. Se7en) (1995),0.398,0.962,0.68,/6yoghtyTpznpBik8EngEmJskVUO.jpg


In [106]:
# Lord of the Rings: The Two Towers, The (2002)
rec_engine.recommend_movies(36)

36 found in cache
202 found in cache
227 found in cache
722 downloaded and cached
749 downloaded and cached
5615 downloaded and cached
159 found in cache
601 downloaded and cached
5875 downloaded and cached
2552 downloaded and cached


Unnamed: 0,movieId,title,collaborative_similarity_score,content_similarity_score,average_similarity_score,poster_path
36,36.0,"Lord of the Rings: The Two Towers, The (2002)",1.0,1.0,1.0,/5VTN0pR8gcqV3EPUHHfMGnJYN9L.jpg
202,202.0,"Lord of the Rings: The Fellowship of the Ring,...",0.978,0.995,0.987,/6oom5QYQ2yQTMJIbnvbkBL9cHo6.jpg
227,227.0,"Lord of the Rings: The Return of the King, The...",0.979,0.979,0.979,/rCzpDGLbOoPwLjy3OAm5NUPOTrC.jpg
722,722.0,"Hobbit: An Unexpected Journey, The (2012)",0.567,0.971,0.769,/yHA9Fc37VmpUA5UncTxxo3rTGVA.jpg
749,749.0,"Hobbit: The Desolation of Smaug, The (2013)",0.527,0.983,0.755,/aU8pXaBa9lNICdhFCVHzSfmgqsq.jpg
5615,5615.0,The Hobbit: The Battle of the Five Armies (2014),0.496,0.918,0.707,/xT98tLqatZPQApyRmlPL12LtiWp.jpg
159,159.0,"Matrix, The (1999)",0.34,0.959,0.65,/f89U3ADr1oiB1s9GkdPOEpXUk5H.jpg
601,601.0,"Road, The (2009)",0.33,0.946,0.638,/qLaXnLzqleBWQtjvZ6JGVSaKoC3.jpg
5875,5875.0,Macbeth (2015),0.372,0.903,0.637,/fivCMGXkJY29Det2LnItrKM4Cbl.jpg
2552,2552.0,Iron Monkey (Siu nin Wong Fei-hung ji: Tit Ma ...,0.327,0.939,0.633,/gLELnixQLunQOTm9PCpNRxrhU58.jpg


In [183]:
%%time
# fetch all data
# for m in top_100["movieId"]:
    # fetch_tmdb_info(m)

CPU times: user 4 µs, sys: 5 µs, total: 9 µs
Wall time: 16.9 µs


In [107]:
%%time
# calculate recommendations for top 100, so we have all movies stored and do not need dynamic calculations
for m in top_list["movieId"]:
    rec_engine.recommend_movies(m).head(1)

252 found in cache
79 found in cache
924 found in cache
1001 found in cache
295 found in cache
251 found in cache
1094 found in cache
3114 downloaded and cached
268 found in cache
167 found in cache
89 found in cache
141 found in cache
79 found in cache
535 downloaded and cached
665 found in cache
239 found in cache
1847 found in cache
1717 found in cache
1822 downloaded and cached
203 found in cache
297 found in cache
924 found in cache
263 found in cache
1586 found in cache
336 found in cache
20482 downloaded and cached
48 found in cache
23 found in cache
21802 downloaded and cached
654 found in cache
860 found in cache
1851 found in cache
1713 found in cache
1946 downloaded and cached
1850 downloaded and cached
1987 downloaded and cached
1954 downloaded and cached
1999 downloaded and cached
2053 downloaded and cached
2035 downloaded and cached
0 found in cache
268 found in cache
1099 downloaded and cached
439 found in cache
614 found in cache
4703 downloaded and cached
451 downloade

1094 found in cache
252 found in cache
22472 downloaded and cached
13706 downloaded and cached
268 found in cache
292 found in cache
295 found in cache
3097 downloaded and cached
8348 downloaded and cached
0 found in cache
336 found in cache
292 found in cache
297 found in cache
598 downloaded and cached
867 found in cache
416 found in cache
1766 found in cache
1289 found in cache
23124 downloaded and cached
13075 downloaded and cached
480 found in cache
3705 downloaded and cached
416 found in cache
1959 downloaded and cached
1751 found in cache
23723 downloaded and cached
1948 downloaded and cached
297 found in cache
6714 downloaded and cached
2563 found in cache
82 found in cache
1717 found in cache
203 found in cache
79 found in cache
189 downloaded and cached
73 found in cache
1001 found in cache
966 downloaded and cached
89 found in cache
141 found in cache
141 found in cache
89 found in cache
1001 found in cache
73 found in cache
7089 downloaded and cached
2492 downloaded and cac

10026 downloaded and cached
3505 downloaded and cached
44064 found in cache
34544 downloaded and cached
533 found in cache
2118 found in cache
3570 found in cache
611 downloaded and cached
11429 found in cache
626 found in cache
442 found in cache
23095 downloaded and cached
1415 downloaded and cached
1822 found in cache
283 found in cache
969 found in cache
4686 downloaded and cached
4687 downloaded and cached
16361 downloaded and cached
6882 downloaded and cached
16350 downloaded and cached
886 downloaded and cached
13796 downloaded and cached
27652 downloaded and cached
2697 found in cache
46510 downloaded and cached
50989 downloaded and cached
24628 downloaded and cached
43321 downloaded and cached
24498 found in cache
28976 downloaded and cached
17338 downloaded and cached
14434 downloaded and cached
23780 downloaded and cached
4041 found in cache
12787 downloaded and cached
1958 found in cache
10950 downloaded and cached
3505 found in cache
9343 found in cache
10932 downloaded an

1298 downloaded and cached
3245 found in cache
4032 downloaded and cached
1301 found in cache
2955 found in cache
1656 found in cache
768 found in cache
678 downloaded and cached
714 found in cache
725 found in cache
616 found in cache
93 found in cache
5007 downloaded and cached
805 found in cache
4978 downloaded and cached
779 downloaded and cached
2665 found in cache
24272 downloaded and cached
46 found in cache
10901 downloaded and cached
17227 found in cache
6605 found in cache
14982 found in cache
33421 downloaded and cached
6842 found in cache
16882 found in cache
2478 found in cache
3597 downloaded and cached
10944 downloaded and cached
422 downloaded and cached
1275 downloaded and cached
15721 downloaded and cached
1814 found in cache
907 downloaded and cached
9337 downloaded and cached
15800 downloaded and cached
775 found in cache
797 downloaded and cached
4006 downloaded and cached
795 found in cache
693 downloaded and cached
760 downloaded and cached
799 downloaded and cac

2974 downloaded and cached
239 found in cache
536 found in cache
626 found in cache
424 downloaded and cached
789 downloaded and cached
9369 found in cache
601 found in cache
284 found in cache
2008 found in cache
3578 downloaded and cached
208 found in cache
2119 downloaded and cached
8785 downloaded and cached
1617 downloaded and cached
3543 found in cache
8502 downloaded and cached
916 found in cache
884 downloaded and cached
56535 found in cache
16075 downloaded and cached
7812 found in cache
6644 found in cache
15017 found in cache
2025 found in cache
5692 found in cache
6642 found in cache
18275 found in cache
9691 found in cache
137 found in cache
15680 downloaded and cached
3027 found in cache
1341 found in cache
1553 downloaded and cached
1610 downloaded and cached
10100 downloaded and cached
25057 downloaded and cached
21530 downloaded and cached
6070 downloaded and cached
5271 downloaded and cached
6512 downloaded and cached
5614 downloaded and cached
73 found in cache
141 f

830 found in cache
839 downloaded and cached
916 found in cache
900 downloaded and cached
698 found in cache
22270 downloaded and cached
578 found in cache
649 found in cache
3952 downloaded and cached
29067 found in cache
526 found in cache
18179 downloaded and cached
18917 downloaded and cached
40113 downloaded and cached
1957 downloaded and cached
2982 downloaded and cached
747 found in cache
13379 downloaded and cached
54922 downloaded and cached
32501 downloaded and cached
55 found in cache
1956 downloaded and cached
2656 found in cache
11025 found in cache
12627 downloaded and cached
27638 found in cache
1932 downloaded and cached
3732 downloaded and cached
2053 found in cache
5158 downloaded and cached
3642 found in cache
3611 downloaded and cached
12852 downloaded and cached
36871 downloaded and cached
14982 found in cache
47176 downloaded and cached
9696 found in cache
7704 found in cache
2577 downloaded and cached
18702 downloaded and cached
236 found in cache
838 found in ca

In [116]:
for m in top_list["movieId"]:
    if not os.path.isfile('data/recs/{}_recs.json'.format(m)):
        print("COULD NOT FIND RECS FOR: {}".format(m))
    else: 
        print("Recs found for: {}".format(m))

Recs found for: 79
Recs found for: 252
Recs found for: 89
Recs found for: 297
Recs found for: 907
Recs found for: 860
Recs found for: 0
Recs found for: 274
Recs found for: 37
Recs found for: 578
Recs found for: 1663
Recs found for: 159
Recs found for: 260
Recs found for: 654
Recs found for: 344
Recs found for: 152
Recs found for: 291
Recs found for: 1713
Recs found for: 534
Recs found for: 909
Recs found for: 969
Recs found for: 772
Recs found for: 295
Recs found for: 270
Recs found for: 1823
Recs found for: 29
Recs found for: 7086
Recs found for: 202
Recs found for: 227
Recs found for: 268
Recs found for: 758
Recs found for: 257
Recs found for: 537
Recs found for: 924
Recs found for: 262
Recs found for: 1001
Recs found for: 36
Recs found for: 1851
Recs found for: 48
Recs found for: 2538
Recs found for: 1094
Recs found for: 336
Recs found for: 480
Recs found for: 82
Recs found for: 141
Recs found for: 1847
Recs found for: 2154
Recs found for: 1
Recs found for: 1717
Recs found for: 805


In [119]:
!tar chvfz notebook.tar.gz *

Autoencoders.ipynb
Content Hybrid.ipynb
Keras test.ipynb
NN test.ipynb
Preprocess & EDA.ipynb
Recommender Engine.ipynb
SVD test.ipynb
TFIDF test.ipynb
__pycache__/
__pycache__/data_download.cpython-36.pyc
allfiles.tar.gz
archive.tar.gz
ckpt/
ckpt/ckpt-loss=1.11.index
ckpt/ckpt-loss=0.77.index
ckpt/ckpt-loss=1.16.data-00001-of-00002
ckpt/ckpt-loss=0.77.data-00001-of-00002
ckpt/ckpt-loss=0.67.data-00000-of-00002
ckpt/ckpt-loss=0.93.index
ckpt/checkpoint
ckpt/ckpt-loss=1.01.index
ckpt/ckpt-loss=0.67.index
ckpt/collab-nn-1/
ckpt/ckpt-loss=0.63.data-00000-of-00002
ckpt/collab-nn-2/
ckpt/ckpt-loss=0.72.data-00001-of-00002
ckpt/autoencoder-1/
ckpt/ckpt-loss=0.93.data-00000-of-00002
ckpt/ckpt-loss=0.77.data-00000-of-00002
ckpt/ckpt-loss=0.63.index
ckpt/ckpt-loss=1.11.data-00000-of-00002
ckpt/ckpt-loss=1.16.index
ckpt/ckpt-loss=1.11.data-00001-of-00002
ckpt/ckpt-loss=1.01.data-00000-of-00002
ckpt/ckpt-loss=0.63.data-00001-of-00002
ckpt/ckpt-loss=1.01.data-00001-of-00002
ckpt/ckpt-loss=1.12.data