In [4]:
# import dependencies
import pandas as pd
import numpy as np
from numpy import array
from numpy import asarray
from numpy import zeros
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from zipfile import ZipFile
import json
import os.path
import re
import pickle

import data_download

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer

# Add some convenience functions to Pandas DataFrame.
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.3f}'.format
def mask(df, key, function):
  """Returns a filtered dataframe, by applying function to key"""
  return df[function(df[key])]

def flatten_cols(df):
  df.columns = [' '.join(col).strip() for col in df.columns.values]
  return df

pd.DataFrame.mask = mask
pd.DataFrame.flatten_cols = flatten_cols

In [2]:
# movies, links, ratings, tags = data_download.load_unprocessed_df(use_large=False)
movies, links, ratings, tags, genome_tags, genome_scores = data_download.load_unprocessed_df(use_large=False)

MovieLens 25M Dataset is downloaded!
MovieLens Latest Small Dataset is downloaded!
MovieLens 25M Dataset is already extracted!
MovieLens Latest Small Dataset is already extracted!


In [45]:
movies.sample(5)

Unnamed: 0,movieId,title,genres
7589,86332,Thor (2011),Action Adventure Drama Fantasy IMAX
417,479,Judgment Night (1993),Action Crime Thriller
2544,3405,"Night to Remember, A (1958)",Action Drama
1641,2187,Stage Fright (1950),Mystery Romance Thriller
2020,2692,Run Lola Run (Lola rennt) (1998),Action Crime


In [17]:
# handle genres (remove all non alphabet characters)
movies['genres'] = movies['genres'].str.replace(pat="|", repl=" ")
movies['genres'] = movies['genres'].str.replace(pat="-", repl="")
movies['genres'] = movies['genres'].str.replace(pat="(no genres listed)", repl="")
movies['genres'] = movies['genres'].str.replace(pat="(", repl="")
movies['genres'] = movies['genres'].str.replace(pat=")", repl="")
movies.sample(5)

Unnamed: 0,movieId,title,genres
8741,127180,"Story of Film: An Odyssey, The (2011)",Documentary
2437,3250,Alive (1993),Drama
2138,2846,"Adventures of Milo and Otis, The (Koneko monog...",Adventure Children Comedy Drama
7382,79299,"No. 1 Ladies' Detective Agency, The (2008)",Comedy Crime Mystery
3334,4517,Lady in White (a.k.a. The Mystery of the Lady ...,Horror Mystery Thriller


In [52]:
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.001, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])

In [53]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [54]:
# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])

# Function that get movie recommendations based on the cosine similarity score of movie genres
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [55]:
genre_recommendations('Saving Private Ryan (1998)').head(10)

909                     Apocalypse Now (1979)
933              Boot, Das (Boat, The) (1981)
1407    All Quiet on the Western Front (1930)
1503               Saving Private Ryan (1998)
1826                Thin Red Line, The (1998)
2216                  Dirty Dozen, The (1967)
2310                  Longest Day, The (1962)
2313                 Tora! Tora! Tora! (1970)
2573                          Red Dawn (1984)
2631            Force 10 from Navarone (1978)
Name: title, dtype: object