# Content based filtering

In [1]:
# Relevant Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import regularizers
from tensorflow.keras import metrics
from tensorflow.keras.utils import plot_model

In [2]:
# Create a variable for the movie file directory.
imdb_data_dir = 'C:/Users/Ehi/Downloads/data/imdb/'

In [3]:
# load metadata csv file
metadata = pd.read_csv(imdb_data_dir + 'movies_metadata.csv')

# Print the first three rows
metadata.head(3)

  metadata = pd.read_csv(imdb_data_dir + 'movies_metadata.csv')


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [4]:
# Load the keywords and credits data
credits = pd.read_csv(imdb_data_dir + 'credits.csv')
keywords = pd.read_csv(imdb_data_dir + 'keywords.csv')

In [5]:
# Drop rows that disrupts the data from merging
metadata = metadata.drop([19730, 29503, 35587])

# Convert IDs to integers Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
metadata['id'] = metadata['id'].astype('int')

In [6]:
# Merge keywords and credits into your main metadata dataframe
metadata = metadata.merge(credits, on='id')
metadata = metadata.merge(keywords, on='id')

In [7]:
# Parse the string features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(literal_eval)

In [8]:
metadata["cast"]

0        [{'cast_id': 14, 'character': 'Woody (voice)',...
1        [{'cast_id': 1, 'character': 'Alan Parrish', '...
2        [{'cast_id': 2, 'character': 'Max Goldman', 'c...
3        [{'cast_id': 1, 'character': 'Savannah 'Vannah...
4        [{'cast_id': 1, 'character': 'George Banks', '...
                               ...                        
46623    [{'cast_id': 0, 'character': '', 'credit_id': ...
46624    [{'cast_id': 1002, 'character': 'Sister Angela...
46625    [{'cast_id': 6, 'character': 'Emily Shaw', 'cr...
46626    [{'cast_id': 2, 'character': '', 'credit_id': ...
46627                                                   []
Name: cast, Length: 46628, dtype: object

In [9]:
# Function to get the director's name from the crew feature. If director is not listed, return NaN
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [10]:
# A list is passed into this function as "X", and returns the values of that list, but limits it to a maximum of three items.
# If X is not a list, or is empty, the function returns an empty list.

def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [11]:
# Defines new director, cast, genres and keywords features in the metadata dataframe.
metadata['director'] = metadata['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    metadata[feature] = metadata[feature].apply(get_list)

In [12]:
# Print the new features
metadata[['title', 'cast', 'director', 'keywords', 'genres']].head(10)

Unnamed: 0,title,cast,director,keywords,genres
0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles]",John Lasseter,"[jealousy, toy, boy]","[Animation, Comedy, Family]"
1,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",Joe Johnston,"[board game, disappearance, based on children'...","[Adventure, Fantasy, Family]"
2,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret]",Howard Deutch,"[fishing, best friend, duringcreditsstinger]","[Romance, Comedy]"
3,Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devine]",Forest Whitaker,"[based on novel, interracial relationship, sin...","[Comedy, Drama, Romance]"
4,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short]",Charles Shyer,"[baby, midlife crisis, confidence]",[Comedy]
5,Heat,"[Al Pacino, Robert De Niro, Val Kilmer]",Michael Mann,"[robbery, detective, bank]","[Action, Crime, Drama]"
6,Sabrina,"[Harrison Ford, Julia Ormond, Greg Kinnear]",Sydney Pollack,"[paris, brother brother relationship, chauffeur]","[Comedy, Romance]"
7,Tom and Huck,"[Jonathan Taylor Thomas, Brad Renfro, Rachael ...",Peter Hewitt,[],"[Action, Adventure, Drama]"
8,Sudden Death,"[Jean-Claude Van Damme, Powers Boothe, Dorian ...",Peter Hyams,"[terrorist, hostage, explosive]","[Action, Adventure, Thriller]"
9,GoldenEye,"[Pierce Brosnan, Sean Bean, Izabella Scorupco]",Martin Campbell,"[cuba, falsely accused, secret identity]","[Adventure, Action, Thriller]"


In [13]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [14]:
# Apply clean_data function to the features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    metadata[feature] = metadata[feature].apply(clean_data)

In [15]:
# Merge all content into one text: Create a new feature that contains the text of all the content relevant to each movie.
# "Soup" represents a combination of all the text features, relevant to build the recommendation engine
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [16]:
# Create a new soup feature in the metadata dataframe
metadata['soup'] = metadata.apply(create_soup, axis=1)

In [17]:
# print the first row of the soup column, in the metadata df
print(metadata['soup'][0])

jealousy toy boy tomhanks timallen donrickles johnlasseter animation comedy family


In [18]:
# Limit the data for memory size
# metadata = metadata[:20000]

Note: There are different ways we can convert the soups texts to a vector. I will show how to use bothe CountVectorizer and TF-IDF Vectorizer.

### Count Vectorizer

In [19]:
# "Vectorizers" are used to convert a text, to a vector of numbers in preparation for machine learning,
# or other nlp processing techniques.

# We include the stopwords variable, to remove stopwords from the dataset.
count = CountVectorizer(stop_words='english')

count_matrix = count.fit_transform(metadata['soup'])

In [None]:
# Cosine similarity, is a mathematical concept that finds the similarity between two vectors.
cosine_sim1 = cosine_similarity(count_matrix, count_matrix)

### TF-IDF Vectorizer 

We will be taking into account the "soup" feature to find similarities in the movies, using sklearn's "Term Frequency-inverse document frequency" (TF-IDF) Vectorizer.

#### How it works: 

The TF-IDF Vectorizer is considered one of the most efficient vectorizer algorithmn. It helps transform text into a vector, used to train machine learning algorithms.

It compares the number of times a word appears in a text document, with the number of ducuments the word appears in.

It is split in  two parts:

1. Term Frequency (TF): Which is the number of times the term appears in a document. Calculated by dividing the number of representation of a term in a text, by the number of words in that text.
    
    
2. Inverse Document Frequency (IDF): This is the number of documents the term appears in. Calculated by finding the log of the number of texts divided by the number of texts containing the word.

#### Mathematical Formula:

$ TF-IDF(t) = TF(t,d) * IDF(t) $

where,

$TF(t,d) = \sum \limits _{x \in d} fr (x,t) $

It returns how many times the term is present in the text.

and,

$ IDF(t) = log \frac{|D|}{1 + |{d:t \in d}|} $

It returns the number of texts where the term appears.

#### Result:

Sklearns tf-idf vectorizer fitted on a text dataset, returns a sparse matrix containing vectors of the dot product of tf and idf, for each text in the dataset.

In [19]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['soup'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(46628, 73881)

In [20]:
# Cosine similarity, is a mathematical concept that finds the similarity between two vectors.
cosine_sim2 = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [21]:
# Reset index of your main DataFrame and construct reverse mapping as before
metadata = metadata.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])

In [22]:
# Function that takes in movie title as input and outputs most similar movies
# Here we used the "cosine_sim2" derived from the TF-IDF Vectorizer Matrix

def get_recommendations(title, cosine_sim=cosine_sim2):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [23]:
get_recommendations('Jumanji', cosine_sim2)

14455                       Where the Wild Things Are
26717                                  Mostly Ghostly
30651    Mostly Ghostly: Have You Met My Ghoulfriend?
41598       Mostly Ghostly 3: One Night in Doom House
42041                                  You Are Umasou
24871               Tinker Bell and the Lost Treasure
30667                 Zenon: Girl of the 21st Century
41195       Middle School: The Worst Years of My Life
27913                            Karlsson on the Roof
18858                                       The Lorax
Name: title, dtype: object