# Content-Based Filtering Movie Recommender System

This notebook was made to accomplish my task of Recommender System. Feel free to use it for education purposes. Have a great day!

# Read the data

In [1]:
pip install rake-nltk

Collecting rake-nltk
  Downloading https://files.pythonhosted.org/packages/8e/c4/b4ff57e541ac5624ad4b20b89c2bafd4e98f29fd83139f3a81858bdb3815/rake_nltk-1.0.4.tar.gz
Building wheels for collected packages: rake-nltk
  Building wheel for rake-nltk (setup.py) ... [?25l[?25hdone
  Created wheel for rake-nltk: filename=rake_nltk-1.0.4-py2.py3-none-any.whl size=7819 sha256=c66c31aa89e9aff4ea3ee770de7ad2bf94f794858e5aa849d7aabb8e2e399a49
  Stored in directory: /root/.cache/pip/wheels/ef/92/fc/271b3709e71a96ffe934b27818946b795ac6b9b8ff8682483f
Successfully built rake-nltk
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.4


In [2]:
import pandas as pd
import numpy as np

import math
import nltk
import operator
import collections
from collections import Counter

from tqdm import tqdm
from rake_nltk import Rake


df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')

df = df[['Title','Genre','Director','Actors','Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


# Data Pre-processing

In [3]:
df.columns = df.columns.str.lower()
df['actors'] = df['actors'].str.replace(' ', '').str.lower().str.split(',').str[:3].apply(' '.join)
df['director'] = df['director'].str.replace(' ', '').str.lower()
df['genre'] = df['genre'].str.lower().str.replace(',', ' ')
df['genre'] = df['genre'].str.replace(' ', '')

df['plot'] = df['plot'].str.replace("\r", "")
df['plot'] = df['plot'].str.replace("\n", "")
df['plot'] = df['plot'].replace(" ", "")
df['plot'] = df['plot'].str.replace('"', '')

punctuation_signs = list("?:!.,;")
df['plot'] = df['plot']

for punct_sign in punctuation_signs:
    df['plot'] = df['plot'].str.replace(punct_sign, '')

# Remove possessive pronouns
df['plot'] = df['plot'].str.replace("'s", "")

df['plot'] = [item.lower() for item in df['plot']]
# df.set_index('title', inplace = True)
df.head()

Unnamed: 0,title,genre,director,actors,plot
0,The Shawshank Redemption,crimedrama,frankdarabont,timrobbins morganfreeman bobgunton,two imprisoned men bond over a number of years...
1,The Godfather,crimedrama,francisfordcoppola,marlonbrando alpacino jamescaan,the aging patriarch of an organized crime dyna...
2,The Godfather: Part II,crimedrama,francisfordcoppola,alpacino robertduvall dianekeaton,the early life and career of vito corleone in ...
3,The Dark Knight,actioncrimedrama,christophernolan,christianbale heathledger aaroneckhart,when the menace known as the joker emerges fro...
4,12 Angry Men,crimedrama,sidneylumet,martinbalsam johnfiedler leej.cobb,a jury holdout attempts to prevent a miscarria...


In [4]:
def extract_keywords(input_str):
    r = Rake()
    r.extract_keywords_from_text(input_str.lower())
    key_words_dict_scores = r.get_word_degrees()
    sorted_key_words_dict_scores = sorted(key_words_dict_scores.items(), key=operator.itemgetter(1), reverse=True)
    sorted_dict = collections.OrderedDict(sorted_key_words_dict_scores)
    return sorted_dict, list(sorted_dict.keys())[:round(len(sorted_dict.keys())/2)]

df['key_words'] = df['plot'].apply(lambda x: extract_keywords(x)[1]).apply(' '.join)
df.drop(columns=['plot'], inplace=True)
df.set_index('title', inplace = True)
df.head()

Unnamed: 0_level_0,genre,director,actors,key_words
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Shawshank Redemption,crimedrama,frankdarabont,timrobbins morganfreeman bobgunton,two imprisoned men bond years finding
The Godfather,crimedrama,francisfordcoppola,marlonbrando alpacino jamescaan,organized crime dynasty transfers control aging
The Godfather: Part II,crimedrama,francisfordcoppola,alpacino robertduvall dianekeaton,son michael expands 1920s new york family crime
The Dark Knight,actioncrimedrama,christophernolan,christianbale heathledger aaroneckhart,dark knight must accept one greatest psycholog...
12 Angry Men,crimedrama,sidneylumet,martinbalsam johnfiedler leej.cobb,jury holdout attempts miscarriage colleagues


In [5]:
# Create dataframe of bag of contents
df['bag_of_contents'] = df['genre']+' '+df['director']+' '+df['actors']+' '+df['key_words']
corpus = df[['bag_of_contents']]
corpus.head()

Unnamed: 0_level_0,bag_of_contents
title,Unnamed: 1_level_1
The Shawshank Redemption,crimedrama frankdarabont timrobbins morganfree...
The Godfather,crimedrama francisfordcoppola marlonbrando alp...
The Godfather: Part II,crimedrama francisfordcoppola alpacino robertd...
The Dark Knight,actioncrimedrama christophernolan christianbal...
12 Angry Men,crimedrama sidneylumet martinbalsam johnfiedle...


# *Count of Words and Bag of Words*
Count the amount of words appear in the corpus and extract each of them

In [6]:
get_sentences = []
get_words = []
for i in range(len(corpus['bag_of_contents'])):
    get_sentences.append(corpus['bag_of_contents'][i].split())
    
for x in range(len(get_sentences)):
    for y in range(len(get_sentences[x])):
        get_words.append(get_sentences[x][y])

In [7]:
count = Counter(get_words)
count_of_words = dict(count)
count_of_words

{'crimedrama': 14,
 'frankdarabont': 2,
 'timrobbins': 1,
 'morganfreeman': 4,
 'bobgunton': 1,
 'two': 15,
 'imprisoned': 1,
 'men': 4,
 'bond': 2,
 'years': 7,
 'finding': 1,
 'francisfordcoppola': 3,
 'marlonbrando': 4,
 'alpacino': 4,
 'jamescaan': 1,
 'organized': 1,
 'crime': 7,
 'dynasty': 1,
 'transfers': 1,
 'control': 2,
 'aging': 3,
 'robertduvall': 2,
 'dianekeaton': 2,
 'son': 6,
 'michael': 1,
 'expands': 1,
 '1920s': 1,
 'new': 13,
 'york': 4,
 'family': 5,
 'actioncrimedrama': 2,
 'christophernolan': 7,
 'christianbale': 4,
 'heathledger': 1,
 'aaroneckhart': 1,
 'dark': 3,
 'knight': 1,
 'must': 13,
 'accept': 2,
 'one': 9,
 'greatest': 1,
 'psychological': 1,
 'wreaks': 1,
 'havoc': 1,
 'mysterious': 5,
 'past': 2,
 'physical': 1,
 'sidneylumet': 3,
 'martinbalsam': 1,
 'johnfiedler': 1,
 'leej.cobb': 3,
 'jury': 1,
 'holdout': 1,
 'attempts': 3,
 'miscarriage': 1,
 'colleagues': 1,
 'biographydramahistory': 7,
 'stevenspielberg': 7,
 'liamneeson': 2,
 'benkingsley': 

In [8]:
bag_of_words = []
for key in count_of_words:
    bag_of_words.append(key)
bag_of_words[:30]

['crimedrama',
 'frankdarabont',
 'timrobbins',
 'morganfreeman',
 'bobgunton',
 'two',
 'imprisoned',
 'men',
 'bond',
 'years',
 'finding',
 'francisfordcoppola',
 'marlonbrando',
 'alpacino',
 'jamescaan',
 'organized',
 'crime',
 'dynasty',
 'transfers',
 'control',
 'aging',
 'robertduvall',
 'dianekeaton',
 'son',
 'michael',
 'expands',
 '1920s',
 'new',
 'york',
 'family']

## Term Frequency (TF)<br>

\begin{equation}
\Large tf(t,d) = \frac{f_{t,d}}{\sum\limits_{t' \in d} {f_{t',d}}}
\end{equation}<br>

where: <br/>
$f_{t,d}$ : the frequency of the term (t) appears in the document (d)<br>
$\sum\limits_{t' \in d} {f_{t',d}} $ : number of terms (t) in the document 

Source: <a href="https://en.wikipedia.org/wiki/Tf%E2%80%93idf">https://en.wikipedia.org/wiki/Tf%E2%80%93idf</a>

In [9]:
def tf(word, docs):
    result = {}
    for token in tqdm(word):
        sent_tf_vector = []
        for document in docs:
            doc_freq = 0
            for word in nltk.word_tokenize(document):
                if token == word:
                      doc_freq += 1
            word_tf = doc_freq/len(nltk.word_tokenize(document))
            sent_tf_vector.append(word_tf)
        result[token] = sent_tf_vector
    return result

In [10]:
tf_result = tf(bag_of_words, corpus['bag_of_contents'])
print("\n\nTF of the word 'imprisoned' in the first of the document: {}\n".format(tf_result['imprisoned'][0]))
print("TF of the word 'imprisoned' in the whole document:\n{}".format(tf_result))

100%|██████████| 2086/2086 [02:43<00:00, 12.78it/s]




TF of the word 'imprisoned' in the first of the document: 0.09090909090909091

TF of the word 'imprisoned' in the whole document:
{'crimedrama': [0.09090909090909091, 0.09090909090909091, 0.07692307692307693, 0.0, 0.1, 0.0, 0.0, 0.07692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06666666666666667, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.09090909090909091, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06666666666666667, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.08333333333333333, 0.0, 0.0, 0.058823529411764705, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07692307692307693, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.125, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

### *Inverse Document Frequency (IDF)*<br>

<img style="margin-bottom: 15px" src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ac67bc0f76b5b8e31e842d6b7d28f8949dab7937"><br>
where: <br/>
N: the total number of documents in the corpus<br>
$ |\{d \in D: t \in d\}| $: number of documents (d) where the term (t) appears.<br>

Sumber: <a href="https://en.wikipedia.org/wiki/Tf%E2%80%93idf">https://en.wikipedia.org/wiki/Tf%E2%80%93idf</a>

In [11]:
def idf(df, word):
    result = {}
    for token in tqdm(word):
        doc_containing_word = 0
        for document in df:
            if token in nltk.word_tokenize(document):
                doc_containing_word += 1
    # return doc_containing_word
        result[token] = np.log(len(df)/(1 + doc_containing_word)) + 1
    return result

In [12]:
idf_result = idf(corpus['bag_of_contents'], count_of_words)
print("\nIDF of the word 'imprisoned' in the document: {}\n".format(idf_result['crimedrama']))
print("IDF of every words in the document:\n{}".format(idf_result))

100%|██████████| 2086/2086 [01:22<00:00, 25.13it/s]


IDF of the word 'imprisoned' in the document: 3.8134107167600364

IDF of every words in the document:
{'crimedrama': 3.8134107167600364, 'frankdarabont': 5.422848629194137, 'timrobbins': 5.8283137373023015, 'morganfreeman': 4.912023005428146, 'bobgunton': 5.8283137373023015, 'two': 3.7488721956224653, 'imprisoned': 5.8283137373023015, 'men': 4.912023005428146, 'bond': 5.422848629194137, 'years': 4.442019376182411, 'finding': 5.8283137373023015, 'francisfordcoppola': 5.135166556742356, 'marlonbrando': 4.912023005428146, 'alpacino': 4.912023005428146, 'jamescaan': 5.8283137373023015, 'organized': 5.8283137373023015, 'crime': 4.442019376182411, 'dynasty': 5.8283137373023015, 'transfers': 5.8283137373023015, 'control': 5.422848629194137, 'aging': 5.135166556742356, 'robertduvall': 5.422848629194137, 'dianekeaton': 5.422848629194137, 'son': 4.575550768806933, 'michael': 5.8283137373023015, 'expands': 5.8283137373023015, '1920s': 5.8283137373023015, 'new': 3.882403588246988, 'york': 4.91202




## TF-IDF<br>

\begin{equation}
{\Large \displaystyle \mathrm {tfidf} (t,d,D)=\mathrm {tf} (t,d)\cdot \mathrm {idf} (t,D)}
\end{equation}<br>

where: <br/>
$tf_{t,d}$: Term Frequency (TF) <br>
$idf_{t,D}$: Inverse Document Frequency (IDF)

Sumber: <a href="https://en.wikipedia.org/wiki/Tf%E2%80%93idf">https://en.wikipedia.org/wiki/Tf%E2%80%93idf</a>

In [15]:
def tfidf(tf,idf):
    result = []
    for token in tqdm(tf.keys()):
        tfidf_sentences = []
        for tf_sentence in tf[token]:
            tf_idf_score = tf_sentence * idf[token]
            tfidf_sentences.append(tf_idf_score)
        result.append(tfidf_sentences)
    return result

In [20]:
tfidf_result = tfidf(tf_result,idf_result)
tfidf_result

100%|██████████| 2086/2086 [00:00<00:00, 19654.15it/s]


[[0.3466737015236397,
  0.3466737015236397,
  0.2933392859046182,
  0.0,
  0.3813410716760037,
  0.0,
  0.0,
  0.2933392859046182,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.2542273811173358,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.3466737015236397,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.2542273811173358,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.3177842263966697,
  0.0,
  0.0,
  0.22431827745647273,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.3813410716760037,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.2933392859046182,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,


### Vector Representation

In [19]:
def build_vec_representation(x):
    result = np.asarray(x)
    return result

vector = build_vec_representation(tfidf_result)
vector

array([[0.3466737 , 0.3466737 , 0.29333929, ..., 0.        , 0.        ,
        0.        ],
       [0.49298624, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.5298467 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.44833183],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.44833183],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.44833183]])

## Calculate the similarity

### Cosine Similarity<br>
<img width="400" height="200" src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1d94e5903f7936d3c131e040ef2c51b473dd071d">

di mana:<br>
$A_i$ : Komponen vektor A<br>
$B_i$ : Komponen vektor B

In [21]:
from numpy import dot
from numpy.linalg import norm

In [23]:
def cosine_similarity(x):
    dot_result = dot(x.T, x)
    norm_result = (x * x).sum(0, keepdims=True) ** .5
    result = dot_result / norm_result / norm_result.T

    return result

In [24]:
cosine_sim = cosine_similarity(vector)
cosine_sim

array([[1.        , 0.04914897, 0.04704767, ..., 0.        , 0.        ,
        0.        ],
       [0.04914897, 1.        , 0.26999251, ..., 0.        , 0.        ,
        0.        ],
       [0.04704767, 0.26999251, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

## List of Movie Titles

In [25]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)
np.array(corpus['bag_of_contents'].index)

array(['The Shawshank Redemption', 'The Godfather',
       'The Godfather: Part II', 'The Dark Knight', '12 Angry Men',
       "Schindler's List",
       'The Lord of the Rings: The Return of the King', 'Pulp Fiction',
       'Fight Club', 'The Lord of the Rings: The Fellowship of the Ring',
       'Forrest Gump', 'Star Wars: Episode V - The Empire Strikes Back',
       'Inception', 'The Lord of the Rings: The Two Towers',
       "One Flew Over the Cuckoo's Nest", 'Goodfellas', 'The Matrix',
       'Star Wars: Episode IV - A New Hope', 'Se7en',
       "It's a Wonderful Life", 'The Silence of the Lambs',
       'The Usual Suspects', 'Léon: The Professional',
       'Saving Private Ryan', 'City Lights', 'Interstellar',
       'American History X', 'Modern Times', 'Casablanca',
       'The Green Mile', 'Psycho', 'Raiders of the Lost Ark',
       'The Pianist', 'Rear Window', 'The Departed', 'Whiplash',
       'Terminator 2: Judgment Day', 'Back to the Future', 'Gladiator',
       'The Lio

## Recommend the film

In [27]:
def recommend_film(film, cosine_matrix, k):
    if film in corpus['bag_of_contents'].index:
        idx = np.where(corpus['bag_of_contents'].index == film)[0][0]

     # creating a Series with the similarity scores in descending order
        top_k_list = pd.Series(cosine_matrix[idx]).sort_values(ascending = False)[1:k+1].index
        
        return list(corpus['bag_of_contents'].iloc[top_k_list].index)
    else:
        print('Movie does not exist.')

# Comparison to Sci-kit Learn Library

In [28]:
recommend_film("The Maltese Falcon", cosine_matrix = cosine_sim, k=8)

['Chinatown',
 'The Big Sleep',
 'The Treasure of the Sierra Madre',
 'Guardians of the Galaxy',
 'Scarface',
 'Pirates of the Caribbean: The Curse of the Black Pearl',
 'Laura',
 'The Best Years of Our Lives']

In [29]:
recommend_film("Schindler's List", cosine_matrix = cosine_sim, k=8)

['Patton',
 'The Great Escape',
 'The Best Years of Our Lives',
 'The Imitation Game',
 'Gandhi',
 'The Grand Budapest Hotel',
 'The Pianist',
 'Shutter Island']

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
cos_matrix = cosine_similarity(vector.T)

In [32]:
recommend_film("The Maltese Falcon", cosine_matrix = cos_matrix, k=8)

['Chinatown',
 'The Big Sleep',
 'The Treasure of the Sierra Madre',
 'Guardians of the Galaxy',
 'Scarface',
 'Pirates of the Caribbean: The Curse of the Black Pearl',
 'Laura',
 'The Best Years of Our Lives']

In [33]:
recommend_film("Schindler's List", cosine_matrix = cos_matrix, k=8)

['Patton',
 'The Great Escape',
 'The Best Years of Our Lives',
 'The Imitation Game',
 'Gandhi',
 'The Grand Budapest Hotel',
 'The Pianist',
 'Shutter Island']