# Welcome to "ML4Recsys : Intro to content-based filtering" Notebook

In this notebook we will try recommend list of film based on one film that the user already watch, so the instruction is:

1. Read the data
2. Make the vector representation
3. Calculate the similarity betweenfilm based on the vector representation

# Read the data

In [None]:
pip install rake-nltk

Collecting rake-nltk
  Downloading https://files.pythonhosted.org/packages/8e/c4/b4ff57e541ac5624ad4b20b89c2bafd4e98f29fd83139f3a81858bdb3815/rake_nltk-1.0.4.tar.gz
Building wheels for collected packages: rake-nltk
  Building wheel for rake-nltk (setup.py) ... [?25l[?25hdone
  Created wheel for rake-nltk: filename=rake_nltk-1.0.4-py2.py3-none-any.whl size=7819 sha256=cc64eab60d5974837e7a3834c909f3712dd5f60a9a25528c70652f1d99ad892a
  Stored in directory: /root/.cache/pip/wheels/ef/92/fc/271b3709e71a96ffe934b27818946b795ac6b9b8ff8682483f
Successfully built rake-nltk
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.4


In [None]:
import pandas as pd
import numpy as np

import math
import nltk
import operator
import collections
from collections import Counter

from tqdm import tqdm
from rake_nltk import Rake


df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')

df = df[['Title','Genre','Director','Actors','Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


# Data Pre-processing

In [None]:
df.columns = df.columns.str.lower()
df['actors'] = df['actors'].str.replace(' ', '').str.lower().str.split(',').str[:3].apply(' '.join)
df['director'] = df['director'].str.replace(' ', '').str.lower()
df['genre'] = df['genre'].str.lower().str.replace(',', ' ')
df['genre'] = df['genre'].str.replace(' ', '')

df['plot'] = df['plot'].str.replace("\r", "")
df['plot'] = df['plot'].str.replace("\n", "")
df['plot'] = df['plot'].replace(" ", "")
df['plot'] = df['plot'].str.replace('"', '')

punctuation_signs = list("?:!.,;")
df['plot'] = df['plot']

for punct_sign in punctuation_signs:
    df['plot'] = df['plot'].str.replace(punct_sign, '')

# Remove possessive pronouns
df['plot'] = df['plot'].str.replace("'s", "")

df['plot'] = [item.lower() for item in df['plot']]
# df.set_index('title', inplace = True)
df.head()

Unnamed: 0,title,genre,director,actors,plot
0,The Shawshank Redemption,crimedrama,frankdarabont,timrobbins morganfreeman bobgunton,two imprisoned men bond over a number of years...
1,The Godfather,crimedrama,francisfordcoppola,marlonbrando alpacino jamescaan,the aging patriarch of an organized crime dyna...
2,The Godfather: Part II,crimedrama,francisfordcoppola,alpacino robertduvall dianekeaton,the early life and career of vito corleone in ...
3,The Dark Knight,actioncrimedrama,christophernolan,christianbale heathledger aaroneckhart,when the menace known as the joker emerges fro...
4,12 Angry Men,crimedrama,sidneylumet,martinbalsam johnfiedler leej.cobb,a jury holdout attempts to prevent a miscarria...


In [None]:
# pip install rake_nltk

In [None]:
def extract_keywords(input_str):
    r = Rake()
    r.extract_keywords_from_text(input_str.lower())
    key_words_dict_scores = r.get_word_degrees()
    sorted_key_words_dict_scores = sorted(key_words_dict_scores.items(), key=operator.itemgetter(1), reverse=True)
    sorted_dict = collections.OrderedDict(sorted_key_words_dict_scores)
    return sorted_dict, list(sorted_dict.keys())[:round(len(sorted_dict.keys())/2)]

df['key_words'] = df['plot'].apply(lambda x: extract_keywords(x)[1]).apply(' '.join)
df.drop(columns=['plot'], inplace=True)
df.set_index('title', inplace = True)
df.head()

Unnamed: 0_level_0,genre,director,actors,key_words
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Shawshank Redemption,crimedrama,frankdarabont,timrobbins morganfreeman bobgunton,two imprisoned men bond years finding
The Godfather,crimedrama,francisfordcoppola,marlonbrando alpacino jamescaan,organized crime dynasty transfers control clan...
The Godfather: Part II,crimedrama,francisfordcoppola,alpacino robertduvall dianekeaton,family crime syndicate son michael expands 192...
The Dark Knight,actioncrimedrama,christophernolan,christianbale heathledger aaroneckhart,dark knight must accept one mysterious past wr...
12 Angry Men,crimedrama,sidneylumet,martinbalsam johnfiedler leej.cobb,jury holdout attempts colleagues forcing


In [None]:
# Membuat dataframe bag of contents
df['bag_of_contents'] = df['genre']+' '+df['director']+' '+df['actors']+' '+df['key_words']
corpus = df[['bag_of_contents']]
corpus.head()

Unnamed: 0_level_0,bag_of_contents
title,Unnamed: 1_level_1
The Shawshank Redemption,crimedrama frankdarabont timrobbins morganfree...
The Godfather,crimedrama francisfordcoppola marlonbrando alp...
The Godfather: Part II,crimedrama francisfordcoppola alpacino robertd...
The Dark Knight,actioncrimedrama christophernolan christianbal...
12 Angry Men,crimedrama sidneylumet martinbalsam johnfiedle...


# Bag of Sentences and Bag of Words
Mengekstrak kalimat dan kata

In [None]:
get_sentences = []
get_words = []
for i in range(len(corpus['bag_of_contents'])):
    get_sentences.append(corpus['bag_of_contents'][i].split())
    
for x in range(len(get_sentences)):
    for y in range(len(get_sentences[x])):
        get_words.append(get_sentences[x][y])

In [None]:
get_sentences[0]

['crimedrama',
 'frankdarabont',
 'timrobbins',
 'morganfreeman',
 'bobgunton',
 'two',
 'imprisoned',
 'men',
 'bond',
 'years',
 'finding']

In [None]:
get_words[0]

'crimedrama'

# *Count of Words and Bag of Words*
Menghitung total kata dalam corpus dan mengekstraknya per kata

## Menghitung jumlah kata dalam dokumen

In [None]:
count = Counter(get_words)
count_of_words = dict(count)
count_of_words

{'crimedrama': 14,
 'frankdarabont': 2,
 'timrobbins': 1,
 'morganfreeman': 4,
 'bobgunton': 1,
 'two': 15,
 'imprisoned': 1,
 'men': 4,
 'bond': 2,
 'years': 7,
 'finding': 1,
 'francisfordcoppola': 3,
 'marlonbrando': 4,
 'alpacino': 4,
 'jamescaan': 1,
 'organized': 1,
 'crime': 7,
 'dynasty': 1,
 'transfers': 1,
 'control': 2,
 'clandestine': 1,
 'robertduvall': 2,
 'dianekeaton': 2,
 'family': 6,
 'syndicate': 1,
 'son': 5,
 'michael': 1,
 'expands': 1,
 '1920s': 1,
 'new': 13,
 'actioncrimedrama': 2,
 'christophernolan': 7,
 'christianbale': 4,
 'heathledger': 1,
 'aaroneckhart': 1,
 'dark': 4,
 'knight': 2,
 'must': 11,
 'accept': 1,
 'one': 8,
 'mysterious': 5,
 'past': 2,
 'wreaks': 1,
 'havoc': 1,
 'physical': 1,
 'tests': 1,
 'joker': 1,
 'sidneylumet': 3,
 'martinbalsam': 1,
 'johnfiedler': 1,
 'leej.cobb': 3,
 'jury': 1,
 'holdout': 1,
 'attempts': 3,
 'colleagues': 1,
 'forcing': 1,
 'biographydramahistory': 7,
 'stevenspielberg': 7,
 'liamneeson': 2,
 'benkingsley': 3,
 

In [None]:
bag_of_words = []
for key in count_of_words:
    bag_of_words.append(key)
bag_of_words[:30]

['crimedrama',
 'frankdarabont',
 'timrobbins',
 'morganfreeman',
 'bobgunton',
 'two',
 'imprisoned',
 'men',
 'bond',
 'years',
 'finding',
 'francisfordcoppola',
 'marlonbrando',
 'alpacino',
 'jamescaan',
 'organized',
 'crime',
 'dynasty',
 'transfers',
 'control',
 'clandestine',
 'robertduvall',
 'dianekeaton',
 'family',
 'syndicate',
 'son',
 'michael',
 'expands',
 '1920s',
 'new']

## Term Frequency (TF)<br>

<div>
    <div class="eq" style="font-size: 40px;border: 1px solid black"><br>
    \begin{equation}
    \Large tf(t,d) = \frac{f_{t,d}}{\sum\limits_{t \in d} {f_{t,d}}}
    \end{equation}<br>
    </div><br>
    <div style="font-size: 15px">
    di mana: <br/>
    $f_{t,d}$ : frekuensi term (t) muncul dalam dokumen (d)<br>
    $\sum\limits_{t' \in d} {f_{t',d}} $ : jumlah term (t) dalam dokumen (d)
    </div>
</div><br>
Sumber: <a href="https://en.wikipedia.org/wiki/Tf%E2%80%93idf">https://en.wikipedia.org/wiki/Tf%E2%80%93idf</a>

In [None]:
def tf(word, docs):
    result = {}
    for token in tqdm(word):
        sent_tf_vector = []
        for document in docs:
            doc_freq = 0
            for word in nltk.word_tokenize(document):
                if token == word:
                      doc_freq += 1
            word_tf = doc_freq/len(nltk.word_tokenize(document))
            sent_tf_vector.append(word_tf)
        result[token] = sent_tf_vector
    return result

In [None]:
tf_result = tf(bag_of_words, corpus['bag_of_contents'])
print("TF kata 'imprisoned' pada dokumen pertama = {}\n".format(tf_result['imprisoned'][0]))
print("TF kata 'imprisoned' pada seluruh dokumen = {}".format(tf_result))

In [None]:
tf_result['imprisoned'][0]

0.09090909090909091

### *Inverse Document Frequency (IDF)*<br>

<div>
    <div class="eq" style="border: 1px solid black"><br>
    <img style="margin-bottom: 15px" src="https://wikimedia.org/api/rest_v1/media/math/render/svg/ac67bc0f76b5b8e31e842d6b7d28f8949dab7937"><br>
    </div><br>
    <div style="font-size: 15px">
    di mana: <br/>
    N: jumlah total dokumen dalam korpus<br>
    $ |\{d \in D: t \in d\}| $: jumlah dokumen (d) di mana term (t) muncul
    </div>
</div><br>
Sumber: <a href="https://en.wikipedia.org/wiki/Tf%E2%80%93idf">https://en.wikipedia.org/wiki/Tf%E2%80%93idf</a>

In [None]:
def idf(df, word):
    result = {}
    for token in tqdm(word):
        doc_containing_word = 0
        for document in df:
            if token in nltk.word_tokenize(document):
                doc_containing_word += 1
    # return doc_containing_word
        result[token] = np.log(len(df)/(1 + doc_containing_word)) + 1
    return result

In [None]:
idf_result = idf(corpus['bag_of_contents'], count_of_words)
print("\nIDF kata 'imprisoned' pada dokumen = {}\n".format(idf_result['crimedrama']))
print("IDF seluruh kata pada seluruh dokumen = {}".format(idf_result))

100%|██████████| 2083/2083 [01:25<00:00, 24.25it/s]


IDF kata 'imprisoned' pada dokumen = 3.8134107167600364

IDF seluruh kata pada seluruh dokumen = {'crimedrama': 3.8134107167600364, 'frankdarabont': 5.422848629194137, 'timrobbins': 5.8283137373023015, 'morganfreeman': 4.912023005428146, 'bobgunton': 5.8283137373023015, 'two': 3.7488721956224653, 'imprisoned': 5.8283137373023015, 'men': 4.912023005428146, 'bond': 5.422848629194137, 'years': 4.442019376182411, 'finding': 5.8283137373023015, 'francisfordcoppola': 5.135166556742356, 'marlonbrando': 4.912023005428146, 'alpacino': 4.912023005428146, 'jamescaan': 5.8283137373023015, 'organized': 5.8283137373023015, 'crime': 4.442019376182411, 'dynasty': 5.8283137373023015, 'transfers': 5.8283137373023015, 'control': 5.422848629194137, 'clandestine': 5.8283137373023015, 'robertduvall': 5.422848629194137, 'dianekeaton': 5.422848629194137, 'family': 4.575550768806933, 'syndicate': 5.8283137373023015, 'son': 4.7297014486341915, 'michael': 5.8283137373023015, 'expands': 5.8283137373023015, '1920




In [None]:
idf_result['crimedrama']

3.8134107167600364

## TF-IDF<br>

<div>
    <div class="eq" style="font-size: 25px;border: 1px solid black"><br>
    \begin{equation}
    {\Large \displaystyle \mathrm {tfidf} (t,d,D)=\mathrm {tf} (t,d)\cdot \mathrm {idf} (t,D)}
    \end{equation}<br>
    </div><br>
    <div style="font-size: 15px">
    di mana: <br/>
    $tf_{t,d}$: Hasil <i>Term Frequency (TF)</i> <br>
    $idf_{t,D}$: Hasil <i>Inverse Document Frequency (IDF)</i>
    </div>
</div><br>
Sumber: <a href="https://en.wikipedia.org/wiki/Tf%E2%80%93idf">https://en.wikipedia.org/wiki/Tf%E2%80%93idf</a>

In [None]:
def tfidf(tf,idf):
    result = []
    for token in tqdm(tf.keys()):
        tfidf_sentences = []
        for tf_sentence in tf[token]:
            tf_idf_score = dot(tf_sentence, idf[token])
            tfidf_sentences.append(tf_idf_score)
        result.append(tfidf_sentences)
    return result

In [None]:
 print("\nTF-IDF kata 'imprisoned' pada dokumen pertama = {}\n".format(dot(tf_result['imprisoned'][0], idf_result['imprisoned'])))


TF-IDF kata 'imprisoned' pada dokumen pertama = 0.5298467033911183



In [None]:
tfidf_all_word = tfidf(tf_result,idf_result) # seluruh kata pada seluruh dokumen
tfidf_one_word = tf_result['imprisoned'][0]*idf_result['imprisoned'] # satu kata ('imprisoned') pada dokumen pertama

print("\nTF-IDF kata 'imprisoned' pada dokumen pertama = {}\n".format(tfidf_one_word))
print("TF-IDF seluruh kata pada seluruh dokumen = {}".format(tfidf_all_word))

100%|██████████| 2083/2083 [00:01<00:00, 1601.91it/s]



TF-IDF kata 'imprisoned' pada dokumen pertama = 0.5298467033911183

TF-IDF seluruh kata pada seluruh dokumen = [[0.3466737015236397, 0.3466737015236397, 0.2933392859046182, 0.0, 0.3813410716760037, 0.0, 0.0, 0.2933392859046182, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2542273811173358, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3466737015236397, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2542273811173358, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3177842263966697, 0.0, 0.0, 0.22431827745647273, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3813410716760037, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2933392859046182, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.47667633959500455, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

### Vector Representation

In [None]:
def build_vec_representation(x):
    result = np.asarray(x)
    return result

vector = build_vec_representation(tfidf_result)
vector

array([[0.3466737 , 0.3466737 , 0.29333929, ..., 0.        , 0.        ,
        0.        ],
       [0.49298624, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.5298467 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.44833183],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.44833183],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.44833183]])

## Calculate the similarity

### Cosine Similarity<br>
<img width="400" height="200" src="https://wikimedia.org/api/rest_v1/media/math/render/svg/1d94e5903f7936d3c131e040ef2c51b473dd071d">

di mana:<br>
$A_i$ : Komponen vektor A<br>
$B_i$ : Komponen vektor B

In [None]:
from numpy import dot
from numpy.linalg import norm

In [None]:
def cosine_similarity(x):
    # dot = 0
    dot_result = dot(x.T, x)
    norm_result = (x * x).sum(0, keepdims=True) ** .5
    result = dot_result / norm_result / norm_result.T

    return result

In [None]:
cosine_sim = cosine_similarity(vector)
cosine_sim

array([[1.        , 0.04853916, 0.04635736, 0.        , 0.04940328,
        0.        , 0.        , 0.16290492, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.10070803, 0.        , 0.        , 0.08038002, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.04803645, 0.        , 0.        , 0.09088663,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.05116523, 0.        , 0.        ,
        0.04677717, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.05401176, 0.        , 0.        , 0.09771671,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.04509146, 0.        ,
        0.        , 0.04021178, 0.        , 0.  

In [None]:
len(cosine_sim)

250

In [None]:
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)
np.array(corpus['bag_of_contents'].index)

array(['The Shawshank Redemption', 'The Godfather',
       'The Godfather: Part II', 'The Dark Knight', '12 Angry Men',
       "Schindler's List",
       'The Lord of the Rings: The Return of the King', 'Pulp Fiction',
       'Fight Club', 'The Lord of the Rings: The Fellowship of the Ring',
       'Forrest Gump', 'Star Wars: Episode V - The Empire Strikes Back',
       'Inception', 'The Lord of the Rings: The Two Towers',
       "One Flew Over the Cuckoo's Nest", 'Goodfellas', 'The Matrix',
       'Star Wars: Episode IV - A New Hope', 'Se7en',
       "It's a Wonderful Life", 'The Silence of the Lambs',
       'The Usual Suspects', 'Léon: The Professional',
       'Saving Private Ryan', 'City Lights', 'Interstellar',
       'American History X', 'Modern Times', 'Casablanca',
       'The Green Mile', 'Psycho', 'Raiders of the Lost Ark',
       'The Pianist', 'Rear Window', 'The Departed', 'Whiplash',
       'Terminator 2: Judgment Day', 'Back to the Future', 'Gladiator',
       'The Lio

## Recommend the film

In [None]:
def recommend_film(film, cosine_matrix, k):
    if film in corpus['bag_of_contents'].index:
        idx = np.where(corpus['bag_of_contents'].index == film)[0][0]

     # creating a Series with the similarity scores in descending order
        top_k_list = pd.Series(cosine_matrix[idx]).sort_values(ascending = False)[1:k+1].index
        
        return list(corpus['bag_of_contents'].iloc[top_k_list].index)
    else:
        print('Movie does not exist!')

# Perbandingan fungsi buatan sendiri dengan library sklearn

In [None]:
recommend_film("The Maltese Falcon", cosine_matrix = cosine_sim, k=8)

['Chinatown',
 'The Big Sleep',
 'The Treasure of the Sierra Madre',
 'Guardians of the Galaxy',
 'Scarface',
 'Pirates of the Caribbean: The Curse of the Black Pearl',
 'The Best Years of Our Lives',
 'Casablanca']

In [None]:
recommend_film("Schindler's List", cosine_matrix = cosine_sim, k=8)

['The Great Escape',
 'The Best Years of Our Lives',
 'The Imitation Game',
 'Gandhi',
 'The Grand Budapest Hotel',
 'The Pianist',
 'Shutter Island',
 'Batman Begins']

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cos_matrix = cosine_similarity(vector.T)
cos_matrix

['The Godfather: Part II',
 'Apocalypse Now',
 'Scarface',
 'Goodfellas',
 'Guardians of the Galaxy',
 'On the Waterfront',
 'A Streetcar Named Desire',
 'Heat']

In [None]:
recommend_film("The Maltese Falcon", cosine_matrix = cos_matrix, k=8)

['Chinatown',
 'The Big Sleep',
 'The Treasure of the Sierra Madre',
 'Guardians of the Galaxy',
 'Scarface',
 'Pirates of the Caribbean: The Curse of the Black Pearl',
 'The Best Years of Our Lives',
 'Casablanca']