## Content-based Movie Recommendation

In [111]:
import pandas as pd
from rake_nltk import Rake
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

### 1. Gather the data

In [123]:
df = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')

df = df[['Title', 'Genre', 'Director', 'Actors', 'Plot']]
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


In [124]:
# initializing the new column
df['Key_words'] = ""

for index, row in df.iterrows():
    plot = row['Plot']
    
    # instantiating Rake, by default it uses english stopwords from NLTK
    # and discards all puntuation characters as well
    r = Rake()

    # extracting the words by passing the text
    r.extract_keywords_from_text(plot)

    # getting the dictionary whith key words as keys and their scores as values
    key_words_dict_scores = r.get_word_degrees()
    
    # assigning the key words to the new column for the corresponding movie
    row['Key_words'] = list(key_words_dict_scores.keys())

# dropping the Plot column
df.drop(columns = ['Plot'], inplace = True)
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Key_words
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...","[years, eventual, redemption, finding, solace,..."
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...","[clandestine, empire, reluctant, son, aging, p..."
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...","[1920s, new, york, grip, michael, vito, corleo..."
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...","[wreaks, havoc, gotham, menace, known, dark, k..."
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....","[miscarriage, justice, colleagues, forcing, ev..."


### 2. Data Cleaning

To detect a similarity only if the person(or genres ...) is exactly the same.

In [125]:
# discarding the blank between the actors, and to lowercase
# just remain only the first three actors
df['Actors'] = df['Actors'].map(lambda x : x.lower().replace(' ', '').split(',')[:3])

# to be lowercase and discarding the blank
df['Genre'] = df['Genre'].map(lambda x : x.lower().split(',')[:3])
df['Director'] = df['Director'].map(lambda x : x.lower().replace(' ', ''))

In [126]:
df.head()

Unnamed: 0,Title,Genre,Director,Actors,Key_words
0,The Shawshank Redemption,"[crime, drama]",frankdarabont,"[timrobbins, morganfreeman, bobgunton]","[years, eventual, redemption, finding, solace,..."
1,The Godfather,"[crime, drama]",francisfordcoppola,"[marlonbrando, alpacino, jamescaan]","[clandestine, empire, reluctant, son, aging, p..."
2,The Godfather: Part II,"[crime, drama]",francisfordcoppola,"[alpacino, robertduvall, dianekeaton]","[1920s, new, york, grip, michael, vito, corleo..."
3,The Dark Knight,"[action, crime, drama]",christophernolan,"[christianbale, heathledger, aaroneckhart]","[wreaks, havoc, gotham, menace, known, dark, k..."
4,12 Angry Men,"[crime, drama]",sidneylumet,"[martinbalsam, johnfiedler, leej.cobb]","[miscarriage, justice, colleagues, forcing, ev..."


### 3. Modeling

Use CountVectorizer rather than TfIdfVectorizer, because in this application every word is important to detect similarity.

In [128]:
df['bag_of_words'] = ''
columns = df.columns

for index, row in df.iterrows():
    words = ''
    for col in columns :
        if col != 'Title' :
            if col != 'Director' :
                words = words + ' '.join(row[col]) + ' '
            else : # Director has only one element!
                words = words + row[col] + ' '
    row['bag_of_words'] = words

df.drop(columns = [col for col in df.columns if col != 'bag_of_words' and col != 'Title'], inplace = True)

In [129]:
df.head()

Unnamed: 0,Title,bag_of_words
0,The Shawshank Redemption,crime drama frankdarabont timrobbins morganfr...
1,The Godfather,crime drama francisfordcoppola marlonbrando a...
2,The Godfather: Part II,crime drama francisfordcoppola alpacino rober...
3,The Dark Knight,action crime drama christophernolan christia...
4,12 Angry Men,crime drama sidneylumet martinbalsam johnfied...


In [134]:
# instantiating and generating the count matrix
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])

In [135]:
# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim

array([[1.        , 0.15789474, 0.13764944, ..., 0.05263158, 0.05263158,
        0.05564149],
       [0.15789474, 1.        , 0.36706517, ..., 0.05263158, 0.05263158,
        0.05564149],
       [0.13764944, 0.36706517, 1.        , ..., 0.04588315, 0.04588315,
        0.04850713],
       ...,
       [0.05263158, 0.05263158, 0.04588315, ..., 1.        , 0.05263158,
        0.05564149],
       [0.05263158, 0.05263158, 0.04588315, ..., 0.05263158, 1.        ,
        0.05564149],
       [0.05564149, 0.05564149, 0.04850713, ..., 0.05564149, 0.05564149,
        1.        ]])

This matrix feature
1. diagonal(every movie is identical to itself)
2. Symmetrical(the similarity between A and B is same as the similarity between B and A)

In [137]:
# creating a Series for the movie titles so they are associated to an ordered numerical list 
# I will use later to match the indexed
indices = pd.Series(df['Title'])
indices[:5]

0    The Shawshank Redemption
1               The Godfather
2      The Godfather: Part II
3             The Dark Knight
4                12 Angry Men
Name: Title, dtype: object

In [148]:
# defining the function that takes in movie title
# as input and returns the top 10 recommended movies
def recommendations(title, cosine_sim = cosine_sim) :
    # initializaing the empty list of recommended movies
    recommended_movies = []
    
    # getting the index of the movie that matches the title
    idx = indices[indices == title].index[0]
    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    
    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # populating the list with the titles of the best 10 matching movies
    for i in top_10_indexes :
        recommended_movies.append(list(df['Title'])[i])
        
    return recommended_movies

In [149]:
recommendations('Rope')

['Strangers on a Train',
 'The Godfather: Part II',
 'Fargo',
 'The Shawshank Redemption',
 'The Godfather',
 'Arsenic and Old Lace',
 'Pulp Fiction',
 'The Silence of the Lambs',
 'Baby Driver',
 'Casino']