## Recommendation System Using NLP

In [236]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk, string
from pprint import pprint

from nltk.corpus import stopwords
from rake_nltk import Rake
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')

In [237]:
#Loading the dataset
dataset = pd.read_csv('https://query.data.world/s/uikepcpffyo2nhig52xxeevdialfl7')

In [238]:
#Extracting Title, Genre, Director, 'Actor',Plot
dataset = dataset[['Title','Genre','Director','Actors','Plot']]

In [239]:
#Printing the shape
print('Dataset Dimension:',dataset.shape)
dataset.head()

Dataset Dimension: (250, 5)


Unnamed: 0,Title,Genre,Director,Actors,Plot
0,The Shawshank Redemption,"Crime, Drama",Frank Darabont,"Tim Robbins, Morgan Freeman, Bob Gunton, Willi...",Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",Francis Ford Coppola,"Marlon Brando, Al Pacino, James Caan, Richard ...",The aging patriarch of an organized crime dyna...
2,The Godfather: Part II,"Crime, Drama",Francis Ford Coppola,"Al Pacino, Robert Duvall, Diane Keaton, Robert...",The early life and career of Vito Corleone in ...
3,The Dark Knight,"Action, Crime, Drama",Christopher Nolan,"Christian Bale, Heath Ledger, Aaron Eckhart, M...",When the menace known as the Joker emerges fro...
4,12 Angry Men,"Crime, Drama",Sidney Lumet,"Martin Balsam, John Fiedler, Lee J. Cobb, E.G....",A jury holdout attempts to prevent a miscarria...


### Data Cleaning
---

In [240]:
#Lowercasing the strings
dataset['Genre'] = dataset['Genre'].str.lower()
dataset['Director'] = dataset['Director'].str.lower()
dataset['Actors'] = dataset['Actors'].str.lower()
dataset['Plot'] = dataset['Plot'].str.lower()

#Merging firstname and lastname of direction to avoid confusion
dataset['Director'] = dataset['Director'].str.replace(' ','')

In [257]:
#Displaying 5 rows
dataset.sample(5)

Unnamed: 0_level_0,Bag of keywords
Title,Unnamed: 1_level_1
Interstellar,adventure drama sci-fi christophernolan elle...
The Usual Suspects,crime drama mystery bryansinger stephenbaldw...
The Wild Bunch,action adventure western sampeckinpah willia...
Alien,horror sci-fi ridleyscott tomskerritt sigourn...
Aliens,action adventure sci-fi jamescameron sigourn...


In [242]:
#Extracting important word from plot
def Extract_Keywords(plot_sent):
    r = Rake() #Instantiating the object
    r.extract_keywords_from_text(plot_sent) #getting the keyword from the text
    keyword_dict_score = r.get_word_degrees() #Extracting the word and frequencies
    return keyword_dict_score

dataset['plot_keywords'] = dataset['Plot'].apply(lambda x: list(Extract_Keywords(x).keys()))
dataset['plot_keywords'] = dataset['plot_keywords'].apply(lambda x: ' '.join(x))
dataset['Genre'] = dataset['Genre'].str.replace(',',' ') 

In [258]:
dataset.sample(5)

Unnamed: 0_level_0,Bag of keywords
Title,Unnamed: 1_level_1
Good Will Hunting,drama gusvansant mattdamon benaffleck stellans...
The Sting,comedy crime drama georgeroyhill paulnewman ...
Monty Python and the Holy Grail,"adventure comedy fantasy terrygilliam,terryj..."
Amadeus,biography drama history milosforman f.murray...
One Flew Over the Cuckoo's Nest,drama milosforman michaelberryman peterbrocco ...


In [244]:
#Merging the actor firstname and lastname
def Actor_Name_Merge(name):
    name_join = name.replace(' ','')
    return name_join.replace(',',' ')

dataset['Actors'] = dataset['Actors'].apply(lambda x: Actor_Name_Merge(x))

In [259]:
dataset.sample(5)

Unnamed: 0_level_0,Bag of keywords
Title,Unnamed: 1_level_1
Eternal Sunshine of the Spotless Mind,drama romance sci-fi michelgondry jimcarrey ...
12 Angry Men,crime drama sidneylumet martinbalsam johnfied...
The Lion King,"animation adventure drama rogerallers,robmin..."
12 Years a Slave,biography drama history stevemcqueen chiwete...
Rio Bravo,action drama western howardhawks johnwayne d...


In [246]:
#joining genre, director, actors and plot_keywords
dataset['Bag of keywords'] = dataset['Genre']+' '+dataset['Director']+' '+dataset['Actors']+' '+dataset['plot_keywords']
#setting title as the index
dataset.set_index(['Title'],inplace=True)
#dropping the columns not required
dataset.drop(axis=1,columns=['Genre','Director','Actors','Plot','plot_keywords'],inplace=True)

### Creating BOW & Similarity Matrix
---

In [248]:
#creating count matrix using bigram
count_vect = CountVectorizer(ngram_range=(1, 2),stop_words=stopwords.words('English'))
count_matrix = count_vect.fit_transform(dataset['Bag of keywords'])

In [249]:
#creating similarity matrix
cosine_matrix = cosine_similarity(count_matrix,count_matrix)
cosine_matrix

array([[1.        , 0.1025641 , 0.0915018 , ..., 0.02564103, 0.02564103,
        0.0270666 ],
       [0.1025641 , 1.        , 0.22875451, ..., 0.02564103, 0.02564103,
        0.0270666 ],
       [0.0915018 , 0.22875451, 1.        , ..., 0.02287545, 0.02287545,
        0.02414726],
       ...,
       [0.02564103, 0.02564103, 0.02287545, ..., 1.        , 0.02564103,
        0.0270666 ],
       [0.02564103, 0.02564103, 0.02287545, ..., 0.02564103, 1.        ,
        0.0270666 ],
       [0.0270666 , 0.0270666 , 0.02414726, ..., 0.0270666 , 0.0270666 ,
        1.        ]])

In [250]:
#Extracting all the indexes
indexes = pd.Series(dataset.index)

In [251]:
print(indexes.head())

0    The Shawshank Redemption
1               The Godfather
2      The Godfather: Part II
3             The Dark Knight
4                12 Angry Men
Name: Title, dtype: object


### Recommending Movies
---

In [252]:
def Recommend_Movies(title,cosine_matrix=cosine_matrix):
    rec = []
    idx = indexes[indexes==title].index[0]
    top_10_indexes = list(pd.Series(cosine_matrix[idx]).sort_values(ascending=False)[:6].index)
    for index in top_10_indexes:
        rec.append(indexes[index])
    return rec

In [270]:
try:
    print(Recommend_Movies('the dark knight'.title(),cosine_matrix))
except:
    print('Sorry! Movie is not found')

['The Dark Knight', 'Batman Begins', 'The Dark Knight Rises', 'The Prestige', 'The Green Mile', 'Witness for the Prosecution']
