In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv


# Content based Movie Recommendation based on Movie Plot (implementation of tfidf and cosine similarity)

For this I am using a dataset which is available in Kaggle. The dataset was prepared by scraping wikipedia and contains various information including plot of the movie.

### Dataset

In [2]:
path = '../input/wikipedia-movie-plots/wiki_movie_plots_deduped.csv'

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv(path)

In [5]:
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [6]:
df.shape

(34886, 8)

In [7]:
df.dtypes

Release Year         int64
Title               object
Origin/Ethnicity    object
Director            object
Cast                object
Genre               object
Wiki Page           object
Plot                object
dtype: object

In [8]:
df.isnull().sum()

Release Year           0
Title                  0
Origin/Ethnicity       0
Director               0
Cast                1422
Genre                  0
Wiki Page              0
Plot                   0
dtype: int64

There are 1422 null values for 'Cast' field. For now we will levae it as it is, and will be solving the issue whenever required.

In [9]:
df['Origin/Ethnicity'].unique()

array(['American', 'Australian', 'Bangladeshi', 'British', 'Canadian',
       'Chinese', 'Egyptian', 'Hong Kong', 'Filipino', 'Assamese',
       'Bengali', 'Bollywood', 'Kannada', 'Malayalam', 'Marathi',
       'Punjabi', 'Tamil', 'Telugu', 'Japanese', 'Malaysian', 'Maldivian',
       'Russian', 'South_Korean', 'Turkish'], dtype=object)

In [10]:
len(df['Origin/Ethnicity'].unique())

24

As we can see there are 24 types of origin for the movies. However, for now we will be looking mainly 'English' movies and hence only movies with American, Australian, British, and Canadian origins will be considered.

In [11]:
len(df[df['Origin/Ethnicity'].isin(['American','British','Australian','Canadian'])])

22346

We can see that there are total of 22346 movies in the given dataset having origin of our interest.

Similarly, we will only take the columns with movie name and plot data because those are sufficient to make the recommendation.

In [12]:
df_eng = df[df['Origin/Ethnicity'].isin(['American','British','Australian','Canadian'])]
df_eng = df_eng.loc[:,['Plot','Title']]
df_eng.set_index('Title', inplace=True)
df_eng.head()
            

Unnamed: 0_level_0,Plot
Title,Unnamed: 1_level_1
Kansas Saloon Smashers,"A bartender is working at a saloon, serving dr..."
Love by the Light of the Moon,"The moon, painted with a smiling face hangs ov..."
The Martyred Presidents,"The film, just over a minute long, is composed..."
"Terrible Teddy, the Grizzly King",Lasting just 61 seconds and consisting of two ...
Jack and the Beanstalk,The earliest known adaptation of the classic f...


In [13]:
df_eng.sample(n=5)

Unnamed: 0_level_0,Plot
Title,Unnamed: 1_level_1
The Book of Life,"Mary Beth, a museum tour guide, takes a group ..."
Lucid,Joel Rothman (Jonas Chernick) is suffering fro...
Wind Across the Everglades,"Set in the early 20th century, the film follow..."
Frankenstein's Daughter,"Teenager Trudy Morton (Sandra Knight), who liv..."
"Do-Deca-Pentathlon, TheThe Do-Deca-Pentathlon",The film is about two brothers in their mid-30...


Let's see our first plot story.

In [14]:
df_eng['Plot'][0]

"A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]"

### Preprocessing the Movie Plots

For preprocessing the plots data we will follow certain steps:
1. We will tokenize the text present in the plots field using nltk's 'punkt'. This step helps to tokenize the sentences or words.
2. Then we will extract the POS tags for the tokens using 'averaged_perceptron_tagger'. This step will give the words tags about which part of speech it might belong to. This step helps in the understanding of the context of the text present in the plot field.
3. And, we will Lemmatize the tokens using WordNetLemmatizer(). This will assign the root to each word. For example: history and historical shares the same root. So both of them is Lemmatized as 'history'.
4. We will also remove commonly used word which might not add any useful value to our analysis using 'stopwords'.

In [19]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [20]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [21]:
verb_codes = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}

Now, let's preprocess the plt text.
The steps included are:
1. First we have to transform all the text into same case because it is case sensitive.
2. We need to find the pos tags of the word.
3. For each part of speech there are further types. For example for noun there are noun singular (NN), noun plural (NNS), prper noun singular (NNP), etc. For our analysis we want to assign the simplest tags. So although it might be variation for any type of part of speech, we will consider it as single type.
4. Here, we will mainly focus in the verb form so will assign all different verb types to one type. So will force our lemmatizer to lemmatize the word into verb form if the tag given by pos_tag in one of the tag in our set. If verb form then lemmatized to verb, if not simply lemmatize it as it is.
5. We also don't want stop words to be in our lemmatized sentence so we will only append the words which are out of bag of words and are alphabets.

In [33]:
def sentence_prep(text):
    text = text.lower()
    
    # replacing abbreviated word with its full form
    text = text.replace("n't", " not")
    text = text.replace("'m", " am")
    text = text.replace("'s", " is")
    text = text.replace("'re", " are")
    text = text.replace("'ll", " will")
    text = text.replace("'ve", " have")
    text = text.replace("'d", " would")
    
    temp_sent = []
    words = nltk.word_tokenize(text)
    tags = nltk.pos_tag(words)
    for i, word in enumerate(words):
        if tags[i][1] in verb_codes:
            lemmatized = lemmatizer.lemmatize(word,'v')
        else:
            lemmatized = lemmatizer.lemmatize(word)
        if lemmatized not in stop_words and lemmatized.isalpha():
            temp_sent.append(lemmatized)  
            
    final_sent = ' '.join(temp_sent) #joining the words with spaces and making it like sentence
    
    
    return final_sent

In [50]:
df_eng['plot_prep'] = df_eng['Plot'].apply(sentence_prep)
df_eng.head()

Unnamed: 0_level_0,Plot,plot_prep
Title,Unnamed: 1_level_1,Unnamed: 2_level_1
Kansas Saloon Smashers,"A bartender is working at a saloon, serving dr...",bartender work saloon serve drink customer fil...
Love by the Light of the Moon,"The moon, painted with a smiling face hangs ov...",moon paint smile face hang park night young co...
The Martyred Presidents,"The film, just over a minute long, is composed...",film minute long compose two shot first girl s...
"Terrible Teddy, the Grizzly King",Lasting just 61 seconds and consisting of two ...,last second consist two shot first shot set wo...
Jack and the Beanstalk,The earliest known adaptation of the classic f...,earliest known adaptation classic fairytale fi...


### Vectorizing the plot data using TF-IDF

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [55]:
tfidf = TfidfVectorizer()
tfidf_movieid = tfidf.fit_transform((df_eng["plot_prep"]))

### Finding Cosine similarity between vectors

In [56]:
from sklearn.metrics.pairwise import cosine_similarity

In [62]:
cos_sim = cosine_similarity(tfidf_movieid, tfidf_movieid)

In [83]:
cos_sim[1]

array([0.02874717, 1.        , 0.03720778, ..., 0.00887276, 0.04114762,
       0.00726197])

In [84]:
pd.Series(cos_sim[1])

0        0.028747
1        1.000000
2        0.037208
3        0.021211
4        0.001425
           ...   
22341    0.007924
22342    0.008296
22343    0.008873
22344    0.041148
22345    0.007262
Length: 22346, dtype: float64

In [85]:
pd.Series(cos_sim[1]).sort_values(ascending = False)

1        1.000000
14371    0.425959
9486     0.381232
19174    0.326026
7074     0.295315
           ...   
4614     0.000000
4615     0.000000
4616     0.000000
4619     0.000000
17957    0.000000
Length: 22346, dtype: float64

In [86]:
pd.Series(cos_sim[0]).sort_values(ascending = False).iloc[1:11].index

Int64Index([9228, 16469, 14017, 8128, 15523, 15929, 18409, 21212, 22067, 231], dtype='int64')

In [71]:
xxx = [[1,2,3],[2,3,4]]
pd.Series(xxx)

0    [1, 2, 3]
1    [2, 3, 4]
dtype: object

### Building Recommendation Function

In [61]:
# Storing indices of the data
indices = pd.Series(df_eng.index)
indices

0                                   Kansas Saloon Smashers
1                            Love by the Light of the Moon
2                                  The Martyred Presidents
3                         Terrible Teddy, the Grizzly King
4                                   Jack and the Beanstalk
                               ...                        
22341    Hochelaga, Land of Souls (Hochelaga terre des ...
22342                                         Indian Horse
22343    The Little Girl Who Was Too Fond of Matches (L...
22344                                      Meditation Park
22345                               Ravenous (Les Affamés)
Name: Title, Length: 22346, dtype: object

In [94]:
def recommendations(title, cosine_sim = cos_sim):
    recommended_movies = []
    index = indices[indices == title].index[0] # We are searching for the position of the first occurance of the title in our indices list
    similarity_scores = pd.Series(cosine_sim[index]).sort_values(ascending = False)
    top_10_movies = list(similarity_scores.iloc[1:11].index)
    for i in top_10_movies:
        recommended_movies.append(list(df_eng.index)[i])
    return recommended_movies

In [95]:
recommendations("Harry Potter and the Chamber of Secrets")

["Harry Potter and the Philosopher's Stone",
 "Harry Potter and the Sorcerer's Stone",
 'Harry Potter and the Deathly Hallows: Part 1',
 'Harry Potter and the Deathly Hallows: Part I',
 'Harry Potter and the Half-Blood Prince',
 'Harry Potter and the Deathly Hallows: Part II',
 'Harry Potter and the Deathly Hallows: Part 2',
 'Harry Potter and the Order of the Phoenix',
 'Harry Potter and the Goblet of Fire',
 'Harry Potter and the Prisoner of Azkaban']

In [96]:
recommendations("Ice Age")

['Ice Age: The Meltdown',
 'Ice Age: Dawn of the Dinosaurs',
 'The Wrong Man',
 'Ice Age: Continental Drift',
 'The Buttercup Chain',
 'Ice Age: Collision Course',
 'Runaway Train',
 'Corrina, Corrina',
 'Sid and Nancy',
 'Zorro, the Gay Blade']