# Movie Recommender

#### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### Import Dataset

In [2]:
# lets read the dataset
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

movies_df = pd.concat([movies, credits], axis= 1)
movies_df.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,title.1,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


## Data Cleaning

#### 1. Duplicates Removal

In [3]:
movies_df.duplicated().sum()

0

#### 2. Drop Unwanted Columns

In [4]:
# lets filter out the only the required columns from the movies dataset
movies_df = movies_df[['genres', 'keywords', 'overview', 'original_title', 'cast', 'crew']]

In [5]:
# lets check the columns 
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   genres          4803 non-null   object
 1   keywords        4803 non-null   object
 2   overview        4800 non-null   object
 3   original_title  4803 non-null   object
 4   cast            4803 non-null   object
 5   crew            4803 non-null   object
dtypes: object(6)
memory usage: 225.3+ KB


#### 3. Drop Null values

In [6]:
movies_df.dropna(inplace=True)

In [7]:
movies_df.isna().sum()

genres            0
keywords          0
overview          0
original_title    0
cast              0
crew              0
dtype: int64

In [8]:
movies_df.head(2)

Unnamed: 0,genres,keywords,overview,original_title,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


#### Extract Genres

In [19]:
# lets see the genres first sample
movies_df['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [28]:
# the genres are string, therefor we need to convert it to list before collecting the names
# ast.literal_eval(obj) : This method converts the string to list
import ast

movies_df['genres']=movies_df['genres'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)])

#### Extract Keywords

In [29]:
# lets see the keywords first sample
movies_df['keywords'][0]

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [30]:
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)])

#### Extract Overview

In [31]:
# lets see the overview first sample
movies_df['overview'][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [43]:
# as this have punctuations, lets remove it
import string

movies_df['overview']=movies_df['overview'].apply(lambda x: ''.join([i for i in x if i not in string.punctuation]).split())

#### Extract Cast

In [46]:
# lets check the length of the first value in cast column
len(movies_df['cast'][0])

12919

In [51]:
# lets extract only first 5 values
movies_df['cast']=movies_df['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)[:5]])

#### Extract Crew

In [53]:
# lets check the length of the first value in crew column
len(movies_df['crew'][0])

22831

In [59]:
# lets extract only the director value
movies_df['crew']=movies_df['crew'].apply(lambda x: [i['name'] for i in ast.literal_eval(x) if i['job'] == 'Director'])

#### Create Tags Column

In [61]:
# lets add a new column which contains the tags for a movie in tags column of the dataframe
movies_df['tags'] = movies_df['genres'] + movies_df['keywords'] + movies_df['overview'] + movies_df['cast'] + movies_df['crew']

In [66]:
# lets do the following steps in tags column
# stopwords removal --> stemming
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
ps = PorterStemmer()
sw = stopwords.words('english')

movies_df['tags']=movies_df['tags'].apply(lambda x: ' '.join([ps.stem(i.lower()) for i in x if i not in sw]))

## Data Preprocessing

#### Bag Of Words

In [118]:
# lets vectorize the tags column using bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
vectors = cv.fit_transform(movies_df['tags'])

In [119]:
vectors

<4800x27845 sparse matrix of type '<class 'numpy.int64'>'
	with 249918 stored elements in Compressed Sparse Row format>

## Cosine Similarity Model

In [120]:
# lets find out the cosine similarity between the vectorized tags of each movies
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

## Movie Recommender

In [121]:
# lets build a method which recommends 5 movies based on the similarity
def Recommender(movie):
    index = movies_df[movies_df['original_title'] == movie].index[0]
    movies_recommended = sorted(list(enumerate(similarity[index])), reverse = True, key = lambda x: x[1])[1:6]
    for i, j in movies_recommended:
        print(movies_df.loc[i]['original_title'])

#### Recommend 5 movies for 'The Avengers'

In [122]:
Recommender('The Avengers')

Avengers: Age of Ultron
Iron Man 2
Captain America: Civil War
Captain America: The Winter Soldier
Iron Man 3


#### Recommend 5 movies for 'King Kong'

In [123]:
# lets recommend 5 movies for the movie King Kong
recommender('King Kong')

Ed Wood
Top Hat
America's Sweethearts
All That Jazz
Meek's Cutoff


#### Recommend 5 movies for 'Titanic'

In [124]:
# lets recommend 5 movies for the movie Titanic
recommender('Titanic')

The Notebook
Captain Phillips
Poseidon
Ghost Ship
The Maid's Room


#### Recommend 5 movies for 'Spider-Man 2'

In [125]:
# lets recommend 5 movies for the movie 'Spider-Man 2'
recommender('Spider-Man 2')

Spider-Man 3
Spider-Man
The Amazing Spider-Man 2
The Amazing Spider-Man
Batman Begins


#### Recommend 5 movies for 'Iron Man 3'

In [126]:
# lets recommend 5 movies for the movie 'Iron Man 3'
recommender('Iron Man 3')

Iron Man
Iron Man 2
Captain America: Civil War
Avengers: Age of Ultron
The Avengers


#### Recommend 5 movies for 'Jurassic World'

In [127]:
# lets recommend 5 movies for the movie 'Jurassic World'
recommender('Jurassic World')

Jurassic Park
The Lost World: Jurassic Park
Vacation
Jurassic Park III
South Park: Bigger, Longer & Uncut
