# Movie Recommender

#### Import Libraries

In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#### Import Dataset

In [70]:
# lets read the dataset
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

movies_df = pd.concat([movies, credits], axis= 1)
movies_df.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,title.1,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


## Data Cleaning

#### 1. Duplicates Removal

In [71]:
movies_df.duplicated().sum()

0

#### 2. Drop Unwanted Columns

In [72]:
# lets filter out the only the required columns from the movies dataset
movies_df = movies_df[['genres', 'keywords', 'overview', 'original_title', 'cast', 'crew']]

In [73]:
# lets check the columns 
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   genres          4803 non-null   object
 1   keywords        4803 non-null   object
 2   overview        4800 non-null   object
 3   original_title  4803 non-null   object
 4   cast            4803 non-null   object
 5   crew            4803 non-null   object
dtypes: object(6)
memory usage: 225.3+ KB


#### 3. Drop Null values

In [82]:
movies_df.dropna(inplace=True)

In [83]:
movies_df.isna().sum()

genres            0
keywords          0
overview          0
original_title    0
cast              0
crew              0
dtype: int64

In [84]:
movies_df.head(2)

Unnamed: 0,genres,keywords,overview,original_title,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...",Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [75]:
# so we need genres name for our recommendation system
# for example ['Action', 'Adventure', 'Fantasy', 'Science Fiction']
# lets define a function which removes the names from the dictionary and appends in a list

# the genres are string, therefor we need to convert it to list before collecting the names
import ast

# ast.literal_eval(obj) : This method converts the string to list.

def names_collector(obj):
    names = []
    for i in ast.literal_eval(obj):
        names.append(i['name'])
    return names

In [76]:
# collect and store all the names in genres and store in genres of dataframe
movies['genres'] = movies['genres'].apply(names_collector)

In [77]:
# lets check the keywords columns first row
movies['keywords'][0]

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [78]:
# we have to apply the same function to collect the names and store in keywords of movies dataframe
movies['keywords'] = movies['keywords'].apply(names_collector)

In [79]:
# lets check the first value of overview column
movies['overview'][0]

# the overview column is fine.

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [80]:
# lets check the first value of cast column
movies['cast'][0]

# there are many cast in a movie, therefor lets use only first 5 cast for our recommendation system

KeyError: 'cast'

In [None]:
# lets take out only 5 cast for each row
def names_collector2(obj):
    names = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 5:
            names.append(i['name'])
            counter += 1
        else:
            break
    return names

movies['cast'] = movies['cast'].apply(names_collector2)

In [None]:
# now lets check crew data
movies['crew'][0]

In [None]:
# lets take out only director for each row in crew column
def fetch_director(obj):
    names = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            names.append(i['name'])
            break
    return names

In [None]:
# lets apply the fetch director function to take out all the directors in crew
movies['crew'] = movies['crew'].apply(fetch_director)

In [None]:
# now let see the first 5 rows of the dataframe
movies.head()

In [None]:
# lets add a new column which contains the tags for a movie in tags column of the dataframe
movies['tags'] = movies['genres'] + movies['keywords'] + movies['overview'].apply(lambda x: x.split()) + movies['cast'] + movies['crew']

In [None]:
# now lets check the dataframe
movies.head()

In [None]:
# now lets drop unnecessary columns from the dataframe
movies = movies[movies.columns[[3,4,-1]]]

In [None]:
# now lets check the dataframek
movies.head()

In [None]:
# remove spaces from the spaced words in tags
movies['tags'] = movies['tags'].apply(lambda x: [i.replace(' ', '').lower() for i in x])

In [None]:
# change the list to string in tags column of the dataframe
movies['tags'] = movies['tags'].apply(lambda x: ' '.join(x))
# now lets check the dataframe
movies.head()

In [None]:
# now lets use stemming for removing
import nltk

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
# lets define a function which carries out stemming
def stem(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))

    return ' '.join(y)

In [None]:
movies['tags'] = movies['tags'].apply(stem)

In [None]:
movies['tags'][0]

In [None]:
# lets vectorize the tags column
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies['tags']).toarray()
vectors

In [None]:
# lets see the features in vectors
cv.get_feature_names_out()

In [None]:
# lets find out the cosine similarity between the vectorized tags of each movies
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)
sorted(list(enumerate(similarity[0])), reverse=True, key= lambda x: x[1])[1:6]

In [None]:
# lets build a method which recommends 5 movies based on the similarity
def recommender(movie):
    index = movies[movies['title'] == movie].index[0]
    movies_recommended = sorted(list(enumerate(similarity[index])), reverse = True, key = lambda x: x[1])[1:6]
    for i, j in movies_recommended:
        print(movies.loc[i]['title'])

In [None]:
# lets see the recommendation for 17th movie
recommender(movies['title'][16])

In [None]:
# lets see some 100 titles of movies
movies['title'].values[:100]

In [None]:
# lets recommend 5 movies for the movie King Kong
recommender('King Kong')

In [None]:
# lets recommend 5 movies for the movie Titanic
recommender('Titanic')

In [None]:
# lets recommend 5 movies for the movie 'Spider-Man 2'
recommender('Spider-Man 2')

In [None]:
# lets recommend 5 movies for the movie 'Iron Man 3'
recommender('Iron Man 3')

In [None]:
# lets recommend 5 movies for the movie 'Jurassic World'
recommender('Jurassic World')