### Importing libraries

In [1]:
import pandas as pd
import json
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import warnings
warnings.filterwarnings('ignore')

### Reading data from the dataset

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [5]:
# merging two dataset
df = movies.merge(credits,on='title')

In [6]:
df.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Deleting the columns which are less important

In [7]:
# important attributes
# 1- genres
# 2- id
# 3- keywords
# 4- title
# 5- overview
# 6- cast
# 7 crew

df = df[['title','overview','id','genres','keywords','cast','crew']]

In [8]:
df.head(1)

Unnamed: 0,title,overview,id,genres,keywords,cast,crew
0,Avatar,"In the 22nd century, a paraplegic Marine is di...",19995,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


### Preprocessing the dataset

In [9]:
# removing missing and duplicate values
df.isnull().sum()

title       0
overview    3
id          0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [10]:
df.dropna(inplace=True)

In [11]:
df.duplicated().sum()

0

In [12]:
# making a funtion to get the important keywords from genres and keywords column
def get_name(obj):
    lst = []
    for i in json.loads(obj):
        lst.append(i['name'])
    return lst

In [13]:
df['genres'] = df['genres'].apply(get_name)
df['keywords'] = df['keywords'].apply(get_name)

In [14]:
# making a funtion to get the first 5 cast name from cast column
def get_cast(obj):
    lst = []
    cnt=0
    for i in json.loads(obj):
        if(cnt==5):
            break
        lst.append(i['name'])
        cnt+=1;
    return lst

In [15]:
df['cast'] = df['cast'].apply(get_name)

In [16]:
# making a funtion to get the drirector name from crew column
def get_dir(obj):
    lst = []
    for i in json.loads(obj):
        if(i['job']=='Director'):
            lst.append(i['name'])
    return lst

In [17]:
df['crew'] = df['crew'].apply(get_dir)

In [18]:
df['overview'] = df['overview'].apply(lambda x:x.split())

In [19]:
df.head()

Unnamed: 0,title,overview,id,genres,keywords,cast,crew
0,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...",19995,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...",285,"[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,Spectre,"[A, cryptic, message, from, Bond’s, past, send...",206647,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...",49026,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,John Carter,"[John, Carter, is, a, war-weary,, former, mili...",49529,"[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


In [20]:
# removing stop words
# df['overview'] = df['overview'].apply(lambda x:[w for w in x if w.lower() not in stopwords.words('english')])

In [21]:
df['genres'] = df['genres'].apply(lambda x:[i.replace(" ","") for i in x])
df['keywords'] = df['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
df['cast'] = df['cast'].apply(lambda x:[i.replace(" ","") for i in x])
df['crew'] = df['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [22]:
df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast'] + df['crew']
df['tags'] = df['tags'].apply(lambda x : ' '.join(x))

In [23]:
df = df[['title','id','tags']]

In [24]:
df['tags'] = df['tags'].apply(lambda x:x.lower())

In [25]:
df.head()

Unnamed: 0,title,id,tags
0,Avatar,19995,"in the 22nd century, a paraplegic marine is di..."
1,Pirates of the Caribbean: At World's End,285,"captain barbossa, long believed to be dead, ha..."
2,Spectre,206647,a cryptic message from bond’s past sends him o...
3,The Dark Knight Rises,49026,following the death of district attorney harve...
4,John Carter,49529,"john carter is a war-weary, former military ca..."


In [26]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')
# stop_words are used to remove the stop words present in the tag section

In [27]:
movies_vector = cv.fit_transform(df['tags']).toarray()

In [28]:
movies_vector.shape

(4806, 5000)

In [29]:
# getting all the features name from the bag of words
cv.get_feature_names()

['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '18th',
 '19',
 '1930s',
 '1940s',
 '1950s',
 '1960s',
 '1970s',
 '1980s',
 '19th',
 '19thcentury',
 '20',
 '20th',
 '24',
 '25',
 '30',
 '3d',
 '40',
 '50',
 '60',
 '60s',
 'aaron',
 'aaroneckhart',
 'aarontaylor',
 'aasifmandvi',
 'abandoned',
 'abducted',
 'abigailbreslin',
 'ability',
 'able',
 'abuse',
 'abusive',
 'academy',
 'accept',
 'accepts',
 'access',
 'accident',
 'accidentally',
 'accompanied',
 'account',
 'accountant',
 'accused',
 'ace',
 'achieve',
 'act',
 'action',
 'actions',
 'activist',
 'activities',
 'actor',
 'actors',
 'actress',
 'acts',
 'actual',
 'actually',
 'adam',
 'adambrody',
 'adamgoldberg',
 'adamlefevre',
 'adams',
 'adamsandler',
 'adamscott',
 'adamshankman',
 'adaptation',
 'addict',
 'addicted',
 'addiction',
 'adewaleakinnuoye',
 'adolescence',
 'adopted',
 'adoption',
 'adrianmartinez',
 'adrienbrody',
 'adult',
 'adultery',
 'adulthood',
 'adults',
 'ad

###  Stemming and lemmetization

In [30]:
# stemming :- normalize words into its base form which may not have any meaning
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

# helper function for stemming
def stemmer_fun(text):
    y = []
    for i in text.split():
        y.append(stemmer.stem(i))
    return ' '.join(y)

In [31]:
# lemmetization :- makes the stem word a meaningful word
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# helper function for lemmatizing
def lemmatizer_fun(text):
    y = []
    for i in text.split():
        y.append(lemmatizer.lemmatize(i))
    return ' '.join(y)

In [32]:
df['tags'] = df['tags'].apply(stemmer_fun)
df['tags'] = df['tags'].apply(lemmatizer_fun)

### Finding similarities between the movies using cosine similarity

In [33]:
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
# Here we are using cosine similarity to measure the similarity between each movies.
m_similarity = cosine_similarity(movies_vector)
m_similarity.shape

(4806, 4806)

In [35]:
# function to recommend 8 movies for each movie
def recommend(movie):
    movie_index = df[df['title']==movie].index[0]
    distance = m_similarity[movie_index]
    # sorting on the basis of vectors of movie
    movie_list = sorted(list(enumerate(distance)),reverse=True, key= lambda x:x[1])[1:9]
    # enumerator returns an object that contains a counter as a key for each value within an object
                        
    for i in movie_list:
        print(df.iloc[i[0]].title)   

In [36]:
recommend('Batman Begins')

The Dark Knight
The Dark Knight Rises
Amidst the Devil's Wings
Batman
Batman & Robin
Batman
Mi America
Defendor


### Saving files

In [37]:
import pickle

In [38]:
# Storing the movies data in the form of dictionary in a pickle file
pickle.dump(df.to_dict(),open('movies_dict.pkl','wb'))

In [39]:
# importing the pickle file for the similarity 
pickle.dump(m_similarity,open('similarity.pkl','wb'))