In [35]:
# importing dependencies
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Data Pre Processing

In [36]:
# loading the data from the csv file to pandas dataframe
movies_data = pd.read_csv('movies.csv')

In [37]:
# first five rows of the dataframe
movies_data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [38]:
# no.of rows and columns 
movies_data.shape

(4803, 24)

In [39]:
# selecting the relevant features for recommendation
selected_features = ['genres', 'keywords', 'tagline', 'cast', 'director', 'overview']
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director', 'overview']


In [40]:
movies_data.isnull().sum()

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
dtype: int64

In [41]:
# replacing the null values with null string

for feature in selected_features:
  movies_data[feature] = movies_data[feature].fillna('')

In [42]:
# containing all the 5 selected features

combined_features = movies_data['genres'] + movies_data['keywords'] + movies_data['tagline'] + movies_data['cast'] + movies_data['director'] + movies_data['overview']

In [43]:
print(combined_features)

0       Action Adventure Fantasy Science Fictioncultur...
1       Adventure Fantasy Actionocean drug abuse exoti...
2       Action Adventure Crimespy based on novel secre...
3       Action Crime Drama Thrillerdc comics crime fig...
4       Action Adventure Science Fictionbased on novel...
                              ...                        
4798    Action Crime Thrillerunited states\u2013mexico...
4799    Comedy RomanceA newlywed couple's honeymoon is...
4800    Comedy Drama Romance TV Moviedate love at firs...
4801    A New Yorker in ShanghaiDaniel Henney Eliza Co...
4802    Documentaryobsession camcorder crush dream gir...
Length: 4803, dtype: object


In [44]:
# converting the text data to feature vectors

vectorizer = TfidfVectorizer()

In [45]:
feature_vector = vectorizer.fit_transform(combined_features)

In [66]:
# print(feature_vector)

Cosine Similarity

In [47]:
# getiing similarity score using cosine similarity

similarity = cosine_similarity(feature_vector)

In [67]:
# print(similarity)

In [49]:
print(similarity.shape)

(4803, 4803)


In [50]:
# getting the movie name from the user

movie_name = input("Enter your favourite movie name : ")

In [51]:
# creating a list with all the movie given in the dataset

list_of_all_titles = movies_data['title'].tolist()

In [71]:
# print(list_of_all_titles)

In [53]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

In [54]:
print(find_close_match)

['Thor', '25th Hour', 'Stoker']


In [55]:
close_match = find_close_match[0]
print(close_match)


Thor


In [56]:
from pandas._libs import index
# finding the index of the movie with the title

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

print(index_of_the_movie)

129


In [70]:
# Getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))

# print(similarity_score)

In [58]:
len(similarity_score)

4803

In [72]:
# sorting movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse=True)

# print(sorted_similar_movies)

In [60]:
# printing name of similar movies based on the index

print('Movies suggested for you : \n')

i = 1
for movie in sorted_similar_movies:

  index = movie[0]
  
  title_from_index = movies_data[movies_data.index == index]['title'].values[0]

  if i < 20:
    print(i, '.', title_from_index )
    i += 1 

Movies suggested for you : 

1 . Thor
2 . Thor: The Dark World
3 . Man of Steel
4 . The Avengers
5 . Pirates of the Caribbean: At World's End
6 . Little Nicky
7 . Avengers: Age of Ultron
8 . Galaxina
9 . The Amazing Spider-Man
10 . Armageddon
11 . Men in Black
12 . Angels & Demons
13 . Ponyo
14 . Iron Man 2
15 . The Legend of Hercules
16 . Cinderella
17 . Jason X
18 . Batman v Superman: Dawn of Justice
19 . Hellboy II: The Golden Army


Movie Recommendation System

In [61]:
def movie_recommendation():

  movie_name = input("Enter your favourite movie name : ")

  list_of_all_titles = movies_data['title'].tolist()

  find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

  close_match = find_close_match[0]

  index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

  similarity_score = list(enumerate(similarity[index_of_the_movie]))

  sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse=True)

  i = 1
  for movie in sorted_similar_movies:

    index = movie[0]
    
    title_from_index = movies_data[movies_data.index == index]['title'].values[0]

    if i < 11:
      x = print(i, '.', title_from_index) 
      i += 1 


In [63]:
final = movie_recommendation()

1 . Thor
2 . Thor: The Dark World
3 . Man of Steel
4 . The Avengers
5 . Pirates of the Caribbean: At World's End
6 . Little Nicky
7 . Avengers: Age of Ultron
8 . Galaxina
9 . The Amazing Spider-Man
10 . Armageddon


In [73]:
import pickle

In [76]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))


In [77]:
pickle.dump(movies_data, open('movies_data.pkl', 'wb'))
