Importing the dependencies

In [1]:
import numpy as np
import pandas as pd
import difflib # to get the closest value matching with user input
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Data Collection and Pre-Processing

In [2]:
movies_df = pd.read_csv('datasets/movies.csv')

In [3]:
movies_df.sample(5)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
873,873,46000000,Action Adventure Crime Thriller,,479,corruption black people italo-american brother...,en,Shaft,New York police detective John Shaft arrests W...,19.643365,...,99.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,"Still the man, any questions?",Shaft,5.5,308,Samuel L. Jackson Jeffrey Wright Christian Bal...,"[{'name': 'Richard Price', 'gender': 0, 'depar...",John Singleton
1663,1663,30000000,Drama Crime,,311,life and death corruption street gang rape sad...,en,Once Upon a Time in America,A former Prohibition-era Jewish gangster retur...,49.336397,...,229.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,"Crime, passion and lust for power - Sergio Leo...",Once Upon a Time in America,8.2,1069,Robert De Niro James Woods Elizabeth McGovern ...,"[{'name': 'Arnon Milchan', 'gender': 2, 'depar...",Sergio Leone
1997,1997,23000000,Romance Science Fiction Drama,http://www.herthemovie.com/,152601,artificial intelligence computer love loneline...,en,Her,"In the not so distant future, Theodore, a lone...",53.682367,...,126.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A Spike Jonze Love Story,Her,7.9,4097,Joaquin Phoenix Scarlett Johansson Rooney Mara...,"[{'name': 'K.K. Barrett', 'gender': 0, 'depart...",Spike Jonze
2504,2504,15600000,Comedy Drama,http://www.juwannamann.com/,35696,sport basketball beautiful woman hit in crotch...,en,Juwanna Mann,A basketball star is booted out of the NBA whe...,2.244947,...,91.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,Juwanna Mann,4.2,20,"Miguel A. N\u00fa\u00f1ez, Jr. Vivica A. Fox K...","[{'name': 'Jesse Vaughan', 'gender': 0, 'depar...",Jesse Vaughan
1238,1238,40000000,Fantasy Horror Science Fiction,,9306,monster experiment island mutation genetics,en,The Island of Dr. Moreau,A shipwrecked sailor stumbles upon a mysteriou...,13.210898,...,96.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The gates of hell are unlocked.,The Island of Dr. Moreau,4.6,147,Marlon Brando Val Kilmer David Thewlis Fairuza...,"[{'name': 'Valerie McCaffrey', 'gender': 1, 'd...",John Frankenheimer


In [4]:
movies_df.shape

(4803, 24)

In [5]:
# selecting the relevant features for recommendation
selected_features = ['genres', 'keywords', 'tagline', 'cast', 'director']

In [6]:
# replacing the null values with null string 
for feature in selected_features:
    movies_df[feature] = movies_df[feature].fillna('')

In [7]:
# combining all the 5 selected features
#movies_sel_df = movies_df[selected_features].copy()
movies_sel_df = movies_df['genres']+' '+movies_df['keywords']+' '+movies_df['tagline']+' '+movies_df['cast']+' '+movies_df['director']

In [8]:
movies_sel_df.sample(5)

1414    Comedy Romance isolation atomic bomb bunker sh...
1055    Comedy Family Music musical the muppets robot ...
4351    Action Drama History War sergeant iwo jima A G...
425     Adventure Action Thriller paris london england...
2515    Adventure Drama Action Romance flying martial ...
dtype: object

In [9]:
# converting the text data to features
vectorizer = TfidfVectorizer()

In [10]:
feature_extraction = vectorizer.fit_transform(movies_sel_df)

In [11]:
print(feature_extraction)

  (0, 2432)	0.17272411194153
  (0, 7755)	0.1128035714854756
  (0, 13024)	0.1942362060108871
  (0, 10229)	0.16058685400095302
  (0, 8756)	0.22709015857011816
  (0, 14608)	0.15150672398763912
  (0, 16668)	0.19843263965100372
  (0, 14064)	0.20596090415084142
  (0, 13319)	0.2177470539412484
  (0, 17290)	0.20197912553916567
  (0, 17007)	0.23643326319898797
  (0, 13349)	0.15021264094167086
  (0, 11503)	0.27211310056983656
  (0, 11192)	0.09049319826481456
  (0, 16998)	0.1282126322850579
  (0, 15261)	0.07095833561276566
  (0, 4945)	0.24025852494110758
  (0, 14271)	0.21392179219912877
  (0, 3225)	0.24960162956997736
  (0, 16587)	0.12549432354918996
  (0, 14378)	0.33962752210959823
  (0, 5836)	0.1646750903586285
  (0, 3065)	0.22208377802661425
  (0, 3678)	0.21392179219912877
  (0, 5437)	0.1036413987316636
  :	:
  (4801, 17266)	0.2886098184932947
  (4801, 4835)	0.24713765026963996
  (4801, 403)	0.17727585190343226
  (4801, 6935)	0.2886098184932947
  (4801, 11663)	0.21557500762727902
  (4801, 1672

Cosine Similarity

In [12]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_extraction)

In [13]:
similarity.shape

(4803, 4803)

In [14]:
# getting the movie name from the user
movie_name = input('Enter your favourite movie name')

Enter your favourite movie name Iron Man


In [15]:
# creating a list with all the movie names given in the dataset
list_titles = movies_df['title'].tolist() 

In [16]:
# finding the close match for the movie name given by the user
find_close_match = difflib.get_close_matches(movie_name, list_titles)

In [17]:
print(find_close_match)

['Iron Man', 'Iron Man 3', 'Iron Man 2']


In [18]:
close_match = find_close_match[0]
print(close_match)

Iron Man


In [19]:
# finding the index of the movie with title

index_movie = movies_df[movies_df.title == close_match]['index'].values[0]
print(index_movie)

68


movies_df[movies_df.title == close_match] filters the DataFrame to only include rows where the title matches close_match.
['index'] selects the column named 'index' from the filtered DataFrame.
.values converts the values in the 'index' column to an array.
[0] retrieves the first value from that array, which corresponds to the index of the first movie that matched the condition.

In [20]:
# getting a list of similar movies

similarity_score = list(enumerate(similarity[index_movie]))

enumerate(similarity[index_movie]) will pair each similarity score with its index.

In [21]:
# sorting the movies based on their similarity score
# represent the 2 column - x:x[1]
sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True )

In [22]:
sorted_similar_movies[0]

(68, 1.0000000000000002)

In [23]:
# print the name of similar movies based on index

print("Movies suggested for you : ")

i =1

for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movies_df[movies_df.index == index]['title'].values[0]
    if(i<11):
        print(i, ".", title_from_index)
        i += 1

Movies suggested for you : 
1 . Iron Man
2 . Iron Man 2
3 . Iron Man 3
4 . Avengers: Age of Ultron
5 . The Avengers
6 . Captain America: Civil War
7 . Captain America: The Winter Soldier
8 . Ant-Man
9 . X-Men
10 . Made


MOVIE RECOMMENDATION SYSTEM

In [24]:
movie_name = input('Enter your favourite movie name')
list_titles = movies_df['title'].tolist() 
find_close_match = difflib.get_close_matches(movie_name, list_titles)
close_match = find_close_match[0]
index_movie = movies_df[movies_df.title == close_match]['index'].values[0]
similarity_score = list(enumerate(similarity[index_movie]))
sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True )
print("Movies suggested for you : ")
i =1

for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movies_df[movies_df.index == index]['title'].values[0]
    if(i<11):
        print(i, ".", title_from_index)
        i += 1

Enter your favourite movie name avatar


Movies suggested for you : 
1 . Avatar
2 . Alien
3 . Aliens
4 . Guardians of the Galaxy
5 . Star Trek Beyond
6 . Star Trek Into Darkness
7 . Galaxy Quest
8 . Alien³
9 . Cargo
10 . Trekkies
