### Movie Recommendation System - Cosine Similairty

In [1]:
import numpy as np
import pandas as pd
import difflib #if user gives spelling mistake while searching for a movie, it predicts the nearest possiblity of the searched word
from sklearn.feature_extraction.text import TfidfVectorizer #textual data into numerical data
from sklearn.metrics.pairwise import cosine_similarity

Data Collection and Pre-processing

In [2]:
movies_data = pd.read_csv("/Users/pavansaipendry/Desktop/Master's/Sem 1/Machine Learning/Projects/Movie Recommendation System/movies.csv")

In [3]:
movies_data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [4]:
movies_data.shape

(4803, 24)

In [5]:
movies_data.isna().sum()

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
dtype: int64

In [27]:
movies_data["vote_average"].value_counts()

vote_average
6.5    216
6.0    216
6.7    213
6.3    207
6.1    201
      ... 
2.7      1
0.5      1
2.2      1
1.9      1
2.4      1
Name: count, Length: 71, dtype: int64

Selecting the relevant features for Recommendation

In [7]:
selected_features = ['genres' , 'keywords' , 'tagline', 'cast', 'crew' , 'director']
selected_features

['genres', 'keywords', 'tagline', 'cast', 'crew', 'director']

Replacing null values with string

In [8]:
for features in selected_features:
    movies_data[features] = movies_data[features].fillna('')


Combining all the Selected features

In [23]:
combined_movieData = movies_data['genres'] + movies_data['keywords'] + movies_data['tagline'] + movies_data['cast'] + movies_data['crew'] + movies_data['director']


object


Converting the Text data into Numerical data.

We use `TfidfVectorizer`, to convert text data into numerical data

In [10]:
vectorizer = TfidfVectorizer()

In [11]:
feature_vector = vectorizer.fit_transform(combined_movieData)

Cosine Similarity

In [12]:
similarity = cosine_similarity(feature_vector)

In [13]:
print(similarity.shape)

(4803, 4803)


Getting the movie name from user

In [14]:
movie_name = input('Give me your favourite movie name')

In [15]:
list_of_all_titles = movies_data['title'].tolist()
# list_of_all_titles

Now that we got all the movie name, we need to find the nearest match

In [16]:
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

In [17]:
print(find_close_match)

['Iron Man', 'Iron Man 3', 'Iron Man 2']


In [18]:
close_match = find_close_match[0]
print(close_match)

Iron Man


Find the `index` of the movie with title

In [19]:
index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
print(index_of_the_movie)

68


Getting list of similar movies

In [20]:
similarity_score = list(enumerate(similarity[index_of_the_movie]))
print("similarity_score",similarity_score)

similarity_score [(0, 0.48149069957258245), (1, 0.4052521022259182), (2, 0.5015450427013423), (3, 0.5113976569568972), (4, 0.49212305962826713), (5, 0.4706430640913124), (6, 0.43727434733936466), (7, 0.5164531648803312), (8, 0.37000235289749384), (9, 0.512844448944907), (10, 0.37812041563617715), (11, 0.3387003691496535), (12, 0.366003247112039), (13, 0.40303423374594655), (14, 0.45509858058127955), (15, 0.47383073584371155), (16, 0.5149973982025331), (17, 0.41790102113969546), (18, 0.3587377793090094), (19, 0.495058701771347), (20, 0.5064402754247072), (21, 0.32305639526535285), (22, 0.49756743840327805), (23, 0.3988045218177907), (24, 0.41248770957906306), (25, 0.45423466186099315), (26, 0.5267736523771356), (27, 0.4953481600581914), (28, 0.5115175862622154), (29, 0.39435688076012215), (30, 0.5206558253492365), (31, 0.508128148896907), (32, 0.3699605847402115), (33, 0.47465214467335015), (34, 0.28995152969511684), (35, 0.4600075129482396), (36, 0.4749046479774919), (37, 0.39057628090

Check for the Highest similarity

In [21]:
sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse=True)

i=1
print(f"Suggested movies for: {movie_name}")
for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movies_data[movies_data.index == index]['title'].values[0]

    if i<21:
        print(i, '-', title_from_index)
        i+=1
    

Suggested movies for: Iron man
1 - Iron Man
2 - Iron Man 2
3 - Ant-Man
4 - Hulk
5 - Blade: Trinity
6 - Captain America: Civil War
7 - Spider-Man
8 - Terminator 3: Rise of the Machines
9 - Spider-Man 2
10 - Catwoman
11 - Alexander
12 - The Core
13 - Elektra
14 - Blade II
15 - Bad Boys II
16 - The Punisher
17 - Avengers: Age of Ultron
18 - Cursed
19 - Blade
20 - The Chronicles of Riddick


Movie Recommendation System

In [22]:
movie_name = input('Give me your favourite movie name')

list_of_all_titles = movies_data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]
index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse=True)

i=1
print(f"Suggested movies for: {movie_name}")
for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movies_data[movies_data.index == index]['title'].values[0]

    if i<21:
        print(i, '-', title_from_index)
        i+=1
    


Suggested movies for: Iron man
1 - Iron Man
2 - Iron Man 2
3 - Ant-Man
4 - Hulk
5 - Blade: Trinity
6 - Captain America: Civil War
7 - Spider-Man
8 - Terminator 3: Rise of the Machines
9 - Spider-Man 2
10 - Catwoman
11 - Alexander
12 - The Core
13 - Elektra
14 - Blade II
15 - Bad Boys II
16 - The Punisher
17 - Avengers: Age of Ultron
18 - Cursed
19 - Blade
20 - The Chronicles of Riddick
