Importing the dependencies

In [31]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [12]:
from google.colab import files
uploaded = files.upload()

Saving movies.xlsx to movies.xlsx


Data Collection and Pre-Processing

In [13]:
import pandas as pd

# Adjust the file path as needed
try:
    movies_data = pd.read_excel('movies.xlsx')
    print(movies_data.head())
except FileNotFoundError:
    print("File not found. Please check the file path.")
except pd.errors.EmptyDataError:
    print("No data in file. The file is empty.")
except pd.errors.ParserError:
    print("Error parsing file. Please check the file format.")
except Exception as e:
    print(f"An error occurred: {e}")


  index     budget                                    genres  \
0     0  237000000  Action Adventure Fantasy Science Fiction   
1     1  300000000                  Adventure Fantasy Action   
2     2  245000000                    Action Adventure Crime   
3     3  250000000               Action Crime Drama Thriller   
4     4  260000000          Action Adventure Science Fiction   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  culture clash future space war space colony so...                en   
1  ocean drug abuse exotic island east india trad...                en   
2         spy based on novel secret ag

In [14]:
# printing the first 5 rows of the dataframe
movies_data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,Unnamed: 659,Unnamed: 660,Unnamed: 661,Unnamed: 662,Unnamed: 663,Unnamed: 664,Unnamed: 665,Unnamed: 666,Unnamed: 667,Unnamed: 668
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,,,,,,,,,,
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,,,,,,,,,,
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bondâ€™s past sends him...,107.376788,...,,,,,,,,,,
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,,,,,,,,,,
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,,,,,,,,,,


In [15]:
# number of rows and columns in the data frame

movies_data.shape

(4807, 669)

In [16]:
# selecting the relevant features for recommendation

selected_features = ['genres','keywords','tagline','cast','director']
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [17]:
# replacing the null valuess with null string
# Data Cleaning: Ensuring that there are no NaN values can make subsequent data processing steps easier.
# Consistency: It standardizes the representation of missing values.
# Compatibility: Some algorithms and functions may not handle NaN values well, so replacing them can help avoid errors.

for feature in selected_features:
  movies_data[feature] = movies_data[feature].fillna('')

In [18]:
# combining all the 5 selected features

combined_features = movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['tagline']+' '+movies_data['cast']+' '+movies_data['director']

In [19]:
print(combined_features)

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4802    Action Crime Thriller united states\u2013mexic...
4803    Comedy Romance  A newlywed couple's honeymoon ...
4804    Comedy Drama Romance TV Movie date love at fir...
4805      A New Yorker in Shanghai Daniel Henney Eliza...
4806    Documentary obsession camcorder crush dream gi...
Length: 4807, dtype: object


In [20]:
# converting the text data to feature vectors

vectorizer = TfidfVectorizer()

In [21]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [22]:
print(feature_vectors)

  (0, 2438)	0.17272254420852876
  (0, 7772)	0.112811923361296
  (0, 13045)	0.19423107703567274
  (0, 10247)	0.1605872955430688
  (0, 8774)	0.2270795907528114
  (0, 14630)	0.15150866870941743
  (0, 16695)	0.1984268159727737
  (0, 14086)	0.20595383419815566
  (0, 13340)	0.21773803283801318
  (0, 17321)	0.20197271475420336
  (0, 17037)	0.2364211486676096
  (0, 13370)	0.15021479989378655
  (0, 11522)	0.2720950793822958
  (0, 11211)	0.09050524353480881
  (0, 17028)	0.1282184332517196
  (0, 15283)	0.07099572703389416
  (0, 4956)	0.24024577715275539
  (0, 14293)	0.2139134043528674
  (0, 3232)	0.2495873350675536
  (0, 16614)	0.1255005745211272
  (0, 14400)	0.33962535023778595
  (0, 5848)	0.16467485510936142
  (0, 3072)	0.22207403899583422
  (0, 3686)	0.2139134043528674
  (0, 5448)	0.10371179266351029
  :	:
  (4805, 17297)	0.28866962459935924
  (4805, 4846)	0.24719298256838088
  (4805, 408)	0.17732364786543078
  (4805, 6947)	0.28866962459935924
  (4805, 11682)	0.21562693510809375
  (4805, 1677)

Cosine Similarity

In [23]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)

In [24]:
print(similarity)

[[1.         0.07223087 0.03775287 ... 0.         0.         0.        ]
 [0.07223087 1.         0.03283857 ... 0.03575488 0.         0.        ]
 [0.03775287 0.03283857 1.         ... 0.         0.05369947 0.        ]
 ...
 [0.         0.03575488 0.         ... 1.         0.         0.02651715]
 [0.         0.         0.05369947 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.02651715 0.         1.        ]]


In [25]:
print(similarity.shape)

(4807, 4807)


Getting the movie name from the user

In [41]:
movies_data['title'] = movies_data['title'].astype(str)

# Getting the movie name from the user
movie_name = input('Enter your favourite movie name: ')

# Creating a list with all the movie names given in the dataset
list_of_all_titles = movies_data['title'].tolist()
print(list_of_all_titles)

# Find close matches
close_matches = difflib.get_close_matches(movie_name, list_of_all_titles)

# Print the result and find the index of the closest match
if close_matches:
    close_match = close_matches[0]  # Take the first close match
    index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
    print(f"Close matches: {close_matches}")
    print(f"Index of the closest match: {index_of_the_movie}")
else:
    print("No close matches found.")

Enter your favourite movie name: iron man
['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice', 'Superman Returns', 'Quantum of Solace', "Pirates of the Caribbean: Dead Man's Chest", 'The Lone Ranger', 'Man of Steel', 'The Chronicles of Narnia: Prince Caspian', 'The Avengers', 'Pirates of the Caribbean: On Stranger Tides', 'Men in Black 3', 'The Hobbit: The Battle of the Five Armies', 'The Amazing Spider-Man', 'Robin Hood', 'The Hobbit: The Desolation of Smaug', 'The Golden Compass', 'King Kong', 'Titanic', 'Captain America: Civil War', 'Battleship', 'Skyfall', 'Spider-Man 2', 'Iron Man 3', 'Alice in Wonderland', 'X-Men: The Last Stand', 'Monsters University', 'Transformers: Revenge of the Fallen', 'Transformers: Age of Extinction', 'Oz: The Great and Powerful', 'The Amazing Spider-Man 2', 'TRON: Legacy

In [42]:
# getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 0.03221081479683828), (1, 0.07063232923093056), (2, 0.03645869484968703), (3, 0.007833596897130144), (4, 0.05484027772753234), (5, 0.07905765295831037), (6, 0.02226707787097095), (7, 0.023641627986854218), (8, 0.037177468614276545), (9, 0.04077055968226686), (10, 0.020493679345863915), (11, 0.012362760376390582), (12, 0.01420561517897943), (13, 0.01679725179719729), (14, 0.037158020383051826), (15, 0.047191836710603494), (16, 0.022417741544486315), (17, 0.01359282738849083), (18, 0.05414409118380707), (19, 0.04715822147162058), (20, 0.032077843548450787), (21, 0.011679257514830722), (22, 0.04522663805521953), (23, 0.047917030394940656), (24, 0.07408351613651513), (25, 0.012303485785546887), (26, 0.014495330814863674), (27, 0.016598171621317045), (28, 0.024509961814708353), (29, 0.05452777496583945), (30, 0.03148051908595748), (31, 0.03810183038803887), (32, 0.023496964078980824), (33, 0.008199909693222915), (34, 0.014653739671484317), (35, 0.013203549683627712), (36, 0.02313129104

In [43]:
len(similarity_score)

4807

In [44]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
print(sorted_similar_movies)

[(68, 1.0), (261, 0.19890036022026827), (1744, 0.16586440108895845), (3730, 0.16172826181941483), (824, 0.15726447541368646), (1330, 0.15667857354068648), (4416, 0.1523661093014194), (1389, 0.14747606469733585), (557, 0.1467895690174872), (1617, 0.14382539603329908), (2310, 0.14318586123554214), (512, 0.1427868424250011), (2924, 0.14109045332544706), (1724, 0.14018203463426238), (3343, 0.1396313866830381), (640, 0.1380240433514586), (652, 0.13565264279252703), (442, 0.13458994353900514), (145, 0.13246106719756237), (3491, 0.12964025766764598), (2212, 0.1289687381962807), (2319, 0.12591949010614661), (1424, 0.11872331319695045), (1851, 0.11572321976810925), (4356, 0.1136226963704355), (1133, 0.11344137451748033), (2003, 0.11221314340713788), (761, 0.10990907752014241), (1381, 0.10969735694031241), (52, 0.10787778066769084), (3783, 0.10616239615770355), (3441, 0.10574339624471515), (4221, 0.10441063256612881), (3990, 0.10433605789783576), (110, 0.10306596389505353), (1180, 0.101481412588

In [45]:
# print the name of similar movies based on the index

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . Hugo
2 . Ender's Game
3 . Kick-Ass 2
4 . Sexy Beast
5 . The Equalizer
6 . The 5th Wave
7 . The Poker House
8 . Neighbors 2: Sorority Rising
9 . Talladega Nights: The Ballad of Ricky Bobby
10 . Carrie
11 . The Amityville Horror
12 . Madagascar
13 . If I Stay
14 . Kick-Ass
15 . (500) Days of Summer
16 . Les MisÃ©rables
17 . The Dictator
18 . Shutter Island
19 . Madagascar 3: Europe's Most Wanted
20 . Shattered Glass
21 . Let Me In
22 . Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan
23 . Nanny McPhee and the Big Bang
24 . GoodFellas
25 . Shine a Light
26 . BrÃ¼no
27 . The Adventurer: The Curse of the Midas Box
28 . Edge of Darkness
29 . Species


Movie Recommendation Sytem

In [46]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies_data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

 Enter your favourite movie name : iron man
Movies suggested for you : 

1 . Hugo
2 . Ender's Game
3 . Kick-Ass 2
4 . Sexy Beast
5 . The Equalizer
6 . The 5th Wave
7 . The Poker House
8 . Neighbors 2: Sorority Rising
9 . Talladega Nights: The Ballad of Ricky Bobby
10 . Carrie
11 . The Amityville Horror
12 . Madagascar
13 . If I Stay
14 . Kick-Ass
15 . (500) Days of Summer
16 . Les MisÃ©rables
17 . The Dictator
18 . Shutter Island
19 . Madagascar 3: Europe's Most Wanted
20 . Shattered Glass
21 . Let Me In
22 . Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan
23 . Nanny McPhee and the Big Bang
24 . GoodFellas
25 . Shine a Light
26 . BrÃ¼no
27 . The Adventurer: The Curse of the Midas Box
28 . Edge of Darkness
29 . Species
