In [2]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# loading the data from the csv file to a pandas dataframe
movies_data = pd.read_csv('movies.csv')

movies_data.head()

In [4]:
# number of rows and columns in the data frame
movies_data.shape

(4803, 24)

In [23]:
# selecting the relevant features for recommendation

selected_features = ['genres','keywords','tagline','cast','director']
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [24]:
# replacing the null valuess with null string

for feature in selected_features:
  movies_data[feature] = movies_data[feature].fillna('')

In [25]:
# combining all the 6 selected features

combined_features =movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['tagline']+' '+movies_data['cast']+' '+movies_data['director']

In [26]:
print(combined_features)

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object


In [27]:
# converting the text data to feature vectors

vectorizer = TfidfVectorizer()

In [28]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [11]:
print(feature_vectors)
#(0, 1183): This indicates the word at column index 1183 (the 1183rd unique word) in the vocabulary has a TF-IDF value of 0.2771429775697421 for the first movie (row 0).

  (0, 1183)	0.2771429775697421
  (0, 243)	0.07630361845708403
  (0, 322)	0.08752209522589645
  (0, 5882)	0.10791658457595885
  (0, 15011)	0.10068472429000817
  (0, 6055)	0.10056743537917023
  (0, 4103)	0.2078190465468255
  (0, 3408)	0.21574818782392277
  (0, 6519)	0.15884357175977007
  (0, 15901)	0.3255851082321633
  (0, 18385)	0.1206723617514615
  (0, 3587)	0.2424810120582838
  (0, 15785)	0.20449485056097091
  (0, 5519)	0.22061174669983705
  (0, 16904)	0.05365726945306952
  (0, 18835)	0.12179929157015999
  (0, 12356)	0.07571236182305312
  (0, 12700)	0.2552737122112953
  (0, 14749)	0.14568185359096344
  (0, 18845)	0.22968831190527225
  (0, 19148)	0.19502634639381394
  (0, 14718)	0.21153518149440187
  (0, 15542)	0.20008526614580363
  (0, 18477)	0.1927717674394528
  (0, 16170)	0.1471845509560594
  :	:
  (4801, 2698)	0.2411630217643225
  (4801, 14565)	0.25682086501772416
  (4801, 18999)	0.2692491943968893
  (4801, 3911)	0.2692491943968893
  (4801, 15406)	0.2692491943968893
  (4801, 8099)	

In [29]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)
print(similarity)
#similarity[0][1] = 0.07294698: The first movie and the second movie have a similarity score of approximately 7.3%.

[[1.         0.07219487 0.037733   ... 0.         0.         0.        ]
 [0.07219487 1.         0.03281499 ... 0.03575545 0.         0.        ]
 [0.037733   0.03281499 1.         ... 0.         0.05389661 0.        ]
 ...
 [0.         0.03575545 0.         ... 1.         0.         0.02651502]
 [0.         0.         0.05389661 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.02651502 0.         1.        ]]


In [30]:
print(similarity.shape)

(4803, 4803)


In [31]:
# getting the movie name from the user

movie_name = input(' Enter your favourite movie name : ')

In [32]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['title'].tolist()
print(list_of_all_titles)

['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice', 'Superman Returns', 'Quantum of Solace', "Pirates of the Caribbean: Dead Man's Chest", 'The Lone Ranger', 'Man of Steel', 'The Chronicles of Narnia: Prince Caspian', 'The Avengers', 'Pirates of the Caribbean: On Stranger Tides', 'Men in Black 3', 'The Hobbit: The Battle of the Five Armies', 'The Amazing Spider-Man', 'Robin Hood', 'The Hobbit: The Desolation of Smaug', 'The Golden Compass', 'King Kong', 'Titanic', 'Captain America: Civil War', 'Battleship', 'Jurassic World', 'Skyfall', 'Spider-Man 2', 'Iron Man 3', 'Alice in Wonderland', 'X-Men: The Last Stand', 'Monsters University', 'Transformers: Revenge of the Fallen', 'Transformers: Age of Extinction', 'Oz: The Great and Powerful', 'The Amazing Spider-Man 2', 'TRON: Legacy', 'Cars 2', 'Green Lant

In [33]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Memento', 'Detention', 'Atonement']


In [34]:
close_match = find_close_match[0]
print(close_match)

Memento


In [35]:
# finding the index of the movie with title

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
print(index_of_the_movie)

3573


In [36]:
# getting a list of similar movies
similarity_score=list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 0.022238879581807963), (1, 0.0), (2, 0.0), (3, 0.11354700071117123), (4, 0.0), (5, 0.04317304446747912), (6, 0.0), (7, 0.01893239271801742), (8, 0.0), (9, 0.0), (10, 0.0), (11, 0.004960303346153563), (12, 0.0), (13, 0.0), (14, 0.0), (15, 0.0), (16, 0.042859041372914006), (17, 0.0), (18, 0.01881274329305227), (19, 0.0), (20, 0.0), (21, 0.015741418253464208), (22, 0.0), (23, 0.017026543831485413), (24, 0.0), (25, 0.005621922739998061), (26, 0.0), (27, 0.005086512406840143), (28, 0.005584390364907041), (29, 0.005842103843543599), (30, 0.0), (31, 0.07232542019548455), (32, 0.024424744552739668), (33, 0.00560104609717222), (34, 0.027882879157407746), (35, 0.0), (36, 0.017797038671457616), (37, 0.0), (38, 0.0), (39, 0.0), (40, 0.0608972781962054), (41, 0.02344737663155489), (42, 0.0), (43, 0.0056757992541223625), (44, 0.0), (45, 0.0050652977819350955), (46, 0.0), (47, 0.0), (48, 0.0), (49, 0.0), (50, 0.0), (51, 0.0), (52, 0.0), (53, 0.0), (54, 0.0), (55, 0.0), (56, 0.0), (57, 0.0), (58,

In [37]:
len(similarity_score)

4803

In [38]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) #here Sorts the similarity_score list based on the second element of each tuple (i.e., x[1], which is the similarity score).
print(sorted_similar_movies)

[(3573, 1.0), (1604, 0.16376859982844238), (2001, 0.1481021185337802), (4271, 0.14714074447833247), (1767, 0.13085878979145354), (1280, 0.12721742760590674), (1306, 0.12174523797751946), (1196, 0.12024092682472871), (793, 0.11883761951620501), (3288, 0.11881096737823177), (659, 0.11798874034189094), (3, 0.11354700071117123), (3275, 0.1102868122737419), (3853, 0.10926330244201923), (2056, 0.10524899219804938), (2180, 0.1044103642580375), (2198, 0.10418390706145234), (123, 0.10263553502861608), (2879, 0.10167398045918534), (125, 0.10104178927535945), (466, 0.10059588507857799), (634, 0.10042616729016901), (3700, 0.09742016508884471), (3415, 0.09517093417458375), (487, 0.09477920524685526), (3851, 0.09387053689592764), (989, 0.0936637308398623), (1259, 0.09304737271539076), (1103, 0.09135374236619369), (3642, 0.09056463902182818), (491, 0.09030325786835791), (2664, 0.08889550213790486), (95, 0.08822553423314944), (96, 0.08801388548731201), (1033, 0.0867723250761221), (1374, 0.086061975379

In [39]:
# print the name of similar movies based on the index

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . Memento
2 . 30 Days of Night
3 . The Crew
4 . Trees Lounge
5 . Suspect Zero
6 . Disturbia
7 . Dragon Nest: Warriors' Dawn
8 . The Prestige
9 . The Flintstones in Viva Rock Vegas
10 . Fido
11 . The Long Kiss Goodnight
12 . The Dark Knight Rises
13 . Clay Pigeons
14 . 2:13
15 . Before I Go to Sleep
16 . Silent Hill: Revelation 3D
17 . Lockout
18 . The Matrix Revolutions
19 . Ravenous
20 . The Matrix Reloaded
21 . The Time Machine
22 . The Matrix
23 . Bound
24 . Remember Me, My Love
25 . Red Planet
26 . Taxman
27 . Baby's Day Out
28 . Memoirs of an Invisible Man
29 . The Fugitive


In [43]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies_data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . Baby's Day Out
2 . Containment
3 . Unaccompanied Minors
4 . GoldenEye
5 . Phone Booth
6 . Daylight
7 . Sex and the City 2
8 . Rumble in the Bronx
9 . Gangster Squad
10 . Mission: Impossible - Ghost Protocol
11 . Dylan Dog: Dead of Night
12 . Beer League
13 . Clerks II
14 . The Net
15 . Doubt
16 . When the Cat's Away
17 . Men in Black II
18 . The Incredibles
19 . Hansel and Gretel Get Baked
20 . Childless
21 . Final Destination
22 . The Men Who Stare at Goats
23 . Christmas with the Kranks
24 . Poltergeist III
25 . Scary Movie 2
26 . Spaced Invaders
27 . Benji
28 . The Adventures of Pluto Nash
29 . Sabotage


In [48]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies_data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]
index = movies_data[movies_data.title == close_match]["index"].values[0]
index

row_data = movies_data.loc[index]  # Access the row data by index
print(row_data)

index                                                                  96
budget                                                          160000000
genres                  Action Thriller Science Fiction Mystery Adventure
homepage                            http://inceptionmovie.warnerbros.com/
id                                                                  27205
keywords                loss of lover dream kidnapping sleep subconsci...
original_language                                                      en
original_title                                                  Inception
overview                Cobb, a skilled thief who commits corporate es...
popularity                                                      167.58371
production_companies    [{"name": "Legendary Pictures", "id": 923}, {"...
production_countries    [{"iso_3166_1": "GB", "name": "United Kingdom"...
release_date                                                   2010-07-14
revenue                               