Importing the dependencies

In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Data Collection and Pre-Processing

In [3]:

movies_data = pd.read_csv('/content/IMDb Movies India.csv', encoding='latin-1')

In [4]:
# printing the first 5 rows of the dataframe
movies_data.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [5]:
# number of rows and columns in the data frame

movies_data.shape

(15509, 10)

In [16]:
# selecting the relevant features for recommendation

selected_features = ['Genre','Rating','Votes','Director','Actor 1','Actor 2','Actor 3']
print(selected_features)

['Genre', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3']


In [17]:
# replacing the null valuess with null string

# Check if the column exists before trying to fill null values
for feature in selected_features:
  if feature in movies_data.columns:  # Check if column exists
    movies_data[feature] = movies_data[feature].fillna('')
  else:
    print(f"Warning: Column '{feature}' not found in the DataFrame.")

In [18]:
# printing the first 5 rows of the dataframe
movies_data.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [22]:
# combining all the 5 selected features

# Convert numerical columns to string type before concatenation
combined_features = movies_data['Genre'].astype(str) + ' ' + \
                     movies_data['Rating'].astype(str) + ' ' + \
                     movies_data['Votes'].astype(str) + ' ' + \
                     movies_data['Director'].astype(str) + ' ' + \
                     movies_data['Actor 1'].astype(str) + ' ' + \
                     movies_data['Actor 2'].astype(str) + ' ' + \
                     movies_data['Actor 3'].astype(str)

In [23]:
print(combined_features)

0        Drama   J.S. Randhawa Manmauji Birbal Rajendra...
1        Drama 7.0 8 Gaurav Bakshi Rasika Dugal Vivek G...
2        Drama, Musical   Soumyajit Majumdar Sayani Gup...
3        Comedy, Romance 4.4 35 Ovais Khan Prateik Ishi...
4        Drama   Amol Palekar Rajat Kapoor Rituparna Se...
                               ...                        
15504    Action 4.6 11 Mahendra Shah Naseeruddin Shah S...
15505    Action, Drama 4.5 655 Kuku Kohli Akshay Kumar ...
15506                Action   Kiran Thej Sangeeta Tiwari  
15507                                         Action      
15508    Action, Drama 6.2 20 K.C. Bokadia Dharmendra J...
Length: 15509, dtype: object


In [24]:
vectorizer = TfidfVectorizer()

In [25]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [26]:
print(feature_vectors)

  (0, 2462)	0.4033873866083307
  (0, 8192)	0.3768760382326694
  (0, 2630)	0.3926314980507225
  (0, 6425)	0.5548518397371268
  (0, 8328)	0.46927472364250905
  (0, 3683)	0.11403615461934238
  (1, 4938)	0.44371931738321657
  (1, 1777)	0.2899616026660809
  (1, 4142)	0.44371931738321657
  (1, 11016)	0.2711891233635194
  (1, 3697)	0.3819354848158827
  (1, 8381)	0.3819354848158827
  (1, 2101)	0.2659398144444615
  (1, 4092)	0.28278932012854
  (1, 3683)	0.07849744901578116
  (2, 1552)	0.42807925324005125
  (2, 8672)	0.20527960524413624
  (2, 2710)	0.3982762044752275
  (2, 7786)	0.3982762044752275
  (2, 4338)	0.2166934662921647
  (2, 9227)	0.32907566823488504
  (2, 6293)	0.28183895353547533
  (2, 9909)	0.42807925324005125
  (2, 6961)	0.1835580193335261
  (2, 3683)	0.07573059823966008
  :	:
  (15505, 10597)	0.40001426512592986
  (15505, 5895)	0.42574561459386395
  (15505, 1767)	0.3086524895540163
  (15505, 5787)	0.31569258937332356
  (15505, 4755)	0.26098682210460494
  (15505, 5673)	0.23204440015

In [27]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)

In [28]:
print(similarity)

[[1.         0.00895155 0.00863603 ... 0.         0.         0.01164137]
 [0.00895155 1.         0.00594466 ... 0.         0.         0.00801341]
 [0.00863603 0.00594466 1.         ... 0.         0.         0.00773095]
 ...
 [0.         0.         0.         ... 1.         0.16682422 0.02394282]
 [0.         0.         0.         ... 0.16682422 1.         0.14352125]
 [0.01164137 0.00801341 0.00773095 ... 0.02394282 0.14352125 1.        ]]


In [29]:
print(similarity.shape)

(15509, 15509)


Getting the movie name from the user

In [31]:
# getting the movie name from the user

movie_name = input(' Enter your favourite movie name : ')

 Enter your favourite movie name : iron man


In [33]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['Name'].tolist()
print(list_of_all_titles)



In [34]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Nirmaan', 'Nirmaan', 'Woman']


In [35]:
close_match = find_close_match[0]
print(close_match)

Nirmaan


In [42]:
# finding the director of the movie with title

index_of_the_movie = movies_data[movies_data['Name'] == close_match]['Director'].values[0]
print(index_of_the_movie)

Ravi Tandon


In [46]:
# Create an index column if needed
movies_data.reset_index(inplace=True)

index_of_the_movie = movies_data[movies_data.Name == close_match]['index'].values[0]
print(index_of_the_movie)

10208


In [47]:
# getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 0.01242331545001353), (1, 0.008551661307763827), (2, 0.008250235324841069), (3, 0.0), (4, 0.009567310366312446), (5, 0.009450347303721797), (6, 0.007958028822782577), (7, 0.0), (8, 0.0), (9, 0.0), (10, 0.009478182854209786), (11, 0.0), (12, 0.0), (13, 0.007551507878428422), (14, 0.0), (15, 0.010128689144237677), (16, 0.0), (17, 0.009497216994083449), (18, 0.0077658885494487), (19, 0.0), (20, 0.007870163293847543), (21, 0.11311257607397612), (22, 0.0), (23, 0.0), (24, 0.0), (25, 0.0), (26, 0.0), (27, 0.008253968143682728), (28, 0.009852653141396774), (29, 0.0), (30, 0.00878680456038023), (31, 0.0), (32, 0.010105594512939695), (33, 0.0), (34, 0.0), (35, 0.010509797809938694), (36, 0.010186035974647957), (37, 0.01211161281751626), (38, 0.0), (39, 0.0), (40, 0.008171524080884238), (41, 0.0), (42, 0.00940053120557503), (43, 0.08584371234112531), (44, 0.0), (45, 0.0), (46, 0.009904506291897595), (47, 0.0), (48, 0.10894190085138432), (49, 0.1295758052724285), (50, 0.0), (51, 0.0), (52, 0

In [48]:
len(similarity_score)

15509

In [49]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
print(sorted_similar_movies)

[(10208, 1.0), (12412, 0.5735193608770744), (14530, 0.43321820061325594), (4365, 0.40572566281567024), (1881, 0.34935778552362373), (2965, 0.3485632447483249), (2019, 0.34777027470145455), (3730, 0.3457405750950091), (5997, 0.33917071647265645), (10483, 0.3333952009061826), (9245, 0.3329408994073275), (14845, 0.3296770118208612), (13044, 0.3235116801489714), (8255, 0.32280268441806037), (9525, 0.3205532868202412), (4123, 0.32002533320237025), (12625, 0.31713088757364394), (4888, 0.31621348128516225), (3798, 0.31511263923917743), (3617, 0.3141383455150141), (9850, 0.3088670837703997), (3619, 0.30814048417155765), (7510, 0.3078174372596894), (221, 0.30419284222184934), (15492, 0.30209539886054415), (15096, 0.2997115460056714), (3621, 0.2995518879456081), (119, 0.2979480305113417), (2585, 0.2938952070224782), (13697, 0.2920243760998543), (4244, 0.2903560950657972), (13937, 0.28856876939530485), (5474, 0.28313900448009627), (10633, 0.2782992963384433), (1687, 0.27671199542942126), (5579, 0

In [50]:
# print the name of similar movies based on the index

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['Name'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . Nirmaan
2 . Sansar
3 . Tumhari Kassam
4 . Ek Baar Kaho
5 . Barkha Bahar
6 . Chhalia
7 . Berahem
8 . Dharma
9 . I Wonder...
10 . Paise Ki Gudiya
11 . Mere Sajna
12 . Victoria No. 203
13 . Shiv Charan
14 . Log Kya Kahenge
15 . Mounto
16 . Doosra Roop
17 . Sawan Bhadon
18 . Ganga Tera Pani Amrit
19 . Dhund
20 . Desh Drohee
21 . Nadaan
22 . Desh Ke Dushman
23 . Khanjar
24 . Aafat
25 . Zorro
26 . Woh Main Nahin
27 . Desh Premee
28 . 7 Saal Baad
29 . Buddha Mil Gaya


Movie Recommendation Sytem

In [52]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies_data['Name'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

# Use 'Name' instead of 'title' to access the correct column
index_of_the_movie = movies_data[movies_data['Name'] == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['Name'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

 Enter your favourite movie name : iron man
Movies suggested for you : 

1 . Nirmaan
2 . Sansar
3 . Tumhari Kassam
4 . Ek Baar Kaho
5 . Barkha Bahar
6 . Chhalia
7 . Berahem
8 . Dharma
9 . I Wonder...
10 . Paise Ki Gudiya
11 . Mere Sajna
12 . Victoria No. 203
13 . Shiv Charan
14 . Log Kya Kahenge
15 . Mounto
16 . Doosra Roop
17 . Sawan Bhadon
18 . Ganga Tera Pani Amrit
19 . Dhund
20 . Desh Drohee
21 . Nadaan
22 . Desh Ke Dushman
23 . Khanjar
24 . Aafat
25 . Zorro
26 . Woh Main Nahin
27 . Desh Premee
28 . 7 Saal Baad
29 . Buddha Mil Gaya
