# Movie Recommendation system for Bollywood movies

### based on name of actors

## Importing the dependencies

In [304]:
import numpy as np
import pandas as pd
import difflib
from scipy.sparse import find

from sklearn.feature_extraction.text import TfidfVectorizer 
# for transforming the textual data to numerical feature vector to find present similarity values 
from sklearn.metrics.pairwise import cosine_similarity

## Dataset Loading and Preprocessing


In [313]:
# loading the dataset

dataset = pd.read_csv('bollywood_movie_dataset.csv')

In [314]:
dataset.shape

(4329, 23)

In [318]:
dataset.head()

Unnamed: 0,index,title_x,imdb_id,poster_path,wiki_link,title,original_title,is_adult,year_of_release,runtime,...,story,summary,tagline,actors,wins_nominations,release_date,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,1,Uri: The Surgical Strike,tt8291224,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Uri:_The_Surgica...,Uri: The Surgical Strike,Uri: The Surgical Strike,0,2019,138,...,Divided over five chapters the film chronicle...,Indian army special forces execute a covert op...,,Vicky Kaushal|Paresh Rawal|Mohit Raina|Yami Ga...,4 wins,11 January 2019 (USA),,,,
1,2,Battalion 609,tt9472208,,https://en.wikipedia.org/wiki/Battalion_609,Battalion 609,Battalion 609,0,2019,131,...,The story revolves around a cricket match betw...,The story of Battalion 609 revolves around a c...,,Vicky Ahuja|Shoaib Ibrahim|Shrikant Kamat|Elen...,,11 January 2019 (India),,,,
2,3,The Accidental Prime Minister (film),tt6986710,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/The_Accidental_P...,The Accidental Prime Minister,The Accidental Prime Minister,0,2019,112,...,Based on the memoir by Indian policy analyst S...,Explores Manmohan Singh's tenure as the Prime ...,,Anupam Kher|Akshaye Khanna|Aahana Kumra|Atul S...,,11 January 2019 (USA),,,,#VALUE!
3,4,Why Cheat India,tt8108208,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Why_Cheat_India,Why Cheat India,Why Cheat India,0,2019,121,...,The movie focuses on existing malpractices in ...,The movie focuses on existing malpractices in ...,,Emraan Hashmi|Shreya Dhanwanthary|Snighdadeep ...,,18 January 2019 (USA),,,,
4,5,Evening Shadows,tt6028796,,https://en.wikipedia.org/wiki/Evening_Shadows,Evening Shadows,Evening Shadows,0,2018,102,...,While gay rights and marriage equality has bee...,Under the 'Evening Shadows' truth often plays...,,Mona Ambegaonkar|Ananth Narayan Mahadevan|Deva...,17 wins & 1 nomination,11 January 2019 (India),,,,


### Removing unnecessary details

In [319]:
dataset = dataset.drop(['imdb_id','imdb_rating', 'poster_path','is_adult', 'wiki_link','title','original_title', 'imdb_votes',  'year_of_release','wins_nominations', 'release_date'], axis = 1)

In [320]:
dataset.head()

Unnamed: 0,index,title_x,runtime,genres,story,summary,tagline,actors,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,1,Uri: The Surgical Strike,138,Action|Drama|War,Divided over five chapters the film chronicle...,Indian army special forces execute a covert op...,,Vicky Kaushal|Paresh Rawal|Mohit Raina|Yami Ga...,,,,
1,2,Battalion 609,131,War,The story revolves around a cricket match betw...,The story of Battalion 609 revolves around a c...,,Vicky Ahuja|Shoaib Ibrahim|Shrikant Kamat|Elen...,,,,
2,3,The Accidental Prime Minister (film),112,Biography|Drama,Based on the memoir by Indian policy analyst S...,Explores Manmohan Singh's tenure as the Prime ...,,Anupam Kher|Akshaye Khanna|Aahana Kumra|Atul S...,,,,#VALUE!
3,4,Why Cheat India,121,Crime|Drama,The movie focuses on existing malpractices in ...,The movie focuses on existing malpractices in ...,,Emraan Hashmi|Shreya Dhanwanthary|Snighdadeep ...,,,,
4,5,Evening Shadows,102,Drama,While gay rights and marriage equality has bee...,Under the 'Evening Shadows' truth often plays...,,Mona Ambegaonkar|Ananth Narayan Mahadevan|Deva...,,,,


In [321]:
dataset.tail()

Unnamed: 0,index,title_x,runtime,genres,story,summary,tagline,actors,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
4324,4325,Samadhi (1950 film),165,Drama,The story is based on the true incident at INA...,The story is based on the true incident at INA...,,Ashok Kumar|Nalini Jaywant|Kuldip Kaur|Shyam|M...,,,,
4325,4326,Sangram (1950 film),139,Drama,After the death of his wife a policeman fails...,After the death of his wife a policeman fails...,,Ashok Kumar|Nalini Jaywant|Nawab|Sajjan|Tiwari...,,,,
4326,4327,Sargam (1950 film),135,Drama|Family,,Add a Plot »,,Raj Kapoor|Rehana|Om Prakash|David Abraham|Rad...,,,,
4327,4328,Sheesh Mahal (1950 film),144,Drama,Thakur Jaspal Singh lives in the prestigious a...,Thakur Jaspal Singh lives in the prestigious a...,,Sohrab Modi|Naseem Banu|Pushpa Hans|Nigar Sult...,,,,
4328,4329,Meena Bazaar (film),\N,Drama,,Add a Plot »,,,,,,


In [322]:
# columns in dataset

dataset.columns

Index(['index', 'title_x', 'runtime', 'genres', 'story', 'summary', 'tagline',
       'actors', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22'],
      dtype='object')

In [323]:
# shape of dataset frame

dataset.shape

(4329, 12)

In [324]:
# information about dataset

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4329 entries, 0 to 4328
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   index        4329 non-null   int64  
 1   title_x      4329 non-null   object 
 2   runtime      4329 non-null   object 
 3   genres       4329 non-null   object 
 4   story        4065 non-null   object 
 5   summary      4329 non-null   object 
 6   tagline      685 non-null    object 
 7   actors       4320 non-null   object 
 8   Unnamed: 19  0 non-null      float64
 9   Unnamed: 20  0 non-null      float64
 10  Unnamed: 21  0 non-null      float64
 11  Unnamed: 22  1 non-null      object 
dtypes: float64(3), int64(1), object(8)
memory usage: 406.0+ KB


In [325]:
# checking for null values in dataset

dataset.isnull().sum()

index             0
title_x           0
runtime           0
genres            0
story           264
summary           0
tagline        3644
actors            9
Unnamed: 19    4329
Unnamed: 20    4329
Unnamed: 21    4329
Unnamed: 22    4328
dtype: int64

In [326]:
# Total count of null values

dataset.isnull().sum().sum()

21232

In [328]:
# shape of dataset now 

dataset.shape

(4329, 12)

In [329]:
# changing the title of movies to lowercase 

dataset['title_x'] = dataset['title_x'].str.lower()
print(dataset['title_x'].head())

0                uri: the surgical strike
1                           battalion 609
2    the accidental prime minister (film)
3                         why cheat india
4                         evening shadows
Name: title_x, dtype: object


In [340]:
# changing the actors names to lowercase 

dataset['actors'] = dataset['actors'].str.lower()
print(dataset['actors'].head())

0    vicky kaushal|paresh rawal|mohit raina|yami ga...
1    vicky ahuja|shoaib ibrahim|shrikant kamat|elen...
2    anupam kher|akshaye khanna|aahana kumra|atul s...
3    emraan hashmi|shreya dhanwanthary|snighdadeep ...
4    mona ambegaonkar|ananth narayan mahadevan|deva...
Name: actors, dtype: object


In [342]:
# changing the story to lowercase 

dataset['story'] = dataset['story'].str.lower()
print(dataset['story'].head())

0    divided over five chapters  the film chronicle...
1    the story revolves around a cricket match betw...
2    based on the memoir by indian policy analyst s...
3    the movie focuses on existing malpractices in ...
4    while gay rights and marriage equality has bee...
Name: story, dtype: object


In [343]:
# changing the summary to lowercase 

dataset['summary'] = dataset['summary'].str.lower()
print(dataset['summary'].head())

0    indian army special forces execute a covert op...
1    the story of battalion 609 revolves around a c...
2    explores manmohan singh's tenure as the prime ...
3    the movie focuses on existing malpractices in ...
4    under the 'evening shadows'  truth often plays...
Name: summary, dtype: object


In [344]:
dataset.head()

Unnamed: 0,index,title_x,runtime,genres,story,summary,tagline,actors,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,1,uri: the surgical strike,138,Action|Drama|War,divided over five chapters the film chronicle...,indian army special forces execute a covert op...,,vicky kaushal|paresh rawal|mohit raina|yami ga...,,,,
1,2,battalion 609,131,War,the story revolves around a cricket match betw...,the story of battalion 609 revolves around a c...,,vicky ahuja|shoaib ibrahim|shrikant kamat|elen...,,,,
2,3,the accidental prime minister (film),112,Biography|Drama,based on the memoir by indian policy analyst s...,explores manmohan singh's tenure as the prime ...,,anupam kher|akshaye khanna|aahana kumra|atul s...,,,,#VALUE!
3,4,why cheat india,121,Crime|Drama,the movie focuses on existing malpractices in ...,the movie focuses on existing malpractices in ...,,emraan hashmi|shreya dhanwanthary|snighdadeep ...,,,,
4,5,evening shadows,102,Drama,while gay rights and marriage equality has bee...,under the 'evening shadows' truth often plays...,,mona ambegaonkar|ananth narayan mahadevan|deva...,,,,


In [345]:
selected_features = ['genres', 'story', 'actors','runtime', 'summary']
print(selected_features)

['genres', 'story', 'actors', 'runtime', 'summary']


In [346]:
# replacing the null values with null string

for features in selected_features :
  dataset[features] = dataset[features].fillna('')

In [347]:
dataset[features].isnull().sum()

0

In [348]:
dataset['runtime'] = dataset['runtime'].astype(str)

In [349]:
combined_features = dataset['genres']+''+dataset['story']+''+dataset['actors']+''+dataset['runtime']+''+dataset['summary']

In [350]:
# first 5 values of dataset

combined_features.head()

0    Action|Drama|Wardivided over five chapters  th...
1    Warthe story revolves around a cricket match b...
2    Biography|Dramabased on the memoir by indian p...
3    Crime|Dramathe movie focuses on existing malpr...
4    Dramawhile gay rights and marriage equality ha...
dtype: object

In [351]:
# first row of dataset

combined_features[0]

'Action|Drama|Wardivided over five chapters  the film chronicles the events of the surgical strike conducted by the indian military against suspected militants in pakistan occupied kashmir. it tells the story of the 11 tumultuous events over which the operation was carried out. indian army special forces carry out a covert operation to avenge the killing of fellow army men at their base by a terrorist group.vicky kaushal|paresh rawal|mohit raina|yami gautam|kirti kulhari|rajit kapoor|ivan rodrigues|manasi parekh|swaroop sampat|riva arora|yogesh soman|fareed ahmed|akashdeep arora|kallol banerjee|138indian army special forces execute a covert operation  avenging the killing of fellow army men at their base by a terrorist group.'

In [352]:
print(combined_features)

0       Action|Drama|Wardivided over five chapters  th...
1       Warthe story revolves around a cricket match b...
2       Biography|Dramabased on the memoir by indian p...
3       Crime|Dramathe movie focuses on existing malpr...
4       Dramawhile gay rights and marriage equality ha...
                              ...                        
4324    Dramathe story is based on the true incident a...
4325    Dramaafter the death of his wife  a policeman ...
4326    Drama|Familyraj kapoor|rehana|om prakash|david...
4327    Dramathakur jaspal singh lives in the prestigi...
4328                                  Drama\Nadd a plot »
Length: 4329, dtype: object


## Converting the text data into features vector using TfidfVectorizer

In [353]:
# Transforms text to feature vectors that can be used as input to estimator.

vectorizer = TfidfVectorizer()
feature_vector = vectorizer.fit_transform(combined_features )

In [354]:
print(feature_vector)

  (0, 4628)	0.11056112975602572
  (0, 11540)	0.11258520734395237
  (0, 970)	0.13157467040336976
  (0, 5016)	0.06653960541939301
  (0, 16332)	0.13157467040336976
  (0, 3463)	0.11258520734395237
  (0, 3382)	0.08542560005357036
  (0, 12092)	0.12106790007969773
  (0, 27619)	0.10103263855926892
  (0, 31988)	0.10877576679362172
  (0, 4235)	0.1339213797401411
  (0, 24696)	0.12106790007969773
  (0, 25838)	0.10005435943235372
  (0, 28760)	0.09744944199168287
  (0, 21712)	0.07725545582260569
  (0, 18301)	0.10320177051193972
  (0, 24759)	0.11258520734395237
  (0, 15535)	0.11492183343665972
  (0, 16461)	0.0318775622180689
  (0, 23539)	0.08954758910868171
  (0, 17117)	0.10877576679362172
  (0, 16907)	0.0921229298269117
  (0, 12995)	0.07904081878500971
  (0, 31934)	0.1044150631129877
  (0, 23473)	0.09002883795542357
  :	:
  (4327, 27642)	0.08286550576862918
  (4327, 14035)	0.04811204249053485
  (4327, 14304)	0.12963536666550554
  (4327, 27290)	0.07463532174402272
  (4327, 31707)	0.024504796231823583

## Cosine Similarity

In [355]:
# similarity confidence value using cosine similiarity

similarity = cosine_similarity(feature_vector)

In [356]:
print(similarity)

[[1.         0.17219709 0.04617793 ... 0.00685765 0.03572243 0.01023735]
 [0.17219709 1.         0.05877757 ... 0.         0.04842137 0.        ]
 [0.04617793 0.05877757 1.         ... 0.         0.05624512 0.        ]
 ...
 [0.00685765 0.         0.         ... 1.         0.         0.14936306]
 [0.03572243 0.04842137 0.05624512 ... 0.         1.         0.        ]
 [0.01023735 0.         0.         ... 0.14936306 0.         1.        ]]


In [357]:
print(similarity.shape)
# compare with all the other movies present there

(4329, 4329)


In [358]:
# creating a list with all the moviee names in dataset

list_of_titles = dataset['title_x'].tolist()
print(list_of_titles)

['uri: the surgical strike', 'battalion 609', 'the accidental prime minister (film)', 'why cheat india', 'evening shadows', 'soni (film)', 'fraud saiyaan', 'bombairiya', 'manikarnika: the queen of jhansi', 'thackeray (film)', 'amavas', 'gully boy', 'hum chaar', 'total dhamaal', 'sonchiriya', 'badla (2019 film)', 'mard ko dard nahi hota', 'hamid (film)', 'photograph (film)', 'risknamaa', 'mere pyare prime minister', '22 yards', 'kesari (film)', 'notebook (2019 film)', 'junglee (2019 film)', 'gone kesh', 'albert pinto ko gussa kyun aata hai?', 'the tashkent files', 'kalank', 'setters (film)', 'student of the year 2', 'pm narendra modi', 'de de pyaar de', "india's most wanted (film)", 'yeh hai india', 'khamoshi (2019 film)', 'kabir singh', 'article 15 (film)', 'one day: justice delivered', 'hume tumse pyaar kitna', 'super 30 (film)', 'family of thakurganj', 'batla house', 'jhootha kahin ka', 'judgementall hai kya', 'chicken curry law', 'arjun patiala', 'jabariya jodi', 'pranaam', 'the sky

## Recommedation

In [359]:
# getting the name as input for recommendation

movie_name = input('Enter your movie name : ')

Enter your movie name : blue


In [360]:
# finding the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['billu']


In [361]:
close_match = find_close_match[0]
print(close_match)

billu


In [362]:
# Index of movie with title and data

index_of_movie = dataset[dataset.title_x == close_match]['index'].values[0]
# we are finding the index of movie by comparing the title of each movie with closly matched string and returing its index 
print(index_of_movie)

925


In [363]:
# finding similariy score 

score = list(enumerate(similarity[index_of_movie]))
print(score)

[(0, 0.0504603935512987), (1, 0.05598679100700425), (2, 0.05008182954465237), (3, 0.03091013033161837), (4, 0.04338666268313743), (5, 0.021535770823434677), (6, 0.03315927165210743), (7, 0.03495016744334574), (8, 0.05396719645150364), (9, 0.08491605128992966), (10, 0.041881722861691145), (11, 0.07698273909067933), (12, 0.023594408176935467), (13, 0.06429101255829549), (14, 0.0732936832379373), (15, 0.07073957309329039), (16, 0.0338109621413548), (17, 0.057734160657368125), (18, 0.04362084199199213), (19, 0.06277161640147189), (20, 0.03849491106322587), (21, 0.013384318042419463), (22, 0.04181166792143568), (23, 0.061930549802034704), (24, 0.047648353531583844), (25, 0.03710164697565409), (26, 0.09630699719611745), (27, 0.050498808327166606), (28, 0.07162441411533162), (29, 0.027876387811893888), (30, 0.021585084100274262), (31, 0.003907695911260164), (32, 0.04798942471085578), (33, 0.027231715289926364), (34, 0.042141298665423016), (35, 0.038348949972527246), (36, 0.03607553377738772),

In [364]:
len(score)

4329

In [365]:
# finding movies having higher similarity score 

sorted_list = sorted(score, key = lambda x:x[1], reverse = True)
# sort based on non increasing order, key is based on first index of score in list element which is score in (index, score)
print(sorted_list)

[(925, 1.0), (2170, 0.1940246564412141), (1481, 0.16153595332923268), (2858, 0.1590545409726467), (1424, 0.15488437882685321), (1960, 0.1534661501253374), (1883, 0.15296839428114012), (1835, 0.15279288009894762), (1003, 0.15122027087397583), (2587, 0.1510280014703504), (1592, 0.14683591660244746), (1017, 0.1466111389976885), (1480, 0.14598884573079698), (3190, 0.14585979089787734), (3533, 0.14406733147881737), (3461, 0.1437170883183096), (1728, 0.14303500796056887), (2912, 0.143017195927542), (802, 0.1405041129377433), (1345, 0.13914888392759092), (3246, 0.13894225363541857), (1538, 0.13851916737216016), (1068, 0.13800353717221164), (3521, 0.1378768255921755), (3272, 0.13716936573276675), (3714, 0.13681861756146893), (2293, 0.1362201687020892), (1299, 0.13550964177517638), (1029, 0.13506432910501284), (1641, 0.13430485760041547), (3906, 0.13384161397298266), (1337, 0.1334983926603454), (3836, 0.13329284773158204), (1010, 0.13323047337205257), (3450, 0.13316962073562358), (1061, 0.13301

In [366]:
print(sorted_list[0], sorted_list[1], sorted_list[2], sorted_list[3], sorted_list[4])
# top five score with index 

(925, 1.0) (2170, 0.1940246564412141) (1481, 0.16153595332923268) (2858, 0.1590545409726467) (1424, 0.15488437882685321)


In [367]:
# printing the name of similar movies based on the index

print("Movies Results : \n")

i = 1

for movie in sorted_list:
  index = movie[0]
  title_of_movie = dataset[dataset.index == index]['title_x'].values[0]
  if(i<=10):
    print(i, '.', title_of_movie)
    i += 1

Movies Results : 

1 . the stoneman murders
2 . jeevan ki shatranj
3 . praan jaye par shaan na jaye
4 . ardh satya
5 . baaz: a bird in danger
6 . vijeta (1996 film)
7 . mahaanta
8 . zulm-o-sitam
9 . woodstock villa
10 . jalwa


## Recommendation function 

In [373]:
def recommend(name):
  find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
  
  try:
    close_match = find_close_match[0]
  except IndexError:
    close_match = 'null'
    print("No result found with this title")
    return 
  index_of_movie = dataset[dataset.title_x == close_match]['index'].values[0]
  score = list(enumerate(similarity[index_of_movie]))
  sorted_list = sorted(score, key = lambda x:x[1], reverse = True)


  print("Movies Results : \n")

  i = 1

  for movie in sorted_list:
    index = movie[0]
    title_of_movie = dataset[dataset.index == index]['title_x'].values[0]
    if(i<=3):
      print(i, '. ', title_of_movie)
      i += 1

In [381]:
movie_name = input('Enter movie/chracter/ clue : ')

recommend(movie_name)

Enter movie/chracter/ clue : london
Movies Results : 

1 .  azhar (film)
2 .  malik ek
3 .  22 yards
