**Movie recommendation system**

**Importing the neccessary liberaries** 

In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# difflib is used to get the closest match of the text 
# Tfidvectorizer is used to convert text to numeriacal values
# cosine similarity is used to find the similarity score for the movie 

**Data Preprocessing**

In [3]:
path = 'movies.csv'
data = pd.read_csv(path)

In [4]:
data.head(4)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan


In [5]:
print(data.columns)

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')


In [6]:
print(data.shape)

(4803, 24)


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

In [8]:
data.isnull().sum()

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
dtype: int64

In [9]:
data = data.fillna(' ')

In [10]:
print(data.isnull().sum())

index                   0
budget                  0
genres                  0
homepage                0
id                      0
keywords                0
original_language       0
original_title          0
overview                0
popularity              0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
status                  0
tagline                 0
title                   0
vote_average            0
vote_count              0
cast                    0
crew                    0
director                0
dtype: int64


In [11]:
# Selecting the relevant features 

features = data[['genres','keywords','tagline','cast','director']]
features.head(3)

Unnamed: 0,genres,keywords,tagline,cast,director
0,Action Adventure Fantasy Science Fiction,culture clash future space war space colony so...,Enter the World of Pandora.,Sam Worthington Zoe Saldana Sigourney Weaver S...,James Cameron
1,Adventure Fantasy Action,ocean drug abuse exotic island east india trad...,"At the end of the world, the adventure begins.",Johnny Depp Orlando Bloom Keira Knightley Stel...,Gore Verbinski
2,Action Adventure Crime,spy based on novel secret agent sequel mi6,A Plan No One Escapes,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,Sam Mendes


In [12]:
# Combining the selected features assigned to a new column called combined features 

combined_features = features['genres']+' '+features['keywords']+' '+features['tagline']+' '+features['cast']+' '+features['director']
combined_features.head(4)

0    Action Adventure Fantasy Science Fiction cultu...
1    Adventure Fantasy Action ocean drug abuse exot...
2    Action Adventure Crime spy based on novel secr...
3    Action Crime Drama Thriller dc comics crime fi...
dtype: object

In [13]:
# Converting the combined text to feature vectors

vectorizer = TfidfVectorizer() # creating a vectorizer object to convert the text into numerical values 
feature_vectors = vectorizer.fit(combined_features)
feature_vectors = feature_vectors.transform(combined_features)
print(feature_vectors)

  (0, 17290)	0.20197912553916564
  (0, 17007)	0.23643326319898794
  (0, 16998)	0.12821263228505786
  (0, 16668)	0.1984326396510037
  (0, 16587)	0.12549432354918996
  (0, 15261)	0.07095833561276564
  (0, 14608)	0.15150672398763912
  (0, 14378)	0.3396275221095982
  (0, 14271)	0.21392179219912874
  (0, 14064)	0.2059609041508414
  (0, 13599)	0.10364139873166359
  (0, 13349)	0.15021264094167083
  (0, 13319)	0.21774705394124838
  (0, 13024)	0.19423620601088706
  (0, 11503)	0.2721131005698365
  (0, 11192)	0.09049319826481456
  (0, 10229)	0.160586854000953
  (0, 8756)	0.22709015857011813
  (0, 7755)	0.11280357148547558
  (0, 5836)	0.1646750903586285
  (0, 5437)	0.10364139873166359
  (0, 5274)	0.11108562744414444
  (0, 4945)	0.24025852494110755
  (0, 3678)	0.21392179219912874
  (0, 3225)	0.24960162956997733
  :	:
  (4801, 10929)	0.13504166990041588
  (4801, 7474)	0.11307961713172225
  (4801, 7269)	0.3025765103586468
  (4801, 6935)	0.2886098184932947
  (4801, 4835)	0.24713765026963996
  (4801, 3

**Cosine similarity**

In [14]:
# getting the similarity using the cosine similarity library 

similarity = cosine_similarity(feature_vectors) # creating a similarity object and passing the feature_vectors argw

print(similarity)

[[1.         0.07219487 0.037733   ... 0.         0.         0.        ]
 [0.07219487 1.         0.03281499 ... 0.03575545 0.         0.        ]
 [0.037733   0.03281499 1.         ... 0.         0.05389661 0.        ]
 ...
 [0.         0.03575545 0.         ... 1.         0.         0.02651502]
 [0.         0.         0.05389661 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.02651502 0.         1.        ]]


In [15]:
print(similarity.shape)
print(similarity.size)
print(type(similarity))

(4803, 4803)
23068809
<class 'numpy.ndarray'>


In [16]:
# creating a list of all the movie names in the dataset 

movie_title_list = data['title'].to_list() # making the title column in the data intio a list 
movie_title_list[0:20] # showing the first 20 items in the list 

['Avatar',
 "Pirates of the Caribbean: At World's End",
 'Spectre',
 'The Dark Knight Rises',
 'John Carter',
 'Spider-Man 3',
 'Tangled',
 'Avengers: Age of Ultron',
 'Harry Potter and the Half-Blood Prince',
 'Batman v Superman: Dawn of Justice',
 'Superman Returns',
 'Quantum of Solace',
 "Pirates of the Caribbean: Dead Man's Chest",
 'The Lone Ranger',
 'Man of Steel',
 'The Chronicles of Narnia: Prince Caspian',
 'The Avengers',
 'Pirates of the Caribbean: On Stranger Tides',
 'Men in Black 3',
 'The Hobbit: The Battle of the Five Armies']

**Getting the Movie input from the user** 

In [17]:
movie_name = input("Enter your favorite movie name :")

Enter your favorite movie name :The expendables


**Finding the closest match of the movie input by the user**

In [18]:
close_match = difflib.get_close_matches(movie_name,movie_title_list)
print(close_match)

['The Expendables', 'The Expendables 3', 'The Expendables 2']


In [19]:
closest_match = close_match[0]
print(closest_match)

The Expendables


**Finding the index of the movie with title**

In [20]:
index_of_the_movie = data[data['title']==closest_match]['index'].values[0]
print(index_of_the_movie)

424


**List of similar movies**

In [21]:
# Getting the list of similar movies 

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)
print(len(similarity_score))

[(0, 0.01403703739087843), (1, 0.0465899822799404), (2, 0.014361685524021495), (3, 0.013073384711849972), (4, 0.013668530235040768), (5, 0.014541271758325175), (6, 0.008076519625876098), (7, 0.014168568560970854), (8, 0.008241736414895263), (9, 0.013842696925028066), (10, 0.015788435474238423), (11, 0.01748979991048322), (12, 0.04635599169412653), (13, 0.012940285266428953), (14, 0.013945570085001923), (15, 0.00761844371546751), (16, 0.013434814807655914), (17, 0.013710730672975307), (18, 0.005787486387843901), (19, 0.01217290427628337), (20, 0.013978777626185341), (21, 0.01178051644570373), (22, 0.007206800124244286), (23, 0.006899942811484633), (24, 0.01324259969792444), (25, 0.005689622224365427), (26, 0.014623194943058837), (27, 0.017934807214322817), (28, 0.051057166571564026), (29, 0.038888050768438445), (30, 0.014147221524059297), (31, 0.013794866558793455), (32, 0.00738247713760235), (33, 0.019749029180825904), (34, 0.0), (35, 0.014781038928141758), (36, 0.013318895627958575), 

In [22]:
sorted_similar_movies = sorted(similarity_score,key=lambda x:x[1],reverse=True)
print(sorted_similar_movies)

[(424, 1.0), (290, 0.35159825393432365), (1960, 0.2685504132518771), (307, 0.22423949819379072), (1297, 0.223555393266472), (3591, 0.22270470537569412), (2933, 0.21501897672423595), (866, 0.20351546211886287), (3115, 0.19565528061361787), (644, 0.17926356833774468), (2543, 0.17423535172082788), (1064, 0.17411350369573222), (478, 0.1715436555336831), (609, 0.1707724653369826), (863, 0.17025522507361363), (1254, 0.16934661897968337), (560, 0.16929895072381146), (1027, 0.16504034830070466), (3505, 0.16404405962338237), (4623, 0.1616391186086228), (969, 0.15933516614112386), (1106, 0.15671785217474535), (3209, 0.15670686880575954), (460, 0.1559990829734575), (1327, 0.15100206396595497), (1155, 0.14820278039181967), (4333, 0.14680907359098788), (4147, 0.14484409948282356), (831, 0.14044037953031283), (794, 0.1383724696102731), (1118, 0.1354448419386307), (2646, 0.13010441971602482), (1076, 0.11887937715509135), (1513, 0.11591236762564053), (1894, 0.11499265003578536), (4220, 0.1119199008795

**Similar movies based on index**

In [23]:
print("movies suggested for you:\n")
i = 1
for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = data[data.index==index]['title'].values[0]
    if(i<30):
        print(i,' ',title_from_index)
        i+=1

movies suggested for you:

1   The Expendables
2   The Expendables 2
3   Rocky Balboa
4   The Expendables 3
5   Grudge Match
6   Nighthawks
7   F.I.S.T.
8   Bullet to the Head
9   An Alan Smithee Film: Burn, Hollywood, Burn
10   Cliffhanger
11   Cop Land
12   The Specialist
13   Daylight
14   Escape Plan
15   D-Tox
16   Get Carter
17   Driven
18   First Blood
19   Men of War
20   Death Race 2000
21   Assassins
22   Rambo: First Blood Part II
23   Skin Trade
24   Zookeeper
25   Creed
26   Spy Kids 3-D: Game Over
27   Rocky
28   Small Apartments
29   Tango & Cash


**Thank You!**  
**Olatunde**