In [None]:
import numpy as np
import pandas as pd
import difflib                      #To match the incorrect spelling by user to recommend movies
from sklearn.feature_extraction.text import TfidfVectorizer   # To convert textral data into numerical data
from sklearn.metrics.pairwise import cosine_similarity       #Gives similarity score for all the movies

Types of Recommendation System

1) *Content Based Recommendation System* - recommends film on the content of the films user has watched. for ex - Super heroes type movies

2) *Popularity Based Recommendation System* - recommends movies/series that are more popular. ex - famous directors or actors.
We are doing for this in project

3) *Collaborative Based Recommendation System* - groups people based on the watching pattern of several people. ex - most viewed

Work Flow

1) *Data collecting*

2) *Data preprocessing* - means we have to clean data or check for missing values

3) *Feature Extraction*- converting textural data into numerical and give each movie a similarity score.

4) *User input* - to recommend movies based on their liking

5) *Cosine Similarity* - This algorithm is used to find simlarity b/w movies by giving them  a vector

6) *List of Movies* - based on user input

In [None]:
# 1) Data Collection

In [None]:
movies_data = pd.read_csv('/content/drive/MyDrive/Machine Learning/movies (1).csv')
movies_data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [None]:

movies_data.tail()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director,label
4798,4798,220000,Action Crime Thriller,,9367,united states\u2013mexico barrier legs arms pa...,es,El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,"[{""name"": ""Columbia Pictures"", ""id"": 5}]","[{""iso_3166_1"": ""MX"", ""name"": ""Mexico""}, {""iso...",1992-09-04,2040920,81.0,"[{""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,"He didn't come looking for trouble, but troubl...",El Mariachi,6.6,238,Carlos Gallardo Jaime de Hoyos Peter Marquardt...,"[{'name': 'Robert Rodriguez', 'gender': 0, 'de...",Robert Rodriguez,1
4799,4799,9000,Comedy Romance,,72766,,en,Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,[],[],2011-12-26,0,85.0,[],Released,A newlywed couple's honeymoon is upended by th...,Newlyweds,5.9,5,Edward Burns Kerry Bish\u00e9 Marsha Dietlein ...,"[{'name': 'Edward Burns', 'gender': 2, 'depart...",Edward Burns,1
4800,4800,0,Comedy Drama Romance TV Movie,http://www.hallmarkchannel.com/signedsealeddel...,231617,date love at first sight narration investigati...,en,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",1.444476,"[{""name"": ""Front Street Pictures"", ""id"": 3958}...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2013-10-13,0,120.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,"Signed, Sealed, Delivered",7.0,6,Eric Mabius Kristin Booth Crystal Lowe Geoff G...,"[{'name': 'Carla Hetland', 'gender': 0, 'depar...",Scott Smith,1
4801,4801,0,,http://shanghaicalling.com/,126186,,en,Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,[],"[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-05-03,0,98.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A New Yorker in Shanghai,Shanghai Calling,5.7,7,Daniel Henney Eliza Coupe Bill Paxton Alan Ruc...,"[{'name': 'Daniel Hsia', 'gender': 2, 'departm...",Daniel Hsia,1
4802,4802,0,Documentary,,25975,obsession camcorder crush dream girl,en,My Date with Drew,Ever since the second grade when he first saw ...,1.929883,"[{""name"": ""rusty bear entertainment"", ""id"": 87...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2005-08-05,0,90.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,My Date with Drew,6.3,16,Drew Barrymore Brian Herzlinger Corey Feldman ...,"[{'name': 'Clark Peterson', 'gender': 2, 'depa...",Brian Herzlinger,1


2) Data Pre Processing

In [None]:
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

In [None]:
movies_data.shape

(4803, 25)

In [None]:
movies_data.isnull().sum()

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
label                      0
dtype: int64

3) Feature Selection

In [None]:
# Selecting relevant features for popularity Recommendation
selected_features = ['genres','keywords','tagline','cast','director']
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [None]:
# replacing all null values with null string

for feature in selected_features:
  movies_data[feature] = movies_data[feature].fillna('')

In [None]:
movies_data.isnull().sum()

index                      0
budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                    0
title                      0
vote_average               0
vote_count                 0
cast                       0
crew                       0
director                   0
dtype: int64

In [None]:
# Combining all the 5 selected features with their textural data

combined_features = movies_data['genres'] + ' ' + movies_data['keywords'] + ' ' + movies_data['tagline'] + ' ' + movies_data['cast'] + ' ' + movies_data['director']
print(combined_features) 

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object


In [None]:
# Converting text data into numerical data
vectorizer = TfidfVectorizer() 
feature_vectors = vectorizer.fit_transform(combined_features)
print(feature_vectors)

  (0, 2432)	0.17272411194153
  (0, 7755)	0.1128035714854756
  (0, 13024)	0.1942362060108871
  (0, 10229)	0.16058685400095302
  (0, 8756)	0.22709015857011816
  (0, 14608)	0.15150672398763912
  (0, 16668)	0.19843263965100372
  (0, 14064)	0.20596090415084142
  (0, 13319)	0.2177470539412484
  (0, 17290)	0.20197912553916567
  (0, 17007)	0.23643326319898797
  (0, 13349)	0.15021264094167086
  (0, 11503)	0.27211310056983656
  (0, 11192)	0.09049319826481456
  (0, 16998)	0.1282126322850579
  (0, 15261)	0.07095833561276566
  (0, 4945)	0.24025852494110758
  (0, 14271)	0.21392179219912877
  (0, 3225)	0.24960162956997736
  (0, 16587)	0.12549432354918996
  (0, 14378)	0.33962752210959823
  (0, 5836)	0.1646750903586285
  (0, 3065)	0.22208377802661425
  (0, 3678)	0.21392179219912877
  (0, 5437)	0.1036413987316636
  :	:
  (4801, 17266)	0.2886098184932947
  (4801, 4835)	0.24713765026963996
  (4801, 403)	0.17727585190343226
  (4801, 6935)	0.2886098184932947
  (4801, 11663)	0.21557500762727902
  (4801, 1672

5) Cosine similarity

In [None]:
# Giving similarity score using cosine similarity
similarity = cosine_similarity(feature_vectors)      #It compares 1 movie with all the movies and then gives a similarity score value and same for others.
print(similarity)

[[1.         0.07219487 0.037733   ... 0.         0.         0.        ]
 [0.07219487 1.         0.03281499 ... 0.03575545 0.         0.        ]
 [0.037733   0.03281499 1.         ... 0.         0.05389661 0.        ]
 ...
 [0.         0.03575545 0.         ... 1.         0.         0.02651502]
 [0.         0.         0.05389661 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.02651502 0.         1.        ]]


In [None]:
similarity.shape        
#Movie index  ,  # its similarity score

(4803, 4803)

MOVIE RECOMMENDATION SYSTEM


In [None]:
movie_name = input( 'Enter your favourite movie name: ')

list_of_all_titles = movies_data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

most_close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.title == most_close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 

print('Movies suggested for you: \n')       

i = 1

for movie in sorted_similar_movies:
  indexx = movie[0]                    
  title_from_index = movies_data[movies_data.index==indexx]['title'].values[0]
  if (i<30):
    print(i, '-', title_from_index)
    i+=1

Enter your favourite movie name: Hello
Movies suggested for you: 

1 - Hellboy
2 - Hellboy II: The Golden Army
3 - Blade II
4 - Mirrormask
5 - Pacific Rim
6 - Superhero Movie
7 - The Helix... Loaded
8 - The Dog Lover
9 - How the Grinch Stole Christmas
10 - Twister
11 - Pan's Labyrinth
12 - Blade: Trinity
13 - My Boss's Daughter
14 - Harry Potter and the Philosopher's Stone
15 - Harry Potter and the Half-Blood Prince
16 - Tangled
17 - American Outlaws
18 - Darling Lili
19 - Highway
20 - Spring Breakers
21 - Licence to Kill
22 - X-Men: First Class
23 - American Hero
24 - Beautiful Creatures
25 - Pitch Black
26 - The Sweetest Thing
27 - Captain America: The Winter Soldier
28 - Alien: Resurrection
29 - The Craft


EXPLANATION OF CODES USED IN MOVIE RECOMMENDATION SYSTEM

Getting name of movie from user



In [None]:
# getting the movie name from user

movie_name = input( 'Enter your favourite movie name: ')

Enter your favourite movie name: hero


In [None]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['title'].tolist()
print(list_of_all_titles)

['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice', 'Superman Returns', 'Quantum of Solace', "Pirates of the Caribbean: Dead Man's Chest", 'The Lone Ranger', 'Man of Steel', 'The Chronicles of Narnia: Prince Caspian', 'The Avengers', 'Pirates of the Caribbean: On Stranger Tides', 'Men in Black 3', 'The Hobbit: The Battle of the Five Armies', 'The Amazing Spider-Man', 'Robin Hood', 'The Hobbit: The Desolation of Smaug', 'The Golden Compass', 'King Kong', 'Titanic', 'Captain America: Civil War', 'Battleship', 'Jurassic World', 'Skyfall', 'Spider-Man 2', 'Iron Man 3', 'Alice in Wonderland', 'X-Men: The Last Stand', 'Monsters University', 'Transformers: Revenge of the Fallen', 'Transformers: Age of Extinction', 'Oz: The Great and Powerful', 'The Amazing Spider-Man 2', 'TRON: Legacy', 'Cars 2', 'Green Lant

In [None]:
#Finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Hero', 'The Crow', 'Cheri']


In [None]:
# Most similar movie name
most_close_match = find_close_match[0]
print(most_close_match)

Hero


In [None]:
# Finding the index of the movie Hero with title
index_of_the_movie = movies_data[movies_data.title == most_close_match]['index'].values[0]        # 0 will give actual position of the value
print(index_of_the_movie)

1136


In [None]:
# getting a list of similar movies of Hero

similarity_score = list(enumerate(similarity[index_of_the_movie]))                    #enumerate means a loop use to get indexed list
print(similarity_score)

[(0, 0.01909030731635588), (1, 0.03933216892034341), (2, 0.024829338580970733), (3, 0.008856529409171622), (4, 0.011825072578315039), (5, 0.03233577320976134), (6, 0.0069872494784649275), (7, 0.012257671357729072), (8, 0.014304886002702908), (9, 0.04353562432308547), (10, 0.013659068836990893), (11, 0.039217717358302866), (12, 0.01239952119285493), (13, 0.011195045102021297), (14, 0.041965167595641326), (15, 0.006590953695830392), (16, 0.011622878059667177), (17, 0.011861581495721047), (18, 0.018465050855965066), (19, 0.016555094718052162), (20, 0.038071664598852584), (21, 0.02764199407460675), (22, 0.01250858423902771), (23, 0.018149674038800325), (24, 0.02054890010193295), (25, 0.0027097455670786023), (26, 0.012650982845633253), (27, 0.011062470223422372), (28, 0.012145286826524878), (29, 0.012705778467075613), (30, 0.03846046816476813), (31, 0.01193437007936041), (32, 0.006386811636565025), (33, 0.012181510770849686), (34, 0.0), (35, 0.012787538608949416), (36, 0.011522592755426806)

In [None]:
len(similarity_score)

4803

In [None]:
# sorting the movies based on their similarity score
                                                      #x represents similarity score and x[1] is about 2nd column value
sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)    #reverse = True means higher order to lower order
print(sorted_similar_movies)

[(1136, 1.0000000000000002), (2884, 0.3175438054893153), (1304, 0.29560422847860446), (2863, 0.2571291105307117), (1357, 0.2373732273904522), (3892, 0.23289049039173151), (317, 0.20887481050756493), (1298, 0.182464407416641), (2592, 0.17850354861636955), (2013, 0.17771716027151768), (2515, 0.1614510320945296), (1095, 0.14966615146157278), (345, 0.14835385600545137), (448, 0.13882379751005533), (404, 0.1347371638336988), (2896, 0.13366615189986356), (3300, 0.1311333008707427), (1836, 0.13042837751135472), (729, 0.12944910554309755), (627, 0.12787960263483067), (2717, 0.1269699738510326), (1193, 0.12110048788289245), (1002, 0.12007471068663342), (836, 0.11907550497512341), (71, 0.11782406072882487), (1986, 0.11510923821626434), (4441, 0.1121826265003816), (1284, 0.10202835901783294), (2300, 0.10186660249381284), (1672, 0.10074274468133833), (164, 0.0965622664400744), (3704, 0.09473106101045704), (3607, 0.0940688903042413), (1114, 0.09309855392759958), (4323, 0.08991520423186068), (1860, 

In [None]:
# print the name of similar movies based on the index

print('Movies suggested for you: \n')        # \n = new line character , it is used to indicate the end of the line of a text

i = 1

for movie in sorted_similar_movies:
  indexx = movie[0]                    # means (1136, )
  title_from_index = movies_data[movies_data.index==indexx]['title'].values[0]
  if (i<30):
    print(i, '-', title_from_index)
    i+=1

Movies suggested for you: 

1 - Hero
2 - 2046
3 - The Grandmaster
4 - House of Flying Daggers
5 - Ip Man 3
6 - Coming Home
7 - The Flowers of War
8 - Red Cliff
9 - Highlander: Endgame
10 - Bodyguards and Assassins
11 - Crouching Tiger, Hidden Dragon
12 - Curse of the Golden Flower
13 - Rush Hour 2
14 - Cold Mountain
15 - Memoirs of a Geisha
16 - A Woman, a Gun and a Noodle Shop
17 - My Lucky Star
18 - Tombstone
19 - A Civil Action
20 - The Last Legion
21 - Brokeback Mountain
22 - The Count of Monte Cristo
23 - The One
24 - The Forbidden Kingdom
25 - The Mummy: Tomb of the Dragon Emperor
26 - Faster
27 - Bambi
28 - The Warlords
29 - The Fall of the Roman Empire


MOVIE RECOMMENDATION SYSTEM

In [None]:
movie_name = input( 'Enter your favourite movie name: ')

list_of_all_titles = movies_data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

most_close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.title == most_close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 

print('Movies suggested for you: \n')       

i = 1

for movie in sorted_similar_movies:
  indexx = movie[0]                    
  title_from_index = movies_data[movies_data.index==indexx]['title'].values[0]
  if (i<30):
    print(i, '-', title_from_index)
    i+=1

Enter your favourite movie name: bomb
Movies suggested for you: 

1 - Womb
2 - Another Year
3 - All or Nothing
4 - Mr. Turner
5 - For Greater Glory - The True Story of Cristiada
6 - Being Julia
7 - The Helix... Loaded
8 - The Christmas Candle
9 - Leap Year
10 - Jakob the Liar
11 - Kingdom of Heaven
12 - The Spirit
13 - The Golden Compass
14 - Ocean's Thirteen
15 - The Other Woman
16 - Zack and Miri Make a Porno
17 - 300: Rise of an Empire
18 - Hitch
19 - The Limey
20 - Letters to Juliet
21 - Predator 2
22 - The Butterfly Effect
23 - The Devil's Own
24 - Without a Paddle
25 - The Adjustment Bureau
26 - Hook
27 - They Live
28 - Chronicle
29 - I Am Legend
