### Constant

In [1]:
DATA_PATH = './Data/movies.csv'

# Import packages

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import difflib
%matplotlib inline

# Load Your Data

In [3]:
# First to load our data we call read_csv function from pandas
movieData = pd.read_csv(DATA_PATH)
# show first five observation for the data
movieData.head()


Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [4]:
# information about dataset
movieData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

# Data Exploration (EDA)

### **Data Preprocessing**

In [5]:
#select column that might help in Recommendation

selectedFeatures = ["genres","keywords","original_title","tagline","cast","director"]


In [6]:
# fill nulls with empty space
for f in selectedFeatures:
    movieData[f].fillna("", inplace=True)

In [7]:
combineFeatures = movieData["genres"] + " " + movieData["keywords"] + " " + movieData["original_title"] + " " + movieData["tagline"] + " " + movieData["cast"] + " " +movieData["director"]

combineFeatures

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  Newlyweds A newlywed couple's ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      Shanghai Calling A New Yorker in Shanghai Da...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# convert text to numerical Values
vec = TfidfVectorizer()

featureVector = vec.fit_transform(combineFeatures)

##### Cosine Similarity

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

sim = cosine_similarity(featureVector)
sim

array([[1.        , 0.07355546, 0.03531556, ..., 0.        , 0.        ,
        0.        ],
       [0.07355546, 1.        , 0.02790734, ..., 0.04430756, 0.        ,
        0.        ],
       [0.03531556, 0.02790734, 1.        , ..., 0.        , 0.04635788,
        0.        ],
       ...,
       [0.        , 0.04430756, 0.        , ..., 1.        , 0.        ,
        0.05548941],
       [0.        , 0.        , 0.04635788, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.05548941, 0.        ,
        1.        ]])

In [10]:
# find the similarity for movie given by use
movieName = input("Enter Movie Name: ")


In [11]:
#list of all movies in dataset
listOfMovies = movieData["title"].to_list()
listOfMovies

['Avatar',
 "Pirates of the Caribbean: At World's End",
 'Spectre',
 'The Dark Knight Rises',
 'John Carter',
 'Spider-Man 3',
 'Tangled',
 'Avengers: Age of Ultron',
 'Harry Potter and the Half-Blood Prince',
 'Batman v Superman: Dawn of Justice',
 'Superman Returns',
 'Quantum of Solace',
 "Pirates of the Caribbean: Dead Man's Chest",
 'The Lone Ranger',
 'Man of Steel',
 'The Chronicles of Narnia: Prince Caspian',
 'The Avengers',
 'Pirates of the Caribbean: On Stranger Tides',
 'Men in Black 3',
 'The Hobbit: The Battle of the Five Armies',
 'The Amazing Spider-Man',
 'Robin Hood',
 'The Hobbit: The Desolation of Smaug',
 'The Golden Compass',
 'King Kong',
 'Titanic',
 'Captain America: Civil War',
 'Battleship',
 'Jurassic World',
 'Skyfall',
 'Spider-Man 2',
 'Iron Man 3',
 'Alice in Wonderland',
 'X-Men: The Last Stand',
 'Monsters University',
 'Transformers: Revenge of the Fallen',
 'Transformers: Age of Extinction',
 'Oz: The Great and Powerful',
 'The Amazing Spider-Man 2',

In [12]:
# find the close match for the given movie name
closestMatch = difflib.get_close_matches(movieName, listOfMovies)

closestMatch

['John Carter', 'John Q', 'Monte Carlo']

In [13]:
match = closestMatch[0]

# getting the index of matching movie
matchedMovieIndex = movieData[movieData["title"] == match]['index'].values[0]
matchedMovieIndex

4

In [14]:
# get similarty score for matched movie

similartyScore = list(enumerate(sim[matchedMovieIndex]))
similartyScore

[(0, 0.10131144911654695),
 (1, 0.04043980281049077),
 (2, 0.05483989653603711),
 (3, 0.006115713755511957),
 (4, 1.0),
 (5, 0.12441422908068121),
 (6, 0.007663641390288833),
 (7, 0.052976006591584426),
 (8, 0.007362113402001646),
 (9, 0.03339524948170556),
 (10, 0.05228450370250097),
 (11, 0.011642425639094659),
 (12, 0.0125778694781116),
 (13, 0.04214494009046859),
 (14, 0.07032975118450543),
 (15, 0.06670796131763868),
 (16, 0.054412168976750865),
 (17, 0.021160144970003847),
 (18, 0.09018636046972248),
 (19, 0.0109221427934986),
 (20, 0.039124644690894),
 (21, 0.010985733222075626),
 (22, 0.005948818233729869),
 (23, 0.03154554403086447),
 (24, 0.046077177540274745),
 (25, 0.009469905500451565),
 (26, 0.03158452420211537),
 (27, 0.10697027089435227),
 (28, 0.04825272323413164),
 (29, 0.024207819235755695),
 (30, 0.03287553632001705),
 (31, 0.042969537004042595),
 (32, 0.10227748639496366),
 (33, 0.05475644295693372),
 (34, 0.009696695824521585),
 (35, 0.032723774434774616),
 (36, 0

In [15]:
# soring similarty score list
similartyScore.sort(key=lambda x:x[1], reverse=True)
similartyScore

[(4, 1.0),
 (373, 0.17424337603487106),
 (2904, 0.16926513664201295),
 (461, 0.16782443094770466),
 (3158, 0.1643826893438796),
 (2964, 0.1425458353206738),
 (4395, 0.14050748700452026),
 (328, 0.13970817221062912),
 (3257, 0.13960268223249955),
 (2121, 0.13636035712580277),
 (1473, 0.13363593062042994),
 (1144, 0.12881538957634697),
 (111, 0.12816308619594213),
 (1373, 0.12795075408629544),
 (816, 0.1274231596524745),
 (1748, 0.12728781631537583),
 (270, 0.1252071783286354),
 (5, 0.12441422908068121),
 (2407, 0.1241243918136993),
 (4555, 0.12162880929664985),
 (473, 0.12107518261651927),
 (3367, 0.12099765068964635),
 (3506, 0.11421548331444878),
 (643, 0.11319023330533728),
 (1075, 0.11291381010619367),
 (305, 0.11257337945796903),
 (3378, 0.11253267756650195),
 (2163, 0.11204376302221036),
 (916, 0.10960023549978175),
 (2157, 0.10839300162522561),
 (2658, 0.10827588495465668),
 (1158, 0.1077089462901872),
 (239, 0.1074379373670018),
 (269, 0.10736439561534869),
 (857, 0.107306711618

In [16]:
# get first highest recommended movies for you

print("Here it is the List Of Movies That Recommended For You, You May Love To Watch Also")

for i in range(10):
    index = similartyScore[i][0]
    title = movieData[movieData["index"] == index]["title"].values[0]

    print(i + 1 , " - ", title)

Here it is the List Of Movies That Recommended For You, You May Love To Watch Also
1  -  John Carter
2  -  Mission to Mars
3  -  Heaven is for Real
4  -  Lost in Space
5  -  Alien
6  -  The Last Days on Mars
7  -  The Specials
8  -  Finding Nemo
9  -  American Psycho
10  -  Max
