In [1]:
#importing modules used
import numpy as np
import pandas as pd
import pickle
import ast
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#loading movie datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
#a tuple of movies table
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
#a tuple of credits
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [5]:
#merging two tables based on title
movies = movies.merge( credits, on = 'title' )

In [6]:
#dataframe after merging movies and credits in movies
movies.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [7]:
#omitting all other columns other than the ones mentioned below
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [8]:
#dataframe after selecting only 7 attributes
movies.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [9]:
#checking the data for any null values or redundancies
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [10]:
#getting rid of the null values
movies.dropna( inplace = True)

In [11]:
#format of data in the genres column
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [12]:
#function to convert and store the objects in form of list
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
        L.append(i['name'])
    return L

In [13]:
#after applying convert function
movies['genres'] = movies['genres'].apply(convert)
movies.iloc[0].genres

['Action',
 'Action',
 'Adventure',
 'Adventure',
 'Fantasy',
 'Fantasy',
 'Science Fiction',
 'Science Fiction']

In [14]:
#format of data in the keywords column
movies.iloc[0].keywords

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

In [15]:
#after applying convert function
movies['keywords'] = movies['keywords'].apply(convert)
movies.iloc[0].keywords

['culture clash',
 'culture clash',
 'future',
 'future',
 'space war',
 'space war',
 'space colony',
 'space colony',
 'society',
 'society',
 'space travel',
 'space travel',
 'futuristic',
 'futuristic',
 'romance',
 'romance',
 'space',
 'space',
 'alien',
 'alien',
 'tribe',
 'tribe',
 'alien planet',
 'alien planet',
 'cgi',
 'cgi',
 'marine',
 'marine',
 'soldier',
 'soldier',
 'battle',
 'battle',
 'love affair',
 'love affair',
 'anti war',
 'anti war',
 'power relations',
 'power relations',
 'mind and soul',
 'mind and soul',
 '3d',
 '3d']

In [16]:
#format of data in the cast column
movies.iloc[0].cast

'[{"cast_id": 242, "character": "Jake Sully", "credit_id": "5602a8a7c3a3685532001c9a", "gender": 2, "id": 65731, "name": "Sam Worthington", "order": 0}, {"cast_id": 3, "character": "Neytiri", "credit_id": "52fe48009251416c750ac9cb", "gender": 1, "id": 8691, "name": "Zoe Saldana", "order": 1}, {"cast_id": 25, "character": "Dr. Grace Augustine", "credit_id": "52fe48009251416c750aca39", "gender": 1, "id": 10205, "name": "Sigourney Weaver", "order": 2}, {"cast_id": 4, "character": "Col. Quaritch", "credit_id": "52fe48009251416c750ac9cf", "gender": 2, "id": 32747, "name": "Stephen Lang", "order": 3}, {"cast_id": 5, "character": "Trudy Chacon", "credit_id": "52fe48009251416c750ac9d3", "gender": 1, "id": 17647, "name": "Michelle Rodriguez", "order": 4}, {"cast_id": 8, "character": "Selfridge", "credit_id": "52fe48009251416c750ac9e1", "gender": 2, "id": 1771, "name": "Giovanni Ribisi", "order": 5}, {"cast_id": 7, "character": "Norm Spellman", "credit_id": "52fe48009251416c750ac9dd", "gender": 

In [17]:
#function to extract the top 5 cast of the movie
def extract_top5_cast(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 5:
            L.append(i['name'])
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [18]:
#after extracting the top 5 function
movies['cast'] = movies['cast'].apply(extract_top5_cast)
movies.iloc[0].cast

['Sam Worthington',
 'Sam Worthington',
 'Zoe Saldana',
 'Zoe Saldana',
 'Sigourney Weaver',
 'Sigourney Weaver',
 'Stephen Lang',
 'Stephen Lang',
 'Michelle Rodriguez',
 'Michelle Rodriguez']

In [19]:
#function to fetch the director of the movie from crew column
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            L.append(i['name'])
            break
    return L

In [20]:
#after fetching the director
movies['crew'] = movies['crew'].apply(fetch_director)
movies.iloc[0].crew

['James Cameron', 'James Cameron']

In [21]:
#format of data in the overview column
movies.iloc[0].overview

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.'

In [22]:
#converting it into list of strings
movies['overview'] = movies['overview'].apply( lambda x:x.split())
movies.iloc[0].overview

['In',
 'the',
 '22nd',
 'century,',
 'a',
 'paraplegic',
 'Marine',
 'is',
 'dispatched',
 'to',
 'the',
 'moon',
 'Pandora',
 'on',
 'a',
 'unique',
 'mission,',
 'but',
 'becomes',
 'torn',
 'between',
 'following',
 'orders',
 'and',
 'protecting',
 'an',
 'alien',
 'civilization.']

In [23]:
#format of data in the movie dataframe as of now
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Action, Adventure, Adventure, Fantasy...","[culture clash, culture clash, future, future,...","[Sam Worthington, Sam Worthington, Zoe Saldana...","[James Cameron, James Cameron]"
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Adventure, Fantasy, Fantasy, Actio...","[ocean, ocean, drug abuse, drug abuse, exotic ...","[Johnny Depp, Johnny Depp, Orlando Bloom, Orla...","[Gore Verbinski, Gore Verbinski]"
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Action, Adventure, Adventure, Crime, ...","[spy, spy, based on novel, based on novel, sec...","[Daniel Craig, Daniel Craig, Christoph Waltz, ...","[Sam Mendes, Sam Mendes]"
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Action, Crime, Crime, Drama, Drama, T...","[dc comics, dc comics, crime fighter, crime fi...","[Christian Bale, Christian Bale, Michael Caine...","[Christopher Nolan, Christopher Nolan]"
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Action, Adventure, Adventure, Science...","[based on novel, based on novel, mars, mars, m...","[Taylor Kitsch, Taylor Kitsch, Lynn Collins, L...","[Andrew Stanton, Andrew Stanton]"


In [24]:
#getting rid of space in the words which consists of two or more spaces
#for example "Science Fiction" to "ScienceFiction" or "Sam Mendes" to "SamMendes"
movies['genres'] = movies['genres'].apply( lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply( lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply( lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply( lambda x:[i.replace(" ","") for i in x])

In [25]:
#format of data in the movie dataframe after removing spaces
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Action, Adventure, Adventure, Fantasy...","[cultureclash, cultureclash, future, future, s...","[SamWorthington, SamWorthington, ZoeSaldana, Z...","[JamesCameron, JamesCameron]"
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Adventure, Fantasy, Fantasy, Actio...","[ocean, ocean, drugabuse, drugabuse, exoticisl...","[JohnnyDepp, JohnnyDepp, OrlandoBloom, Orlando...","[GoreVerbinski, GoreVerbinski]"
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Action, Adventure, Adventure, Crime, ...","[spy, spy, basedonnovel, basedonnovel, secreta...","[DanielCraig, DanielCraig, ChristophWaltz, Chr...","[SamMendes, SamMendes]"
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Action, Crime, Crime, Drama, Drama, T...","[dccomics, dccomics, crimefighter, crimefighte...","[ChristianBale, ChristianBale, MichaelCaine, M...","[ChristopherNolan, ChristopherNolan]"
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Action, Adventure, Adventure, Science...","[basedonnovel, basedonnovel, mars, mars, medal...","[TaylorKitsch, TaylorKitsch, LynnCollins, Lynn...","[AndrewStanton, AndrewStanton]"


In [26]:
#adding a new column tags consisting of the all remaining columns
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [27]:
#format of data in the movie dataframe after adding tags column
movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Action, Adventure, Adventure, Fantasy...","[cultureclash, cultureclash, future, future, s...","[SamWorthington, SamWorthington, ZoeSaldana, Z...","[JamesCameron, JamesCameron]","[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Adventure, Fantasy, Fantasy, Actio...","[ocean, ocean, drugabuse, drugabuse, exoticisl...","[JohnnyDepp, JohnnyDepp, OrlandoBloom, Orlando...","[GoreVerbinski, GoreVerbinski]","[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Action, Adventure, Adventure, Crime, ...","[spy, spy, basedonnovel, basedonnovel, secreta...","[DanielCraig, DanielCraig, ChristophWaltz, Chr...","[SamMendes, SamMendes]","[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Action, Crime, Crime, Drama, Drama, T...","[dccomics, dccomics, crimefighter, crimefighte...","[ChristianBale, ChristianBale, MichaelCaine, M...","[ChristopherNolan, ChristopherNolan]","[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Action, Adventure, Adventure, Science...","[basedonnovel, basedonnovel, mars, mars, medal...","[TaylorKitsch, TaylorKitsch, LynnCollins, Lynn...","[AndrewStanton, AndrewStanton]","[John, Carter, is, a, war-weary,, former, mili..."


In [28]:
#creating a new dataframe consisting of movie_id, title and tags
new_df = movies[['movie_id','title','tags']]
new_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [29]:
#converting the individual strings into a paragraph
new_df['tags'] = new_df['tags'].apply( lambda x:" ".join(x))
new_df['tags'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply( lambda x:" ".join(x))


'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. Action Action Adventure Adventure Fantasy Fantasy ScienceFiction ScienceFiction cultureclash cultureclash future future spacewar spacewar spacecolony spacecolony society society spacetravel spacetravel futuristic futuristic romance romance space space alien alien tribe tribe alienplanet alienplanet cgi cgi marine marine soldier soldier battle battle loveaffair loveaffair antiwar antiwar powerrelations powerrelations mindandsoul mindandsoul 3d 3d SamWorthington SamWorthington ZoeSaldana ZoeSaldana SigourneyWeaver SigourneyWeaver StephenLang StephenLang MichelleRodriguez MichelleRodriguez JamesCameron JamesCameron'

In [30]:
#converting the paragraph into full lowercase
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())
new_df['tags'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action action adventure adventure fantasy fantasy sciencefiction sciencefiction cultureclash cultureclash future future spacewar spacewar spacecolony spacecolony society society spacetravel spacetravel futuristic futuristic romance romance space space alien alien tribe tribe alienplanet alienplanet cgi cgi marine marine soldier soldier battle battle loveaffair loveaffair antiwar antiwar powerrelations powerrelations mindandsoul mindandsoul 3d 3d samworthington samworthington zoesaldana zoesaldana sigourneyweaver sigourneyweaver stephenlang stephenlang michellerodriguez michellerodriguez jamescameron jamescameron'

In [31]:
#intialising ps as a PorterStemmer
ps = PorterStemmer()

In [32]:
#PorterStemmer is used in the following function to stem words to its root word
#for example dancing,dancer,dance will be converted to danc
def stem(text) :
    y = []
    
    for i in text.split() :
        y.append( ps.stem(i))
        
    return " ".join(y)

In [33]:
#applying stemming to the tags column
new_df['tags'] = new_df['tags'].apply(stem)
new_df['tags'][0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action action adventur adventur fantasi fantasi sciencefict sciencefict cultureclash cultureclash futur futur spacewar spacewar spacecoloni spacecoloni societi societi spacetravel spacetravel futurist futurist romanc romanc space space alien alien tribe tribe alienplanet alienplanet cgi cgi marin marin soldier soldier battl battl loveaffair loveaffair antiwar antiwar powerrel powerrel mindandsoul mindandsoul 3d 3d samworthington samworthington zoesaldana zoesaldana sigourneyweav sigourneyweav stephenlang stephenlang michellerodriguez michellerodriguez jamescameron jamescameron'

In [34]:
#inbuilt function to remove stopwords i.e. 'are','at','is' etc and take count of 10000 most frequent words
cv = CountVectorizer(max_features=10000,stop_words='english')

In [35]:
#converting the count vectorizer into a 2d array format
vectors = cv.fit_transform(new_df['tags']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [36]:
#retrieving all the top 10000 words in alphabetical order
cv.get_feature_names()



['000',
 '007',
 '10',
 '100',
 '10th',
 '11',
 '12',
 '12th',
 '13',
 '14',
 '15',
 '150',
 '16',
 '16th',
 '16thcenturi',
 '17',
 '17th',
 '17thcenturi',
 '18',
 '1890',
 '18th',
 '18thcenturi',
 '19',
 '1910',
 '1920',
 '1930',
 '1930s',
 '1937',
 '1940',
 '1941',
 '1944',
 '1945',
 '1950',
 '1950s',
 '1955',
 '1959',
 '1960',
 '1960s',
 '1962',
 '1964',
 '1965',
 '1967',
 '1970',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1974',
 '1976',
 '1977',
 '1979',
 '1980',
 '1980s',
 '1984',
 '1985',
 '1990',
 '1994',
 '1995',
 '1997',
 '1999',
 '19th',
 '19thcenturi',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2003',
 '2008',
 '2009',
 '20th',
 '21st',
 '21stcenturi',
 '23',
 '24',
 '25',
 '27',
 '30',
 '300',
 '35',
 '3d',
 '40',
 '47',
 '50',
 '500',
 '50cent',
 '51',
 '60',
 '60s',
 '70',
 '7th',
 '80',
 'aaron',
 'aaroneckhart',
 'aaronpaul',
 'aarontaylor',
 'aaronyoo',
 'aasifmandvi',
 'abandon',
 'abbi',
 'abbiecornish',
 'abduct',
 'abhishekbachchan',
 'abigailbreslin',
 'abil',
 'abl

In [37]:
#inbuilt function which calculates the similarity on 10000d space and calculates the cosine angle between every two movies.
#based on similarity of words, it gives similarity score between 0 and 1.
# 1 meaning the movie is the same and more near the score is to 1, the more similar the movie.
similarity = cosine_similarity(vectors)
similarity

array([[1.        , 0.09237878, 0.09529862, ..., 0.05288859, 0.        ,
        0.        ],
       [0.09237878, 1.        , 0.08392737, ..., 0.00970371, 0.        ,
        0.01077451],
       [0.09529862, 0.08392737, 1.        , ..., 0.0120125 , 0.        ,
        0.        ],
       ...,
       [0.05288859, 0.00970371, 0.0120125 , ..., 1.        , 0.02842676,
        0.03701166],
       [0.        , 0.        , 0.        , ..., 0.02842676, 1.        ,
        0.0631273 ],
       [0.        , 0.01077451, 0.        , ..., 0.03701166, 0.0631273 ,
        1.        ]])

In [38]:
#recommend function takes movie as input and outputs top 5 movies to be recommended to the user
def recommend(movie):
    #movie_index gets the movie_id from the new_df table based on the title match
    movie_index = new_df[new_df['title'] == movie].index[0]
    #the similarity of the current movie with respect to all the others is stored in distances
    distances = similarity[movie_index]
    #we sort the distances in reverse order and store 1-5 indexed movies in variable movies_list 
    #at index 0, it would be the same movie, hence 1-5
    #we use enumerate function and store it in list so that we don't loose the actual corresponding index of the movie
    movies_list = sorted( list(enumerate(distances)), reverse = True, key = lambda x:x[1])[1:6]
    #function to print the titles of the movies present in movies_list
    for i in movies_list:
        print( new_df.iloc[i[0]].title )

In [39]:
#to create a .pkl file for storing the new_df dataframe
pickle.dump(new_df,open('movies.pkl','wb'))

In [40]:
#to create a .pkl file for storing the new_df dataframe after converting it into dictionary
pickle.dump(new_df.to_dict(),open('movie_dict.pkl','wb'))

In [41]:
#to create a .pkl file for storing the values of similarity
pickle.dump(similarity,open('similarity.pkl','wb'))