In [1]:
# importing the required libraries
import numpy as np
import pandas as pd

In [2]:
# reading the datasets
movies = pd.read_csv("10000 Movies Data.csv")
credits = pd.read_csv("10000 Credits Data.csv")

In [3]:
# checking the head of movies dataset
movies.head(2)

Unnamed: 0.1,Unnamed: 0,Movie_id,title,Genres,release_date,Keywords,overview,poster_path,Budget,Revenue,popularity,vote_average,vote_count
0,0,238,The Godfather,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",1972-03-14,"[{'id': 131, 'name': 'italy'}, {'id': 697, 'na...","Spanning the years 1945 to 1955, a chronicle o...",/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,245066411,245066411,93.552,8.7,16814
1,1,278,The Shawshank Redemption,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",1994-09-23,"[{'id': 378, 'name': 'prison'}, {'id': 417, 'n...",Framed in the 1940s for the double murder of h...,/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg,28341469,28341469,78.664,8.7,22542


In [4]:
# checking the head of credits dataset
credits.head(2)

Unnamed: 0.1,Unnamed: 0,Movie_id,title,Cast,Crew
0,0,238,The Godfather,"[{'adult': False, 'gender': 2, 'id': 3084, 'kn...","[{'adult': False, 'gender': 2, 'id': 154, 'kno..."
1,1,278,The Shawshank Redemption,"[{'adult': False, 'gender': 2, 'id': 504, 'kno...","[{'adult': False, 'gender': 2, 'id': 153, 'kno..."


In [5]:
# getting an overview of movies dataset
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9980 entries, 0 to 9979
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    9980 non-null   int64  
 1   Movie_id      9980 non-null   int64  
 2   title         9980 non-null   object 
 3   Genres        9980 non-null   object 
 4   release_date  9980 non-null   object 
 5   Keywords      9980 non-null   object 
 6   overview      9974 non-null   object 
 7   poster_path   9980 non-null   object 
 8   Budget        9980 non-null   int64  
 9   Revenue       9980 non-null   int64  
 10  popularity    9980 non-null   float64
 11  vote_average  9980 non-null   float64
 12  vote_count    9980 non-null   int64  
dtypes: float64(2), int64(5), object(6)
memory usage: 1013.7+ KB


In [6]:
# getting an overview of credits data
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9980 entries, 0 to 9979
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  9980 non-null   int64 
 1   Movie_id    9980 non-null   int64 
 2   title       9980 non-null   object
 3   Cast        9980 non-null   object
 4   Crew        9980 non-null   object
dtypes: int64(2), object(3)
memory usage: 390.0+ KB


* The movies dataset has several columns with missing values. We need to address them.
* The **credits** dataset does not have any missing values.

We can see that both cast and crew columns have values which are included within inverted commas (""). We can use *eval()* function to access the list items.

For ease of analysis, we will merge the two dataset as they have a couple of columns in common.

In [7]:
movies_data = pd.merge(movies, credits, on = "Movie_id")

In [8]:
# checking the information of the merged dataframe
movies_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9980 entries, 0 to 9979
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0_x  9980 non-null   int64  
 1   Movie_id      9980 non-null   int64  
 2   title_x       9980 non-null   object 
 3   Genres        9980 non-null   object 
 4   release_date  9980 non-null   object 
 5   Keywords      9980 non-null   object 
 6   overview      9974 non-null   object 
 7   poster_path   9980 non-null   object 
 8   Budget        9980 non-null   int64  
 9   Revenue       9980 non-null   int64  
 10  popularity    9980 non-null   float64
 11  vote_average  9980 non-null   float64
 12  vote_count    9980 non-null   int64  
 13  Unnamed: 0_y  9980 non-null   int64  
 14  title_y       9980 non-null   object 
 15  Cast          9980 non-null   object 
 16  Crew          9980 non-null   object 
dtypes: float64(2), int64(6), object(9)
memory usage: 1.4+ MB


In [9]:
# checking for null values
movies_data.isnull().sum().sort_values(ascending=False)

overview        6
Unnamed: 0_x    0
Revenue         0
Cast            0
title_y         0
Unnamed: 0_y    0
vote_count      0
vote_average    0
popularity      0
Budget          0
Movie_id        0
poster_path     0
Keywords        0
release_date    0
Genres          0
title_x         0
Crew            0
dtype: int64

Since, we only have 6 NaN values in the entire dataset, we will get rid of them in order to make the recommendation system more efficient.

In [10]:
movies_data.dropna(inplace=True)

In [11]:
movies_data.rename(columns = {"title_x":"Title"}, inplace = True)

In [12]:
movies_data.head(2)

Unnamed: 0,Unnamed: 0_x,Movie_id,Title,Genres,release_date,Keywords,overview,poster_path,Budget,Revenue,popularity,vote_average,vote_count,Unnamed: 0_y,title_y,Cast,Crew
0,0,238,The Godfather,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",1972-03-14,"[{'id': 131, 'name': 'italy'}, {'id': 697, 'na...","Spanning the years 1945 to 1955, a chronicle o...",/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,245066411,245066411,93.552,8.7,16814,0,The Godfather,"[{'adult': False, 'gender': 2, 'id': 3084, 'kn...","[{'adult': False, 'gender': 2, 'id': 154, 'kno..."
1,1,278,The Shawshank Redemption,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",1994-09-23,"[{'id': 378, 'name': 'prison'}, {'id': 417, 'n...",Framed in the 1940s for the double murder of h...,/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg,28341469,28341469,78.664,8.7,22542,1,The Shawshank Redemption,"[{'adult': False, 'gender': 2, 'id': 504, 'kno...","[{'adult': False, 'gender': 2, 'id': 153, 'kno..."


In [13]:
# storing only the required columns
movies_data = movies_data[["Movie_id", "Title", "Genres", "Keywords", "overview", "Cast", "Crew"]]

In [14]:
# checking for duplicated records
movies_data.duplicated().sum()

0

Since, each record in the columns genre, keywords, cast and crew is a list enclosed within " ", we can use **eval()** function to get the list item out of each record.

In [15]:
eval(movies_data["Genres"][0])[0]["name"]

'Drama'

In [16]:
for item in eval(movies_data["Genres"][29]):
    print(item["name"])

Animation
Drama
War


In [17]:
# function to extract the values of a column
def extract(col):
    list_ = []
    for item in eval(col):
        list_.append(item["name"])
    return list_

In [18]:
movies_data["Genres"] = movies_data["Genres"].apply(extract)

In [19]:
movies_data["Keywords"] = movies_data["Keywords"].apply(extract)

In [20]:
def top_actors(col):
    list_ = []
    counter = 0
    for item in eval(col):
        if counter < 3:
            list_.append(item["name"])
            counter+=1
    return list_

In [21]:
# getting the top 3 actors
movies_data["Cast"] = movies_data["Cast"].apply(top_actors)

In [22]:
counter = 0
for item in movies_data["Cast"].values:
    
    if len(item)<3:
        counter+=1
        #print(item)
print(counter)

68


In [23]:
def get_director(col):
    list_ = []
    for crew in eval(col):
        if crew["job"] == "Director":
            list_.append(crew["name"])
    return list_

In [24]:
movies_data["director"] = movies_data["Crew"].apply(get_director)

In [25]:
movies_data["overview"] = movies_data["overview"].apply(lambda x:x.split())

In [34]:
movies_data["Keywords"].value_counts(10)

[]                                                                                                           0.060457
[woman director]                                                                                             0.003609
[anime]                                                                                                      0.001604
[based on novel or book]                                                                                     0.000902
[short film]                                                                                                 0.000902
                                                                                                               ...   
[mermaid, princess, based on toy]                                                                            0.000100
[video game]                                                                                                 0.000100
[prison, based on novel or book, biography, cincinnati, 

In [33]:
movies_data["Keywords"] = movies_data["Keywords"].apply(lambda x:x.split())

AttributeError: 'list' object has no attribute 'split'

In [26]:
movies_data.drop(columns=['Crew'], inplace=True)

In [27]:
# function to replace the spaces between words
def formatter(col):
    list_ = []
    for item in col:
        list_.append(item.replace(" ", ""))
    return list_

In [28]:
movies_data["Genres"] = movies_data["Genres"].apply(formatter)
movies_data["Cast"] = movies_data["Cast"].apply(formatter)
movies_data["director"] = movies_data["director"].apply(formatter)

In [29]:
movies_data.head(2)

Unnamed: 0,Movie_id,Title,Genres,Keywords,overview,Cast,director
0,238,The Godfather,"[Drama, Crime]","[italy, loss of loved one, love at first sight...","[Spanning, the, years, 1945, to, 1955,, a, chr...","[MarlonBrando, AlPacino, JamesCaan]",[FrancisFordCoppola]
1,278,The Shawshank Redemption,"[Drama, Crime]","[prison, corruption, police brutality, based o...","[Framed, in, the, 1940s, for, the, double, mur...","[TimRobbins, MorganFreeman, BobGunton]",[FrankDarabont]


In [30]:
movies_data["Keywords"]

0       [italy, loss of loved one, love at first sight...
1       [prison, corruption, police brutality, based o...
2       [italy, italian american, cuba, symbolism, gan...
3                                                      []
4       [based on novel or book, factory, concentratio...
                              ...                        
9975                                  [crime boss, heist]
9976    [monster, darkness, desert, archaeologist, fou...
9977                                          [christmas]
9978            [competition, career, sports, automobile]
9979    [poltergeist, haunted house, remake, duringcre...
Name: Keywords, Length: 9974, dtype: object

In [31]:
movies_data["overview"]

0       [Spanning, the, years, 1945, to, 1955,, a, chr...
1       [Framed, in, the, 1940s, for, the, double, mur...
2       [In, the, continuing, saga, of, the, Corleone,...
3       [Raj, is, a, rich,, carefree,, happy-go-lucky,...
4       [The, true, story, of, how, businessman, Oskar...
                              ...                        
9975    [After, a, botched, heist,, Eddie, a, murderou...
9976    [An, archaeological, team, attempt, to, unlock...
9977    [8-year-old, Finn, is, terrified, to, learn, h...
9978    [Talented, rookie, race-car, driver, Jimmy, Bl...
9979    [A, family's, suburban, home, is, invaded, by,...
Name: overview, Length: 9974, dtype: object

In [31]:
movies_data["combination"] = movies_data["Genres"] + movies_data["Keywords"] + movies_data["overview"] + movies_data["Cast"] + movies_data["director"]

In [32]:
movies_final = movies_data[["Movie_id", "Title", "combination"]]

In [33]:
movies_final.head(2)

Unnamed: 0,Movie_id,Title,combination
0,238,The Godfather,"[Drama, Crime, italy, loss of loved one, love ..."
1,278,The Shawshank Redemption,"[Drama, Crime, prison, corruption, police brut..."


In [34]:
movies_final.loc[:, "combination"] = movies_final.loc[:, "combination"].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_final.loc[:, "combination"] = movies_final.loc[:, "combination"].apply(lambda x:" ".join(x))


In [35]:
movies_final["combination"] = movies_final["combination"].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_final["combination"] = movies_final["combination"].apply(lambda x:x.lower())


In [36]:
movies_final["combination"]

0       drama crime italy loss of loved one love at fi...
1       drama crime prison corruption police brutality...
2       drama crime italy italian american cuba symbol...
3       comedy drama romance raj is a rich, carefree, ...
4       drama history war based on novel or book facto...
                              ...                        
9975    action crime thriller crime boss heist after a...
9976    horror monster darkness desert archaeologist f...
9977    comedy family tvmovie christmas 8-year-old fin...
9978    action competition career sports automobile ta...
9979    horror poltergeist haunted house remake during...
Name: combination, Length: 9974, dtype: object

In [37]:
import nltk
from nltk.stem import PorterStemmer

In [38]:
movies_final["combination"][0]

'drama crime italy loss of loved one love at first sight based on novel or book europe symbolism patriarch organized crime mafia lawyer religion revenge motive crime family sicilian mafia religious hypocrisy gun violence rise to power dead horse gang violence 1940s 1950s mafia war part of trilogy spanning the years 1945 to 1955, a chronicle of the fictional italian-american corleone crime family. when organized crime family patriarch, vito corleone barely survives an attempt on his life, his youngest son, michael steps in to take care of the would-be killers, launching a campaign of bloody revenge. marlonbrando alpacino jamescaan francisfordcoppola'

In [39]:
def stem(word):
    ps = PorterStemmer()
    return ps.stem(word)

In [40]:
movies_final["combination"] = movies_final["combination"].apply(lambda x:" ".join([stem(word) for word in x.split()]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_final["combination"] = movies_final["combination"].apply(lambda x:" ".join([stem(word) for word in x.split()]))


In [41]:
movies_final["combination"]

0       drama crime itali loss of love one love at fir...
1       drama crime prison corrupt polic brutal base o...
2       drama crime itali italian american cuba symbol...
3       comedi drama romanc raj is a rich, carefree, h...
4       drama histori war base on novel or book factor...
                              ...                        
9975    action crime thriller crime boss heist after a...
9976    horror monster dark desert archaeologist found...
9977    comedi famili tvmovi christma 8-year-old finn ...
9978    action competit career sport automobil talent ...
9979    horror poltergeist haunt hous remak duringcred...
Name: combination, Length: 9974, dtype: object

**CountVectorizer** is a machine learning algorithm that is used to convert text data into a numerical format that can be used by other machine learning algorithms. It does this by counting the number of times each word appears in a document. The resulting numerical representation of the text is called a bag-of-words model.

In [42]:
# importing CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=10000, stop_words="english")

In [43]:
vectors = cv.fit_transform(movies_final["combination"]).toarray()

In [44]:
vectors.shape

(9974, 10000)

In [45]:
# def stem(text):
#     list_ = []
#     for i in text.split():
#         list_.append(PorterStemmer.stem(i))
        
#     return " ".join(list_)

In [None]:
#movies_final["tags"] = movies_final["tags"].apply(stem)

**Cosine similarity** is a measure of similarity between two non-zero vectors defined in an inner product space. Cosine similarity is the cosine of the angle between the vectors; that is, it is the dot product of the vectors divided by the product of their lengths. It follows that the cosine similarity does not depend on the magnitudes of the vectors, but only on their angle. The cosine similarity always belongs to the interval [-1, 1].

In [46]:
from sklearn.metrics.pairwise import cosine_similarity
cs = cosine_similarity(vectors)

In [47]:
cs.shape

(9974, 9974)

In [48]:
cs

array([[1.        , 0.18114575, 0.48560714, ..., 0.11468293, 0.09157015,
        0.07777628],
       [0.18114575, 1.        , 0.07587039, ..., 0.15203028, 0.20135435,
        0.02025271],
       [0.48560714, 0.07587039, 1.        , ..., 0.05822225, 0.        ,
        0.05429253],
       ...,
       [0.11468293, 0.15203028, 0.05822225, ..., 1.        , 0.18298126,
        0.13987572],
       [0.09157015, 0.20135435, 0.        , ..., 0.18298126, 1.        ,
        0.        ],
       [0.07777628, 0.02025271, 0.05429253, ..., 0.13987572, 0.        ,
        1.        ]])

In [50]:
sorted(list(enumerate(cs[0])), key=lambda x:x[1], reverse=True)[1:6]

[(2, 0.4856071418767538),
 (1633, 0.42294442611014477),
 (6583, 0.3956282840374722),
 (9782, 0.3808554565920136),
 (5642, 0.3581885095735786)]

In [51]:
def recommender(movie):
    movie_index = movies_final[movies_final["Title"]==movie].index[0] # to extract movie information
    similiarity = cs[movie_index] # access the list element with index = movie_index
    index_dist_pair = sorted(list(enumerate(similiarity)), key = lambda x:x[1], reverse=True)[1:6] # extract the top 5 recommended movies
    
    for i in index_dist_pair:
        print(movies_final.iloc[i[0]].Title)

In [52]:
movies_final["Title"][0:5]

0                  The Godfather
1       The Shawshank Redemption
2          The Godfather Part II
3    Dilwale Dulhania Le Jayenge
4               Schindler's List
Name: Title, dtype: object

In [53]:
recommender("The Godfather")

The Godfather Part II
The Godfather Part III
The Air I Breathe
Gotti
The Informer


In [54]:
import pickle

In [55]:
pickle.dump(movies_final.to_dict(), open("movies.pkl", "wb"))
pickle.dump(cs, open("similarity.pkl", "wb"))