In [91]:
# Import all the required libraries
import numpy as np
import pandas as pd
from IPython.display import display
import ast
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from snowballstemmer import EnglishStemmer
from sklearn.metrics.pairwise import cosine_similarity

In [58]:
# Reading the csv file that contain all the credits
credit = pd.read_csv('tmdb_credits.csv')

In [59]:
# Reading the csv file that contain all the movies
movie = pd.read_csv('tmdb_movies.csv')

In [61]:
# Merge both dataframes on title column
movies = movie.merge(credit, on='title')

In [62]:
### These are the columns that are useful for us
# genres
# id
# keywords
# title
# overview
# cast
# crew

In [63]:
# Just for checking which language is dominating in languages
# movies['original_language'].value_counts()

In [64]:
# Making our final dataframe
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [65]:
# For checking and removing duplicates and null values
movies.duplicated().sum()
movies.isna().sum()
movies = movies.dropna()
movies.isna().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [69]:
# The function’s goal is to extract only the name field from each dictionary within the list and 
# return these names as a simple list of strings
def convert_genres_and_keywords(obj):
    lst = []
    for i in ast.literal_eval(obj):
        lst.append(i['name'])
    return lst

In [67]:
# Applying convert_genres_and_keywords function 
movies['genres'] = movies['genres'].apply(convert_genres_and_keywords)

In [68]:
# Applying convert_genres_and_keywords function 
movies['keywords'] = movies['keywords'].apply(convert_genres_and_keywords)

In [72]:
# The function’s goal is to extract only the name field from each dictionary within the list and 
# return these names as a simple list of strings (only first 3)
def convert_cast(obj):
    lst = []
    for i in ast.literal_eval(obj)[0:3]:
        lst.append(i['name'])
    return lst    

In [71]:
# Applying convert_cast function 
movies['cast'] = movies['cast'].apply(convert_cast)

In [73]:
# The function’s goal is to extract only the name field from each dictionary within the list and 
# return these names as a simple list of strings (where job is director)
def convert_crew(obj):
    lst = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            lst.append(i['name'])
    return lst        

In [26]:
movies['crew'] = movies['crew'].apply(convert_crew)

In [74]:
# For making a list of overview
movies['overview'] = movies['overview'].apply(lambda x : x.split())

In [75]:
# To remove spaces from these columns
movies['genres'] = movies['genres'].apply(lambda x : [i.replace(' ', '') for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x : [i.replace(' ', '') for i in x])
movies['cast'] = movies['cast'].apply(lambda x : [i.replace(' ', '') for i in x])
movies['crew'] = movies['crew'].apply(lambda x : [i.replace(' ', '') for i in x])

In [76]:
# Making our main column of tag
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [77]:
# Creating a new dataframe that contain these three columns : 'movie_id', 'title', 'tags'
new_df = movies[['movie_id', 'title', 'tags']]

In [78]:
# For converting each list of tags in the tags column into a single string,
# where tags are separated by spaces
new_df['tags'] = new_df['tags'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: ' '.join(x))


In [79]:
# Converting it to lower case
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())


In [80]:
# For removing english stop words
cv = CountVectorizer(max_features = 5000, stop_words = 'english')

In [84]:
# import nltk
# from nltk.stem.porter import PorterStemmer
# ps = PorterStemmer()

# We can also use the method above

# In easy words it just reduce the length of words

stemmer = EnglishStemmer()
def stem(text):
    y = [stemmer.stemWord(word) for word in text.split()]
    return ' '.join(y)   
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [85]:
# This line converts the text data in new_df['tags'] into a numerical matrix where each row 
# represents a document and each column represents the count of a specific word from the 
# vocabulary.
vectors = cv.fit_transform(new_df['tags']).toarray()

In [87]:
# It gives you the list of vocabulary words that were used to create the word count vectors
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [88]:
# Calculates similarity in each document/(row) to every other row based on the word count vectors.
similarity = cosine_similarity(vectors)

In [89]:
# It is recommending the most similar 5 movies 
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x:x[1])[1:6]
    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [90]:
# For checking
recommend('Batman')

Batman
Batman & Robin
The Dark Knight Rises
Batman Begins
Batman Returns


In [92]:
# Creating a file for saving and deploying
pickle.dump(new_df.to_dict(),open('movie_dict.pkl','wb'))

In [93]:
# Creating a file for saving and deploying
pickle.dump(similarity,open('similarity.pkl','wb'))