In [None]:
# import libraries
import numpy as np
import pandas as pd

In [None]:
# Load the credits and movies datasets
credits=pd.read_csv("./data/tmdb_5000_credits.csv")
movies=pd.read_csv("./data/tmdb_5000_movies.csv")

In [None]:
movies.head()

In [None]:
movies.dtypes

In [None]:
credits.head(1)

In [None]:
credits['crew']

In [None]:
credits.dtypes

In [None]:
movies = movies.merge(credits,on='title')

In [None]:
movies.shape

In [None]:
movies.head()

In [None]:
#genres
#id
#title
#overview(most import according to content)
#cast
#crew
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [None]:
movies.isnull().sum()

In [None]:
#as overview is most important column and if we have missing values, either we have to manually enter values or drop them.so i am dropping this

movies.dropna(inplace=True)

In [None]:
movies.isnull().sum()

In [None]:
movies.duplicated().sum()

In [None]:
movies['genres']

In [None]:
movies.iloc[0].genres

In [None]:
import ast # used to convert string in to list

def convert(obj):
    L =[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [None]:
movies['genres']=movies['genres'].apply(convert) #convert function is applied to each row of genres column


In [None]:
movies.head()

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)
movies.head()

In [None]:
movies.iloc[0].cast

In [None]:
# function to extract the first three actors from the 'cast' column
def actor(obj):
    L =[]
    counter=0
    for i in ast.literal_eval(obj):
        if counter !=3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [None]:
movies['cast']=movies['cast'].apply(actor)

In [None]:
movies.head()

In [None]:
movies.iloc[0].crew

In [None]:
# function to fetch the director from the 'crew' column
def fetch_director(obj):
    L =[]
    for i in ast.literal_eval(obj):
        if i['job'] =='Director':
            L.append(i['name'])
            break
    return L

In [None]:
movies['crew']=movies['crew'].apply(fetch_director)

In [None]:
movies['crew'].isnull().sum()

In [None]:
movies.head()

In [None]:
movies['overview'][0]

In [None]:
# tokenize the 'overview' column because we have to convert the string to list
movies['overview']=movies['overview'].apply(lambda x:x.split()) # string to list

In [None]:
movies.head()

In [None]:
# remove spaces from the genre, cast, and crew lists
movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast']=movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew']=movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [None]:
movies.head()

In [None]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']


In [None]:
movies.head()

In [None]:
new_df=movies[['movie_id','title','tags']]

In [None]:
new_df['tags']=new_df['tags'].apply(lambda x:" ".join(x))  # use " ".join to convert list to string with spaces

In [None]:
import nltk
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [None]:
# a function to apply stemming to text, why? because we have to convert the words to their root form
def stem(text):
    y=[]
    
    for i in text.split(): # so we will stem every word in the list
        y.append(ps.stem(i))
    return " ".join(y)

In [None]:
# apply stemming to the 'tags' column
new_df['tags']=new_df['tags'].apply(stem)

In [None]:
new_df['tags'][0]

In [None]:
# convert text to lowercase in the 'tags' column
new_df['tags']=new_df['tags'].apply(lambda x :x.lower())

In [None]:
new_df['tags'][0]

In [None]:
new_df['tags'][1]

In [None]:
# Text Vectorization

# so now we have to convert tags in to vector and then all movie will become a vector.so total 45000 vectors will be there.
# so to recommend , we have to select the closest vectors
# I am using bag of words for now
# remove stopwords

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')

In [None]:
vectors=cv.fit_transform(new_df['tags']).toarray()

In [None]:
vectors.shape

In [None]:
vectors[0] # so many zero because we have 5000 words and our tag has not that much words

In [None]:
cv.get_feature_names_out()# now we have problem with same words like accept ,accepted,acceptence,accepts .we need to apply stemming.

In [None]:
# what stemming will do that ['loved','loving','love']=> will get output as ['love','love','love']
# after that we have to find cosine distance

In [None]:
# calculate cosine similarity between vectors
from sklearn.metrics.pairwise import cosine_similarity # it is between 0 and 1

In [None]:
vectors #every row of the array has 5000 columns

In [None]:
# calculate cosine similarity matrix
similarity=cosine_similarity(vectors)

In [None]:
similarity.shape 

In [None]:
similarity

In [None]:
sorted(list(enumerate(similarity[0])),reverse=True,key = lambda x: x[1])[1:6]

In [None]:
index = new_df[new_df['title'] == 'Avatar'].index[0]

distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])

In [None]:
distances

In [None]:
# a function to recommend similar movies
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:# i is the tuple and 
        print(new_df.iloc[i[0]].title)# i[0] is the index of every tuple

In [None]:
recommend('Avatar')

In [None]:
recommend('Batman Begins')

In [None]:
# Save the new dataset and similarity matrix to pickle files
import pickle

pickle.dump(new_df,open('movie_list.pkl','wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))