In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
import gc
gc.collect()

0

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
#movies.head(1)

In [4]:
#credits.shape
#movies.shape

In [5]:
# Merge both dateset into single data frame
movies = movies.merge(credits, on='title')

In [6]:
#movies.shape

In [7]:
#movies.head(1)
#movies.info()

In [8]:
# Only Neccessary columns will keep
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [9]:
#movies.head(1)

In [10]:
# Missing Data
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [11]:
# Drop Blank Data 
movies.dropna(inplace=True)

In [12]:
# find duplicate data
movies.duplicated().sum()

np.int64(0)

In [13]:
#movies.iloc[0].genres

In [14]:
# need to update json object to simple values format
# ast.literal_eval() is a function in Python's ast (Abstract Syntax Tree) module. 
# Its primary purpose is to safely evaluate a string containing a Python literal or a container display into its corresponding Python object. 
def convertObjectToValues(obj):
    list_values = []
    for i in ast.literal_eval(obj):
        list_values.append(i['name'])
    return list_values
        

In [15]:
movies['genres'] = movies['genres'].apply(convertObjectToValues)

In [16]:
# movies.head()

In [17]:
# movies['keywords'].apply(convertObjectToValues)

In [18]:
movies['keywords']=movies['keywords'].apply(convertObjectToValues)

In [19]:
# movies.head()

In [20]:
def convertTop3ObjectToValues(obj):
    list_Values = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
           list_Values.append(i['name'])
           counter+=1
        else:
            break
    return list_Values        
            
            
        

In [21]:
# movies['cast'].apply(convertTop3ObjectToValues)

In [22]:
movies['cast']=movies['cast'].apply(convertTop3ObjectToValues)

In [23]:
# movies.head()

In [24]:
# movies['crew'][0]

In [25]:
def fetchCrewDirector(obj):
    list_Values = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
           list_Values.append(i['name'])
           break
    return list_Values

In [26]:
#movies['crew'].apply(fetchCrewDirector)

In [27]:
movies['crew']=movies['crew'].apply(fetchCrewDirector)

In [28]:
#movies.head()

In [29]:
#movies['overview'][0]

In [30]:
#movies['overview'].apply(lambda x:x.split())

In [31]:
movies['overview']=movies['overview'].apply(lambda x:x.split())

In [32]:
#movies.head()

In [33]:
#movies.head()

In [34]:
# Remove space between words
#movies['genres'].apply(lambda x:[i.replace(" ","")for i in x])

In [35]:
movies['genres']=movies['genres'].apply(lambda x:[i.replace(" ","")for i in x])
movies['keywords']=movies['keywords'].apply(lambda x:[i.replace(" ","")for i in x])
movies['cast']=movies['cast'].apply(lambda x:[i.replace(" ","")for i in x])
movies['crew']=movies['crew'].apply(lambda x:[i.replace(" ","")for i in x])

In [36]:
#movies.head()

In [37]:
# Create New Column as Tags
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [38]:
# Create New Data Frame
newMovieDF = movies[['movie_id','title','tags']]

In [39]:
#newMovieDF.head()

In [40]:
#newMovieDF['tags'].apply(lambda x:" ".join(x))

In [41]:
newMovieDF['tags']=newMovieDF['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newMovieDF['tags']=newMovieDF['tags'].apply(lambda x:" ".join(x))


In [42]:
# newMovieDF.head()

In [43]:
# convert string into lower case
newMovieDF['tags'] = newMovieDF['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newMovieDF['tags'] = newMovieDF['tags'].apply(lambda x:x.lower())


In [44]:
#newMovieDF.head()

In [45]:
# Word to Vectorization via SKLearn and import CountVectorizer
#cv = CountVectorizer(max_features=5000,stop_words='english')

In [46]:
#vectors = cv.fit_transform(newMovieDF['tags']).toarray()

In [47]:
#vectors

In [48]:
#cv.get_feature_names_out()

In [49]:
# need to use stem function for same type of words like 
# ['loved','loving','love'] if we will apply stem function on these then it will look like
# ['love','love','love']
import nltk

In [50]:
from nltk.stem.porter import PorterStemmer

In [51]:
ps = PorterStemmer()

In [52]:
# create helper function for every words stem
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)
        

In [53]:
#newMovieDF['tags'].apply(stem)

In [54]:
newMovieDF['tags'] = newMovieDF['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newMovieDF['tags'] = newMovieDF['tags'].apply(stem)


In [55]:
#newMovieDF.head().tags

In [56]:
# Word to Vectorization via SKLearn and import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [57]:
vectors = cv.fit_transform(newMovieDF['tags']).toarray()

In [58]:
#vectors

In [59]:
#cv.get_feature_names_out()

In [60]:
# now we will get the distance for each movie vector with cosine formula from SKLearn
from sklearn.metrics.pairwise import cosine_similarity

In [61]:
similarity = cosine_similarity(vectors)

In [62]:
#similarity[0].shape

In [63]:
#vectors

In [64]:
# fetch top 10 similar movies
def recommend(movie):
    index = newMovieDF[newMovieDF['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:11]:
        print(newMovieDF.iloc[i[0]].title)          

In [65]:
#recommend('Batman Begins')

In [66]:
import pickle, gzip

In [69]:
import sys
print(sys.getsizeof(similarity))

184781152


In [None]:
#pickle.dump(similarity,open('similaritylist.pkl','wb'))
with open('similarity.pkl', 'wb') as f:
    pickle.dump(similarity, f)

#with gzip.open('similarity.pkl.gz', 'rb') as f:
 #   similarity = pickle.load(f)

In [None]:
# pickle.dump(newMovieDF,open('movies.pkl','wb'))