In [1]:
import numpy as np
import pandas as pd
import ast

In [3]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [4]:
movies = movies.merge(credits, on="title")

In [5]:
#columns to keep
# genres
# id
# keywords
# title
# overview
# cast
# crew

In [9]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [11]:
# Handling misssing datas and then removing it
movies.isnull().sum()
movies.dropna(inplace = True)
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [13]:
movies.duplicated().sum()

0

In [15]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [17]:
# Removing the genres of the movies

def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [19]:
movies['genres'] = movies['genres'].apply(convert)

In [21]:
movies['keywords'] = movies['keywords'].apply(convert) 

In [23]:
# this function is just taking the first three cast (the main 3 actors) and just keeping it in the final

def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

In [25]:
movies['cast'] = movies['cast'].apply(convert3) 

In [27]:
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [29]:
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [31]:
movies['crew'] = movies['crew'].apply(fetch_director)

In [32]:
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [33]:
# we are replacing the spaces with no space and making sure that the tags being created are not repative withe the first names
# and making sure the uniqueness of the tags e.g. Sam worthington -> SamWorthington and Sam Mendes to SamMendes

movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ","") for i in x]) 
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x]) 
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x]) 
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])

In [37]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [39]:
new_df = movies[['movie_id','title', 'tags']]

In [41]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))


In [43]:
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


In [45]:
import nltk

In [46]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [49]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [51]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [52]:
from sklearn.feature_extraction.text import CountVectorizer 
cv = CountVectorizer(max_features=5000, stop_words='english')

In [53]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [54]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [55]:
# to measure the fistance between 2 movie we won't be using the eucledian distance beacuse of the curse of dimensionality. 
# rather than calculating the distance between the tips of the movies in the vecotr form we will be calculating the angle between 
# the vectors of the movies. This is known as the cosine distance between the vectors.

In [61]:
from sklearn.metrics.pairwise import cosine_similarity

In [63]:
similarity = cosine_similarity(vectors)

In [65]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate (distances)), reverse=True, key=lambda x:x[1])[1:6]

    for i in movies_list:
        print(new_df.iloc[i[0]].title)

# sorted(list(enumerate (similarity[0])), reverse=True, key=lambda x:x[1])
# Here similiarit[0] will be done by the distances variable

In [67]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [76]:
import pickle

In [82]:
pickle.dump(new_df.to_dict(), open('movie_dict.pkl','wb'))

In [84]:
pickle.dump(similarity, open('similarity.pkl','wb'))