In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv("movies.csv")
credits = pd.read_csv("credits.csv")


In [3]:
movies = movies.merge(credits, on='title')
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [4]:
movies = movies[['genres', 'movie_id', 'keywords', 'title', 'overview', 'cast', 'crew']]

In [5]:
movies.isnull().sum()

genres      0
movie_id    0
keywords    0
title       0
overview    3
cast        0
crew        0
dtype: int64

In [6]:
movies.dropna(inplace=True)

In [7]:
movies.duplicated().sum()

np.int64(0)

In [8]:
import ast
def convert(obj):
    L = []
    for d in ast.literal_eval(obj):
        n = d['name']
        s = ''
        for i in n.split(" "):
            s += i.lower()
        L.append(s)
    return L

In [9]:
movies['genres'] = movies['genres'].apply(convert)

In [10]:
movies['keywords'] = movies['keywords'].apply(convert)

In [11]:
def convert_cast(obj):
    L = []
    counter = 0
    for d in ast.literal_eval(obj):
        n = d['name']
        s = ''
        for i in n.split(" "):
            s += i.lower()
        L.append(s)
        
        counter += 1
        if counter == 3:
            break
    return L

In [12]:
movies['cast'] = movies['cast'].apply(convert_cast)

In [13]:
def convert_crew(obj):
    L = []
    for d in ast.literal_eval(obj):
        if d["job"] == "Director":
            n = d['name']
            s = ''
            for i in n.split(" "):
                s += i.lower()
            L.append(s)
            break
    return L

In [14]:
movies['crew'] = movies['crew'].apply(convert_crew)

In [15]:
def convert_overview(s):
    L = []
    for i in s.split(" "):
        L.append(i.lower())
    return L

In [16]:
movies['overview'] = movies['overview'].apply(convert_overview)

In [17]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [18]:
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))

In [19]:
movies.head(1)

Unnamed: 0,genres,movie_id,keywords,title,overview,cast,crew,tags
0,"[action, adventure, fantasy, sciencefiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...",Avatar,"[in, the, 22nd, century,, a, paraplegic, marin...","[samworthington, zoesaldana, sigourneyweaver]",[jamescameron],"in the 22nd century, a paraplegic marine is di..."


In [20]:
new_df = movies[['movie_id', 'title', 'tags']]

In [21]:
new_df.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."


In [22]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [23]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [24]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [25]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [26]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [27]:
print(cv.get_feature_names_out())

['000' '007' '10' ... 'zone' 'zoo' 'zooeydeschanel']


In [28]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [29]:
similarity

array([[1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
        0.        ],
       [0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
        0.02615329],
       [0.0860309 , 0.06063391, 1.        , ..., 0.02451452, 0.        ,
        0.        ],
       ...,
       [0.04499213, 0.02378257, 0.02451452, ..., 1.        , 0.03962144,
        0.04229549],
       [0.        , 0.        , 0.        , ..., 0.03962144, 1.        ,
        0.08714204],
       [0.        , 0.02615329, 0.        , ..., 0.04229549, 0.08714204,
        1.        ]], shape=(4806, 4806))

In [30]:
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    sorted_distances = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])
    top_5_idx = [sorted_distances[i][0] for i in range(1,6)]
    top_5 = [new_df.iloc[x]['title'] for x in top_5_idx]
    return top_5

In [31]:
recommend('Se7en')

['Zodiac',
 'The Bone Collector',
 '2:13',
 'The Secret in Their Eyes',
 'In the Valley of Elah']

In [32]:
new_df['vectors'] = vectors.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['vectors'] = vectors.tolist()


In [33]:
import pickle 
pickle.dump(new_df.to_dict(), open('movies.pkl', 'wb'))

In [None]:
new_df['vectors']

0       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                              ...                        
4804    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4805    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4806    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4807    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4808    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: vectors, Length: 4806, dtype: object