In [1]:
# Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import string
import warnings
import nltk
import pickle
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings('ignore')

In [2]:
# Load and read data
path = './data/NETFLIX MOVIES AND TV SHOWS CLUSTERING.csv'
df = pd.read_csv(path)
df.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."


In [3]:
# Know your data
print('Shape : ',df.shape)
print()
print('# of duplicated rows : ',df.duplicated().sum())
print()
print('Columns : ',df.columns)
print()
print('Missing/Null count :')
print(df.isna().sum())
print()
print('Unique count :')
print(df.nunique())

Shape :  (7787, 12)

# of duplicated rows :  0

Columns :  Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

Missing/Null count :
show_id            0
type               0
title              0
director        2389
cast             718
country          507
date_added        10
release_year       0
rating             7
duration           0
listed_in          0
description        0
dtype: int64

Unique count :
show_id         7787
type               2
title           7787
director        4049
cast            6831
country          681
date_added      1565
release_year      73
rating            14
duration         216
listed_in        492
description     7769
dtype: int64


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7787 entries, 0 to 7786
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       7787 non-null   object
 1   type          7787 non-null   object
 2   title         7787 non-null   object
 3   director      5398 non-null   object
 4   cast          7069 non-null   object
 5   country       7280 non-null   object
 6   date_added    7777 non-null   object
 7   release_year  7787 non-null   int64 
 8   rating        7780 non-null   object
 9   duration      7787 non-null   object
 10  listed_in     7787 non-null   object
 11  description   7787 non-null   object
dtypes: int64(1), object(11)
memory usage: 730.2+ KB


In [5]:
df.describe(include='all')

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
count,7787,7787,7787,5398,7069,7280,7777,7787.0,7780,7787,7787,7787
unique,7787,2,7787,4049,6831,681,1565,,14,216,492,7769
top,s1,Movie,3%,"Raúl Campos, Jan Suter",David Attenborough,United States,"January 1, 2020",,TV-MA,1 Season,Documentaries,Multiple women report their husbands as missin...
freq,1,5377,1,18,18,2555,118,,2863,1608,334,3
mean,,,,,,,,2013.93258,,,,
std,,,,,,,,8.757395,,,,
min,,,,,,,,1925.0,,,,
25%,,,,,,,,2013.0,,,,
50%,,,,,,,,2017.0,,,,
75%,,,,,,,,2018.0,,,,


In [6]:
def get_cast(string_,index):
    try:
        return string_.split()[index]
    except:
        return 'unknown'
# Columns to keep
col_to_keep =['type','title', 'director', 'cast', 'country', 'release_year', 'listed_in', 'description'] # I will keep only necessory columns
df = df[col_to_keep]
df['director'] = df['director'].fillna('unknown') # filling nan with unknown director
df['new_director'] = df['director'].apply(lambda x : x.replace(' ','').lower()) # concatenating director's name
df['new_cast'] = df['cast'].fillna('unknown') # filling nan with unknown cast
df['new_cast'] = df['new_cast'].apply(lambda x :' '.join([s.replace(' ','').lower() for s in x.split(',')])) # concateing cast's name
# Taking only three casts/actor/actress for this model
df['cast1'] = df['new_cast'].apply(lambda x: get_cast(x,0)) # cast1
df['cast2'] = df['new_cast'].apply(lambda x: get_cast(x,1)) # cast2
df['cast3'] = df['new_cast'].apply(lambda x: get_cast(x,3)) # cast3
df['country'] = df['country'].fillna('unknown') # fill na with unknown
df['tags'] = df['type']+' '+df['title']+' '+df['new_director']+' '+df['cast1']+' '+df['cast2']+' '+df['cast3']+' '+df['listed_in']
df['tags']= df['tags']+' '+df['country']+' '+df['description'] # making tags
df['tags'] = df['tags'].apply(lambda x: ''.join([s for s in x if s not in string.punctuation])) # Remove punctuations

In [7]:
# Preprocessing
def text_preprocess(texts):
    stemmer = SnowballStemmer(language='english')
    texts = [stemmer.stem(t).lower() for t in texts.split()]
    texts = [t for t in texts if t.isnumeric()==False]
    texts = [t for t in texts if t not in string.punctuation]
    return ' '.join(texts)

df['tags'] = df['tags'].apply(text_preprocess) # applying preprocess function for tags
df['tags'] = df['tags']+' '+str(df['release_year']) # concate year column

In [8]:
col_to_keep.extend(['tags'])
preprocessed_df = df[col_to_keep]
preprocessed_df.head(3)

Unnamed: 0,type,title,director,cast,country,release_year,listed_in,description,tags
0,TV Show,3%,unknown,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,2020,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...,tv show unknown joãomiguel biancacomparato rod...
1,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,2016,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...,movi jorgemichelgrau demiánbichir héctorbonill...
2,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,2011,"Horror Movies, International Movies","When an army recruit is found dead, his fellow...",movi gilbertchan teddchan stellachung lawrence...


In [9]:
count_vectorizer = CountVectorizer(stop_words='english', max_features=10000)
vectors = count_vectorizer.fit_transform(df['tags']).toarray()
similarity_scores = cosine_similarity(vectors)

In [10]:
# Save pickle file
!mkdir pickles
cols = ['title','director','country','release_year','listed_in']
pickle.dump(df[cols],open('./pickles/result_df.pkl','wb'))
pickle.dump(similarity_scores,open('./pickles/similarity_scores.pkl','wb')) # This is a large file 462MB therefore I am not uploading on github

A subdirectory or file pickles already exists.
