# Steps

1. General dataset exploration
2. Basic data visualisation
3. Tokenize text data
4. Build autencoder and clustering layer
5. Visualize the cluster with Seaborn
6. Interactive scatterplot with Bokeh

# Load the Dataset and Validate 5 Samples

In [2]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML


In [3]:
INPUT_DIR_NETFLIX = "data/netflix_titles.csv"

df_netflix = pd.read_csv(INPUT_DIR_NETFLIX)
df_netflix.sample(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
51,s52,Movie,InuYasha the Movie 2: The Castle Beyond the Lo...,Toshiya Shinohara,"Kappei Yamaguchi, Satsuki Yukino, Mieko Harada...",Japan,"September 15, 2021",2002,TV-14,99 min,"Action & Adventure, Anime Features, Internatio...","With their biggest foe seemingly defeated, Inu..."
7454,s7455,Movie,Midnight Special,Jeff Nichols,"Michael Shannon, Joel Edgerton, Kirsten Dunst,...","United States, Greece","September 7, 2020",2016,PG-13,112 min,"Action & Adventure, Sci-Fi & Fantasy","When his son exhibits supernatural powers, a f..."
3983,s3984,Movie,ReMastered: The Miami Showband Massacre,Stuart Sender,,United States,"March 22, 2019",2019,TV-MA,71 min,"Documentaries, Music & Musicals",The killing of three members of the Miami Show...
6176,s6177,Movie,Anwar,Manish Jha,"Siddharth Koirala, Nauheed Cyrusi, Manisha Koi...",India,"December 31, 2019",2007,TV-14,134 min,"Dramas, Independent Movies, International Movies",Seeking refuge in a Hindu temple after the tra...
671,s672,Movie,Mobile Suit Gundam III: Encounters in Space,"Yoshiyuki Tomino, Yoshikazu Yasuhiko","Toru Furuya, Shuichi Ikeda, Hirotaka Suzuoki, ...",,"June 19, 2021",1982,TV-MA,140 min,"Action & Adventure, Anime Features, Internatio...",The Earth Federation prepares to take the war ...


# Data Cleansing and Fetching Movie Data

1. Remove duplications
2. Replace missing director data with 'no data'
3. Drop NA records 

In [4]:
df_netflix['director'].replace(np.nan, 'No Data',inplace  = True)

# Drops

df_netflix.dropna(inplace=True)

# Drop Duplicates

df_netflix.drop_duplicates(inplace= True)

df_movies_netflix = df_netflix.loc[df_netflix['type'] == 'Movie']

# verify dataframe
print(df_movies_netflix.isnull().sum())
print(df_movies_netflix.info())

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5277 entries, 7 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       5277 non-null   object
 1   type          5277 non-null   object
 2   title         5277 non-null   object
 3   director      5277 non-null   object
 4   cast          5277 non-null   object
 5   country       5277 non-null   object
 6   date_added    5277 non-null   object
 7   release_year  5277 non-null   int64 
 8   rating        5277 non-null   object
 9   duration      5277 non-null   object
 10  listed_in     5277 non-null   object
 11  description   5277 non-null   object
dtypes: int64(1), object(11)
memory usage: 535.9+ KB
None


# Feature Selection for Clustering

- Lets start with just the description.

- Preprocess and tokenize the description


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans

In [6]:
text_content = df_movies_netflix['description']
vector = TfidfVectorizer(max_df = 0.4,           # drop words that occur more than max_df %
                         stop_words = 'english', # remove stop words
                         lowercase = True,       # everything to lowercase
                         use_idf = True,
                         norm = u'l2',
                         smooth_idf = True       # prevent divide by zero errors
                         )

tfidf = vector.fit_transform(text_content)

In [7]:
k = 5
kmeans = MiniBatchKMeans(n_clusters = k)
kmeans.fit(tfidf)
centres = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vector.get_feature_names()

request_transform = vector.transform(df_movies_netflix['description'])
df_movies_netflix['cluster'] = kmeans.predict(request_transform)
df_movies_netflix['cluster'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_netflix['cluster'] = kmeans.predict(request_transform)


0    4075
2     700
4     204
1     171
3     127
Name: cluster, dtype: int64

## Classification

### Preprocessing

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [9]:
n_most_common_words = 8000
max_len = 130
tokenizer = Tokenizer(num_words=n_most_common_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df_movies_netflix['description'].values)
sequences = tokenizer.texts_to_sequences(df_movies_netflix['description'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = pad_sequences(sequences, maxlen=max_len)

Found 15155 unique tokens.


In [10]:
categorized_clustered_labels = to_categorical(df_movies_netflix['cluster'], num_classes=5)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X , categorized_clustered_labels, test_size=0.25, random_state=42)

In [12]:
epochs = 10
emb_dim = 128
batch_size = 256

In [13]:
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

In [15]:
print((X_train.shape, y_train.shape, X_test.shape, y_test.shape))

model = Sequential()
model.add(Embedding(n_most_common_words, emb_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.7))
model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
model.add(Dense(5, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())
history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])

((3957, 130), (3957, 5), (1320, 130), (1320, 5))
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 130, 128)          1024000   
                                                                 
 spatial_dropout1d_1 (Spatia  (None, 130, 128)         0         
 lDropout1D)                                                     
                                                                 
 lstm_1 (LSTM)               (None, 64)                49408     
                                                                 
 dense_1 (Dense)             (None, 5)                 325       
                                                                 
Total params: 1,073,733
Trainable params: 1,073,733
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10


2022-04-11 00:10:47.594566: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2022-04-11 00:19:07.537276: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
