# Steps

1. General dataset exploration
2. Basic data visualisation
3. Tokenize text data
4. Build autencoder and clustering layer
5. Visualize the cluster with Seaborn
6. Interactive scatterplot with Bokeh

# Load the Dataset and Validate 5 Samples

In [49]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML

In [50]:
INPUT_DIR_NETFLIX = "netflix_titles.csv"

df_netflix = pd.read_csv(INPUT_DIR_NETFLIX)
df_netflix.sample(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
813,s814,TV Show,The Adventures of Sonic the Hedgehog,,"Jaleel White, Long John Baldry, Garry Chalk","United States, Canada","June 2, 2021",1993,TV-Y7,1 Season,Kids' TV,"Hyper hedgehog Sonic and his cohort Miles ""Tai..."
5582,s5583,Movie,Amy Schumer: The Leather Special,Amy Schumer,Amy Schumer,United States,"March 7, 2017",2017,TV-MA,57 min,Stand-Up Comedy,"Comic sensation Amy Schumer riffs on sex, dati..."
289,s290,TV Show,The Crowned Clown,,"Yeo Jin-goo, Lee Se-young, Kim Sang-kyung, Jun...",South Korea,"August 10, 2021",2019,TV-14,1 Season,"International TV Shows, Romantic TV Shows, TV ...","Standing in for an unhinged Joseon king, a loo..."
5272,s5273,TV Show,VeggieTales in the City,,"Phil Vischer, Mike Nawrocki, Rob Paulsen, Tres...",United States,"September 15, 2017",2017,TV-Y,2 Seasons,Kids' TV,"With exciting trips to the big city, the ski s..."
1173,s1174,TV Show,Men on a Mission,Jung-ah Im,"Ho-dong Kang, Soo-geun Lee, Sang-min Lee, Youn...",South Korea,"March 23, 2021",2021,TV-14,6 Seasons,"International TV Shows, Korean TV Shows, Stand...",Male celebs play make-believe as high schooler...


# Data Cleansing and Fetching Movie Data

1. Remove duplications
2. Replace missing director data with 'no data'
3. Drop NA records 

In [51]:
df_netflix['director'].replace(np.nan, 'No Data',inplace  = True)

# Drops

df_netflix.dropna(inplace=True)

# Drop Duplicates

df_netflix.drop_duplicates(inplace= True)

df_movies_netflix = df_netflix.loc[df_netflix['type'] == 'Movie']

# verify dataframe
print(df_movies_netflix.isnull().sum())
print(df_movies_netflix.info())

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5277 entries, 7 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       5277 non-null   object
 1   type          5277 non-null   object
 2   title         5277 non-null   object
 3   director      5277 non-null   object
 4   cast          5277 non-null   object
 5   country       5277 non-null   object
 6   date_added    5277 non-null   object
 7   release_year  5277 non-null   int64 
 8   rating        5277 non-null   object
 9   duration      5277 non-null   object
 10  listed_in     5277 non-null   object
 11  description   5277 non-null   object
dtypes: int64(1), object(11)
memory usage: 535.9+ KB
None


# Feature Selection for Clustering

- Lets start with just the description.

- Preprocess and tokenize the description


In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans

In [53]:
text_content = df_movies_netflix['description']
vector = TfidfVectorizer(max_df = 0.4,           # drop words that occur more than max_df %
                         stop_words = 'english', # remove stop words
                         lowercase = True,       # everything to lowercase
                         use_idf = True,
                         norm = u'l2',
                         smooth_idf = True       # prevent divide by zero errors
                         )

tfidf = vector.fit_transform(text_content)

In [54]:
k = 5
kmeans = MiniBatchKMeans(n_clusters = k)
kmeans.fit(tfidf)
centres = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vector.get_feature_names()

request_transform = vector.transform(df_movies_netflix['description'])
df_movies_netflix['cluster'] = kmeans.predict(request_transform)
df_movies_netflix['cluster'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies_netflix['cluster'] = kmeans.predict(request_transform)


3    4546
4     481
2     224
1      25
0       1
Name: cluster, dtype: int64

## Classification

### Preprocessing

In [55]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from transformers import BertTokenizer
import tensorflow as tf

In [56]:
categorized_clustered_labels = to_categorical(df_movies_netflix['cluster'], num_classes=5)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(df_movies_netflix['description'],
                                                    categorized_clustered_labels, 
                                                    test_size = 0.10, 
                                                    random_state = 0)

In [58]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(sentence):
    tokens = tokenizer.encode_plus(sentence, 
                                 max_length=512,
                                 truncation=True, 
                                 padding='max_length',
                                 add_special_tokens=True, 
                                 return_token_type_ids=False,
                                 return_tensors='tf')  
    return tokens['input_ids'], tokens['attention_mask']


X_train_ids = np.zeros((len(X_train), 512))
X_train_mask = np.zeros((len(X_train), 512))
X_test_ids = np.zeros((len(X_test), 512))
X_test_mask = np.zeros((len(X_test), 512))

print("="*50)
print("shape of X_train_ids:",X_train_ids.shape)
print("-"*50)
print("shape of X_train_mask:",X_train_mask.shape)
print("="*50)
print("="*50)
print("shape of X_test_ids:",X_test_ids.shape)
print("-"*50)
print("shape of X_test_mask:",X_test_mask.shape)
print("="*50)

for i, sequence in enumerate(X_train):
    tokens = tokenize(sequence)
    X_train_ids[i, :], X_train_mask[i, :] = tokens[0], tokens[1]
    
for i, sequence in enumerate(X_test):
    tokens = tokenize(sequence)
    X_test_ids[i, :], X_test_mask[i, :] = tokens[0], tokens[1]

shape of X_train_ids: (4749, 512)
--------------------------------------------------
shape of X_train_mask: (4749, 512)
shape of X_test_ids: (528, 512)
--------------------------------------------------
shape of X_test_mask: (528, 512)


In [59]:
# Converting the tokenized ids and Mask into tensorflow Tensors
X_train_ids = tf.convert_to_tensor(X_train_ids)
X_train_mask = tf.convert_to_tensor(X_train_mask)

X_test_ids = tf.convert_to_tensor(X_test_ids)
X_test_mask = tf.convert_to_tensor(X_test_mask)

In [60]:
data_train = tf.data.Dataset.from_tensor_slices((X_train_ids, 
                                                 X_train_mask, 
                                                 y_train))

data_test = tf.data.Dataset.from_tensor_slices((X_test_ids, 
                                                X_test_mask, 
                                                y_test))

In [61]:
SHUFFLE = 100000
BATCH_SIZE = 32

def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels
    
data_train = data_train.map(map_func)
data_test = data_test.map(map_func)

train = data_train.shuffle(SHUFFLE).batch(BATCH_SIZE) #, drop_remainder=True)
val = data_test.shuffle(SHUFFLE).batch(BATCH_SIZE)

In [62]:
from transformers import TFAutoModel
bert = TFAutoModel.from_pretrained('bert-base-uncased')
bert.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
Total params: 109,482,240
Trainable params: 109,482,240
Non-trainable params: 0
_________________________________________________________________


In [63]:
input_ids = tf.keras.layers.Input(shape=(512,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(512,), name='attention_mask', dtype='int32')
embeddings = bert.bert(input_ids, attention_mask=mask)[0]  # we access the transformer model within our bert object using the bert attribute (eg bert.bert instead of bert)

x = tf.keras.layers.Dropout(0.4)(embeddings)
x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(768))(x)
y = tf.keras.layers.Dense(5, activation='softmax', name='outputs')(x)

model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)
model.layers[2].trainable = False

In [64]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [65]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, decay=1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[acc, f1_m,precision_m, recall_m])

from keras.callbacks import EarlyStopping,ModelCheckpoint
early_stopping = EarlyStopping(monitor='val_accuracy',
                               min_delta=0,
                               patience=3,
                               verbose=2, 
                               mode='max')
checkpoint = ModelCheckpoint("best_model.hdf5",
                             monitor='val_accuracy',
                             verbose=1,
                             save_best_only=True, 
                             mode='max', 
                             period=1)
callbacks_1 = [early_stopping,checkpoint]

history = model.fit(train,
                    validation_data=val,
                    epochs=20,
                    callbacks=callbacks_1)

Epoch 1/20
Epoch 1: val_accuracy improved from -inf to 0.83712, saving model to best_model.hdf5
Epoch 2/20
Epoch 2: val_accuracy improved from 0.83712 to 0.91856, saving model to best_model.hdf5
Epoch 3/20
Epoch 3: val_accuracy improved from 0.91856 to 0.97538, saving model to best_model.hdf5
Epoch 4/20
Epoch 4: val_accuracy improved from 0.97538 to 0.98295, saving model to best_model.hdf5
Epoch 5/20
Epoch 5: val_accuracy did not improve from 0.98295
Epoch 6/20
Epoch 6: val_accuracy improved from 0.98295 to 0.99242, saving model to best_model.hdf5
Epoch 7/20
Epoch 7: val_accuracy did not improve from 0.99242
Epoch 8/20
Epoch 8: val_accuracy did not improve from 0.99242
Epoch 9/20
Epoch 9: val_accuracy did not improve from 0.99242
Epoch 9: early stopping
