# Steps

1. General dataset exploration
2. Basic data visualisation
3. Tokenize text data
4. Build autencoder and clustering layer
5. Visualize the cluster with Seaborn
6. Interactive scatterplot with Bokeh

# Load the Dataset and Validate 5 Samples

In [16]:
import pandas as pd
import numpy as np
from IPython.display import display, HTML


In [10]:
INPUT_DIR_NETFLIX = "data/netflix_titles.csv"

df_netflix = pd.read_csv(INPUT_DIR_NETFLIX)
df_netflix.sample(5)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
167,s168,Movie,Open Season 2,"Matthew O'Callaghan, Todd Wilderman","Joel McHale, Mike Epps, Jane Krakowski, Billy ...","United States, Canada","September 1, 2021",2008,PG,76 min,"Children & Family Movies, Comedies",Elliot the buck and his forest-dwelling cohort...
5322,s5323,Movie,Undefeated,"Daniel Lindsay, T.J. Martin","Montrail 'Money' Brown, O.C. Brown, Bill Court...",United States,"August 18, 2017",2011,PG-13,113 min,"Documentaries, Sports Movies",An inspirational profile of an inner-city high...
2800,s2801,Movie,Masameer - The Movie,Malik Nejer,"Malik Nejer, Shahad Alahmari, Abdulaziz Almuza...",Saudi Arabia,"March 19, 2020",2020,TV-14,100 min,"Comedies, International Movies",A young girl passionate about AI sets out to m...
7947,s7948,Movie,Savage Raghda,Mahmoud Karim,"Ramez Galal, Riham Hagag, Bayyumi Fuad, Entess...",Egypt,"June 13, 2019",2018,TV-MA,93 min,"Comedies, International Movies, Romantic Movies","Desperate to support his son, a single father ..."
654,s655,Movie,#Selfie,Cristina Jacob,"Flavia Hojda, Crina Semciuc, Olimpia Melinte, ...",Romania,"June 21, 2021",2014,TV-MA,125 min,"Comedies, Dramas, International Movies","Two days before their final exams, three teen ..."


# Data Cleansing and Fetching Movie Data

1. Remove duplications
2. Replace missing director data with 'no data'
3. Drop NA records 

In [11]:
df_netflix['director'].replace(np.nan, 'No Data',inplace  = True)

# Drops

df_netflix.dropna(inplace=True)

# Drop Duplicates

df_netflix.drop_duplicates(inplace= True)

df_movies_netflix = df_netflix.loc[df_netflix['type'] == 'Movie']

# verify dataframe
print(df_movies_netflix.isnull().sum())
print(df_movies_netflix.info())

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Int64Index: 5277 entries, 7 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       5277 non-null   object
 1   type          5277 non-null   object
 2   title         5277 non-null   object
 3   director      5277 non-null   object
 4   cast          5277 non-null   object
 5   country       5277 non-null   object
 6   date_added    5277 non-null   object
 7   release_year  5277 non-null   int64 
 8   rating        5277 non-null   object
 9   duration      5277 non-null   object
 10  listed_in     5277 non-null   object
 11  description   5277 non-null   object
dtypes: int64(1), object(11)
memory usage: 535.9+ KB
None


# Feature Selection for Clustering

- Lets start with just the description.

- Preprocess and tokenize the description


In [22]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MinMaxScaler

In [24]:
df_token = df_movies_netflix[ "description"]

max_words = 10000

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df_token)
# transforms strings in list of intergers
sequences = tokenizer.texts_to_sequences(df_token)
# calculated word index / vocabulary size
word_index = tokenizer.word_index 
print(f"{len(word_index)} unique tokens found - vocabulary size")
data = pad_sequences(sequences) #transforms integer lists into 2D tensor
print(data)
# display(HTML(df_movies_netflix.head().to_html()))


15154 unique tokens found - vocabulary size
[[   0    0    0 ...    8 1900   94]
 [   0    0    0 ...    1  111 1902]
 [   0    0    0 ...   25  593   52]
 ...
 [   0    0    0 ...    5 6768  509]
 [   0    0    0 ...    1 3125 1588]
 [   0    0    0 ...   51    7   94]]


## Rescaling

- Rescale the data to 0-1 range so that convergance happens faster

In [85]:
scaler = MinMaxScaler() 
# the values of all features are rescaled into the range of [0, 1]
x = scaler.fit_transform(data) 

- Autoencoder for encoding word vectors + Clustering layer for generating labels [Also known as DEC - Deep Embedded Clustering]

In [86]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Model

In [109]:
def autoencoder(dims, act='relu', init='glorot_uniform'):
    """
    Fully connected symmetric auto-encoder model.
  
    dims: list of the sizes of layers of encoder like [500, 500, 2000, 10]. 
          dims[0] is input dim, dims[-1] is size of the latent hidden layer.

    act: activation function
    
    return:
        (autoencoder_model, encoder_model): Model of autoencoder and model of encoder
    """
    n_stacks = len(dims) - 1
    
    input_data = Input(shape=(dims[0],), name='input')
    x = input_data
    
    # internal layers of encoder
    for i in range(n_stacks-1):
        x = Dense(dims[i + 1], activation=act, kernel_initializer=init, name='encoder_%d' % i)(x)

    # latent hidden layer
    encoded = Dense(dims[-1], kernel_initializer=init, name='encoder_%d' % (n_stacks - 1))(x)

    x = encoded
    # internal layers of decoder
    for i in range(n_stacks-1, 0, -1):
        x = Dense(dims[i], activation=act, kernel_initializer=init, name='decoder_%d' % i)(x)

    # decoder output
    x = Dense(dims[0], kernel_initializer=init, name='decoder_0')(x)
    
    decoded = x
    
    autoencoder_model = Model(inputs=input_data, outputs=decoded, name='autoencoder')
    encoder_model     = Model(inputs=input_data, outputs=encoded, name='encoder')
    
    return autoencoder_model, encoder_model

Defining hyper-parameters

In [110]:
# max numbers of clusters
n_clusters = 20
# epchos for autencoder training 
n_epochs   = 10 
# batch size
batch_size = 32

In [111]:
from tensorflow.keras.initializers import VarianceScaling

dims = [x.shape[-1], 500, 500, 1000, 10] 
# this helps with better initialization of the weights
init = VarianceScaling(scale=1. / 3., mode='fan_in',
                           distribution='uniform')
# [rmsprop] Standard values for lr and momentum - SGD(lr=1, momentum=0.9)
pretrain_optimizer = "rmsprop" 
pretrain_epochs = n_epochs

Define clustering layer which can be placed inside the pipeline to generate labels parallelly

In [112]:
from tensorflow.keras.layers import Layer, InputSpec
import tensorflow.keras.backend as K_end

class ClusteringLayer(Layer):
    '''
    Clustering layer converts input sample (feature) to soft label, i.e. a vector that represents the probability of the
    sample belonging to each cluster. The probability is calculated with student's t-distribution.
    '''

    def __init__(self, n_clusters, weights=None, alpha=1.0, **kwargs):
        if 'input_shape' not in kwargs and 'input_dim' in kwargs:
            kwargs['input_shape'] = (kwargs.pop('input_dim'),)
        super(ClusteringLayer, self).__init__(**kwargs)
        self.n_clusters = n_clusters
        self.alpha = alpha
        self.initial_weights = weights
        self.input_spec = InputSpec(ndim=2)

    def build(self, input_shape):
        assert len(input_shape) == 2
        input_dim = input_shape[1]
        self.input_spec = InputSpec(dtype=K_end.floatx(), shape=(None, input_dim))
        self.clusters = self.add_weight(name='clusters', shape=(self.n_clusters, input_dim), initializer='glorot_uniform') 
        
        if self.initial_weights is not None:
            self.set_weights(self.initial_weights)
            del self.initial_weights
        self.built = True

    def call(self, inputs, **kwargs):
        ''' 
        student t-distribution, as used in t-SNE algorithm.
        It measures the similarity between embedded point z_i and centroid µ_j.
                 q_ij = 1/(1+dist(x_i, µ_j)^2), then normalize it.
                 q_ij can be interpreted as the probability of assigning sample i to cluster j.
                 (i.e., a soft assignment)
       
        inputs: the variable containing data, shape=(n_samples, n_features)
        
        Return: student's t-distribution, or soft labels for each sample. shape=(n_samples, n_clusters)
        '''
        q = 1.0 / (1.0 + (K_end.sum(K_end.square(K_end.expand_dims(inputs, axis=1) - self.clusters), axis=2) / self.alpha))
        q **= (self.alpha + 1.0) / 2.0
        q = K_end.transpose(K_end.transpose(q) / K_end.sum(q, axis=1)) # Make sure all of the values of each sample sum up to 1.
        
        return q

    def compute_output_shape(self, input_shape):
        assert input_shape and len(input_shape) == 2
        return input_shape[0], self.n_clusters

    def get_config(self):
        config = {'n_clusters': self.n_clusters}
        base_config = super(ClusteringLayer, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

Initialize and generate autoencoder and encoder models

In [113]:
autoencoder, encoder = autoencoder(dims, init=init)

Start encoding

In [114]:
#loss='mse'
autoencoder.compile(optimizer=pretrain_optimizer, loss='binary_crossentropy')
print(len(x))
autoencoder.fit(x, x, batch_size=batch_size, epochs=pretrain_epochs)

5277
Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x294dd9bb0>

In [116]:
from tensorflow.keras.optimizers import SGD

In [117]:
clustering_layer = ClusteringLayer(n_clusters, name='clustering')(encoder.output)
model = Model(inputs=encoder.input, outputs=clustering_layer)
model.compile(optimizer=SGD(0.01, 0.9), loss='kld') #(optimizer=SGD(0.01, 0.9), loss='kld')

In [118]:
from sklearn.cluster import KMeans

In [119]:
kmeans = KMeans(n_clusters=n_clusters, n_init=20)
y_pred = kmeans.fit_predict(encoder.predict(x))

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


In [120]:
model.get_layer(name='clustering').set_weights([kmeans.cluster_centers_])


In [121]:
# computing an auxiliary target distribution
def target_distribution(q):
    weight = q ** 2 / q.sum(0)
    return (weight.T / weight.sum(1)).T

In [122]:
loss = 0
index = 0
maxiter = 1000 # 8000
update_interval = 100 # 140
index_array = np.arange(x.shape[0])
tol = 0.001 # tolerance threshold to stop training


In [123]:
for ite in range(int(maxiter)):
    if ite % update_interval == 0:
        q = model.predict(x, verbose=0)
        p = target_distribution(q)  # update the auxiliary target distribution p

    idx = index_array[index * batch_size: min((index+1) * batch_size, x.shape[0])]
    loss = model.train_on_batch(x=x[idx], y=p[idx])
    index = index + 1 if (index + 1) * batch_size <= x.shape[0] else 0

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: unsupported operand type(s) for -: 'NoneType' and 'int'


In [None]:
# Eval.
q = model.predict(x, verbose=0)
p = target_distribution(q)  # update the auxiliary target distribution p

# evaluate the clustering performance
y_pred = q.argmax(1)

In [None]:
data_all = df_netflix.copy()

In [None]:
from sklearn.manifold import TSNE

x_embedded = TSNE(n_components=2).fit_transform(x)

x_embedded.shape

In [None]:
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

# sns settings
sns.set(rc={'figure.figsize':(15,15)})

# colors
palette = sns.color_palette("bright", len(set(y_pred)))

# plot
sns.scatterplot(x_embedded[:,0], x_embedded[:,1], hue=y_pred, legend='full', palette=palette)
plt.title("Netflix Movies and Tv Shows, Clustered(Autoencoder and custem Keras Layer), Tf-idf with Plain Text")
# plt.savefig("plots/t-sne_covid19_label_TFID.png")
plt.show()