# CNN-Kim Model
I can't get it exactly the same now because I'm using custom-trained word vectors instead of pre-trained ones, but I can still try to build a similar architecture.

Since there are different variations of his model, I'll start with a model with the following architecture:
- Using custom-trained word vectors (dimension = 200)
- Keeping those word vectors static during the trainin process (maybe - TBD while implementing)
- Single-channel
- Filter windows height of 3, 4, 5 with 100 feature maps each
- MaxPooling to extract the features from each feature map
- Dropout rate of 0.5
- ReLu activation function 

In [1]:
# Import the libraries
import pandas as pd
import numpy as np
import ast
import tensorflow as tf
# from tensorflow_addons.metrics.hamming import hamming_loss_fn
# from tensorflow_addons.metrics import HammingLoss
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Conv1D, GlobalMaxPooling1D, concatenate, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.metrics import Precision, Recall
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
# Check that a GPU is available
tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

### Load in the data

In [3]:
# Load in the data
# data_list = [a, b, c, d, e, f, g, h, i ,f] = [None, None, None, None, None, None, None, None, None, None]
# data_list = [a, b, c] = [None, None, None]
data_list = [a] = [None]
data_location = '../Datasets/AmazonCat-13K/processed/' 
for i in range(len(data_list)):
    data_list[i] = pd.read_csv(data_location + f'tokenized_no{i + 1}.csv', encoding='latin1')[0:1000]
    
# Concatenate all the data and reset the index
data = pd.concat(data_list, sort=False)
data = data.reset_index()

# Delete unused var (so save memory)
del data_list

In [4]:
# Convery tringged arrays to arrays
data['tokenized_title_and_description'] = data['tokenized_title_and_description'].apply(lambda stringged_array: ast.literal_eval(stringged_array))
data['labels'] = data['labels'].apply(lambda stringged_array: ast.literal_eval(stringged_array))

In [5]:
# Check the first 3 rows
data.head(n=3)

Unnamed: 0,index,item_id,tokenized_title_and_description,labels
0,0,ID:B0027DQHA0,"[29260, 21551, 12365, 3328, 4450, 19, 237, 211...","[Music, TV, Movies & TV, Classical]"
1,1,ID:0756400120,"[381, 15160, 38609, 41, 5949, 10, 477, 1179, 3...","[Books, General, Science Fiction, United State..."
2,2,ID:B00024YAOQ,"[646, 150, 56, 73, 5, 99, 1, 883, 3, 4, 3470, ...","[Motivation & Self-Improvement, Business & Inv..."


In [6]:
# Check the shape
data.shape

(448308, 4)

In [7]:
# Load in the embedding matrix
embedding_matrix = np.loadtxt(data_location + 'embedding_matrix.csv', delimiter=',')

In [8]:
# Check the embedding matrix size
embedding_matrix.shape

(200000, 200)

### Prepare the X data

In [9]:
sequences = list(data['tokenized_title_and_description'])

In [10]:
# Add padding to the sequences
MAX_SEQUENCE_LENGTH = 500
padded_sequences = pad_sequences(sequences,
                                 maxlen=MAX_SEQUENCE_LENGTH,
                                 padding='post') # Add padding to the end if needs padding

### Prepare the y data

In [11]:
# Get the labels
all_labels = list(data['labels'])

In [12]:
# Get count of unique labels
unique_labels = []
for labels in all_labels:
    for label in labels:
        unique_labels.append(label)
count = len(set(unique_labels))

# Delete unused var (to save memory)
del unique_labels

In [13]:
# Convert the tages into a binary vectors
mlb = MultiLabelBinarizer(sparse_output=True)
placeholder1 = mlb.fit_transform(all_labels)
placeholder2 = placeholder1.astype('int8')
del placeholder1
place_holder3 = np.zeros((data.shape[0], count), dtype='int8')
binary_vectors = placeholder2.todense(out=place_holder3)
del placeholder2

In [14]:
# Delete unused vars (to save memory)
del labels
del data

### Prepare the data for training and testing

In [15]:
# Get the data
X = padded_sequences
y = binary_vectors

In [16]:
# Create the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [17]:
# Check X train and X test
print(f'X train shape: {X_train.shape}')
print(f'X test shape: {X_test.shape}')

X train shape: (336231, 500)
X test shape: (112077, 500)


In [18]:
# Check y train and y test
print(f'y train shape: {y_train.shape}')
print(f'y test shape: {y_test.shape}')

y train shape: (336231, 12836)
y test shape: (112077, 12836)


### Create the embedding layer

In [19]:
# Create the embedding layer definition
class PretrainedEmbedding(tf.keras.layers.Layer):
    """Non-trainable embedding layer."""

    def __init__(self, embeddings, dropout_rate=0.2, **kwargs):
        """"Instantiate the layer using a pre-defined embedding matrix."""
        super().__init__(**kwargs)
        self.embeddings = tf.constant(embeddings)
        # if you want to add some dropout (or normalization, etc.)
        self.dropout = tf.keras.layers.Dropout(rate=dropout_rate)

    def call(self, inputs, training=None):
        """Embed some input tokens and optionally apply dropout."""
        output = tf.nn.embedding_lookup(self.embeddings, inputs)
        return self.dropout(output, training=training)

### Define the model layers

In [20]:
# Define the Input and Embedding layers
i = Input(shape=(MAX_SEQUENCE_LENGTH), dtype=tf.int32)
x = PretrainedEmbedding(embedding_matrix, dropout_rate=0)(i)

# Convolution with window size = 3  
x3 = Conv1D(filters=100, 
            kernel_size=3, 
            strides=1,
            padding='valid',
            activation='relu',
            use_bias=True
           )(x)
x3 = GlobalMaxPooling1D()(x3)

# Convolution with window size = 4
x4 = Conv1D(filters=100, 
            kernel_size=4, 
            strides=1,
            padding='valid',
            activation='relu',
            use_bias=True
           )(x)
x4 = GlobalMaxPooling1D()(x4)

# Convolution with window size = 5
x5 = Conv1D(filters=100, 
            kernel_size=5, 
            strides=1,
            padding='valid',
            activation='relu',
            use_bias=True
           )(x)
x5 = GlobalMaxPooling1D()(x5)

# Concatenated max-pooling layers and final Dense layer
concatenated = concatenate([x3, x4, x5])
x = Dropout(rate=0.5)(concatenated)
x = Dense(y_train.shape[1], activation='softmax')(x)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



In [21]:
# Build the model
model = Model(i, x)

In [22]:
# Have a look at the model
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 500)]        0                                            
__________________________________________________________________________________________________
pretrained_embedding (Pretraine (None, 500, 200)     0           input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 498, 100)     60100       pretrained_embedding[0][0]       
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 497, 100)     80100       pretrained_embedding[0][0]       
______________________________________________________________________________________________

### Define how the model should be trained

In [23]:
# Define how the model should be trained
model.compile(optimizer=tf.keras.optimizers.Adadelta(learning_rate=0.001), # CNN-Kim uses SGD with Adelta update ruel
              loss='binary_crossentropy',
              metrics=[Precision(), Recall()])

### Train the model

In [24]:
# Train the model
result = model.fit(X_train, y_train,
                   epochs=20,
                   validation_data=(X_test, y_test),
                   batch_size=128);

Train on 336231 samples, validate on 112077 samples
Epoch 1/20
   128/336231 [..............................] - ETA: 1:55:02

UnknownError: 2 root error(s) found.
  (0) Unknown:  Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node model/conv1d_2/conv1d (defined at /home/maxitron/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:1751) ]]
	 [[metrics/recall/assert_greater_equal/Assert/AssertGuard/pivot_f/_23/_67]]
  (1) Unknown:  Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node model/conv1d_2/conv1d (defined at /home/maxitron/anaconda3/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py:1751) ]]
0 successful operations.
0 derived errors ignored. [Op:__inference_distributed_function_1885]

Function call stack:
distributed_function -> distributed_function


In [None]:
# Plot the precision metric at each itteration
plt.plot(result.history['precision'], label='train_precision');
plt.plot(result.history['val_precision'], label='test_precision');
plt.legend();

In [None]:
# Plot the recall metric at each itteration
plt.plot(result.history['recall'], label='train_recall');
plt.plot(result.history['val_recall'], label='test_recall');
plt.legend();