## Library

Reference 1: https://keras.io/examples/nlp/text_classification_with_transformer/

Reference 2: http://peterbloem.nl/blog/transformers





In [1]:
import numpy as np
import pandas as pd

In [2]:
pip install -U tensorflow-addons



In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split

## Transformer Block

In [4]:
# soruce:(https://keras.io/examples/nlp/text_classification_with_transformer/)
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads,
                                             key_dim=embed_dim)
        self.ffn = keras.Sequential([
            layers.Dense(ff_dim, activation="relu"),
            layers.Dense(embed_dim),
        ])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)  # self-attention layer
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)  # layer norm
        ffn_output = self.ffn(out1)  #feed-forward layer
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)  # layer norm

## Embedding and Position



In [5]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size,
                                          output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

## Data

### Preprocessing Steps

In [6]:
train = pd.read_csv('train.tsv',sep='\t')
valid = pd.read_csv('dev.tsv',sep='\t')
test = pd.read_csv('test.tsv',sep='\t')

In [7]:
vocab_size = 20000
MAX_LEN = 256 
txt = np.array(list(train["comment_text"]))
labels = np.array(list(train["label"]))

In [8]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=vocab_size,lower=False)
tokenizer.fit_on_texts(txt)
texts_to_seq = tokenizer.texts_to_sequences(txt)
texts_to_seq_pad = keras.preprocessing.sequence.pad_sequences(texts_to_seq,
                                                              maxlen=MAX_LEN,
                                                              truncating='post',
                                                              padding='post')

In [9]:
x_train = texts_to_seq_pad
y_train = labels

In [10]:
x_valid = np.array(list(valid["comment_text"]))
x_valid = keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(x_valid),maxlen=MAX_LEN,truncating='post',padding='post')

y_valid = np.array(list(valid["label"]))


## Define Classifier Architecture


In [11]:
embed_ = 32  # Embedding size for each token
head_nums = 2  # Number of attention heads
feed_forward = 32  # Hidden layer size in feed forward network inside transformer

## Using Sequential API
model = keras.Sequential()
model.add(layers.Input(shape=(MAX_LEN, )))
model.add(TokenAndPositionEmbedding(MAX_LEN, vocab_size, embed_))
model.add(TransformerBlock(embed_, head_nums, feed_forward))
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dropout(0.1))
model.add(layers.Dense(feed_forward, activation='sigmoid'))
model.add(layers.Dropout(0.1))
model.add(layers.Dense(1, activation='sigmoid'))

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 token_and_position_embeddin  (None, 256, 32)          648192    
 g (TokenAndPositionEmbeddin                                     
 g)                                                              
                                                                 
 transformer_block (Transfor  (None, 256, 32)          10656     
 merBlock)                                                       
                                                                 
 global_average_pooling1d (G  (None, 32)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_2 (Dropout)         (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 32)                1

## Training

In [13]:
import tqdm
import tensorflow_addons as tfa
tqdm_callback = tfa.callbacks.TQDMProgressBar()

In [14]:
model.compile(optimizer="adam",loss="binary_crossentropy",metrics=["accuracy"])

history = model.fit(x_train,y_train,
                    batch_size=128, epochs=10,
                    validation_data=(x_valid, y_valid),
                    callbacks = [tqdm_callback],
                    verbose=0)

Training:   0%|           0/10 ETA: ?s,  ?epochs/s

Epoch 1/10


0/501           ETA: ?s - 

Epoch 2/10


0/501           ETA: ?s - 

Epoch 3/10


0/501           ETA: ?s - 

Epoch 4/10


0/501           ETA: ?s - 

Epoch 5/10


0/501           ETA: ?s - 

Epoch 6/10


0/501           ETA: ?s - 

Epoch 7/10


0/501           ETA: ?s - 

Epoch 8/10


0/501           ETA: ?s - 

Epoch 9/10


0/501           ETA: ?s - 

Epoch 10/10


0/501           ETA: ?s - 

In [15]:
model.save('/content')



INFO:tensorflow:Assets written to: /content/assets


INFO:tensorflow:Assets written to: /content/assets


# Make prediction

In [16]:
x_test = np.array(list(test["comment_text"]))

x_test = keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(x_test),
    maxlen=MAX_LEN,
    truncating='post',
    padding='post')
    
y_test = np.array(list(test["label"]))


In [17]:
preds = model.predict(x_test)

In [18]:
preds_final = np.array([[float(1 - x), float(x)] for x in preds])

In [19]:
final_preds_converted = []
for i,x in enumerate(preds_final):
  if x[0]>x[1]:
    final_preds_converted.append(0)
  else:
    final_preds_converted.append(1)

In [20]:
import sklearn
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

print("Prediction Accuracy is",sklearn.metrics.accuracy_score(y_test, final_preds_converted))
print("\n\n")
print(classification_report(y_test, final_preds_converted))
print("\n\n")
print(confusion_matrix(y_test, final_preds_converted))

Prediction Accuracy is 0.7844036697247706



              precision    recall  f1-score   support

           0       0.78      0.78      0.78       428
           1       0.79      0.79      0.79       444

    accuracy                           0.78       872
   macro avg       0.78      0.78      0.78       872
weighted avg       0.78      0.78      0.78       872




[[335  93]
 [ 95 349]]
