### Building the model

For this project I'll be comparing the performace of two models, a Transformer Encoder from scratch  
and a pretrained BERT model

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

We'll start by loading in our data and preprocessing it for our model

In [2]:
train_data = pd.read_csv('data/train_gr/train_clean.csv')
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17316 entries, 0 to 17315
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_review      17310 non-null  object
 1   user_suggestion  17316 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 270.7+ KB


In [3]:
train_data['user_review'] = train_data['user_review'].astype(str)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17316 entries, 0 to 17315
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_review      17316 non-null  object
 1   user_suggestion  17316 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 270.7+ KB


In [4]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_data['user_review'])
text_sequences = tokenizer.texts_to_sequences(train_data['user_review'])
text_sequences = tf.keras.preprocessing.sequence.pad_sequences(text_sequences,maxlen=512)

In [5]:
len_seq = len(text_sequences[0])
num_seq = len(text_sequences)

print(f'Max Review Length: {len_seq}')
print(f'Number of reviews: {num_seq}')

Max Review Length: 512
Number of reviews: 17316


Now that we've tokenized our text, let's make it so we can access our learned vocabulary

In [6]:
word2idx = tokenizer.word_index
idx2word = {v:w for w,v in word2idx.items()}
word2idx['PAD'] = 0
idx2word[0] = 'PAD'

In [7]:
labels = tf.keras.utils.to_categorical(train_data['user_suggestion'],num_classes=2)

Our data is now ready to be put into a dataset for our Transformer Encoder

In [8]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = tf.data.Dataset.from_tensor_slices((text_sequences,labels))
train_dataset = dataset.take(int(num_seq*0.9)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
val_dataset = dataset.skip(int(num_seq*0.9)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

Let's now move onto creating our model

We begin by defining the first portion of our model, the token embedding and positional embedding layer  
I'll be using positional embeddings over static positional encodings for this model

The input into our token embedding layer will have shape (vocab_size,) and the output shape will be (hidden_size,),  
for the positional embedding layer the input will have shape (max_pos_embeddings) which we will set to be the max size  
of a sequence (maxlen), and output shape (hidden_size,)

The input to the layer will have shape (batch_size,sequence_length,hidden_size)

In [9]:
class PositionalEmbeddings(tf.keras.layers.Layer):
    def __init__(self,vocab_size,hidden_size,max_pos_emb,dropout_rate=0.3):
        super().__init__()

        self.token_emb = tf.keras.layers.Embedding(input_dim=vocab_size,
                                                  output_dim=hidden_size)
        self.pos_emb = tf.keras.layers.Embedding(input_dim=max_pos_emb,
                                                 output_dim=hidden_size)
        self.ln = tf.keras.layers.LayerNormalization()
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
    
    def call(self,x):
        seq_length = x.shape[-1] 
        pos_ids = tf.range(0,seq_length,delta=1)

        token_emb = self.token_emb(x)
        pos_emb = self.pos_emb(pos_ids)

        emb = tf.add(token_emb,pos_emb)
        emb = self.ln(emb)
        emb = self.dropout(emb)
        return emb

After our positional embedding layer we have our encoder layer,  
keras contains a multiheadattenion layer so we'll only need to supply it with the necessary  
hyperparemeters, which are the number of heads and the embedding dimension (hidden_size),  
after this we create a feed forward (dense) layer with units = ffn_dim for the first dense layer,  
and units = embed_dim for the second dense layer

Since skip connections are used, during the forward pass we'll add the output of the multheadattention layer  
with the input before normalization, and another skip connection with the output of the feed forward network

In [10]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self,embed_dim, num_heads, ffn_dim, dropout_rate=0.5):
        super().__init__()

        self.mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads,
                                                      key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
           [ tf.keras.layers.Dense(units=ffn_dim,activation='relu'),
            tf.keras.layers.Dense(units=embed_dim)]
        )

        self.ln1 = tf.keras.layers.LayerNormalization()
        self.ln2 = tf.keras.layers.LayerNormalization()
        self.dropout1 = tf.keras.layers.Dropout(dropout_rate)
        self.dropout2 = tf.keras.layers.Dropout(dropout_rate)
    
    def call(self,x):
        attn_output = self.mha(x,x)
        attn_output = self.dropout1(attn_output)
        out = self.ln1(tf.add(x,attn_output))
        
        ffn_out = self.ffn(out)
        ffn_out = self.dropout2(ffn_out)

        return self.ln2(out+ffn_out)

Now that we have our encoder layer defined, we can move onto defining the Transformer Encoder

In [11]:
class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self,num_layers,emb_params,enc_params):
        super().__init__()

        self.emb = PositionalEmbeddings(**emb_params)
        self.enc_layers = [EncoderLayer(**enc_params)
                           for _ in range(num_layers)]
        
    def call(self,x):
        x = self.emb(x)
        for layer in self.enc_layers:
            x = layer(x)
        
        return x

For sentiment analysis, we create a model consisting of the encoder, a flatten layer, and a  
dense layer with units = 2 and sigmoid activation function

In [14]:
hidden_size = 8
ffn_dim = 4
num_heads = 2
max_len = 512
num_layers = 1

emb_params = {
    'vocab_size':len(word2idx),
    'hidden_size':hidden_size,
    'max_pos_emb':max_len
}

enc_params = {
    'embed_dim':hidden_size,
    'num_heads':num_heads,
    'ffn_dim':ffn_dim
}

enc_model = tf.keras.Sequential()
enc_model.add(tf.keras.layers.Input(shape=(max_len,)))
enc_model.add(TransformerEncoder(num_layers,emb_params,enc_params))
enc_model.add(tf.keras.layers.Flatten())
enc_model.add(tf.keras.layers.Dense(units=2,activation='sigmoid'))

enc_model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 transformer_encoder_1 (Tran  (None, 512, 8)           676588    
 sformerEncoder)                                                 
                                                                 
 flatten_1 (Flatten)         (None, 4096)              0         
                                                                 
 dense_5 (Dense)             (None, 2)                 8194      
                                                                 
Total params: 684,782
Trainable params: 684,782
Non-trainable params: 0
_________________________________________________________________


In [15]:
enc_model.compile(loss='binary_crossentropy',optimizer='adam',
                  metrics=['accuracy'])

In [16]:
vanilla_encoder = enc_model.fit(train_dataset,validation_data=val_dataset,
                                epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


It's clear our model is overfitting the training data, let's compare our model to DistilBERT

In [126]:
enc_model.save_weights('models/vanilla_enc.h5')

In [16]:
from transformers import DistilBertTokenizer
from transformers import TFAutoModelForSequenceClassification

model_ckpt = 'distilbert-base-uncased'
dbert_tokenizer = DistilBertTokenizer.from_pretrained(model_ckpt)

dbert_model = TFAutoModelForSequenceClassification.from_pretrained(model_ckpt,num_labels=2)

  from .autonotebook import tqdm as notebook_tqdm
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_transform', 'vocab_layer_norm', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_22', 'pre_classifier']
You should probably TRAIN th

In [None]:
from sklearn.metrics import accuracy_score

inputs = dbert_tokenizer((train_data['user_review'].to_list())[:100],return_tensors='tf',
                         truncated=True,padding=True,max_length=200)

In [17]:
def tokenize_batch(batch):
    return dbert_tokenizer(batch['user_review'],
                           padding=True,
                           truncation=True,
                           max_length=max_len)

In [18]:
from datasets import Dataset

h_train_data = Dataset.from_pandas(train_data)
h_train_data_enc = h_train_data.map(tokenize_batch,
                                    batched=True,
                                    batch_size=None)

                                                                  

In [26]:
h_train_data_enc

Dataset({
    features: ['user_review', 'user_suggestion', 'input_ids', 'attention_mask'],
    num_rows: 17316
})

In [28]:
tokenizer_columns = dbert_tokenizer.model_input_names


dbert_ds = h_train_data_enc.to_tf_dataset(
    columns=tokenizer_columns,
    label_cols=["user_suggestion"],shuffle=True,batch_size=BATCH_SIZE
)

dbert_tr_ds = dbert_ds.take(int(len(dbert_ds)*0.9))
dbert_val_ds = dbert_ds.skip(int(len(dbert_ds)*0.9))

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [31]:
dbert_model.compile(loss='binary_crossentropy',
                    optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
                    metrics=['accuracy'])

In [195]:
dbert_enc = dbert_model.fit(dbert_tr_ds,validation_data=dbert_val_ds
                            ,epochs=3)

Epoch 1/2


Given the lack of computing power I'll finetune the model using a Kaggle notebook