# Download Data

In [1]:
!wget -q https://hkustconnect-my.sharepoint.com/:u:/g/personal/nnanda_connect_ust_hk/EfREjZqiZTlPqhqUPICBbPABdlgPumlaUVxPncm-_9aWIw?download=1 -O "Project 1 - data.zip"
!unzip -q "Project 1 - data.zip"

# Import Libraries

In [2]:
!pip -q install keras-layer-normalization

  Building wheel for keras-layer-normalization (setup.py) ... [?25l[?25hdone


In [3]:
import os
import nltk
import math
import pandas as pd
import numpy as np
import keras
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import Counter
from itertools import chain
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, BatchNormalization,\
    Activation, Input, Add, Concatenate, Embedding, Conv1D, MaxPool1D,\
    Flatten, LSTM, Bidirectional, MaxPooling1D, SimpleRNN, GRU, SpatialDropout1D
from keras_layer_normalization import LayerNormalization
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV


In [4]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
stopwords = set(stopwords.words("english"))
ps = PorterStemmer()

# Data Loader

In [6]:
def load_data(split_name='train', columns=['text', 'stars']):
    try:
        print(f"select [{', '.join(columns)}] columns from the {split_name} split")
        df = pd.read_csv(f'data_2021_spring/{split_name}.csv')
        df = df.loc[:,columns]
        print("succeed!")
        return df
    except:
        print("Failed, then try to ")
        print(f"select all columns from the {split_name} split")
        df = pd.read_csv(f'data_2021_spring/{split_name}.csv')
        return df

In [7]:
train_df = load_data('train', columns=['full'])
valid_df = load_data('valid', columns=['full'])
test_df = load_data('test', columns=['full'])

select [full] columns from the train split
Failed, then try to 
select all columns from the train split
select [full] columns from the valid split
Failed, then try to 
select all columns from the valid split
select [full] columns from the test split
Failed, then try to 
select all columns from the test split


# Following https://towardsdatascience.com/multi-label-multi-class-text-classification-with-bert-transformer-and-keras-c6355eccb63a

In [8]:
!pip -q install transformers

[K     |████████████████████████████████| 2.0MB 7.3MB/s 
[K     |████████████████████████████████| 870kB 46.4MB/s 
[K     |████████████████████████████████| 3.2MB 52.0MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [9]:
#######################################
### -------- Load libraries ------- ###
# Load Huggingface transformers
from transformers import TFBertModel,  BertConfig, BertTokenizerFast
# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam, RMSprop, Nadam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
# And pandas for data import + sklearn because you allways need sklearn
import pandas as pd
from sklearn.model_selection import train_test_split

# Bert without Attention Mask

In [14]:
#######################################
### --------- Setup BERT ---------- ###
# Name of the BERT model to use
model_name = 'bert-base-uncased'
# Max length of tokens
max_length = 100
# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False
# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.4.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [None]:
transformer_model.summary()

Model: "tf_bert_model_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
Total params: 109,482,240
Trainable params: 109,482,240
Non-trainable params: 0
_________________________________________________________________


In [15]:
#######################################
### ------- Build the model ------- ###
# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model
# Load the MainLayer
bert = transformer_model.layers[0]
# Build your model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
inputs = {'input_ids': input_ids}
# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
# dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
dropout = Dropout(0.45, name='pooled_output')
# pooled_output = dropout(bert_model, training=True)
pooled_output = dropout(bert_model, training=False)
# Then build your model output
outputs = Dense(units=5, kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='stars')(pooled_output)
# product = Dense(units=len(data.Product_label.value_counts()), kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='product')(pooled_output)
# outputs = {'issue': issue, 'product': product}
# And combine it all in a model object
model = Model(inputs=inputs, outputs=outputs, name='BERT_MultiClass')
# Take a look at the model
model.summary()

Model: "BERT_MultiClass"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_ids (InputLayer)       [(None, 100)]             0         
_________________________________________________________________
bert (TFBertMainLayer)       TFBaseModelOutputWithPool 109482240 
_________________________________________________________________
pooled_output (Dropout)      (None, 768)               0         
_________________________________________________________________
stars (Dense)                (None, 5)                 3845      
Total params: 109,486,085
Trainable params: 109,486,085
Non-trainable params: 0
_________________________________________________________________


In [16]:
#######################################
### ------- Train the model ------- ###
# Set an optimizer
optimizer = RMSprop(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)
# Set loss and metrics
loss = {'issue': CategoricalCrossentropy(from_logits = True), 'product': CategoricalCrossentropy(from_logits = True)}
metric = {'issue': CategoricalAccuracy('accuracy'), 'product': CategoricalAccuracy('accuracy')}
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = CategoricalCrossentropy(from_logits = True), 
    metrics = CategoricalAccuracy('accuracy'))
# Ready output data for the model
# y_issue = to_categorical(data['Issue'])
# y_product = to_categorical(data['Product'])
y_train = to_categorical(train_df["stars"]-1, num_classes=5)
y_valid = to_categorical(valid_df["stars"]-1, num_classes=5)
# Tokenize the input (takes some time)
x_train = tokenizer(
    text=train_df["text"].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)

os.makedirs("models", exist_ok=True)
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join("models", "weights_bert_without_attantion_mask.hdf5"),
    monitor="val_accuracy",
    verbose=1,
    save_best_only=True)
earlystopping = keras.callbacks.EarlyStopping(
    monitor='val_accuracy',
    patience=3,
    verbose=1)

# Fit the model
history = model.fit(
    x={'input_ids': x_train['input_ids']},
    y={'stars': y_train},
    validation_split=0.2,
    batch_size=64,
    epochs=10,
    callbacks=[checkpointer, earlystopping])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.60500, saving model to models/weights_bert_without_attantion_mask.hdf5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.60500 to 0.60650, saving model to models/weights_bert_without_attantion_mask.hdf5
Epoch 3/10

Epoch 00003: val_accuracy improved from 0.60650 to 0.61800, saving model to models/weights_bert_without_attantion_mask.hdf5
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.61800
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.61800
Epoch 6/10

Epoch 00006: val_accuracy did not improve from 0.61800
Epoch 00006: early stopping


In [17]:
#######################################
### ----- Evaluate the model ------ ###
# Ready test data
# y_valid
x_valid = tokenizer(
    text=valid_df["text"].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = False,
    verbose = True)
# Run evaluation
model_eval = model.evaluate(
    x={'input_ids': x_valid['input_ids']},
    y={'stars': y_valid}
)



In [18]:
model = keras.models.load_model(os.path.join("models", "weights_bert_without_attantion_mask.hdf5"))

train_score = model.evaluate(
    x={'input_ids': x_train['input_ids']},
    y={'stars': y_train})

valid_score = model.evaluate(
    x={'input_ids': x_valid['input_ids']},
    y={'stars': y_valid})

print("training loss:", train_score[0], "training accuracy", train_score[1])
print("valid loss:", valid_score[0], "valid accuracy", valid_score[1])

training loss: 0.635367214679718 training accuracy 0.7609000205993652
valid loss: 0.9575802683830261 valid accuracy 0.6115000247955322


Load the bert model without attention - Adam, Drop=0.1, LR=5e-5

In [None]:
!wget -q https://hkustconnect-my.sharepoint.com/:u:/g/personal/nnanda_connect_ust_hk/Ebw_XPCer7xHrLTrKafKAC4BGecTS0vORrJXhu_2dM8P_g?download=1 -O "weights_bert_without_attention_mask.hdf5"

model = keras.models.load_model(os.path.join("weights_bert_without_attention_mask.hdf5"))

train_score = model.evaluate(
    x={'input_ids': x_train['input_ids']},
    y={'stars': y_train})

valid_score = model.evaluate(
    x={'input_ids': x_valid['input_ids']},
    y={'stars': y_valid})

print("training loss:", train_score[0], "training accuracy", train_score[1])
print("valid loss:", valid_score[0], "valid accuracy", valid_score[1])

training loss: 0.72379469871521 training accuracy 0.7186999917030334
valid loss: 0.923902153968811 valid accuracy 0.6175000071525574


Load the bert model without attention - RMSprop, Drop=0.5, LR=5e-5

In [None]:
!wget -q https://hkustconnect-my.sharepoint.com/:u:/g/personal/nnanda_connect_ust_hk/EWQ7aIv2qZJHuvJcPYHeYSQBR9kxTakulpMixT9JozV0Hw?download=1 -O "weights_bert_without_attention_mask_62.hdf5"

model = keras.models.load_model(os.path.join("weights_bert_without_attention_mask_62.hdf5"))

train_score = model.evaluate(
    x={'input_ids': x_train['input_ids']},
    y={'stars': y_train})

valid_score = model.evaluate(
    x={'input_ids': x_valid['input_ids']},
    y={'stars': y_valid})

print("training loss:", train_score[0], "training accuracy", train_score[1])
print("valid loss:", valid_score[0], "valid accuracy", valid_score[1])

training loss: 0.7079257965087891 training accuracy 0.7211999893188477
valid loss: 0.9042415022850037 valid accuracy 0.6200000047683716


Load the bert model without attention - RMSprop, Drop=0.7, LR=5e-5

In [None]:
!wget -q https://hkustconnect-my.sharepoint.com/:u:/g/personal/nnanda_connect_ust_hk/ETeQLB1tzXtJtB8amwOMnhQBzpjMDTbStbn3Zht0sb54oA?download=1 -O "weights_bert_without_attention_mask_62_3.hdf5"

model = keras.models.load_model(os.path.join("weights_bert_without_attention_mask_62_3.hdf5"))

train_score = model.evaluate(
    x={'input_ids': x_train['input_ids']},
    y={'stars': y_train})

valid_score = model.evaluate(
    x={'input_ids': x_valid['input_ids']},
    y={'stars': y_valid})

print("training loss:", train_score[0], "training accuracy", train_score[1])
print("valid loss:", valid_score[0], "valid accuracy", valid_score[1])

training loss: 0.7118481397628784 training accuracy 0.7178999781608582
valid loss: 0.9178875088691711 valid accuracy 0.6230000257492065


# Bert with attention mask

In [None]:
#######################################
### --------- Setup BERT ---------- ###
# Name of the BERT model to use
model_name = 'bert-base-uncased'
# Max length of tokens
max_length = 100
# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
config.output_hidden_states = False
# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
# Load the Transformers BERT model
transformer_model = TFBertModel.from_pretrained(model_name, config = config)

In [11]:
#######################################
### ------- Build the model ------- ###
# TF Keras documentation: https://www.tensorflow.org/api_docs/python/tf/keras/Model

# Load the MainLayer
bert = transformer_model.layers[0]

# Build your model input
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32') 
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}

# Load the Transformers BERT model as a layer in a Keras model
bert_model = bert(inputs)[1]
dropout = Dropout(config.hidden_dropout_prob, name='pooled_output')
# dropout = Dropout(0.5, name='pooled_output')
pooled_output = dropout(bert_model, training=False)

# Then build your model output
outputs = Dense(units=5, kernel_initializer=TruncatedNormal(stddev=config.initializer_range), name='stars')(pooled_output)

# And combine it all in a model object
model_attention = Model(inputs=inputs, outputs=outputs, name='BERT_MultiClass')

# Take a look at the model
model_attention.summary()

Model: "BERT_MultiClass"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_mask (InputLayer)     [(None, 100)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 100)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 109482240   attention_mask[0][0]             
                                                                 input_ids[0][0]                  
__________________________________________________________________________________________________
pooled_output (Dropout)         (None, 768)          0           bert[0][1]         

In [12]:
#######################################
### ------- Train the model ------- ###
# Set an optimizer
optimizer = RMSprop(
    learning_rate=5e-05,
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Compile the model
model_attention.compile(
    optimizer = optimizer,
    loss = CategoricalCrossentropy(from_logits = True), 
    metrics = CategoricalAccuracy('accuracy'))

# Ready output data for the model
y_train = to_categorical(train_df["stars"]-1, num_classes=5)
y_valid = to_categorical(valid_df["stars"]-1, num_classes=5)

# Tokenize the input (takes some time)
x_train_attention_mask = tokenizer(
    text=train_df["text"].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

x_valid_attention_mask = tokenizer(
    text=valid_df["text"].to_list(),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

os.makedirs("models", exist_ok=True)
checkpointer = keras.callbacks.ModelCheckpoint(
    filepath=os.path.join("models", "weights_bert_with_attention_mask.hdf5"),
    monitor="val_accuracy",
    verbose=1,
    save_best_only=True)
earlystopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    verbose=1)

# Fit the model
history = model_attention.fit(
    x={'input_ids': x_train_attention_mask['input_ids'], 'attention_mask': x_train_attention_mask['attention_mask']},
    y={'stars': y_train},
    validation_split=0.2,
    batch_size=64,
    epochs=10,
    callbacks=[checkpointer, earlystopping])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.62000, saving model to models/weights_bert_with_attention_mask.hdf5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.62000 to 0.62050, saving model to models/weights_bert_with_attention_mask.hdf5
Epoch 3/10

Epoch 00003: val_accuracy improved from 0.62050 to 0.62250, saving model to models/weights_bert_with_attention_mask.hdf5
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.62250
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.62250
Epoch 00005: early stopping


In [13]:
model = keras.models.load_model(os.path.join("models", "weights_bert_with_attention_mask.hdf5"))

train_score = model.evaluate(
    x={'input_ids': x_train_attention_mask['input_ids'], 'attention_mask': x_train_attention_mask['attention_mask']},
    y={'stars': y_train})

valid_score = model.evaluate(
    x={'input_ids': x_valid_attention_mask['input_ids'], 'attention_mask': x_valid_attention_mask['attention_mask']},
    y={'stars': y_valid})

print("training loss:", train_score[0], "training accuracy", train_score[1])
print("valid loss:", valid_score[0], "valid accuracy", valid_score[1])

training loss: 0.6308273673057556 training accuracy 0.7638000249862671
valid loss: 0.9697353839874268 valid accuracy 0.6265000104904175


Load the Bert model with attention - Drop=0.1, Adam, LR=5e-5

In [None]:
!wget -q https://hkustconnect-my.sharepoint.com/:u:/g/personal/nnanda_connect_ust_hk/EePikR9TtXhCu5uVzRyZ9P8B2jpezGfALcGQTz_emtudXQ?download=1 -O "weights_bert_with_attention_mask.hdf5"

model = keras.models.load_model(os.path.join("weights_bert_with_attention_mask.hdf5"))

train_score = model.evaluate(
    x={'input_ids': x_train_attention_mask['input_ids'], 'attention_mask': x_train_attention_mask['attention_mask']},
    y={'stars': y_train})

valid_score = model.evaluate(
    x={'input_ids': x_valid_attention_mask['input_ids'], 'attention_mask': x_valid_attention_mask['attention_mask']},
    y={'stars': y_valid})

print("training loss:", train_score[0], "training accuracy", train_score[1])
print("valid loss:", valid_score[0], "valid accuracy", valid_score[1])

training loss: 0.5933009386062622 training accuracy 0.7847999930381775
valid loss: 0.973475456237793 valid accuracy 0.6209999918937683


Load the Bert model with attention - Drop=0.1, RMSprop, LR=5e-5

In [20]:
!wget -q https://hkustconnect-my.sharepoint.com/:u:/g/personal/nnanda_connect_ust_hk/EXrPZ5UNreNKsJhqoZgBbuABcVH8WpSlYWxgiNC23weYJg?download=1 -O "weights_bert_with_attention_mask_62_65.hdf5"

model = keras.models.load_model(os.path.join("weights_bert_with_attention_mask_62_65.hdf5"))

train_score = model.evaluate(
    x={'input_ids': x_train_attention_mask['input_ids'], 'attention_mask': x_train_attention_mask['attention_mask']},
    y={'stars': y_train})

valid_score = model.evaluate(
    x={'input_ids': x_valid_attention_mask['input_ids'], 'attention_mask': x_valid_attention_mask['attention_mask']},
    y={'stars': y_valid})

print("training loss:", train_score[0], "training accuracy", train_score[1])
print("valid loss:", valid_score[0], "valid accuracy", valid_score[1])

training loss: 0.6308273673057556 training accuracy 0.7638000249862671
valid loss: 0.9697353839874268 valid accuracy 0.6265000104904175
