In [None]:
!pip install tensorflow-gpu
!pip install --upgrade grpcio
!pip install bert-for-tf2

In [None]:
import os
import math
import datetime

from tqdm import tqdm

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer

In [None]:
train = pd.read_csv('/content/drive/My Drive/goemotions_aug_dairai_train_cleaned.csv')

In [None]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
!unzip uncased_L-12_H-768_A-12.zip

--2020-10-12 13:48:16--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.20.128, 74.125.142.128, 74.125.195.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.20.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 407727028 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip’


2020-10-12 13:48:18 (173 MB/s) - ‘uncased_L-12_H-768_A-12.zip’ saved [407727028/407727028]

Archive:  uncased_L-12_H-768_A-12.zip
   creating: uncased_L-12_H-768_A-12/
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: uncased_L-12_H-768_A-12/vocab.txt  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: uncased_L-12_H-768_A-12/bert_config.json  


In [None]:
model_dir = './uncased_L-12_H-768_A-12/'
bert_ckpt_file = model_dir + "bert_model.ckpt"
from bert.loader import load_stock_weights
bert_params = bert.params_from_pretrained_ckpt(model_dir)

In [None]:
# add adapter layer in bert_model
bert_params.adapter_size=64

In [None]:

def flatten_layers(root_layer):
    if isinstance(root_layer, keras.layers.Layer):
        yield root_layer
    for layer in root_layer._layers:
        for sub_layer in flatten_layers(layer):
            yield sub_layer


def freeze_bert_layers(l_bert):
    """
    Freezes all but LayerNorm and adapter layers - see arXiv:1902.00751.
    """
    for layer in flatten_layers(l_bert):
        if layer.name in ["LayerNorm", "adapter-down", "adapter-up"]:
            layer.trainable = True
        elif len(layer._layers) == 0:
            layer.trainable = False
        l_bert.embeddings_layer.trainable = False

In [None]:
def get_model(max_seq_len, params):
  input_tensor = keras.layers.Input((max_seq_len, ), dtype='int32')
  bert_layer = bert.BertModelLayer.from_params(params, name='bert')
  bert_output = bert_layer(input_tensor)
  x = keras.layers.Lambda(lambda x: x[:, 0, :])(bert_output)
  x = keras.layers.Dropout(0.5)(x)
  x = keras.layers.Dense(768, activation='tanh')(x)
  x = keras.layers.Dropout(0.5)(x)
  x = keras.layers.Dense(4, activation='sigmoid')(x)

  model = keras.models.Model(input_tensor, x)
  model.build(input_shape=(None, max_seq_len))
  load_stock_weights(bert_layer, bert_ckpt_file)
  freeze_bert_layers(bert_layer)
  
  return model

In [None]:
from bert.tokenization.bert_tokenization import FullTokenizer
tokenizer = FullTokenizer('uncased_L-12_H-768_A-12/vocab.txt')

In [None]:
train = train.rename(columns={'cleaned_processed':'text'})

In [None]:
class EmotionDetectionData:
  DATA_COLUMN = "text"
  LABEL_COLUMNS = ['anger', 'fear', 'joy', 'sadness']

  def __init__(self, train, tokenizer: FullTokenizer, max_seq_len):
    self.tokenizer = tokenizer
    self.max_seq_len = max_seq_len
    self.train_x, self.train_y = self._prepare(train)
    
  def _prepare(self, df):
    x, y = [], []
    
    for _, row in tqdm(df.iterrows()):
      text, label = row[EmotionDetectionData.DATA_COLUMN], row[EmotionDetectionData.LABEL_COLUMNS]
      tokens = self.tokenizer.tokenize(text)
      if len(tokens) > self.max_seq_len - 2:
        tokens = tokens[:self.max_seq_len - 2]
      tokens = ["[CLS]"] + tokens + ["[SEP]"]
      token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
      if len(token_ids) < self.max_seq_len:
        token_ids += [0] * (self.max_seq_len - len(token_ids))
      x.append(token_ids)
      y.append(label.astype('int32'))
    return np.array(x), np.array(y)

In [None]:
train['text_len'] = train['text'].apply(lambda x: len(x.split(' ')))

In [None]:
train = train[train.text_len < 100]

In [None]:
data = EmotionDetectionData(train, tokenizer, 100)

460548it [07:21, 1043.66it/s]


In [None]:
data.max_seq_len

100

In [None]:
model = get_model(100, bert_params)

In [None]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='goemotions_adapter_bert_weights_100000.h5',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True
)
early_stop_callback =  tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', patience=2)

In [None]:
model.compile(keras.optimizers.Adam(5e-5), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(data.train_x, data.train_y, validation_split=0.3, batch_size=32, shuffle=True, epochs=5, callbacks=[model_checkpoint_callback, early_stop_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.save_weights('/content/drive/My Drive/adapter_bert_final_weights.h5')

In [None]:
predicter = model = get_model(data.max_seq_len, bert_params)
predicter.load_weights('/content/drive/My Drive/adapter_bert_final_weights.h5')
y_predict = predicter.predict(data.train_x[:10])

In [None]:
y_predict

array([[2.1813128e-03, 1.7206159e-03, 7.1748692e-01, 2.9481815e-03],
       [6.2209209e-03, 4.8038445e-02, 5.0082576e-02, 1.2255999e-02],
       [7.5943983e-01, 1.0496391e-03, 1.3680718e-02, 2.4896829e-01],
       [3.0582529e-03, 2.9010135e-03, 7.9828161e-01, 1.1252617e-03],
       [1.7251832e-03, 2.3815947e-02, 6.7448523e-03, 9.8140830e-01],
       [2.9195060e-03, 2.6431173e-01, 2.9565291e-03, 8.9544791e-01],
       [1.7085606e-02, 1.2962701e-04, 4.0250272e-03, 9.9405175e-01],
       [9.1257156e-04, 4.2398172e-04, 9.0980041e-01, 9.8832138e-04],
       [7.7782618e-03, 9.9403834e-01, 6.6346517e-03, 6.9292807e-03],
       [4.7041266e-03, 4.9519225e-04, 9.1201359e-01, 5.9784058e-04]],
      dtype=float32)

In [None]:
data.train_y[:10]

array([[0, 0, 1, 0],
       [0, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0]], dtype=int32)

In [None]:
model.compile(keras.optimizers.Adam(5e-5), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.evaluate(data.train_x[9000:],  data.train_y[9000:])



[0.1526949107646942, 0.7950000166893005]

In [None]:
y_predict = predicter.predict(data.train_x[6000:8000])
y_true = data.train_y[6000:8000]

In [None]:
y_predict = np.where(y_predict > 0.4, 1, 0)

In [None]:
from sklearn.metrics import f1_score, recall_score, precision_score

In [None]:
y_true

array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [1, 0, 0, 0],
       ...,
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1]], dtype=int32)

In [None]:
y_predict

array([[0, 0, 1, 0],
       [0, 0, 1, 0],
       [1, 0, 0, 0],
       ...,
       [1, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 0, 1]])

In [None]:
recall_score(y_true=y_true, y_pred=y_predict, average='weighted')

0.8833922261484098

In [None]:
precision_score(y_true=y_true, y_pred=y_predict, average='weighted')

0.8123453877810835

In [None]:
f1_score(y_true=y_true, y_pred=y_predict, average='weighted')

0.8436428197007676

In [None]:
data = EmotionDetectionData(train, tokenizer, 100)

460548it [07:58, 961.92it/s] 


In [None]:
np.save('/content/drive/My Drive/train_x_100.npy', data.train_x)

In [None]:
np.save('/content/drive/My Drive/train_y_100.npy', data.train_y)

In [None]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='/content/drive/My Drive/goemotions_adapter_bert_weights_full_dataset.h5',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True
)
early_stop_callback =  tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', patience=2)

In [None]:
model.fit(data.train_x, data.train_y, validation_split=0.3, batch_size=64, shuffle=True, epochs=5, callbacks=[model_checkpoint_callback, early_stop_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5

In [None]:
model.load_weights('/content/drive/My Drive/goemotions_adapter_bert_weights_full_dataset.h5')

In [None]:
train_x = data.train_x
train_y = data.train_y
np.save('/content/drive/My Drive/goemotions_train_x_100_full_dataset', train_x)
np.save('/content/drive/My Drive/goemotions_train_y_full_dataset', train_y)

In [None]:
model.compile(keras.optimizers.Adam(5e-5), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.evaluate(train_x[90000:90800],  train_y[90000:90800])



[0.06508126854896545, 0.8162500262260437]

In [None]:
model.fit(data.train_x, data.train_y, validation_split=0.3, batch_size=64, shuffle=True, epochs=5, callbacks=[model_checkpoint_callback, early_stop_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5

KeyboardInterrupt: ignored

In [None]:
model.compile(keras.optimizers.Adam(5e-5), loss='binary_crossentropy', metrics=['binary_accuracy'])

In [None]:
model.evaluate(train_x[390000:390800],  train_y[390000:390800])

  86/2159 [>.............................] - ETA: 16:36 - loss: 0.0552 - binary_accuracy: 0.9738

KeyboardInterrupt: ignored

In [None]:
y_pred = model.predict(train_x[90000:90800])

In [None]:
y_pred.shape


(800, 4)

In [None]:
for i in range(800):
  for j in range(4):
    y_pred[i, j] = 1 if y_pred[i, j] > 0.3 else 0

In [None]:
y_pred.astype('int32')

array([[0, 1, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       ...,
       [0, 0, 0, 1],
       [0, 0, 0, 0],
       [0, 0, 0, 1]], dtype=int32)

In [None]:
y_true = train_y[90000:90800]

In [None]:
recall_score(y_true=y_true, y_pred=y_pred, average='weighted')

0.9666666666666667

In [None]:
precision_score(y_true=y_true, y_pred=y_pred, average='weighted')

0.930098918303591

In [None]:
f1_score(y_true=y_true, y_pred=y_pred, average='weighted')


0.9469667723570174

In [None]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='/content/drive/My Drive/goemotions_adapter_bert_weights_full_dataset.h5',
    save_weights_only=True,
    monitor='val_binary_accuracy',
    mode='max',
    save_best_only=True
)
early_stop_callback =  tf.keras.callbacks.EarlyStopping(monitor='val_binary_accuracy', mode='max', patience=2)

In [None]:
model.fit(train_x[390000:400000], train_y[390000:400000], validation_split=0.3, batch_size=64, shuffle=True, epochs=5, callbacks=[model_checkpoint_callback, early_stop_callback])

Epoch 1/5

KeyboardInterrupt: ignored