In [None]:
!pip install tensorflow-gpu
!pip install --upgrade grpcio
!pip install bert-for-tf2

Collecting tensorflow-gpu
[?25l  Downloading https://files.pythonhosted.org/packages/18/99/ac32fd13d56e40d4c3e6150030132519997c0bb1f06f448d970e81b177e5/tensorflow_gpu-2.3.1-cp36-cp36m-manylinux2010_x86_64.whl (320.4MB)
[K     |████████████████████████████████| 320.4MB 54kB/s 
Installing collected packages: tensorflow-gpu
Successfully installed tensorflow-gpu-2.3.1
Requirement already up-to-date: grpcio in /usr/local/lib/python3.6/dist-packages (1.32.0)
Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/af/c1/015648a2186b25c6de79d15bec40d3d946fcf1dd5067d1c1b28009506486/bert-for-tf2-0.14.6.tar.gz (40kB)
[K     |████████████████████████████████| 40kB 2.7MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c

In [None]:
import os
import math
import datetime

from tqdm import tqdm

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer

In [None]:
train = pd.read_csv('/content/drive/My Drive/goemotions_aug_dairai_train_cleaned.csv')

In [None]:
train.head()

Unnamed: 0.1,Unnamed: 0,anger,augmented,cleaned_processed,datasource,fear,joy,sadness
0,206509,0.0,0,i feel that some korea guy are handsome and so...,dairai,0.0,1.0,0.0
1,375242,0.0,0,i put my pen to paper and made a list of thing...,dairai,0.0,0.0,0.0
2,166570,1.0,0,i wish i only had to feel the pain of the pett...,dairai,0.0,0.0,0.0
3,200580,0.0,0,i feel passionate about this journey and stand...,dairai,0.0,1.0,0.0
4,300766,0.0,0,i feel like i have convinced myself of these f...,dairai,0.0,1.0,0.0


In [None]:
train.shape

(460551, 8)

In [None]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip

--2020-10-08 20:08:29--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.20.128, 74.125.142.128, 74.125.195.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.20.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 407727028 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip’


2020-10-08 20:08:32 (153 MB/s) - ‘uncased_L-12_H-768_A-12.zip’ saved [407727028/407727028]



In [None]:
!unzip uncased_L-12_H-768_A-12.zip

Archive:  uncased_L-12_H-768_A-12.zip
   creating: uncased_L-12_H-768_A-12/
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: uncased_L-12_H-768_A-12/vocab.txt  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: uncased_L-12_H-768_A-12/bert_config.json  


In [None]:
model_dir = './uncased_L-12_H-768_A-12/'
bert_ckpt_file = model_dir + "bert_model.ckpt"
from bert.loader import load_stock_weights
bert_params = bert.params_from_pretrained_ckpt(model_dir)

In [None]:
classes = ['anger', 'fear', 'joy', 'sadness']

In [None]:
def get_model(max_seq_len, params):
  input_tensor = keras.layers.Input((max_seq_len, ), dtype='int32')
  bert_layer = bert.BertModelLayer.from_params(params, name='bert')
  bert_output = bert_layer(input_tensor)
  x = keras.layers.Lambda(lambda x: x[:, 0, :])(bert_output)
  x = keras.layers.Dropout(0.5)(x)
  x = keras.layers.Dense(768, activation='tanh')(x)
  x = keras.layers.Dropout(0.5)(x)
  x = keras.layers.Dense(4, activation='sigmoid')(x)

  model = keras.models.Model(input_tensor, x)
  model.build(input_shape=(None, max_seq_len))
  load_stock_weights(bert_layer, bert_ckpt_file)
  
  
  return model

In [None]:
from bert.tokenization.bert_tokenization import FullTokenizer
tokenizer = FullTokenizer('uncased_L-12_H-768_A-12/vocab.txt')

In [None]:
tokens = tokenizer.tokenize('hello bert! hello world!')
print(tokens)
sequence = tokenizer.convert_tokens_to_ids(tokens)
print(sequence)

['hello', 'bert', '!', 'hello', 'world', '!']
[7592, 14324, 999, 7592, 2088, 999]


In [None]:
train = train.rename(columns={'cleaned_processed':'text'})

In [None]:
class IntentDetectionData:
  DATA_COLUMN = "text"
  LABEL_COLUMNS = ['anger', 'fear', 'joy', 'sadness']

  def __init__(self, train, tokenizer: FullTokenizer, max_seq_len):
    self.tokenizer = tokenizer
    self.max_seq_len = 0
    self.train_x, self.train_y = self._prepare(train)
    self.max_seq_len = min(self.max_seq_len, max_seq_len)
    self.train_x =self._pad(self.train_x)
    self.train_y = self.train_y.astype('int32')
    
  def _prepare(self, df):
    x, y = [], []
    
    for _, row in tqdm(df.iterrows()):
      text, label = row[IntentDetectionData.DATA_COLUMN], row[IntentDetectionData.LABEL_COLUMNS]
      tokens = self.tokenizer.tokenize(text)
      tokens = ["[CLS]"] + tokens + ["[SEP]"]
      token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
      self.max_seq_len = max(self.max_seq_len, len(token_ids))
      x.append(token_ids)
      y.append(label)

    return np.array(x), np.array(y)

  def _pad(self, ids):
    x = []
    for input_ids in ids:
      input_ids = input_ids[:min(len(input_ids), self.max_seq_len - 2)]
      input_ids = input_ids + [0] * (self.max_seq_len - len(input_ids))
      x.append(np.array(input_ids))
    return np.array(x)

In [None]:
train['text_len'] = train['text'].apply(lambda x: len(x.split(' ')))

In [None]:
train.text_len.sort_values()

308960      1
198631      1
304858      1
164110      1
219996      1
         ... 
110501     80
224855     82
225137    101
78205     102
78768     180
Name: text_len, Length: 460551, dtype: int64

In [None]:
train = train[train.text_len < 100]

In [None]:
data = IntentDetectionData(train[:100000], tokenizer, 100)

100000it [01:24, 1188.07it/s]


In [None]:
with open('train_x_100000.npy', 'wb') as f:
    np.save(f, data.train_x)

In [None]:
with open('train_y_100000.npy', 'wb') as f:
    np.save(f, data.train_y)

In [None]:
data.max_seq_len

91

In [None]:
# model = get_model(data.max_seq_len, bert_params)
model = get_model(data.max_seq_len, bert_params)

Done loading 196 BERT weights from: ./uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7fbf414dacc0> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights


In [None]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='goemotions_bert_weights_100000.h5',
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True
)
early_stop_callback =  tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', patience=2)


In [None]:
model.compile(keras.optimizers.Adam(5e-5), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(data.train_x, data.train_y, validation_split=0.3, batch_size=32, shuffle=True, epochs=5, callbacks=[model_checkpoint_callback, early_stop_callback])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.summary()

In [None]:
model.save_weights('/content/drive/My Drive/goemotion_bert_weights_100000.h5', overwrite=True)

In [None]:
model = get_model(data.max_seq_len, bert_params)
model.load_weights('/content/drive/My Drive/goemotion_bert_weights_100000.h5')

Done loading 196 BERT weights from: ./uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7fbf3596d748> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights


In [None]:
model.predict(data.train_x[0:10])

array([[2.3150997e-05, 3.3076099e-05, 9.9887902e-01, 6.3355765e-05],
       [2.3511145e-02, 1.3643566e-02, 2.2761660e-02, 3.7019197e-02],
       [9.9998462e-01, 3.8386494e-05, 4.5079380e-04, 7.9335179e-05],
       [1.0806235e-03, 7.5574510e-04, 3.7288973e-01, 1.1009743e-02],
       [1.4841853e-03, 7.0907612e-04, 5.7680482e-01, 9.2981055e-02],
       [2.1988984e-05, 1.0045726e-05, 6.1871469e-05, 9.9992609e-01],
       [2.2278580e-05, 1.0310466e-05, 5.9951872e-05, 9.9992573e-01],
       [2.9060957e-05, 1.8187677e-05, 9.9562621e-01, 7.4902920e-05],
       [5.3904983e-05, 9.9994278e-01, 6.6810178e-05, 3.7717000e-05],
       [2.2593871e-05, 3.1888558e-05, 9.9877602e-01, 5.2689793e-05]],
      dtype=float32)

In [None]:
data.train_y[0:10]

array([[0, 0, 1, 0],
       [0, 0, 0, 0],
       [1, 0, 0, 0],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 1, 0, 0],
       [0, 0, 1, 0]], dtype=int32)