## Classify text with BERT 

Reading file

In [1]:
import pandas as pd
import numpy as np

import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)


C:\Users\User\AppData\Local\Programs\Python\Python37\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\User\AppData\Local\Programs\Python\Python37\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
C:\Users\User\AppData\Local\Programs\Python\Python37\lib\site-packages\numpy\.libs\libopenblas.QVLO2T66WEPI7JZ63PS3HMOHFEY472BC.gfortran-win_amd64.dll
  stacklevel=1)
 The versions of TensorFlow you are currently using is 2.6.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
sent_train = pd.read_csv(
    r"C:\Users\User\Documents\C2001\FIT3161_3162\FIT3161\dataset\sentiment\mbsa.csv",
).dropna()

sent_train.head()

Unnamed: 0,Date,text,Sentiment
0,2019-05-27,È appena uscito un nuovo video! LES CRYPTOMONN...,Positive
1,2019-05-27,Cardano: Digitize Currencies; EOS https://t.co...,Positive
2,2019-05-27,Another Test tweet that wasn't caught in the s...,Positive
3,2019-05-27,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...,Positive
4,2019-05-27,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...,Positive


In [3]:
sent_train['Sentiment'] = sent_train['Sentiment'].replace("Positive", 1, regex=True)
sent_train['Sentiment'] = sent_train['Sentiment'].replace("Negative", 0, regex=True)


In [4]:
sent_train.head()

Unnamed: 0,Date,text,Sentiment
0,2019-05-27,È appena uscito un nuovo video! LES CRYPTOMONN...,1
1,2019-05-27,Cardano: Digitize Currencies; EOS https://t.co...,1
2,2019-05-27,Another Test tweet that wasn't caught in the s...,1
3,2019-05-27,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...,1
4,2019-05-27,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...,1


In [5]:
sent_train = sent_train.replace(r'\n',' ', regex=True) 
sent_train = sent_train.replace(r'\t',' ', regex=True) 

In [6]:
# text = np.asarray(sent_train['text'])
sent_train = sent_train.drop(columns=['Date'])

### Split the dataframe into train, validation and test

In [7]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(sent_train, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

11809858 train examples
2952465 validation examples
3690581 test examples


### Create an input pipeline using tf.data

In [8]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('Sentiment')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [9]:
batch_size = 5
train_ds = df_to_dataset(train, batch_size=batch_size)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

### Creating a BERT Tokenizer

In [None]:
preprocessor = hub.load(
    "https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3")
# bert_preprocess_model = hub.KerasLayer(preprocessor)

In [None]:
encoder_train = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4",
    trainable=True)

In [None]:
text_test = [text[10]]
text_preprocessed = preprocessor(text_test)

print(f'Keys       : {list(text_preprocessed.keys())}')
print(f'Shape      : {text_preprocessed["input_word_ids"].shape}')
print(f'Word Ids   : {text_preprocessed["input_word_ids"][0, :12]}')
print(f'Input Mask : {text_preprocessed["input_mask"][0, :12]}')
print(f'Type Ids   : {text_preprocessed["input_type_ids"][0, :12]}')

### Using the BERT model

In [None]:
bert_model = hub.KerasLayer(encoder_train)

In [None]:
# define model
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(preprocessor, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(encoder_train, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.1)(net)
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

In [None]:
classifier_model = build_classifier_model()

## TRAINING

In [None]:
# loss function
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [None]:
epochs = 5
steps_per_epoch = 100
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [None]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [None]:
text_train_preprocessed = bert_preprocess_model(text)

In [None]:
print(f'Training model with {encoder_train}')
history = classifier_model.fit(text_train_preprocessed, y,
                               epochs=epochs)