In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

import time

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import IPython.display as ipd

from scipy.io import wavfile as wav

from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer
import tensorflow.keras as keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ReduceLROnPlateau,EarlyStopping,ModelCheckpoint,LearningRateScheduler
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.utils import pad_sequences


SEED = 42

import os
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

import random 
random.seed(SEED)

import numpy as np
np.random.seed(SEED)

import tensorflow as tf
tf.random.set_seed(SEED)

: 

# Read Train and Test data

In [None]:
#import the train and test files for financial news
fnews_Xtrain = pd.read_csv('data/train/fnews_Xtrain.csv')
fnews_Xtest = pd.read_csv('data/test/fnews_Xtest.csv')
fnews_ytrain = pd.read_csv('data/train/fnews_ytrain.csv')
fnews_ytest = pd.read_csv('data/test/fnews_ytest.csv')

In [None]:
#drop the first column for all the train and test sets
fnews_Xtrain.drop(columns=fnews_Xtrain.columns[0], axis=1, inplace=True)
fnews_ytrain.drop(columns=fnews_ytrain.columns[0], axis=1, inplace=True)
fnews_Xtest.drop(columns=fnews_Xtest.columns[0], axis=1, inplace=True)
fnews_ytest.drop(columns=fnews_ytest.columns[0], axis=1, inplace=True)

In [None]:
#check the respective shape of the train and test sets
print('Xtrain:',fnews_Xtrain.shape, 'ytrain:' ,fnews_ytrain.shape)
print('Xtest:',fnews_Xtest.shape, 'ytest:' ,fnews_ytest.shape)

Xtrain: (3876, 1) ytrain: (3876, 1)
Xtest: (970, 1) ytest: (970, 1)


In [None]:
fnews_Xtrain['News'][0]

"['russia', 'raisio', 's', 'food', 'division', 's', 'home', 'market', 'stretch', 'way', 'vladivostok']"

In [None]:
type(fnews_Xtrain["News"][0])

str

In [None]:
fnews_Xtrain.head()

Unnamed: 0,News
0,"['russia', 'raisio', 's', 'food', 'division', ..."
1,"['operator', 'need', 'learn', 'use', 'device',..."
2,"['company', 'expects', 'net', 'sale', 'half', ..."
3,"['bridge', 'km', 'long', 'located', 'anasmotet..."
4,"['nokia', 'capcom', 'announced', 'resident', '..."


In [None]:
fnews_Xtrain['News'] = fnews_Xtrain['News'].apply(eval)
fnews_Xtest['News'] = fnews_Xtest['News'].apply(eval)

# BERT Model

BERT is a trained Transformer Encoder stack. We will be using both the base and the large version from huggingface, and choose the best model.

In [None]:
# Create sentence and label lists
sentences = fnews_Xtrain['News']

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = [["[CLS]"] + sentence + ["[SEP]"] for sentence in sentences]
labels = fnews_ytrain.values

BERT requires specifically formatted inputs. For each tokenized input sentence, we need to create:
- input ids: a sequence of integers identifying each input token to its index number in the BERT tokenizer vocabulary
- segment mask: (optional) a sequence of 1s and 0s used to identify whether the input is one sentence or two sentences long. For one sentence inputs, this is simply a sequence of 0s. For two sentence inputs, there is a 0 for each token of the first sentence, followed by a 1 for each token of the second sentence
- attention mask: (optional) a sequence of 1s and 0s, with 1s for all input tokens and 0s for all padding tokens (we'll detail this in the next paragraph)
- labels: a single value of 1 or 0. In our task 1 means "grammatical" and 0 means "ungrammatical"

Although we can have variable length input sentences, BERT does requires our input arrays to be the same size. We address this by first choosing a maximum sentence length, and then padding and truncating our inputs until every input sequence is of the same length.

To "pad" our inputs in this context means that if a sentence is shorter than the maximum sentence length, we simply add 0s to the end of the sequence until it is the maximum sentence length.

If a sentence is longer than the maximum sentence length, then we simply truncate the end of the sequence, discarding anything that does not fit into our maximum sentence length.

We pad and truncate our sequences so that they all become of length MAX_LEN ("post" indicates that we want to pad and truncate at the end of the sequence, as opposed to the beginning) pad_sequences is a utility function that we're borrowing from Keras. It simply handles the truncating and padding of Python lists.


In [None]:
from transformers import TFBertConfig, TFBertModel, BertTokenizer

In [None]:
MAX_LEN = 128
def create_attention_masks(df):
    # Create sentence and label lists
    train_sentences = df

    # We need to add special tokens at the beginning and end of each sentence for BERT to work properly
    train_sentences = [["[CLS]"] + sentence + ["[SEP]"] for sentence in train_sentences]
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    tokenized_texts = [tokenizer.tokenize(" ".join(sent)) for sent in sentences] # retokensized it according to BERT vocabulary
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    
    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
        train_seq_mask = [float(i>0) for i in seq]
        attention_masks.append(train_seq_mask)

    return input_ids, attention_masks

In [None]:
# mapping the classes to a numeric representation
train_labels = fnews_ytrain['Class'].map({'neutral':0, 'positive': 1, 'negative': -1}).values
test_labels = fnews_ytest['Class'].map({'neutral':0, 'positive': 1, 'negative': -1}).values

In [None]:
train_input_ids, train_attention_masks = create_attention_masks(fnews_Xtrain['News'])
test_input_ids, test_attention_masks = create_attention_masks(fnews_Xtest['News'])
ttrain_input_ids, val_input_ids, ttrain_attention_masks, val_attention_masks, ttrain_labels, val_labels  = train_test_split(train_input_ids, train_attention_masks, train_labels, 0.2)
print('Train shape: ', ttrain_input_ids.shape, ttrain_attention_masks.shape)
print('Val shape: ', val_input_ids.shape, val_attention_masks.shape)
print('Test shape: ', test_input_ids.shape, test_attention_masks.shape)

In [None]:
ttrain_inputs = tf.tensor(train_input_ids)
val_inputs = tf.tensor(val_input_ids)
test_inputs = tf.tensor(test_input_ids)
ttrain_labels = tf.tensor(ttrain_labels)
val_labels = tf.tensor(val_labels)
test_labels = tf.tensor(test_labels)
ttrain_masks = tf.tensor(ttrain_attention_masks)
val_masks = tf.tensor(val_attention_masks)
test_masks = tf.tensor(test_attention_masks)

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

UsageError: Line magic function `%` not found.


In [43]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_labels, test_masks)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

Now that our input data is properly formatted, it's time to fine tune the BERT model.

For this task, we first want to modify the pre-trained BERT model to give outputs for classification, and then we want to continue training the model on our dataset until that the entire model, end-to-end, is well-suited for our task. Thankfully, the huggingface pytorch implementation includes a set of interfaces designed for a variety of NLP tasks. Though these interfaces are all built on top of a trained BERT model, each has different top layers and output types designed to accomodate their specific NLP task.

We'll load BertForSequenceClassification. This is the normal BERT model with an added single linear layer on top for classification that we will use as a sentence classifier. As we feed input data, the entire pre-trained BERT model and the additional untrained classification layer is trained on our specific task.
Structure of Fine-Tuning Model

As we've showed beforehand, the first token of every sequence is the special classification token ([CLS]). Unlike the hidden state vector corresponding to a normal word token, the hidden state corresponding to this special token is designated by the authors of BERT as an aggregate representation of the whole sentence used for classification tasks. As such, when we feed in an input sentence to our model during training, the output is the length 768 hidden state vector corresponding to this token. The additional layer that we've added on top consists of untrained linear neurons of size [hidden_state, number_of_labels], so [768,2], meaning that the output of BERT plus our classification layer is a vector of two numbers representing the "score" for "grammatical/non-grammatical" that are then fed into cross-entropy loss.
The Fine-Tuning Process

Because the pre-trained BERT layers already encode a lot of information about the language, training the classifier is relatively inexpensive. Rather than training every layer in a large model from scratch, it's as if we have already trained the bottom layers 95% of where they need to be, and only really need to train the top layer, with a bit of tweaking going on in the lower levels to accomodate our task.

Sometimes practicioners will opt to "freeze" certain layers when fine-tuning, or to apply different learning rates, apply diminishing learning rates, etc. all in an effort to preserve the good quality weights in the network and speed up training (often considerably). In fact, recent research on BERT specifically has demonstrated that freezing the majority of the weights results in only minimal accuracy declines, but there are exceptions and broader rules of transfer learning that should also be considered. For example, if your task and fine-tuning dataset is very different from the dataset used to train the transfer learning model, freezing the weights may not be a good idea. We'll cover the broader scope of transfer learning in NLP in a future post. 

## BERT Hyperparameter Tuning

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

val_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/train',
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

test_ds = tf.keras.utils.text_dataset_from_directory(
    'aclImdb/test',
    batch_size=batch_size)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)
