In [3]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

import time

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import IPython.display as ipd

from scipy.io import wavfile as wav

from sklearn import preprocessing
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score
import tensorflow as tf
# import tensorflow_hub as hub
# import tensorflow_text as text
# from official.nlp import optimization  # to create AdamW optimizer
import tensorflow.keras as keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ReduceLROnPlateau,EarlyStopping,ModelCheckpoint,LearningRateScheduler
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.utils import pad_sequences


SEED = 42

import os
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

import random 
random.seed(SEED)

import numpy as np
np.random.seed(SEED)

import tensorflow as tf
tf.random.set_seed(SEED)

# Read Train and Test data

In [4]:
#import the train and test files for financial news
fnews_Xtrain = pd.read_csv('data/train/fnews_Xtrain.csv')
fnews_Xtest = pd.read_csv('data/test/fnews_Xtest.csv')
fnews_ytrain = pd.read_csv('data/train/fnews_ytrain.csv')
fnews_ytest = pd.read_csv('data/test/fnews_ytest.csv')

In [20]:
fnews_Xtrain.shape

(3876, 1)

In [19]:
fnews_Xtest.shape

(970, 1)

In [5]:
#drop the first column for all the train and test sets
fnews_Xtrain.drop(columns=fnews_Xtrain.columns[0], axis=1, inplace=True)
fnews_ytrain.drop(columns=fnews_ytrain.columns[0], axis=1, inplace=True)
fnews_Xtest.drop(columns=fnews_Xtest.columns[0], axis=1, inplace=True)
fnews_ytest.drop(columns=fnews_ytest.columns[0], axis=1, inplace=True)

In [6]:
#check the respective shape of the train and test sets
print('Xtrain:',fnews_Xtrain.shape, 'ytrain:' ,fnews_ytrain.shape)
print('Xtest:',fnews_Xtest.shape, 'ytest:' ,fnews_ytest.shape)

Xtrain: (3876, 1) ytrain: (3876, 1)
Xtest: (970, 1) ytest: (970, 1)


In [7]:
fnews_Xtrain['News'][0]

"['russia', 'raisio', 's', 'food', 'division', 's', 'home', 'market', 'stretch', 'way', 'vladivostok']"

In [8]:
type(fnews_Xtrain["News"][0])

str

In [9]:
fnews_Xtrain.head()

Unnamed: 0,News
0,"['russia', 'raisio', 's', 'food', 'division', ..."
1,"['operator', 'need', 'learn', 'use', 'device',..."
2,"['company', 'expects', 'net', 'sale', 'half', ..."
3,"['bridge', 'km', 'long', 'located', 'anasmotet..."
4,"['nokia', 'capcom', 'announced', 'resident', '..."


In [10]:
fnews_Xtrain['News'] = fnews_Xtrain['News'].apply(eval)
fnews_Xtest['News'] = fnews_Xtest['News'].apply(eval)

# BERT Model

BERT is a trained Transformer Encoder stack. We will be using both the base and the large version from huggingface, and choose the best model.

BERT requires specifically formatted inputs. For each tokenized input sentence, we need to create:
- input ids: a sequence of integers identifying each input token to its index number in the BERT tokenizer vocabulary
- segment mask: (optional) a sequence of 1s and 0s used to identify whether the input is one sentence or two sentences long. For one sentence inputs, this is simply a sequence of 0s. For two sentence inputs, there is a 0 for each token of the first sentence, followed by a 1 for each token of the second sentence
- attention mask: (optional) a sequence of 1s and 0s, with 1s for all input tokens and 0s for all padding tokens (we'll detail this in the next paragraph)
- labels: a single value of 1 or 0. In our task 1 means "grammatical" and 0 means "ungrammatical"

Although we can have variable length input sentences, BERT does requires our input arrays to be the same size. We address this by first choosing a maximum sentence length, and then padding and truncating our inputs until every input sequence is of the same length.

To "pad" our inputs in this context means that if a sentence is shorter than the maximum sentence length, we simply add 0s to the end of the sequence until it is the maximum sentence length.

If a sentence is longer than the maximum sentence length, then we simply truncate the end of the sequence, discarding anything that does not fit into our maximum sentence length.

We pad and truncate our sequences so that they all become of length MAX_LEN ("post" indicates that we want to pad and truncate at the end of the sequence, as opposed to the beginning) pad_sequences is a utility function that we're borrowing from Keras. It simply handles the truncating and padding of Python lists.


In [13]:
from transformers import BertConfig, TFBertModel, BertTokenizer

In [21]:
MAX_LEN = 128
def create_attention_masks(df):
    # We need to add special tokens at the beginning and end of each sentence for BERT to work properly
    sentences = [["[CLS]"] + sentence + ["[SEP]"] for sentence in df]
    
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    tokenized_texts = [tokenizer.tokenize(" ".join(sent)) for sent in sentences] # retokensized it according to BERT vocabulary
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    
    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
        train_seq_mask = [float(i>0) for i in seq]
        attention_masks.append(train_seq_mask)

    return input_ids, attention_masks

In [22]:
# mapping the classes to a numeric representation
train_labels = fnews_ytrain['Class'].map({'neutral':0, 'positive': 1, 'negative': -1}).values
test_labels = fnews_ytest['Class'].map({'neutral':0, 'positive': 1, 'negative': -1}).values

In [46]:
train_input_ids, train_attention_masks = create_attention_masks(fnews_Xtrain['News'])
test_input_ids, test_attention_masks = create_attention_masks(fnews_Xtest['News'])
ttrain_input_ids, val_input_ids, ttrain_attention_masks, val_attention_masks, ttrain_labels, val_labels  = train_test_split(train_input_ids, train_attention_masks, train_labels, test_size=0.2)
print('Train shape: ', len(ttrain_input_ids), len(ttrain_attention_masks), len(ttrain_labels))
print('Val shape: ', len(val_input_ids), len(val_attention_masks), len(val_labels))
print('Test shape: ', len(test_input_ids), len(test_attention_masks), len(test_labels))

Train shape:  3100 3100 3100
Val shape:  776 776 776
Test shape:  970 970 970


In [47]:
ttrain_inputs = tf.convert_to_tensor(train_input_ids, dtype=tf.float64)
val_inputs = tf.convert_to_tensor(val_input_ids, dtype=tf.float64)
test_inputs = tf.convert_to_tensor(test_input_ids, dtype=tf.float64)
ttrain_labels = tf.convert_to_tensor(ttrain_labels, dtype=tf.float64)
val_labels = tf.convert_to_tensor(val_labels, dtype=tf.float64)
test_labels = tf.convert_to_tensor(test_labels, dtype=tf.float64)
ttrain_masks = tf.convert_to_tensor(ttrain_attention_masks, dtype=tf.float64)
val_masks = tf.convert_to_tensor(val_attention_masks, dtype=tf.float64)
test_masks = tf.convert_to_tensor(test_attention_masks, dtype=tf.float64)

# First Model: BERT with 1 layer unfrozen

In [52]:
def create_model():
    input_ids = tf.keras.layers.Input((3100,), dtype=tf.float64)
    attention_masks = tf.keras.layers.Input((3100,), dtype=tf.float64)

    config = BertConfig() 
    config.output_hidden_states = False 
    bert_model = TFBertModel.from_pretrained('bert-base-uncased',
                                             config=config)

    x = tf.keras.layers.Concatenate()([input_ids, attention_masks])
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(30, activation='sigmoid')(x)

    model = tf.keras.models.Model(inputs=[input_ids, attention_masks], 
                                  outputs=x)
    return model, bert_model

In [53]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
ttrain_inputs_scaled = scaler.fit_transform(ttrain_inputs)
val_inputs_scaled = scaler.transform(val_inputs)
test_inputs_scaled = scaler.transform(test_inputs)

In [54]:
epochs = 3
batch_size = 8
patients = 3
learning_rate = 3e-5
n_validate_per_epoch = 5
model, _ = create_model()
optimizer = tf.keras.optimizers.Adam(learning_rate)
model.compile(loss='binary_crossentropy', optimizer=optimizer)
init_weights = model.get_weights()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [55]:
test_preds = []
model.set_weights(init_weights)
model.fit(ttrain_inputs_scaled, ttrain_labels,
                validation_data=(val_inputs_scaled, val_labels),
                epochs=epochs,
                batch_size=batch_size,)
        
t_preds = model.predict(test_inputs, batch_size=batch_size)

ValueError: Data cardinality is ambiguous:
  x sizes: 3876
  y sizes: 3100
Make sure all arrays contain the same number of samples.