## Fine-tuning BERT base and ESG-BERT models

Note: This file was run with Google Colab Pro Plus using high-RAM and GPU. It therefore requires installing the relevant packages and connecting to MyDrive.

## Set-up

In [None]:
!pip install transformers
!pip3 install pickle5

### Import modules

In [None]:
import pickle
import numpy as np
import pandas as pd
import torch
import warnings
import seaborn as sns
import random
import pickle5 as pickle
from collections import defaultdict, Counter
from string import punctuation
from matplotlib import pyplot as plt
from nltk.util import bigrams
from tqdm import tqdm
from itertools import product

from sklearn.feature_extraction import _stop_words
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score
from sklearn.utils import shuffle

from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import BertModel, BertTokenizer

warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Ensure reproducibility by setting seed

In [None]:
seed = 42

# python RNG
random.seed(seed)

# pytorch RNGs
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# numpy RNG
np.random.seed(seed)

### Get variables & data

In [None]:
with open('/content/drive/MyDrive/train.pkl', "rb") as fh:
    train_data = pickle.load(fh)
train = pd.DataFrame(train_data)

with open('/content/drive/MyDrive/val.pkl', "rb") as fh:
    val_data = pickle.load(fh)
val = pd.DataFrame(val_data)

with open('/content/drive/MyDrive/test.pkl', "rb") as fh:
    test_data = pickle.load(fh)
test = pd.DataFrame(test_data)

In [None]:
def get_augmented_dataset(df):
    translated_indices = data_for_augmentation.index.to_list() #translated indices
    market_indices = df.index.to_list() #indices in original df
    indices = list(set(translated_indices) & set(market_indices)) # only check indices in both dfs
    translated_set = data_for_augmentation.loc[indices]
    augmented_dataset = shuffle(pd.concat([df, translated_set]), random_state=42)
    return augmented_dataset

# Get translated paragraphs
translated_data = pd.read_excel('/content/drive/MyDrive/Back translation with GT.xlsx')
data_for_augmentation = translated_data[translated_data['Exact match']=='No']
data_for_augmentation.rename(columns={'Back-translation':'Paragraph'}, inplace=True)
data_for_augmentation.drop(columns=['Original Paragraph','Exact match'], inplace=True)
data_for_augmentation.set_index('Index',inplace=True)

# Create augmented train dataset
augmented_train = get_augmented_dataset(train)

### Clean text

In [None]:
# Define function to clean text
def clean(text):
    return [w.strip(punctuation) for w in text.strip().split() if w.strip(punctuation) != '']

In [None]:
train['Paragraph'] = train['Paragraph'].apply(clean)
val['Paragraph'] = val['Paragraph'].apply(clean)
test['Paragraph'] = test['Paragraph'].apply(clean)
augmented_train['Paragraph'] = augmented_train['Paragraph'].apply(clean)

### Define classes

In [None]:
# Define dataset class
class BERTDataset(Dataset):

    def __init__(self, data, label_col, hf_path):
        
        # Initialize tokenizer
        self.tok = BertTokenizer.from_pretrained(hf_path)
        
        # Truncate and encode paragraphs
        self.paragraphs = list(data['Paragraph'].apply(self.tok.encode, max_length=512, truncation=True))
        
        # Store labels
        self.labels = list(data[label_col])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        paragraph = self.paragraphs[idx]
        label = self.labels[idx]
        return paragraph, label

In [None]:
# Define BERT classifier
class BERTClassifier(nn.Module):

    def __init__(self, hf_path, dropout_rate=0.2):
        
        # Define network layers
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(hf_path)
        self.linear = nn.Linear(768, 2)
        
        # Define dropout
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, paragraphs, masks):
        
        # Define flow of tensors through network
        output_bert = self.bert(paragraphs, attention_mask=masks)[0].mean(axis=1)
        return self.linear(self.dropout(output_bert))

### Define functions

In [None]:
# Define collate function
def bert_collate(batch):
    
    # Store batch size
    batch_size = len(batch)
    
    # Separate paragraphs and labels
    paragraphs = [p for p, _ in batch]
    labels = torch.tensor([l for _, l in batch]).long()
    
    # Store length of longest paragraphs in batch
    max_len = max(len(p) for p in paragraphs)
    
    # Create padded paragraph and attention mask tensors (the latter to avoid performing attention on padding token indices)
    paragraphs_pad = torch.zeros((batch_size, max_len)).long()
    masks_pad = torch.zeros((batch_size, max_len)).long()
    for i, p in enumerate(paragraphs):
        paragraphs_pad[i, :len(p)] = torch.tensor(p)
        masks_pad[i, :len(p)] = 1
    
    return paragraphs_pad, masks_pad, labels

In [None]:
def bert_finetuning(hf_path, train, val, test, col_name, lr=1e-5, epochs=20, 
                    dropout_rate=.2, validation_run=True):
    # Create datasets
    train_dataset = BERTDataset(train, col_name, hf_path)
    val_dataset = BERTDataset(val, col_name, hf_path)
    test_dataset = BERTDataset(test, col_name, hf_path)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, collate_fn=bert_collate, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16, collate_fn=bert_collate)
    test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=bert_collate)

    # Initialize model
    model = BERTClassifier(hf_path, dropout_rate=dropout_rate)

    # Define optimizer and training objective
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    # Define device and move model to CUDA if available
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    # Train model
    for e in range(1, epochs+1):

        model.train()
        
        training_loss = 0
        
        for i, b in enumerate(train_loader):

            # Perform forward pass
            optimizer.zero_grad()
            paragraphs, masks, lbls = [t.to(device) for t in b]
            output = model(paragraphs, masks)
            loss = criterion(output, lbls)
            training_loss += loss
            
            # Perform backpropagation and update weights
            loss.backward()
            optimizer.step()
        
        training_loss = training_loss / len(train_loader)
    
        # Evaluate model on development data
        model.eval()

        y_true = list()
        y_pred = list()

        with torch.no_grad():
            loss = 0
            for b in val_loader:
                paragraphs, masks, lbls = [t.to(device) for t in b]
                output = model(paragraphs, masks)
                max_output = output.argmax(dim=1)
                y_true.extend(lbls.tolist())
                y_pred.extend(max_output.tolist())
                loss += criterion(output, lbls)
        val_loss = loss / len(val_loader)
        
        print(f"Epoch {e}: Training loss {training_loss:.2f}--Val loss {val_loss:.2f}--Accuracy {accuracy_score(y_true, y_pred):.2f}")

    if validation_run == True:
        return y_true, y_pred # from validation set

    else:
        # Evaluate model on test data
        model.eval()

        y_true = list()
        y_pred = list()

        with torch.no_grad():
            for b in test_loader:
                paragraphs, masks, lbls = [t.to(device) for t in b]
                output = model(paragraphs, masks)
                max_output = output.argmax(dim=1)
                y_true.extend(lbls.tolist())
                y_pred.extend(max_output.tolist())

        print('Test accuracy: {:.2f}'.format(accuracy_score(y_true, y_pred)))

        return model, y_true, y_pred # from test set

In [None]:
def get_best_parameters(hf_path, train, val, test, col_name, lrs, dropouts, augs, epochs):
    f1_results = []
    for lr, dropout, augmented in tqdm(list(product(lrs, dropouts, augs))):
        if augmented==True:
            y_true, y_pred = bert_finetuning(hf_path, augmented_train, val, test, 
                                            col_name, lr=lr, epochs=epochs, 
                                            dropout_rate=dropout, 
                                            validation_run=True)
        else:
            y_true, y_pred = bert_finetuning(hf_path, train, val, test, 
                                            col_name, lr=lr, epochs=epochs, 
                                            dropout_rate=dropout, 
                                            validation_run=True)
        macro_f1 = f1_score(y_true, y_pred, average='macro')
        f1_results.append(({'lr': lr, 
                            'dropout': dropout, 
                            'augmented': augmented}, macro_f1))
    best_params, best_score = max(f1_results, key = lambda i : i[1])
    print(f"Best score: {best_score}")
    print(f"Best params: {best_params}")
    return best_params
        
def print_classification_report_heatmap(actuals, preds):
    # Get classification report
    print(classification_report(actuals, preds, digits = 4))
    
    # Get AUROC score
    print(f"ROC AUC score: {roc_auc_score(actuals, preds)}") #returns macro by default

    # Get heatmap
    fig, ax = plt.subplots(1,1, figsize=(5,5))
    process_mat = confusion_matrix(actuals, preds)
    sns.heatmap(process_mat.T, square = True, annot=True, fmt = "d", ax=ax)
    ax.set_xlabel("true labels")
    ax.set_ylabel("predicted labels")
    plt.show()

def find_and_run_best_model(which_model, col_name, train, val, test, lrs, dropouts, augs, epochs):
    # Get hf_path (path to right Hugging Face model)
    if which_model == 'BERT':
        hf_path = 'bert-base-uncased'
    else:
        hf_path = 'nbroad/ESG-BERT'
    
    # Find best hyperparameters
    best_params = get_best_parameters(hf_path, train, val, test, col_name, 
                                      lrs, dropouts, augs, epochs)
    lr = best_params['lr']
    dropout = best_params['dropout']
    augmented = best_params['augmented']

    # See performance of best model on test data
    if augmented == True:
        model, y_true, y_pred = bert_finetuning(hf_path, augmented_train, val, test, 
                                        col_name, lr=lr, epochs=epochs, 
                                        dropout_rate=dropout,
                                        validation_run=False)
        path = f'/content/drive/MyDrive/{col_name}-AUG-{which_model}.pth'
    else:
        model, y_true, y_pred = bert_finetuning(hf_path, train, val, test, 
                                        col_name, lr=lr, epochs=epochs, 
                                        dropout_rate=dropout,
                                        validation_run=False)
        path = f'/content/drive/MyDrive/{col_name}-{which_model}.pth'
    print_classification_report_heatmap(y_true, y_pred)

    # Save model
    torch.save(model, path)

    return y_pred, best_params

## Get best models with BERT base

In [None]:
bert_dict = {}

### Process

In [None]:
y_pred, best_params = find_and_run_best_model('BERT',
                                              'Process_action', 
                                              train, 
                                              val, 
                                              test, 
                                              lrs=[5e-5, 1e-5, 1e-6], 
                                              dropouts=[0.1, 0.2],
                                              augs=[False], 
                                              epochs=3)
bert_dict['Process_action'] = {'Preds': y_pred, 'Best_params': best_params}

### Process augmented

In [None]:
y_pred, best_params = find_and_run_best_model('BERT',
                                              'Process_action', 
                                              train, 
                                              val, 
                                              test, 
                                              lrs=[5e-5, 1e-5, 1e-6], 
                                              dropouts=[0.1, 0.2],
                                              augs=[True], 
                                              epochs=3)
bert_dict['Process_action_AUG'] = {'Preds': y_pred, 'Best_params': best_params}

### Market

In [None]:
y_pred, best_params = find_and_run_best_model('BERT',
                                              'Market_action', 
                                              train, 
                                              val, 
                                              test, 
                                              lrs=[5e-5, 1e-5, 1e-6], 
                                              dropouts=[0.1, 0.2], 
                                              augs=[False], 
                                              epochs=3)
bert_dict['Market_action'] = {'Preds': y_pred, 'Best_params': best_params}

### Market augmented

In [None]:
y_pred, best_params = find_and_run_best_model('BERT',
                                              'Market_action', 
                                              train, 
                                              val, 
                                              test, 
                                              lrs=[5e-5, 1e-5, 1e-6], 
                                              dropouts=[0.1, 0.2], 
                                              augs=[True], 
                                              epochs=3)
bert_dict['Market_action_AUG'] = {'Preds': y_pred, 'Best_params': best_params}

### Social

In [None]:
y_pred, best_params = find_and_run_best_model('BERT',
                                              'Social', 
                                              train, 
                                              val, 
                                              test, 
                                              lrs=[5e-5, 1e-5, 1e-6], 
                                              dropouts=[0.1, 0.2], 
                                              augs=[False], 
                                              epochs=3)
bert_dict['Social'] = {'Preds': y_pred, 'Best_params': best_params}

### Social augmented

In [None]:
y_pred, best_params = find_and_run_best_model('BERT',
                                              'Social', 
                                              train, 
                                              val, 
                                              test, 
                                              lrs=[5e-5, 1e-5, 1e-6], 
                                              dropouts=[0.1, 0.2], 
                                              augs=[True], 
                                              epochs=3)
bert_dict['Social_AUG'] = {'Preds': y_pred, 'Best_params': best_params}

### Environment

In [None]:
y_pred, best_params = find_and_run_best_model('BERT',
                                              'Environment', 
                                              train, 
                                              val, 
                                              test, 
                                              lrs=[5e-5, 1e-5, 1e-6], 
                                              dropouts=[0.1, 0.2], 
                                              augs=[False], 
                                              epochs=3)
bert_dict['Environment'] = {'Preds': y_pred, 'Best_params': best_params}

### Environment augmented

In [None]:
y_pred, best_params = find_and_run_best_model('BERT',
                                              'Environment', 
                                              train, 
                                              val, 
                                              test, 
                                              lrs=[5e-5, 1e-5, 1e-6], 
                                              dropouts=[0.1, 0.2], 
                                              augs=[True], 
                                              epochs=3)
bert_dict['Environment_AUG'] = {'Preds': y_pred, 'Best_params': best_params}

### Save for later

In [None]:
# Save dictionary for later
with open("/content/drive/MyDrive/bert_dict.txt", "wb") as f:
    pickle.dump(bert_dict, f)

## Get best models with ESGBERT

In [None]:
# Note: details on ESG-BERT here: https://huggingface.co/nbroad/ESG-BERT

In [None]:
esgbert_dict = {}

### Process

In [None]:
y_pred, best_params = find_and_run_best_model('ESG-BERT', 
                                              'Process_action', 
                                              train, 
                                              val, 
                                              test, 
                                              lrs=[5e-5, 1e-5, 1e-6], 
                                              dropouts=[0.1, 0.2], 
                                              augs=[False], 
                                              epochs=3)
esgbert_dict['Process_action'] = {'Preds': y_pred, 'Best_params': best_params}

### Process augmented

In [None]:
y_pred, best_params = find_and_run_best_model('ESG-BERT', 
                                              'Process_action', 
                                              train, 
                                              val, 
                                              test, 
                                              lrs=[5e-5, 1e-5, 1e-6], 
                                              dropouts=[0.1, 0.2], 
                                              augs=[True], 
                                              epochs=3)
esgbert_dict['Process_action_AUG'] = {'Preds': y_pred, 'Best_params': best_params}

### Market

In [None]:
y_pred, best_params = find_and_run_best_model('ESG-BERT',
                                              'Market_action',
                                              train, 
                                              val, 
                                              test, 
                                              lrs=[5e-5, 1e-5, 1e-6], 
                                              dropouts=[0.1, 0.2], 
                                              augs=[False], 
                                              epochs=3)
esgbert_dict['Market_action'] = {'Preds': y_pred, 'Best_params': best_params}

### Market augmented



In [None]:
y_pred, best_params = find_and_run_best_model('ESG-BERT',
                                              'Market_action', 
                                              train, 
                                              val, 
                                              test, 
                                              lrs=[5e-5, 1e-5, 1e-6], 
                                              dropouts=[0.1, 0.2], 
                                              augs=[True], 
                                              epochs=3)
esgbert_dict['Market_action_AUG'] = {'Preds': y_pred, 'Best_params': best_params}

### Social

In [None]:
y_pred, best_params = find_and_run_best_model('ESG-BERT',
                                              'Social', 
                                              train, 
                                              val, 
                                              test, 
                                              lrs=[5e-5, 1e-5, 1e-6], 
                                              dropouts=[0.1, 0.2], 
                                              augs=[False], 
                                              epochs=3)
esgbert_dict['Social'] = {'Preds': y_pred, 'Best_params': best_params}

### Social augmented

In [None]:
y_pred, best_params = find_and_run_best_model('ESG-BERT',
                                              'Social', 
                                              train, 
                                              val, 
                                              test, 
                                              lrs=[5e-5, 1e-5, 1e-6], 
                                              dropouts=[0.1, 0.2], 
                                              augs=[True], 
                                              epochs=3)
esgbert_dict['Social_AUG'] = {'Preds': y_pred, 'Best_params': best_params}

### Environment

In [None]:
y_pred, best_params = find_and_run_best_model('ESG-BERT',
                                              'Environment',
                                              train, 
                                              val, 
                                              test, 
                                              lrs=[5e-5, 1e-5, 1e-6], 
                                              dropouts=[0.1, 0.2], 
                                              augs=[False], 
                                              epochs=3)
esgbert_dict['Environment'] = {'Preds': y_pred, 'Best_params': best_params}

### Environment augmented

In [None]:
y_pred, best_params = find_and_run_best_model('ESG-BERT',
                                              'Environment',
                                              train, 
                                              val, 
                                              test, 
                                              lrs=[5e-5, 1e-5, 1e-6], 
                                              dropouts=[0.1, 0.2], 
                                              augs=[True], 
                                              epochs=3)
esgbert_dict['Environment_AUG'] = {'Preds': y_pred, 'Best_params': best_params}

### Save for later

In [None]:
# Save dictionary for later
with open("/content/drive/MyDrive/esgbert_dict.txt", "wb") as f:
    pickle.dump(esgbert_dict, f)