## Get predictions from BERT

Note: This file was run with Google Colab Pro Plus using high-RAM and GPU. It therefore requires installing the relevant packages and connecting to MyDrive.

## Set-up

In [None]:
!pip install transformers
!pip3 install pickle5

### Import modules

In [None]:
import pickle
import numpy as np
import pandas as pd
import torch
import warnings
import seaborn as sns
import random
import pickle5 as pickle
from collections import defaultdict, Counter
from string import punctuation
from matplotlib import pyplot as plt
from nltk.util import bigrams
from tqdm import tqdm
from itertools import product

from sklearn.feature_extraction import _stop_words
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, f1_score
from sklearn.utils import shuffle

from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import BertModel, BertTokenizer

warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Clean text

In [None]:
# Define function to clean text
def clean(text):
    return [w.strip(punctuation) for w in text.strip().split() if w.strip(punctuation) != '']

### Define classes

In [None]:
# Define dataset class
class BERTDataset(Dataset):

    def __init__(self, data, hf_path):
        
        # Initialize tokenizer
        self.tok = BertTokenizer.from_pretrained(hf_path)
        
        # Truncate and encode paragraphs
        self.paragraphs = list(data['Paragraph'].apply(self.tok.encode, max_length=512, truncation=True))

    def __len__(self):
        return len(self.paragraphs)

    def __getitem__(self, idx):
        paragraph = self.paragraphs[idx]
        return paragraph

In [None]:
# Define BERT classifier
class BERTClassifier(nn.Module):

    def __init__(self, hf_path, dropout_rate=0.2):
        
        # Define network layers
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(hf_path)
        self.linear = nn.Linear(768, 2)
        
        # Define dropout
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, paragraphs, masks):
        
        # Define flow of tensors through network
        output_bert = self.bert(paragraphs, attention_mask=masks)[0].mean(axis=1)
        return self.linear(self.dropout(output_bert))

### Define functions

In [None]:
# Define collate function
def bert_collate(batch):
    
    # Store batch size
    batch_size = len(batch)
    
    # Get paragraphs
    paragraphs = [p for p in batch]
    
    # Store length of longest paragraphs in batch
    max_len = max(len(p) for p in paragraphs)
    
    # Create padded paragraph and attention mask tensors (the latter to avoid performing attention on padding token indices)
    paragraphs_pad = torch.zeros((batch_size, max_len)).long()
    masks_pad = torch.zeros((batch_size, max_len)).long()
    for i, p in enumerate(paragraphs):
        paragraphs_pad[i, :len(p)] = torch.tensor(p)
        masks_pad[i, :len(p)] = 1
    
    return paragraphs_pad, masks_pad

## Get predictions

In [None]:
# For reference (how to run model on new data)

def get_predictions(data, local_path, hf_path):

    model = torch.load(local_path)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    unseen_dataset = BERTDataset(data, hf_path)
    unseen_loader = DataLoader(unseen_dataset, batch_size=16, collate_fn=bert_collate)

    model.eval()
    y_pred = list()

    with torch.no_grad():
        for b in tqdm(unseen_loader):
            paragraphs, masks = [t.to(device) for t in b]
            output = model(paragraphs, masks)
            max_output = output.argmax(dim=1)
            y_pred.extend(max_output.tolist())

    return y_pred

In [None]:
# Save best model paths
market_path = f'/content/drive/MyDrive/Market_action-BERT.pth'
env_path = f'/content/drive/MyDrive/Environment-BERT.pth'

# Upload all paragraphs
with open('/content/drive/MyDrive/paras_no_sents_df.pkl', "rb") as fh:
    data = pickle.load(fh)

data = data[data['Supply_Chain']=='Yes']
data['Paragraph'] = data['Paragraph'].astype(str)
data['Paragraph'] = data['Paragraph'].apply(clean)

In [None]:
data['Market_action_pred'] = get_predictions(data, market_path, 'bert-base-uncased')
data['Environment_pred'] = get_predictions(data, env_path, 'bert-base-uncased')

In [None]:
data.describe()

In [None]:
data.to_pickle('/content/drive/MyDrive/alt_paragraph_predictions.pkl')