In [1]:
# Load Data
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install transformers torch
!pip install sentencepiece
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification, Trainer, TrainingArguments, LongformerConfig
from pathlib import Path
from torch.utils.data import Dataset, DataLoader, Subset, random_split
import pandas as pd
import numpy as np
import torch

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


## 1. Define DataSet and Dataloaders for Claims Text

In [3]:
class TextDataset(Dataset):
    def __init__(self, csv_file,label_map, tokenizer, max_length=4096): # adjust max length depending on tokenizer
        self.data_frame = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_map = label_map
        # redefine self.data_frame for integer valued labels
        self.data_frame['status'] = self.data_frame['status'].map(self.label_map)

    def __len__(self):
        return self.data_frame.shape[0]

    def __getitem__(self, idx):
        text = self.data_frame.iloc[idx]['claims']
        label = self.data_frame.iloc[idx]['status']

        # Encoding the text using the BERT tokenizer
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [4]:
import torch

# Enabling gradient accumulation
accumulation_steps = 4  # How many steps to wait before performing a backward/update pass

# Optimizing memory management
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True

# Set the max_split_size_mb to a value that works for your setup
torch.cuda.set_per_process_memory_fraction(0.9, device=0)


In [5]:
input_path = '/content/drive/MyDrive/CSCI_567/Project/text_df_medium.csv'

# Load the dataset
label_map = {'ABN': 0, 'ISS': 1}
model_name = 'allenai/longformer-base-4096'
tokenizer = LongformerTokenizerFast.from_pretrained(model_name)
dataset = TextDataset(input_path,label_map=label_map, tokenizer= tokenizer,max_length = 4096)

# Example dataset size
total_size = len(dataset)

# Define the proportions
train_size = int(0.95* total_size)
val_size = int(0.025* total_size)
test_size = total_size - train_size - val_size

# Split the dataset
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

# define dataloaders
BATCH_SIZE = 3
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

9440
248
249


## 2. Fine Tune Pretrained BigBird

In [6]:
import torch
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

# Number of epochs
num_epochs = 20

# load model for BERT classification
model = LongformerForSequenceClassification.from_pretrained(model_name,
                                                        attention_window = 1024,
                                                          num_labels=2)

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Scheduler
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LongformerForSequenceClassification(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
          

In [7]:
model.config

LongformerConfig {
  "_name_or_path": "allenai/longformer-base-4096",
  "attention_mode": "longformer",
  "attention_probs_dropout_prob": 0.1,
  "attention_window": [
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024,
    1024
  ],
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "ignore_attention_mask": false,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 4098,
  "model_type": "longformer",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "onnx_export": false,
  "pad_token_id": 1,
  "sep_token_id": 2,
  "transformers_version": "4.35.2",
  "type_vocab_size": 1,
  "vocab_size": 50265
}

In [8]:
best_acc = 0

In [None]:
import torch

# set model to training mode and Fine Tune
model.train()
for epoch in range(num_epochs):
    optimizer.zero_grad()  # Reset gradients at the start of the epoch
    total_correct = 0
    total_predictions = 0
    for batch_idx, batch in enumerate(train_dataloader):
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss / accumulation_steps  # Normalize the loss

        # Calculate predictions and update accuracy counts
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        correct = (predictions == batch['labels']).sum().item()
        total_correct += correct
        total_predictions += predictions.size(0)

        # Backward pass
        loss.backward()  # Accumulate gradients

        # Optimizer and scheduler steps are performed after the specified number of accumulation steps
        if (batch_idx + 1) % accumulation_steps == 0 or (batch_idx + 1) == len(train_dataloader):
            optimizer.step()    # Update model parameters
            scheduler.step()    # Update learning rate
            optimizer.zero_grad()  # Reset gradients

        # Print loss or any other metrics you want to track
        if (batch_idx + 1) % accumulation_steps == 0:
            print(f"Loss: {loss.item() * accumulation_steps}")  # Multiply back to get the actual loss value

    # Calculate training accuracy
    train_accuracy = total_correct / total_predictions
    print(f"Epoch {epoch}: Training Accuracy: {train_accuracy:.4f}")

    # Validation phase
    model.eval()
    total_val_correct = 0
    total_val_predictions = 0
    total_val_loss = 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(val_dataloader):
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}

            # Forward pass
            outputs = model(**batch)
            val_loss = outputs.loss.item()
            total_val_loss += val_loss

            # Calculate predictions and update accuracy counts
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            correct = (predictions == batch['labels']).sum().item()
            total_val_correct += correct
            total_val_predictions += predictions.size(0)

    # Calculate and print the validation accuracy and loss
    val_accuracy = total_val_correct / total_val_predictions
    avg_val_loss = total_val_loss / len(val_dataloader)

    # Save model if validation loss has decreased
    if val_accuracy > best_acc:
        best_acc = val_accuracy
        torch.save(model.state_dict(), 'best_model_state_longformer_4096.pth')

    print(f"Epoch {epoch}: Validation Accuracy: {val_accuracy:.4f}, Validation Loss: {avg_val_loss:.4f}")
    print()

Loss: 0.7517245411872864
Loss: 0.8242060542106628
Loss: 0.6508587002754211
Loss: 0.8188252449035645
Loss: 0.7601408362388611
Loss: 0.6736016273498535
Loss: 0.6818929314613342
Loss: 0.5447723865509033
Loss: 0.7556043267250061
Loss: 0.5017964839935303
Loss: 0.7840456962585449
Loss: 0.4598442614078522
Loss: 0.6822554469108582
Loss: 0.6292786002159119
Loss: 0.5830971598625183
Loss: 0.6265794038772583
Loss: 0.6661720871925354
Loss: 0.3741137981414795
Loss: 0.6602022647857666
Loss: 0.8231985569000244
Loss: 0.6294004321098328
Loss: 0.8079047203063965
Loss: 0.6131155490875244
Loss: 0.5692692399024963
Loss: 0.6273713707923889
Loss: 0.6455931067466736
Loss: 0.7881519198417664
Loss: 0.877012312412262
Loss: 0.7621977925300598
Loss: 0.3777244985103607
Loss: 0.4764760434627533
Loss: 0.9539182782173157
Loss: 0.6941573619842529
Loss: 0.6403096914291382
Loss: 0.4677121639251709
Loss: 0.5974876284599304
Loss: 0.7274818420410156
Loss: 0.8131194114685059
Loss: 0.5967779159545898
Loss: 0.5882920622825623
L

## 3. Evaluate BigBird Performance on validation set

In [None]:
# set model to evaluation mode
model.eval()

# store predictions in list
test_predictions = []
test_labels = []

# Evaluate on validation set
with torch.no_grad():
    for batch in test_dataloader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        logits = outputs.logits

        # Get predictions
        predictions = torch.argmax(logits, dim=1)
        labels = batch['labels']

        # Store predictions
        test_predictions.extend(predictions.cpu().numpy())
        test_labels.extend(labels.cpu().numpy())

In [None]:
# Use SKLearn library to predict F1, Precision, Recall
from sklearn.metrics import f1_score, precision_score, recall_score,accuracy_score

f1 = f1_score(test_labels, test_predictions)
recall = recall_score(test_labels, test_predictions)
precision = precision_score(test_labels, test_predictions)
accuracy = accuracy_score(test_labels, test_predictions)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

In [None]:
# Plot the proportion of positive vs. negative predictions
import matplotlib.pyplot as plt

print(val_predictions.count(1))
print(val_predictions.count(0))

plt.bar(['Positive', 'Negative'], [val_predictions.count(1), val_predictions.count(0)])
plt.show()

In [None]:
model_save_path = '/content/drive/MyDrive/CSCI567/project/bigbird_state_dict.pth'
torch.save(model.state_dict(), model_save_path)