In [None]:
import torch
import random
import numpy as np

seed = 42
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
print("Random seed set as " + str(seed))

torch.cuda.empty_cache()

In [None]:
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import AutoTokenizer, LongformerForSequenceClassification
import json
import pandas as pd

In [3]:
df = pd.read_csv("/Users/pakhigupta/Documents/UIUC Grad/CS410/assignments/MP3.2_private/train_data.csv")
df.head()

Unnamed: 0,document,label
0,"Economy of India From Wikipedia, the free ency...",0
1,"Silicon From Wikipedia, the free encyclopedia ...",0
2,Call Us 1 - 603 - 244 - 6292 Follow Us 1 - 603...,1
3,Skip to main content .us Hello Select your add...,0
4,"Mucus From Wikipedia, the free encyclopedia Ju...",0


In [29]:
train_texts, train_labels = df['document'].values, df['label'].values
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
model = LongformerForSequenceClassification.from_pretrained("allenai/longformer-base-4096", num_labels=2)


config.json: 100%|██████████| 694/694 [00:00<00:00, 103kB/s]
vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 9.33MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 45.4MB/s]
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 22.0MB/s]
pytorch_model.bin: 100%|██████████| 597M/597M [00:14<00:00, 40.5MB/s]
Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
tokenized_texts = []
from tqdm import tqdm

max_seq_length = 4096

for text in tqdm(train_texts):
    tokenized_texts.append(tokenizer(text, truncation=True, padding='max_length', max_length=max_seq_length, return_tensors='pt'))

100%|██████████| 1500/1500 [00:16<00:00, 88.30it/s]


In [31]:
# Tokenize the texts and convert them to tensors
from sklearn.metrics import accuracy_score, f1_score, classification_report
import torch_optimizer as optim
from transformers import AdamW

input_ids = torch.cat([t['input_ids'] for t in tokenized_texts], dim=0)
attention_mask = torch.cat([t['attention_mask'] for t in tokenized_texts], dim=0)
labels = torch.tensor(train_labels)

# Create a dataset and data loader
dataset = TensorDataset(input_ids, attention_mask, labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

batch_size = 16
lr = 1e-5

train_loader = DataLoader(train_dataset, batch_size=batch_size)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [32]:
def train_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    for batch in tqdm(dataloader):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(predictions == labels).item()
            print(labels.cpu())
            total_samples += labels.size(0)


    print(classification_report(predictions.cpu().numpy(), labels.cpu().numpy()))
    return total_loss / len(dataloader), correct_predictions / total_samples

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LongformerForSequenceClassification(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
          

In [34]:
num_epochs = 1
MAIN_DIR = "/Users/pakhigupta/Documents/UIUC Grad/CS410/assignments/MP3.2_private"
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    val_loss, val_accuracy = evaluate(model, val_loader, criterion, device)

    print(f"Epoch {epoch + 1}/{num_epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f} - Val Accuracy: {val_accuracy:.2%}")
    model.save_pretrained(f"{MAIN_DIR}/fine_tuned_longform_epoch_{epoch+1}_lr_{lr}")

# Save the fine-tuned model
model.save_pretrained(f"{MAIN_DIR}/fine_tuned_longform")

  0%|          | 0/75 [00:00<?, ?it/s]