In [1]:
from transformers import pipeline, DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from datasets import load_dataset
import os

In [2]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [3]:
# 'distilbert-base-uncased' as the model to fine-tune
model_name = 'distilbert-base-uncased'
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


Dataset Part

In [4]:
# load dataset, only focus on "abstract", "claims", "decision"
dataset_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", 
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-21',
    val_filing_start_date='2016-01-22',
    val_filing_end_date='2016-01-31',
)
train_dataset = dataset_dict['train']
validation_dataset = dataset_dict['validation']
train_dataset.set_format(type="torch", columns=["abstract", "claims", "decision"])
validation_dataset.set_format(type="torch", columns=["abstract", "claims", "decision"])

Found cached dataset hupd (/home/sihanwang/.cache/huggingface/datasets/HUPD___hupd/sample-05bfac9c56e1f5e9/0.0.0/6920d2def8fd7767046c0470603357f76866e5a09c97e19571896bfdca521142)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# method to transform dataset to (texts, labels), labels are 1/0 from decision
def dataset_to_texts_and_labels(dataset):
    texts = []
    labels = []
    for data in dataset:
        text = data['abstract'] + data['claims']
        label = 1 if data['decision'] == 'ACCEPTED' else 0
        texts.append(text)
        labels.append(label)
    return texts, labels

# dataset class to fit in dataloader
class TextEncodeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [6]:
# transform dataset to (texts, labels)
train_texts, train_labels = dataset_to_texts_and_labels(train_dataset)
validation_texts, validation_labels = dataset_to_texts_and_labels(validation_dataset)

# base tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

# get encoding texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
validation_encodings = tokenizer(validation_texts, truncation=True, padding=True)

# form TextEncodeDataset dataset
train_dataset = TextEncodeDataset(train_encodings, train_labels)
validation_dataset = TextEncodeDataset(validation_encodings, validation_labels)

# form the dataloader
train_dataloader = DataLoader(train_dataset, batch_size=16)
validation_dataloader = DataLoader(validation_dataset, batch_size=32)


Training Part

In [7]:
# base model
model = DistilBertForSequenceClassification.from_pretrained(model_name)
model.to(device)
model.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.w

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [8]:
# Training
optim = AdamW(model.parameters(), lr=5e-5)
epochs = 2
for epoch in range(epochs):
    for batch in train_dataloader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)       
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

In [9]:
# save fine-tuned model
save_diirectory = "saved"
tokenizer.save_pretrained(save_diirectory)
model.save_pretrained(save_diirectory)

Validation Part

In [10]:
# load model
save_diirectory = "saved"
model = DistilBertForSequenceClassification.from_pretrained(save_diirectory)
model.to(device)
model.eval()
tokenizer = DistilBertTokenizerFast.from_pretrained(save_diirectory)

In [11]:
# compute the validation loss
validation_loss = 0
count = 0
with torch.no_grad():
    for batch in validation_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        validation_loss += loss
        count += 1
validation_loss = validation_loss.item()
print('average validation loss: %5.5f' %(validation_loss/count))

average validation loss: 0.65093
