# Setup

In [1]:
! pip install -q datasets==2.20.0 \
                 accelerate==0.33.0 \
                 evaluate==0.4.2 \
                 peft==0.12.0 \
                 adapters==1.0.0

[0m

In [2]:
import numpy as np

import evaluate

import numpy as np
import pandas as pd

from adapters import AutoAdapterModel, AdapterTrainer, LoRAConfig

from transformers import (
    AutoTokenizer, 
    TrainingArguments,
    EarlyStoppingCallback
)

from datasets import load_dataset

2024-08-29 09:42:36.085646: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-29 09:42:36.085715: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-29 09:42:36.086825: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-29 09:42:36.093182: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Method

We use the HuggingFace Trainer library to conduct finetuning using PEFT adapters.

# Data

In [3]:
ds = load_dataset("ccdv/patent-classification", "abstract")

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased")

In [6]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [7]:
tokenized_dataset = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [8]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 25000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
})

In [9]:
id2label = {
    0: "Human Necessities", 
    1: "Performing Operations; Transporting",
    2: "Chemistry; Metallurgy",
    3: "Textiles; Paper",
    4: "Fixed Constructions",
    5: "Mechanical Engineering; Lightning; Heating; Weapons; Blasting",
    6: "Physics",
    7: "Electricity",
    8: "General tagging of new or cross-sectional technology"
}

label2id = { v: k for k, v in id2label.items()}

# Build Model

In [10]:
model = AutoAdapterModel.from_pretrained("distilbert/distilbert-base-cased")

In [11]:
model

DistilBertAdapterModel(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlockWithAdapters(
          (attention): MultiHeadSelfAttentionWithAdapters(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): LoRALinearTorch(
              in_features=768, out_features=768, bias=True
              (loras): ModuleDict()
            )
            (k_lin): LoRALinearTorch(
              in_features=768, out_features=768, bias=True
              (loras): ModuleDict()
            )
            (v_lin): LoRALinearTorch(
              in_features=768, out_features=768, bias=True
              (loras): ModuleDict()
            )
            (ou

In [12]:
model.add_classification_head('patent', num_labels=9, id2label=id2label)

In [13]:
model

DistilBertAdapterModel(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlockWithAdapters(
          (attention): MultiHeadSelfAttentionWithAdapters(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): LoRALinearTorch(
              in_features=768, out_features=768, bias=True
              (loras): ModuleDict()
            )
            (k_lin): LoRALinearTorch(
              in_features=768, out_features=768, bias=True
              (loras): ModuleDict()
            )
            (v_lin): LoRALinearTorch(
              in_features=768, out_features=768, bias=True
              (loras): ModuleDict()
            )
            (ou

In [14]:
lora_config = LoRAConfig(r=8, alpha=16)
model.add_adapter("patent", config=lora_config)
model.set_active_adapters("patent")

In [15]:
model

DistilBertAdapterModel(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlockWithAdapters(
          (attention): MultiHeadSelfAttentionWithAdapters(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): LoRALinearTorch(
              in_features=768, out_features=768, bias=True
              (loras): ModuleDict(
                (patent): LoRA()
              )
            )
            (k_lin): LoRALinearTorch(
              in_features=768, out_features=768, bias=True
              (loras): ModuleDict()
            )
            (v_lin): LoRALinearTorch(
              in_features=768, out_features=768, bias=True
              (lo

In [16]:
model.train_adapter('patent')

In [17]:
model

DistilBertAdapterModel(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlockWithAdapters(
          (attention): MultiHeadSelfAttentionWithAdapters(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): LoRALinearTorch(
              in_features=768, out_features=768, bias=True
              (loras): ModuleDict(
                (patent): LoRA()
              )
            )
            (k_lin): LoRALinearTorch(
              in_features=768, out_features=768, bias=True
              (loras): ModuleDict()
            )
            (v_lin): LoRALinearTorch(
              in_features=768, out_features=768, bias=True
              (lo

In [19]:
print(model.adapter_summary())

Name                     Architecture         #Param      %Param  Active   Train
--------------------------------------------------------------------------------
patent                   lora                147,456       0.226       1       1
--------------------------------------------------------------------------------
Full model                                65,190,912     100.000               0


# Fine-Tune

In [20]:
accuracy = evaluate.load("accuracy")

In [21]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [22]:
training_args = TrainingArguments(
    output_dir="distilbert-patent",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=50,
    weight_decay=0.01,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

In [23]:
trainer = AdapterTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [24]:
trainer.train()

[2024-08-29 09:43:42,623] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Epoch,Training Loss,Validation Loss,Accuracy
1,1.2237,1.165578,0.5852
2,1.1475,1.105873,0.6078
3,1.1029,1.071191,0.6242
4,1.0879,1.051021,0.635
5,1.046,1.042637,0.6412
6,1.0411,1.033411,0.643


: 

# Inference

In [None]:
trainer.evaluate(tokenized_dataset['test'])