# Fine Tuning A BERT Model With HuggingFace

## 1. Introduction

In [2]:
!pip install arxiv
!pip install evaluate

Collecting arxiv
  Downloading arxiv-2.2.0-py3-none-any.whl.metadata (6.3 kB)
Collecting feedparser~=6.0.10 (from arxiv)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting sgmllib3k (from feedparser~=6.0.10->arxiv)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading arxiv-2.2.0-py3-none-any.whl (11 kB)
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=sgmllib3k-1.0.0-py3-none-any.whl size=6046 sha256=aa47a597289f9fe27c5425d57ba0106fa4bbeade982d83a88567c268122cb8f6
  Stored in directory: /root/.cache/pip/wheels/3b/25/2a/105d6a15df6914f4d15047691c6c28f9052cc1173e40285d03
Successfully built sgmllib3k
Installing collected packag

In [7]:
# import arxiv
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import google.colab as colab

 Next I authenticate myself as my Google account user. This will be helpful since I will be storing the doccuments as json in [Google Cloud Storage](https://cloud.google.com/storage?hl=en) and authentication through [colab](https://colab.research.google.com/) means there's no extra steps for access to the data!

In [8]:
colab.auth.authenticate_user()

## 2. Collecting The Data

In [19]:
client = arxiv.Client()

# artificial intelligence abstraccs
ai_results = [{
                "id": res.entry_id,
                "code": res.primary_category,
                "text": res.summary
              } for res in client.results(
                arxiv.Search(
                    query = "cat:cs.AI",
                    max_results = 1000
                    )
                )
]

In [20]:
ai_results[0]

{'id': 'http://arxiv.org/abs/cs/9308101v1',
 'code': 'cs.AI',
 'text': 'Because of their occasional need to return to shallow points in a search\ntree, existing backtracking methods can sometimes erase meaningful progress\ntoward solving a search problem. In this paper, we present a method by which\nbacktrack points can be moved deeper in the search space, thereby avoiding this\ndifficulty. The technique developed is a variant of dependency-directed\nbacktracking that uses only polynomial space while still providing useful\ncontrol information and retaining the completeness guarantees provided by\nearlier approaches.'}

In [22]:
# information retervial abstracts
ir_results = [{
                "id": res.entry_id,
                "code": res.primary_category,
                "text": res.summary
              } for res in client.results(
                arxiv.Search(
                    query = "cat:cs.IR",
                    max_results = 1000
                    )
                )
]

# robotics abstracts
ro_results = [{
                "id": res.entry_id,
                "code": res.primary_category,
                "text": res.summary
              } for res in client.results(
                arxiv.Search(
                    query = "cat:cs.RO",
                    max_results = 100
                    )
                )
]

Now we combine them into a dataframe,

In [None]:
df = pd.DataFrame(
    ai_results + ir_results + ro_results
)

Now we

In [None]:
df.head(2)

In [None]:
labeler  = LabelEncoder()
df = df.assign(label=labeler.fit_transform(df["code"]))
labeler.classes_

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
                                            df["text"],
                                            df["label"],
                                            test_size=0.15,
                                            random_state=42,
                                            stratify=df["label"])

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  test_size=0.20,
                                                  random_state=42,
                                                  stratify=y_train)

In [None]:
train_df = pd.DataFrame({"text": X_train, "label": y_train})

val_df = pd.DataFrame({"text": X_val, "label": y_val})

test_df = pd.DataFrame({"text": X_test, "label": y_test})

In [None]:
train_df.shape, val_df.shape, test_df.shape

In [None]:
train_df.to_json("gs://harmon-arxiv/train_abstracts.json")
val_df.to_json("gs://harmon-arxiv/val_abstracts.json")
test_df.to_json("gs://harmon-arxiv/test_abstracts.json")

## 2. HuggingFace Models

Now that I have the data in Google Cloud Storage we begin the fine tuning of our model. First thing I do is import Pandas for reloading the data from cloud storage, and the necessary PyTorch and Hugging Face modules.

In [45]:
import pandas as pd

# PyTorch imports
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW

# Hugging Face imports
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import Dataset, DatasetDict
import evaluate

Now I can load the datasets from cloud storage as Pandas dataframes and then convert them to [Hugging Face Datasets](https://huggingface.co/docs/datasets/en/index) so they can be used by the Transformer model:

In [10]:
train_df = pd.read_json("gs://harmon-arxiv/train_abstracts.json")
val_df = pd.read_json("gs://harmon-arxiv/val_abstracts.json")
test_df = pd.read_json("gs://harmon-arxiv/test_abstracts.json")

Then conver them to Dataset objects:

In [11]:
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
val_dataset = Dataset.from_pandas(val_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

Finally I will combine them into a [DatasetDict](https://huggingface.co/docs/datasets/v4.0.0/en/package_reference/main_classes#datasets.DatasetDict) obect. This is not necessary, but it is convenient since applying a transformation to the DatasetDict applies it all the Datasets. This avoids repeating the same transformations individually across each datasets.

In [12]:
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

Next I download the [DistilBERT](https://huggingface.co/docs/transformers/en/model_doc/distilbert) model from [HuggingFace's Model Hub](https://huggingface.co/models) as well as its associated Tokenizer. To do so, I use the [AutoTokenizer and AutoModelForSequenceClassification classes](https://huggingface.co/docs/transformers/en/model_doc/auto) as they allow me to swap out models easily. Notice that the tokenizer has to match the model and we have to use the [from_pretrained class methods](https://www.geeksforgeeks.org/python/classmethod-in-python/) for each class. This ensures that the tokenizer and weights for the model are both initialized from the same point in pre-training. Lastly, notice move the model to the GPU and that I have to put the number of classes in AutoModelForSequenceClassification during instantiation. This adds a linear layer with softmax on top of the foundational model.

In [56]:
checkpoint = "distilbert/distilbert-base-uncased"
device="cuda"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)
model = model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:
type(model)

```
transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification
```

You can confirm the last layer by printing the model architecture

```
print(model)
```
which will return,


```
DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features=768, out_features=3072, bias=True)
            (lin2): Linear(in_features=3072, out_features=768, bias=True)
            (activation): GELUActivation()
          )
          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        )
      )
    )
  )
  (pre_classifier): Linear(in_features=768, out_features=768, bias=True)
  (classifier): Linear(in_features=768, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
```

The "classifier" layer is the head that was added and we can confirm that the output is 3 classes.

Next I load the metric to optimize the model using the [evaluate](https://huggingface.co/docs/evaluate/en/index) library. I chose a multiclass [ROC-AUC](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) metric to measure the perofrmance of the model. This is a pretty standard metric for classification problems since it in essence measures "how well the model call separate the classes." Though it should be noted the ROC-AUC curve can be misleading when you have imbalanced classes as I discussed in a [prior post](https://michael-harmon.com/blog/NLP1.html).


In [57]:
roc_auc_score = evaluate.load("roc_auc", "multiclass")

Now that i have

In [58]:

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)



Map:   0%|          | 0/1428 [00:00<?, ? examples/s]

Map:   0%|          | 0/357 [00:00<?, ? examples/s]

Map:   0%|          | 0/315 [00:00<?, ? examples/s]

In [59]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [60]:
tokenized_datasets = tokenized_datasets.remove_columns("text")
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [61]:
tokenized_datasets = tokenized_datasets.with_format("torch")

In [64]:
tokenized_datasets.num_rows

{'train': 1428, 'validation': 357, 'test': 315}

In [65]:
tokenized_datasets["test"].features


{'labels': Value('int64'),
 'input_ids': List(Value('int32')),
 'attention_mask': List(Value('int8'))}

In [66]:
val_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [68]:
from typing import Dict
import numpy as np

def calculate_roc_auc(
  model: transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification,
  loader: DataLoader
) -> Dict[str, np.float64]:

  roc_auc_score = evaluate.load("roc_auc", "multiclass")
  model.eval()
  for batch in loader:
      batch = {k: v.to(device) for k, v in batch.items()}
      with torch.no_grad():
          outputs = model(**batch)
          scores = torch.nn.functional.softmax(outputs.logits, dim=-1)
          roc_auc_score.add_batch(references=batch["labels"],
                                prediction_scores=scores)

  return roc_auc_score.compute(multi_class="ovr")

In [69]:
calculate_roc_auc(model, val_dataloader)

{'roc_auc': np.float64(0.4807119639299569)}

{'roc_auc': np.float64(0.6150256894201531)}

## 3. Fine Tuning DistilBERT

In [24]:
model.device

device(type='cuda', index=0)

In [70]:
def compute_metrics(eval_preds):
    roc_auc_score = evaluate.load("roc_auc", "multiclass")
    preds, labels = eval_preds
    scores = torch.nn.functional.softmax(
                        torch.tensor(preds), dim=-1)

    return roc_auc_score.compute(prediction_scores=scores, references=labels, multi_class="ovr")

In [72]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch",
    report_to="none"
)

In [75]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
output = trainer.train()

Epoch,Training Loss,Validation Loss,Roc Auc
1,0.4213,0.461289,0.968239
2,0.2204,0.460888,0.961172
3,0.2034,0.396393,0.977058


Epoch,Training Loss,Validation Loss,Roc Auc
1,0.4213,0.461289,0.968239
2,0.2204,0.460888,0.961172
3,0.2034,0.396393,0.977058


In [33]:
trainer.save_model("temp")

In [None]:
trainer.load   ("temp")

In [35]:
model = trainer.model


In [36]:
test_dataloader = DataLoader(
    tokenized_datasets["test"], batch_size=8, collate_fn=data_collator
)

In [51]:
calculate_roc_auc(model, test_dataloader)

{'roc_auc': np.float64(0.972989898989899)}

In [52]:
trainer.create_model_card()