# Fine Tuning A BERT Model With HuggingFace

## 1. Introduction

## 2. Collecting The Data 

In [None]:
import arxiv 

client = arxiv.Client()

# artificial intelligence
ai_results = [{
                "id": res.entry_id,
                "code": res.primary_category,
                "text": res.summary
              } for res in client.results(
                arxiv.Search(
                    query = "cat:cs.AI",
                    max_results = 1000
                    )
                )
]


In [None]:
# robotics
ro_results = [{
                "id": res.entry_id,
                "code": res.primary_category,
                "text": res.summary
              } for res in client.results(
                arxiv.Search(
                    query = "cat:cs.RO",
                    max_results = 100
                    )
                )
]

In [None]:
# information retervial
ir_results = [{
                "id": res.entry_id,
                "code": res.primary_category,
                "text": res.summary
              } for res in client.results(
                arxiv.Search(
                    query = "cat:cs.IR",
                    max_results = 1000
                    )
                )
]

In [None]:
import pandas as pd
from datasets import Dataset

df = pd.DataFrame(
    ai_results + ir_results + ro_results
)

In [1]:
100 * df.groupby("code").size() / df.shape[0]

NameError: name 'df' is not defined

In [119]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

labeler  = LabelEncoder()

In [120]:
df = df.assign(label=labeler.fit_transform(df["code"]))

In [123]:
labeler.classes_

array(['cs.AI', 'cs.IR', 'cs.RO'], dtype=object)

In [14]:
df.to_json("gs://harmon-arxiv/abstracts.json",
            storage_options={"token": "credentials.json"})

In [None]:
# df = pd.read_json("gs://harmon-arxiv/abstracts.json",
#             storage_options={"token": "credentials.json"})

In [51]:
X_train, X_test, y_train, y_test = train_test_split(df["text"], 
                                                    df["label"],
                                                     test_size=0.15, 
                                                     random_state=42,
                                                     stratify=df["label"]
                                                     )

In [52]:
X_train, X_val, y_train, y_val = train_test_split(X_train, 
                                                  y_train, 
                                                  test_size=0.20, 
                                                  random_state=42, 
                                                  stratify=y_train)

In [161]:
train_df = pd.DataFrame({"text": X_train,
                        "label": y_train})

val_df = pd.DataFrame({"text": X_val,
                        "label": y_val})

test_df = pd.DataFrame({"text": X_test,
                        "label": y_test})

In [162]:
train_df.shape

(1428, 2)

In [163]:
val_df.shape

(357, 2)

In [164]:
test_df.shape

(315, 2)

In [165]:
train_df.to_json("gs://harmon-arxiv/train_abstracts.json",
            storage_options={"token": "credentials.json"})

val_df.to_json("gs://harmon-arxiv/val_abstracts.json",
            storage_options={"token": "credentials.json"})

test_df.to_json("gs://harmon-arxiv/test_abstracts.json",
            storage_options={"token": "credentials.json"})

In [166]:
test_df["text"].apply(lambda x: len(x)).to_frame("count").query("count == 0")

Unnamed: 0,count


## 2. HuggingFace 

In [7]:
import pandas as pd

In [8]:
train_df = pd.read_json("gs://harmon-arxiv/train_abstracts.json",
                         storage_options={"token": "credentials.json"})

val_df = pd.read_json("gs://harmon-arxiv/val_abstracts.json",
                     storage_options={"token": "credentials.json"})

test_df = pd.read_json("gs://harmon-arxiv/test_abstracts.json",
                        storage_options={"token": "credentials.json"})

In [10]:
from datasets import Dataset

In [9]:
from datasets import ClassLabel, DatasetDict
# class_labels = ClassLabel(names=["artifical intelligence", "information retrival", ""])

In [11]:
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
val_dataset = Dataset.from_pandas(val_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

In [12]:
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

In [14]:
test_dataset.features

{'text': Value('string'), 'label': Value('int64')}

In [15]:
import torch
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding

Using GPT2 Through HuggingFace Directly

In [23]:

checkpoint = "distilbert/distilbert-base-uncased"
device="mps"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
model = model.to(device)

In [45]:
def tokenize_function(example):
    return tokenizer(example["text"], padding=True, truncation=True, return_tensors="pt")


In [46]:
batches = test_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 315/315 [00:00<00:00, 2767.07 examples/s]


In [50]:
type(batches)

datasets.arrow_dataset.Dataset

## 3. Fine Tuning DistilBERT

In [56]:
model.device

device(type='mps', index=0)

In [57]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)


tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/1428 [00:00<?, ? examples/s]

Map: 100%|██████████| 1428/1428 [00:00<00:00, 5912.58 examples/s]
Map: 100%|██████████| 357/357 [00:00<00:00, 8021.55 examples/s]
Map: 100%|██████████| 315/315 [00:00<00:00, 8779.07 examples/s]


In [58]:
# class_labels = ClassLabel(names=[
#                       "artifical intelligence", 
#                       "information retrival", 
#                       "robotics"])

In [76]:
def compute_metrics(eval_preds):
    roc_auc_score = evaluate.load("roc_auc", "multiclass")
    preds, labels = eval_preds
    scores = torch.nn.functional.softmax(
                        torch.tensor(preds), dim=-1)
    
    return roc_auc_score.compute(prediction_scores=scores, references=labels, multi_class="ovr")

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch"
)

In [79]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)


In [80]:
trainer.train()

Epoch,Training Loss,Validation Loss


RuntimeError: MPS backend out of memory (MPS allocated: 6.02 GB, other allocations: 12.06 GB, max allowed: 18.13 GB). Tried to allocate 89.42 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
predictions = torch.tensor(trainer.predict(tokenized_datasets["test"]))

In [None]:
scores = torch.nn.functional.softmax(
    torch.tensor(predictions.predictions), dim=-1)

In [None]:
roc_auc_score.compute(references=test_dataset["label"], 
                      prediction_scores=scores,
                      multi_class="ovr")

{'roc_auc': 0.8106397306397306}