# Fine Tuning A BERT Model With HuggingFace

## 1. Introduction

In [None]:
# !pip install arxiv
# !pip install evaluate

In [None]:
import arxiv
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from google.colab import auth

 Next I authenticate myself as my Google account user. This will be helpful since I will be storing the doccuments as json in [Google Cloud Storage](https://cloud.google.com/storage?hl=en) and authentication through [colab](https://colab.research.google.com/) means there's no extra steps for access to the data!

In [None]:
auth.authenticate_user()

## 2. Collecting The Data

In [None]:
client = arxiv.Client()

# artificial intelligence abstraccs
ai_results = [{
                "id": res.entry_id,
                "code": res.primary_category,
                "text": res.summary
              } for res in client.results(
                arxiv.Search(
                    query = "cat:cs.AI",
                    max_results = 1000
                    )
                )
]

# information retervial abstracts
ir_results = [{
                "id": res.entry_id,
                "code": res.primary_category,
                "text": res.summary
              } for res in client.results(
                arxiv.Search(
                    query = "cat:cs.IR",
                    max_results = 1000
                    )
                )
]

# robotics abstracts
ro_results = [{
                "id": res.entry_id,
                "code": res.primary_category,
                "text": res.summary
              } for res in client.results(
                arxiv.Search(
                    query = "cat:cs.RO",
                    max_results = 100
                    )
                )
]

Now we combine them into a dataframe,

In [None]:
df = pd.DataFrame(
    ai_results + ir_results + ro_results
)

In [None]:
labeler  = LabelEncoder()
df = df.assign(label=labeler.fit_transform(df["code"]))
labeler.classes_

In [None]:
df.to_json("gs://harmon-arxiv/abstracts.json")

In [None]:
# df = pd.read_json("gs://harmon-arxiv/abstracts.json")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df["text"],
                                                    df["label"],
                                                     test_size=0.15,
                                                     random_state=42,
                                                     stratify=df["label"]
                                                     )

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  test_size=0.20,
                                                  random_state=42,
                                                  stratify=y_train)

In [None]:
train_df = pd.DataFrame({"text": X_train,
                        "label": y_train})

val_df = pd.DataFrame({"text": X_val,
                        "label": y_val})

test_df = pd.DataFrame({"text": X_test,
                        "label": y_test})

In [None]:
train_df.shape

In [None]:
val_df.shape

In [None]:
test_df.shape

In [None]:
train_df.to_json("gs://harmon-arxiv/train_abstracts.json")

val_df.to_json("gs://harmon-arxiv/val_abstracts.json")

test_df.to_json("gs://harmon-arxiv/test_abstracts.json")

In [None]:
test_df["text"].apply(lambda x: len(x)).to_frame("count").query("count == 0")

## 2. HuggingFace Models

In [None]:
train_df = pd.read_json("gs://harmon-arxiv/train_abstracts.json")

val_df = pd.read_json("gs://harmon-arxiv/val_abstracts.json")

test_df = pd.read_json("gs://harmon-arxiv/test_abstracts.json")

In [None]:
import pandas as pd
from datasets import ClassLabel, Dataset, DatasetDict
import torch
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from torch.utils.data import DataLoader
import evaluate

In [None]:
train_df = pd.read_json("gs://train_abstracts.json")
val_df = pd.read_json("/val_abstracts.json")

In [None]:
# class_labels = ClassLabel(names=["artifical intelligence", "information retrival", ""])

In [None]:
train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
val_dataset = Dataset.from_pandas(val_df, preserve_index=False)
test_dataset = Dataset.from_pandas(test_df, preserve_index=False)aaa

In [None]:
dataset_dict = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

In [None]:

checkpoint = "distilbert/distilbert-base-uncased"
device="cuda"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)
roc_auc_score = evaluate.load("roc_auc", "multiclass")


In [None]:
model = model.to(device)

In [None]:

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)


tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns("text")
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [None]:
tokenized_datasets = tokenized_datasets.with_format("torch")

In [None]:
val_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [None]:
model.eval()
for batch in val_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        scores = torch.nn.functional.softmax(outputs.logits, dim=-1)
        roc_auc_score.add_batch(references=batch["labels"],
                               prediction_scores=scores)

In [None]:
roc_auc_score.compute(multi_class="ovr")

## 3. Fine Tuning DistilBERT

In [None]:
model.device

In [None]:
def compute_metrics(eval_preds):
    roc_auc_score = evaluate.load("roc_auc", "multiclass")
    preds, labels = eval_preds
    scores = torch.nn.functional.softmax(
                        torch.tensor(preds), dim=-1)

    return roc_auc_score.compute(prediction_scores=scores, references=labels, multi_class="ovr")

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_strategy="epoch",
    report_to="none"
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()

In [None]:
trainer.save_model("temp")

In [None]:
trainer.load   ("temp")

In [None]:
model = trainer.model
roc_auc_score = evaluate.load("roc_auc", "multiclass")


In [None]:

for batch in val_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        scores = torch.nn.functional.softmax(outputs.logits, dim=-1)
        roc_auc_score.add_batch(references=batch["labels"],
                               prediction_scores=scores)

In [None]:
roc_auc_score.compute(multi_class="ovr")

In [None]:
predictions = torch.tensor(trainer.predict(tokenized_datasets["validation"]))

scores = torch.nn.functional.softmax(
    torch.tensor(predictions.predictions), dim=-1)

In [None]:
roc_auc_score.compute(references=test_dataset["label"],
                      prediction_scores=scores,
                      multi_class="ovr")