# Import

In [1]:
from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments


# Step 3 - Train transformer

In [2]:
df = pd.read_csv("../data/tagged_balanced.csv")

In [3]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted")
    }


In [None]:
dataset = Dataset.from_pandas(df[["text", "Tag"]])
dataset = dataset.class_encode_column("Tag")

id2label = dataset.features["Tag"].int2str
label2id = dataset.features["Tag"].str2int

labels_list = dataset.features["Tag"].names

id2label = {i: label for i, label in enumerate(labels_list)}
label2id = {label: i for i, label in enumerate(labels_list)}

dataset = dataset.train_test_split(test_size=0.2)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized = dataset.map(tokenize, batched=True)
tokenized = tokenized.rename_column("Tag", "labels")

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=dataset["train"].features["Tag"].num_classes,
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./models/transformer_model",
    eval_strategy="epoch",
    per_device_train_batch_size=4,
    num_train_epochs=4,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()


Casting to class labels:   0%|          | 0/348 [00:00<?, ? examples/s]

Map:   0%|          | 0/278 [00:00<?, ? examples/s]

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.820528,0.8,0.711111


In [7]:
trainer.save_model("../models/transformer_model")
tokenizer.save_pretrained("../models/transformer_model")

('../models/transformer_model/tokenizer_config.json',
 '../models/transformer_model/special_tokens_map.json',
 '../models/transformer_model/vocab.txt',
 '../models/transformer_model/added_tokens.json',
 '../models/transformer_model/tokenizer.json')