In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoConfig, AutoModelForSequenceClassification
from torchsummary import summary
import numpy as np

import evaluate
from transformers import TrainingArguments, Trainer

In [4]:
ds = load_dataset("thainq107/ntc-scv")
ds

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence'],
        num_rows: 30000
    })
    valid: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence'],
        num_rows: 10000
    })
})

# Data preparation

## Tokenization

In [5]:
model_name = "distilbert-base-uncased"  # bert-base-uncased

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
max_seq_length = 100
max_seq_length = min(max_seq_length, tokenizer.model_max_length)


def preprocess_function(examples):
    # Tokenize the texts
    result = tokenizer(
        examples["preprocessed_sentence"],
        padding="max_length",
        max_length=max_seq_length,
        truncation=True,
    )
    result["label"] = examples["label"]
    return result


# Running the preprocessing pipeline on all the datasets
processed_dataset = ds.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on dataset",
)

processed_dataset

Running tokenizer on dataset: 100%|██████████| 30000/30000 [00:02<00:00, 13258.04 examples/s]
Running tokenizer on dataset: 100%|██████████| 10000/10000 [00:00<00:00, 13497.62 examples/s]
Running tokenizer on dataset: 100%|██████████| 10000/10000 [00:00<00:00, 12616.21 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence', 'input_ids', 'attention_mask'],
        num_rows: 30000
    })
    valid: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['sentence', 'label', 'preprocessed_sentence', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})

# Model training

In [None]:
num_labels = 2

config = AutoConfig.from_pretrained(
    model_name, num_labels=num_labels, finetuning_task="text-classification"
)
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

summary(model)

In [None]:
metric = evaluate.load("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    result = metric.compute(predictions=predictions, references=labels)
    return result

In [None]:
training_args = TrainingArguments(
    output_dir="save_model",
    learning_rate=2e-5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["valid"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()