## Requirements

In [None]:
# !pip install transformers datasets huggingface_hub evaluate wandb pythainlp accelerate
# !pip install sentencepiece
# !pip install thai2transformers

In [None]:
# import wandb
# wandb.login()

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
# NLP Dataset

# !curl -LO https://raw.githubusercontent.com/noppakorn/ASR-project/main/nlp-part/data.csv

## Setup

In [None]:
%env WANDB_PROJECT=asr-project-nlp-part

env: WANDB_PROJECT=asr-project-nlp-part


In [None]:
seed = 42 # The answer to everything

import torch
torch.manual_seed(seed)
torch.use_deterministic_algorithms(True)

import random
random.seed(seed)

import numpy as np
np.random.seed(seed)


def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(seed)

import transformers
transformers.enable_full_determinism(seed)

In [None]:
import re
import pandas as pd

import torchtext

from datasets import load_dataset, Dataset

import evaluate

In [None]:
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

## Datasets

In [None]:
model_checkpoint = 'airesearch/wangchanberta-base-att-spm-uncased'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_checkpoint,
    revision='main',
    model_max_length=416
    )

tokenizer.additional_special_tokens = ['<s>NOTUSED', '</s>NOTUSED', '<_>']

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
df = pd.read_csv("data.csv")
df = df[df.noodle_type.isin({"บะหมี่", "เส้นเล็ก", "เส้นใหญ่"})]
df = df.iloc[1:].drop("id", axis=1).rename({"input": "text", "noodle_type": "label"}, axis=1)

ds = Dataset.from_pandas(df)
ds = ds.class_encode_column("label")
ds = ds.train_test_split(test_size=0.1, stratify_by_column="label")


label_id = ds["train"].features["label"].names

id2label = {i:j for i,j in enumerate(label_id)}
label2id = {j:i for i,j in id2label.items()}

ds

Casting to class labels:   0%|          | 0/113 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'soup', 'spicy', '__index_level_0__'],
        num_rows: 101
    })
    test: Dataset({
        features: ['text', 'label', 'soup', 'spicy', '__index_level_0__'],
        num_rows: 12
    })
})

In [None]:
tokenized_ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

## Model

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

In [None]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    # return accuracy.compute(predictions=predictions, references=labels)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels),
        "f1": f1.compute(predictions=predictions, references=labels, average="macro"),
    }

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, 
    num_labels=len(id2label), 
    id2label=id2label, 
    label2id=label2id,
)

Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

In [None]:
training_args = TrainingArguments(
    output_dir="asr-project-nlp-part",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=False,
    push_to_hub=False,
    report_to="wandb",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
print("Result before training")
trainer.evaluate()

You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Result before training


[34m[1mwandb[0m: Currently logged in as: [33mnoppakorn[0m ([33mmeen[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 1.1328198909759521,
 'eval_accuracy': {'accuracy': 0.25},
 'eval_f1': {'f1': 0.21645021645021645},
 'eval_runtime': 0.9826,
 'eval_samples_per_second': 12.213,
 'eval_steps_per_second': 1.018}

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.015677,{'accuracy': 0.5},{'f1': 0.35714285714285715}
2,No log,0.994116,{'accuracy': 0.4166666666666667},{'f1': 0.273015873015873}
3,No log,0.944472,{'accuracy': 0.5833333333333334},{'f1': 0.42735042735042733}
4,No log,0.996964,{'accuracy': 0.5},{'f1': 0.3636363636363636}
5,No log,0.923123,{'accuracy': 0.5},{'f1': 0.3666666666666667}
6,No log,0.866339,{'accuracy': 0.6666666666666666},{'f1': 0.4880952380952381}
7,No log,0.831534,{'accuracy': 0.6666666666666666},{'f1': 0.4880952380952381}
8,No log,0.852817,{'accuracy': 0.6666666666666666},{'f1': 0.4880952380952381}
9,No log,0.854753,{'accuracy': 0.6666666666666666},{'f1': 0.4880952380952381}
10,No log,0.858991,{'accuracy': 0.6666666666666666},{'f1': 0.4880952380952381}


TrainOutput(global_step=70, training_loss=0.8999687194824219, metrics={'train_runtime': 110.5901, 'train_samples_per_second': 9.133, 'train_steps_per_second': 0.633, 'total_flos': 11020443834330.0, 'train_loss': 0.8999687194824219, 'epoch': 10.0})

In [None]:
print("Result after training")
trainer.evaluate()

Result after training


{'eval_loss': 0.8589908480644226,
 'eval_accuracy': {'accuracy': 0.6666666666666666},
 'eval_f1': {'f1': 0.4880952380952381},
 'eval_runtime': 0.0704,
 'eval_samples_per_second': 170.508,
 'eval_steps_per_second': 14.209,
 'epoch': 10.0}

In [None]:
pred = trainer.predict(tokenized_ds["test"])
print(pred)

PredictionOutput(predictions=array([[ 0.8933094 ,  0.6652651 , -0.9933173 ],
       [-0.04161858, -0.15913342, -0.49688157],
       [-0.00165053,  0.22126995, -0.6481161 ],
       [-0.42424852,  0.45186907, -0.57754785],
       [ 0.50140625,  0.22173627, -0.37152675],
       [-0.6451205 ,  0.07911201,  0.00606302],
       [-0.21276471,  1.051708  , -0.87246674],
       [ 0.18750952,  0.88024133, -0.8790394 ],
       [-0.69683826,  0.12036067, -0.17606197],
       [-0.6911022 ,  0.40018544, -0.15414399],
       [-0.39716172,  0.5868643 ,  0.13704619],
       [-0.21861634,  0.14311564, -0.5874522 ]], dtype=float32), label_ids=array([0, 0, 0, 2, 0, 1, 1, 1, 2, 1, 1, 0]), metrics={'test_loss': 0.8589908480644226, 'test_accuracy': {'accuracy': 0.6666666666666666}, 'test_f1': {'f1': 0.4880952380952381}, 'test_runtime': 0.1086, 'test_samples_per_second': 110.521, 'test_steps_per_second': 9.21})


In [47]:
print("Label\t\t", tokenized_ds["test"]["label"])
print("Predicted\t", pred[0].argmax(axis=1).tolist())

Label		 [0, 0, 0, 2, 0, 1, 1, 1, 2, 1, 1, 0]
Predicted	 [0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]
