In [None]:
!pip install evaluate
!pip install -U datasets

In [None]:
import pandas as pd
from sklearn.metrics import multilabel_confusion_matrix, classification_report
import numpy as np

from datasets import Dataset, load_dataset
from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sklearn.model_selection import train_test_split
import evaluate
import uuid

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

from datasets import load_dataset, Dataset

import warnings
warnings.filterwarnings('ignore')

### Hugging Face Hub Login
- For logging model while training

In [None]:
from huggingface_hub import login, create_repo, upload_folder

login()

### Dataset loading

In [None]:
from datasets import load_dataset

ds = load_dataset("nnudee/Thai-Thangkarn-sentence", split = 'train')
ds

In [None]:
# Example data from datasets
ds[0]

In [None]:
ds = ds.class_encode_column("label")
ds = ds.train_test_split(test_size=0.2, stratify_by_column="label", seed = 1122)

In [None]:
id2label = ds['train'].features['label'].int2str
label2id = ds['train'].features['label'].str2int

label_feature = ds['train'].features['label']

id2label_dict = {i: label_feature.int2str(i) for i in range(label_feature.num_classes)}
label2id_dict = {label_feature.int2str(i): i for i in range(label_feature.num_classes)}
num_labels = label_feature.num_classes

print(id2label_dict)
print(label2id_dict)

### Tokenizer & Language Model - Wangchan


In [None]:
tokenizer_name = "airesearch/wangchanberta-base-att-spm-uncased"
model_name = "airesearch/wangchanberta-base-att-spm-uncased"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label_dict,
    label2id=label2id_dict,
    problem_type='single_label_classification',
    ignore_mismatched_sizes=True
)

In [None]:
# Usage and output of tokenizer
sample_sentence = ds['train'][0]['output']
tokenizer(sample_sentence)

In [None]:
def tokenize_and_encode(examples):
  return tokenizer(examples["output"], padding="max_length", truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    f1 = f1_score(labels, predictions, average='macro')
    accuracy = accuracy_score(labels, predictions)

    return {"f1_macro": f1, "accuracy": accuracy}

tokenized_train_ds = ds["train"].map(tokenize_and_encode, batched=True)
tokenized_test_ds = ds["test"].map(tokenize_and_encode, batched=True)

In [None]:
from transformers import TrainingArguments, Trainer

output_dir = "./results"
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=10,            # Main parameters for finetuning
    per_device_train_batch_size=80, # Main parameters for finetuning
    per_device_eval_batch_size=80,  # Main parameters for finetuning
    learning_rate=2e-5,             # Main parameters for finetuning
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    report_to="none",
    fp16=True if "cuda" in str(model.device) else False, # Enable mixed precision training if GPU is available
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
print("\nStarting training...")
trainer.train()
print("\nTraining complete!")

In [None]:
# Get predictions for the training set
print("\nGenerating predictions for the training set...")
train_predictions = trainer.predict(tokenized_train_ds)
train_preds = np.argmax(train_predictions.predictions, axis=-1)
train_labels = train_predictions.label_ids

# Get predictions for the validation (test) set
print("Generating predictions for the validation set...")
val_predictions = trainer.predict(tokenized_test_ds)
val_preds = np.argmax(val_predictions.predictions, axis=-1)
val_labels = val_predictions.label_ids

In [None]:
from sklearn.metrics import f1_score, accuracy_score, precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

# Get class names for display
class_names = [id2label_dict[i] for i in range(num_labels)]


# Translate this ['กันเอง', 'กึ่งทางการ', 'ทางการ', 'พิธีการ', 'ไม่เป็นทางการ']
# to be ['Casual', 'Semi-formal', 'Formal', 'Ceremonial', 'Informal'] manually
class_names = ['Casual', 'Semi-formal', 'Formal', 'Ceremonial', 'Informal']

# --- Plot Confusion Matrix for Training Set ---
print("\n Training Set Confusion Matrix ")
cm_train = confusion_matrix(train_labels, train_preds)
disp_train = ConfusionMatrixDisplay(confusion_matrix=cm_train, display_labels=class_names)

fig_train, ax_train = plt.subplots(figsize=(8, 8))
disp_train.plot(cmap=plt.cm.Blues, ax=ax_train, xticks_rotation='vertical')
ax_train.set_title("Training Set Confusion Matrix")
plt.tight_layout()
plt.show()

train_report = classification_report(
    train_labels,
    train_preds,
    target_names=class_names,
    digits=3
)
print(train_report)

# --- Plot Confusion Matrix for Validation Set ---
print("\n Validation Set Confusion Matrix ")
cm_val = confusion_matrix(val_labels, val_preds)
disp_val = ConfusionMatrixDisplay(confusion_matrix=cm_val, display_labels=class_names)

fig_val, ax_val = plt.subplots(figsize=(8, 8))
disp_val.plot(cmap=plt.cm.Blues, ax=ax_val, xticks_rotation='vertical')
ax_val.set_title("Validation Set Confusion Matrix")
plt.tight_layout()
plt.show()

val_report = classification_report(
    val_labels,
    val_preds,
    target_names=class_names,
    digits=3
)
print(val_report)

print("\nConfusion matrices displayed.")

### PhayathaiBert

In [None]:
tokenizer_name = "clicknext/phayathaibert"
model_name = "clicknext/phayathaibert"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label_dict,
    label2id=label2id_dict,
    problem_type='single_label_classification',
    ignore_mismatched_sizes=True
)

In [None]:
# Usage and output of tokenizer
sample_sentence = ds['train'][0]['output']
tokenizer(sample_sentence)

In [None]:
def tokenize_and_encode(examples):
  return tokenizer(examples["output"], padding="max_length", truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    f1 = f1_score(labels, predictions, average='macro')
    accuracy = accuracy_score(labels, predictions)

    return {"f1_macro": f1, "accuracy": accuracy}

tokenized_train_ds = ds["train"].map(tokenize_and_encode, batched=True)
tokenized_test_ds = ds["test"].map(tokenize_and_encode, batched=True)

In [None]:
from transformers import TrainingArguments, Trainer

output_dir = "./results"
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=10,            # Main parameters for finetuning
    per_device_train_batch_size=40, # Main parameters for finetuning
    per_device_eval_batch_size=40,  # Main parameters for finetuning
    learning_rate=2e-5,             # Main parameters for finetuning
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    greater_is_better=True,
    report_to="none",
    fp16=True if "cuda" in str(model.device) else False, # Enable mixed precision training if GPU is available
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
print("\nStarting training...")
trainer.train()
print("\nTraining complete!")

In [None]:
# Get predictions for the training set
print("\nGenerating predictions for the training set...")
train_predictions = trainer.predict(tokenized_train_ds)
train_preds = np.argmax(train_predictions.predictions, axis=-1)
train_labels = train_predictions.label_ids

# Get predictions for the validation (test) set
print("Generating predictions for the validation set...")
val_predictions = trainer.predict(tokenized_test_ds)
val_preds = np.argmax(val_predictions.predictions, axis=-1)
val_labels = val_predictions.label_ids

In [None]:
model_path = "Thangkarn-model"

trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
repo_name = "nnudee/Thai-Thangkarn-classifier"

create_repo(repo_id=repo_name, private=False)

upload_folder(
    folder_path=model_path,
    path_in_repo="",
    repo_id=repo_name
)

print(f"✅ โมเดลถูกอัปโหลดที่: https://huggingface.co/{repo_name}")

In [None]:
from sklearn.metrics import f1_score, accuracy_score, precision_recall_fscore_support, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns

# Get class names for display
class_names = [id2label_dict[i] for i in range(num_labels)]


# Translate this ['กันเอง', 'กึ่งทางการ', 'ทางการ', 'พิธีการ', 'ไม่เป็นทางการ']
# to be ['Casual', 'Semi-formal', 'Formal', 'Ceremonial', 'Informal'] manually
class_names = ['Casual', 'Semi-formal', 'Formal', 'Ceremonial', 'Informal']

# --- Plot Confusion Matrix for Training Set ---
print("\n--- Training Set Confusion Matrix ---")
cm_train = confusion_matrix(train_labels, train_preds)
disp_train = ConfusionMatrixDisplay(confusion_matrix=cm_train, display_labels=class_names)

fig_train, ax_train = plt.subplots(figsize=(8, 8))
disp_train.plot(cmap=plt.cm.Blues, ax=ax_train, xticks_rotation='vertical')
ax_train.set_title("Training Set Confusion Matrix")
plt.tight_layout()
plt.show()

train_report = classification_report(
    train_labels,
    train_preds,
    target_names=class_names,
    digits=3
)
print(train_report)

# --- Plot Confusion Matrix for Validation Set ---
print("\n Validation Set Confusion Matrix ")
cm_val = confusion_matrix(val_labels, val_preds)
disp_val = ConfusionMatrixDisplay(confusion_matrix=cm_val, display_labels=class_names)

fig_val, ax_val = plt.subplots(figsize=(8, 8))
disp_val.plot(cmap=plt.cm.Blues, ax=ax_val, xticks_rotation='vertical')
ax_val.set_title("Validation Set Confusion Matrix")
plt.tight_layout()
plt.show()

val_report = classification_report(
    val_labels,
    val_preds,
    target_names=class_names,
    digits=3
)
print(val_report)

print("\nConfusion matrices displayed.")

### Error analysis

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import numpy as np

In [None]:
ds_pd = ds['test'].to_pandas()
ds_pd.reset_index(inplace = True)
ds_pd

In [None]:
MODEL_NAME = "nnudee/Thai-Thangkarn-classifier"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.eval()

texts = ds_pd["output"].tolist()
true_labels = ds_pd["label"].tolist()
pred_labels = []

In [None]:
for text in tqdm(texts):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        pred = torch.argmax(probs, dim=-1).item()
        pred_labels.append(pred)

In [None]:
df_error = pd.DataFrame({
    "output": texts,
    "true_label": true_labels,
    "pred_label": pred_labels
})


In [None]:
if model.config.id2label:
    df_error["true_label_name"] = df_error["true_label"].map(model.config.id2label)
    df_error["pred_label_name"] = df_error["pred_label"].map(model.config.id2label)

df_error["is_correct"] = df_error["true_label"] == df_error["pred_label"]
df_wrong = df_error[df_error["is_correct"] == False].copy()

df_wrong
# แสดงผล
#print(df_wrong[["text", "true_label_name", "pred_label_name"]].head(10))

In [None]:
df_wrong.drop(columns=['true_label','pred_label','is_correct'], inplace=True)

In [None]:
df_wrong.to_csv('Error_analyse.csv', index=False)

In [None]:
df_wrong

In [None]:
grouped = df_wrong.groupby(["true_label_name", "pred_label_name"])

for (true_label, pred_label), group in grouped:
    print(f"\n{true_label} → {pred_label} ({len(group)} ตัวอย่าง)")
    display(group[["output", "true_label_name", "pred_label_name"]].reset_index(drop=True))


In [None]:
import pandas as pd

df_wrong = pd.read_csv('./Error_analyse.csv')  # ลองไม่ใส่ lines=True ก่อน

In [None]:
grouped = df_wrong.groupby(["true_label_name", "pred_label_name"])

for (true_label, pred_label), group in grouped:
    print(f"{true_label} → {pred_label} ({len(group)} ตัวอย่าง)")
    for i, row in group.iterrows():
        print(f"- {row['output']}")