<a href="https://colab.research.google.com/github/pavaris-pm/TV-script-generation/blob/main/transformers_thaipos_%5BTorch%2BHF%5D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training Thai `Part-of-speech` tagging model utilizing Transformers
- the corpus are kept in format of `.conllu` file with all as a test set
- we will tackle this as a recommended in the repo by doing 10-fold cross validation

from total of `1000` sentences, The first 750 sentences are originally English (01). The remaining 250 sentences are originally German (02), French (03), Italian (04) or Spanish (05) and they were translated to other languages via English.

In [None]:
# clone the corpus (updated version)
!git clone https://github.com/UniversalDependencies/UD_Thai-PUD.git
!pip -q install datasets evaluate accelerate
!pip -q install transformers[sentencepiece]
!pip -q install conllu
!pip -q install lion-pytorch
!pip -q install seqeval

Cloning into 'UD_Thai-PUD'...
remote: Enumerating objects: 225, done.[K
remote: Counting objects: 100% (105/105), done.[K
remote: Compressing objects: 100% (73/73), done.[K
remote: Total 225 (delta 62), reused 75 (delta 32), pack-reused 120[K
Receiving objects: 100% (225/225), 2.29 MiB | 17.00 MiB/s, done.
Resolving deltas: 100% (129/129), done.
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

# Install necessary packages

In [None]:
import conllu
from conllu import parse
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    CamembertForMaskedLM,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    get_scheduler,
)
import torch
import torch.nn as nn
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torch.optim import AdamW
from lion_pytorch import Lion
import torch.nn.functional as F
from typing import List, Tuple, Dict
from sklearn.model_selection import KFold, StratifiedKFold
from huggingface_hub import notebook_login, Repository, get_full_repo_name
import pandas as pd
from datasets import load_dataset
import evaluate
import numpy as np
from accelerate import Accelerator

# Huggingface Login

In [None]:
# huggingface login -> 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
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Parse the `conllu` format dataset
- note here that we will format it into form of huggingface dataset

In [None]:
# Path to your CoNLL-U file
file_path = "/content/UD_Thai-PUD/th_pud-ud-test.conllu"

# Read the content of the file
with open(file_path, "r", encoding="utf-8") as file:
    data = file.read()

# Parse the content using conllu.parse()
parsed_data = parse(data)

In [None]:
# check for total samples in the corpus
len(parsed_data)

1000

In [None]:
# get access to the text data
parsed_data[0].metadata['text']

'“แม้ว่าการเปลี่ยนไปใช้ระบบดิจิตัลเป็นสิ่งที่ไม่เคยมีมาก่อนในสหรัฐฯ การเปลี่ยนผ่านอำนาจอย่างสันตินั้นก็ไม่ใช่เรื่องใหม่” โคริ ชูลแมน ผู้ช่วยพิเศษของโอบามา เขียนลงบล็อกเมื่อวันจันทร์'

In [None]:
# visualize the data in conllu format
for sentence in parsed_data:
  print(sentence.serialize())
  break

# sent_id = n01001011
# text = “แม้ว่าการเปลี่ยนไปใช้ระบบดิจิตัลเป็นสิ่งที่ไม่เคยมีมาก่อนในสหรัฐฯ การเปลี่ยนผ่านอำนาจอย่างสันตินั้นก็ไม่ใช่เรื่องใหม่” โคริ ชูลแมน ผู้ช่วยพิเศษของโอบามา เขียนลงบล็อกเมื่อวันจันทร์
# translit = “mǽ²wá¹kárpélí¹ynpaićʰai²rabbdičitâlpensi¹ŋdʰí¹mai¹gʰéymímák¹'nnaishrâṭʰ. kárpélí¹ynpʰá¹n'ãnáčɔːyá¹ŋsântinâ²nkɔmai¹ćʰai¹rűá¹ŋhaim¹” gʰóri ćʰúlmǽn pʰú²ćʰ¹wybʰiśéškʰɔːŋ'óbámá kʰíánlŋblɔkműá¹wânčândʰr
# text_en = “While much of the digital transition is unprecedented in the United States, the peaceful transition of power is not,” Obama special assistant Kori Schulman wrote in a blog post Monday.
1	“	_	PUNCT	``	_	27	punct	_	SpaceAfter=No|Translit=“
2	แม้	_	ADP	IN	_	10	mark	_	SpaceAfter=No|Translit=mǽ²
3	ว่า	_	ADP	IN	_	2	fixed	_	SpaceAfter=No|Translit=wá¹
4	การเปลี่ยน	_	VERB	VV	_	10	csubj	_	SpaceAfter=No|Translit=kárpélí¹yn
5	ไป	_	PART	RP	_	4	compound:prt	_	SpaceAfter=No|Translit=pai
6	ใช้	_	VERB	VV	_	4	xcomp	_	SpaceAfter=No|Translit=ćʰai²
7	ระบบ	_	NOUN	NN	_	6	obj	_	Spa

In [None]:
def get_pos_tags()->Tuple[Dict, Dict]:
  # Access the sentences and their properties
  pos_tags = {} # this is to keep all unique pos tags - label2id tag
  upos_list = []

  for sentence in parsed_data:
    for token in sentence:
        # Access properties of each token in the sentence
        word = token["form"]
        upos = token["upostag"]

        if upos not in pos_tags:
          pos_tags[upos] = len(pos_tags)

  # get id2label tag
  rv_pos_tags = {v:k for k, v in pos_tags.items()}

  # Perform further processing or analysis based on the parsed data
  # return label2id, id2label
  return (pos_tags, rv_pos_tags)

In [None]:
# obtain a label
label2id, id2label = get_pos_tags()

In [None]:
# we will format it in form of huggingface dataset
text_batched, word_batched, upos_batched = [], [], []

for sentence in parsed_data:
    word_list, upos_list = [], []
    text_batched.append(sentence.metadata['text'])
    for token in sentence:
        # Access properties of each token in the sentence
        word = token["form"]
        upos = label2id[token["upostag"]] # convert from text to id instead
        word_list.append(word)
        upos_list.append(upos)
    # add into the big batch
    word_batched.append(word_list)
    upos_batched.append(upos_list)

print(len(text_batched))
print(len(word_batched))
print(len(upos_batched))

1000
1000
1000


In [None]:
conllu_format = pd.DataFrame({
    'text' : text_batched,
    'tokens' : word_batched,
    'pos_tags': upos_batched,
})

conllu_format

Unnamed: 0,text,tokens,pos_tags
0,“แม้ว่าการเปลี่ยนไปใช้ระบบดิจิตัลเป็นสิ่งที่ไม...,"[“, แม้, ว่า, การเปลี่ยน, ไป, ใช้, ระบบ, ดิจิต...","[0, 1, 1, 2, 3, 2, 4, 5, 6, 4, 7, 3, 6, 2, 6, ..."
1,สำหรับผู้ที่ติดตามการเปลี่ยนผ่านโซเชียลมีเดียใ...,"[สำหรับ, ผู้, ที่, ติดตาม, การเปลี่ยนผ่าน, โซเ...","[1, 4, 7, 2, 2, 5, 4, 1, 9, 4, 7, 6, 2, 3, 8]"
2,แต่จากวาทกรรมก่อนเมื่อไม่นานเกี่ยวกับการลดการอ...,"[แต่, จาก, วาทกรรม, ก่อน, เมื่อ, ไม่, นาน, เกี...","[10, 1, 4, 5, 1, 3, 5, 2, 1, 2, 2, 3, 4, 2, 9,..."
3,“คือผมไม่ได้ชอบกดดันพวกคุณหรอกนะ แต่ชะตากรรมขอ...,"[“, คือ, ผม, ไม่, ได้, ชอบ, กดดัน, พวก, คุณ, ห...","[0, 2, 11, 3, 6, 2, 2, 4, 11, 3, 3, 10, 4, 1, ..."
4,การใช้จ่ายครั้งใหม่นี้ได้รับการสนับสนุนโดยบัญช...,"[การใช้จ่าย, ครั้ง, ใหม่, นี้, ได้, รับ, การสน...","[2, 4, 5, 7, 6, 2, 2, 1, 4, 4, 7, 5, 1, 9]"
...,...,...,...
995,ปอมปีย์เข้าบัญชาการกองทหาร 2 กองในกาปัวและเริ่...,"[ปอมปีย์, เข้า, บัญชาการ, กอง, ทหาร, 2, กอง, ใ...","[9, 2, 2, 4, 4, 12, 4, 1, 9, 10, 2, 2, 4, 3, 6..."
996,ซีซาร์ได้รับแจ้งถึงการกระทำของปอมปีย์จากคิวริโ...,"[ซีซาร์, ได้รับ, แจ้ง, ถึง, การกระทำ, ของ, ปอม...","[9, 6, 2, 1, 2, 1, 9, 1, 9, 4, 7, 2, 4, 1, 4, 7]"
997,ในขณะเดียวกัน ตำแหน่งของเขาในองค์กรมีมาร์โค อั...,"[ใน, ขณะ, เดียว, กัน, ตำแหน่ง, ของ, เขา, ใน, อ...","[1, 4, 5, 11, 4, 1, 11, 1, 4, 2, 9, 9, 6, 7, 6..."
998,แต่เมื่อสภาตอบเขาอย่างชัดเจนโดยการไม่อนุญาตให้...,"[แต่, เมื่อ, สภา, ตอบ, เขา, อย่างชัดเจน, โดย, ...","[10, 1, 4, 2, 11, 5, 1, 3, 2, 2, 11, 2, 2, 4, ..."


In [None]:
#conllu_format.to_csv('ud_thai_pud_conllu.csv', index=False)

## define `CustomDataset` for KFold Cross Validation training

In [None]:
class ThaiPOSDataset(Dataset):
  def __init__(self,
               parse_data: conllu.models.SentenceList,
               tokenizer: AutoTokenizer,
               label2id: Dict,

               ):
    self.data = parse_data
    self.tokenizer = tokenizer
    self.label2id = label2id
    self.max_length = max_length
    self.device = 'cuda' if torch.cuda.is_available() else 'cpu'


  def __len__(self)->int:
    length = len(self.data)
    return length


  def __getitem__(self, index: int)->Tuple[torch.tensor, torch.tensor, torch.tensor]:
    # return words and labels
    word_list, upos_list= [], []
    # get through the data index
    sentence = self.data[index] # this will get each sentence
    for token in sentence:
      # Access properties of each token in the sentence
      word = token["form"]
      upos = self.label2id[token["upostag"]] # we will convert it to id
      word_list.append(word)
      upos_list.append(upos)

    # convert to format that is simpler to be tokenized
    text = " ".join(word_list) # will group up to be string and pass to tokenizer
    raw_upos = upos_list

    # convert to tensor
    tokenized_text = tokenizer(text, return_tensors='pt')
    input_ids = tokenized_text.input_ids
    attention_mask = tokenized_text.attention_mask
    labels = torch.tensor(raw_upos) # with tensor, it is integer

    # padding label to be equal so that it can be group into batched
    # where it will be grouped to max_length so that things will be much easier
    #padded_labels = F.pad(labels, (0, max(0, self.max_length - labels.size(0))), mode='constant', value=0)
    #input_ids, attention_mask, padded_labels = input_ids.to(self.device), attention_mask.to(self.device), padded_labels.to(self.device)
    #padded_labels = padded_labels.to(self.device)

    return (input_ids, attention_mask, labels)


# Format `.csv` file into huggingface dataset

In [None]:
# # format it to huggingface dataset
# dataset = load_dataset('csv', data_files={'train': "ud_thai_pud_conllu.csv"})
# dataset

# Load `thai_pud` directly from huggingface

In [None]:
dataset = load_dataset("universal_dependencies", "th_pud")
dataset

Downloading builder script:   0%|          | 0.00/87.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.33M [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/191k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/332k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['idx', 'text', 'tokens', 'lemmas', 'upos', 'xpos', 'feats', 'head', 'deprel', 'deps', 'misc'],
        num_rows: 1000
    })
})

In [None]:
pos_feature = dataset["test"].features["upos"]
label_names = pos_feature.feature.names
print(label_names)
print(len(label_names))

['NOUN', 'PUNCT', 'ADP', 'NUM', 'SYM', 'SCONJ', 'ADJ', 'PART', 'DET', 'CCONJ', 'PROPN', 'PRON', 'X', '_', 'ADV', 'INTJ', 'VERB', 'AUX']
18


In [None]:
# get label
label2id = {}
for tag in label_names:
  if tag not in label2id.keys():
    label2id[tag] = len(label2id)

id2label = {v: k for k, v in label2id.items()}

print(label2id)
print(id2label)
print(len(label2id))
print(len(id2label))

{'NOUN': 0, 'PUNCT': 1, 'ADP': 2, 'NUM': 3, 'SYM': 4, 'SCONJ': 5, 'ADJ': 6, 'PART': 7, 'DET': 8, 'CCONJ': 9, 'PROPN': 10, 'PRON': 11, 'X': 12, '_': 13, 'ADV': 14, 'INTJ': 15, 'VERB': 16, 'AUX': 17}
{0: 'NOUN', 1: 'PUNCT', 2: 'ADP', 3: 'NUM', 4: 'SYM', 5: 'SCONJ', 6: 'ADJ', 7: 'PART', 8: 'DET', 9: 'CCONJ', 10: 'PROPN', 11: 'PRON', 12: 'X', 13: '_', 14: 'ADV', 15: 'INTJ', 16: 'VERB', 17: 'AUX'}
18
18


# Load `WangchanBerta` Model for token classification
- we aims to use 2 models at this time. first is wangchanberta where second is mDeBertaV3

In [None]:
id2label, label2id

({0: 'NOUN',
  1: 'PUNCT',
  2: 'ADP',
  3: 'NUM',
  4: 'SYM',
  5: 'SCONJ',
  6: 'ADJ',
  7: 'PART',
  8: 'DET',
  9: 'CCONJ',
  10: 'PROPN',
  11: 'PRON',
  12: 'X',
  13: '_',
  14: 'ADV',
  15: 'INTJ',
  16: 'VERB',
  17: 'AUX'},
 {'NOUN': 0,
  'PUNCT': 1,
  'ADP': 2,
  'NUM': 3,
  'SYM': 4,
  'SCONJ': 5,
  'ADJ': 6,
  'PART': 7,
  'DET': 8,
  'CCONJ': 9,
  'PROPN': 10,
  'PRON': 11,
  'X': 12,
  '_': 13,
  'ADV': 14,
  'INTJ': 15,
  'VERB': 16,
  'AUX': 17})

In [None]:
# Load model directly
def init_model(model_name: str)->Tuple[AutoTokenizer, AutoModelForTokenClassification]:
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForTokenClassification.from_pretrained(
      model_name,
      num_labels=len(label2id),
      label2id=label2id,
      id2label=id2label,
  )
  return (model, tokenizer)


In [None]:
model_names = {
    'deberta': 'microsoft/mdeberta-v3-base',
    'wangchan': 'airesearch/wangchanberta-base-att-spm-uncased',
    'tuned-mdeberta': 'Pavarissy/mdeberta-v3-ud-thai-pud-upos',
}

model, tokenizer = init_model(model_names['deberta']) # we will change the decoder part
model

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DebertaV2ForTokenClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(251000, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=Tr

In [None]:
model.config

DebertaV2Config {
  "_name_or_path": "microsoft/mdeberta-v3-base",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "NOUN",
    "1": "PUNCT",
    "2": "ADP",
    "3": "NUM",
    "4": "SYM",
    "5": "SCONJ",
    "6": "ADJ",
    "7": "PART",
    "8": "DET",
    "9": "CCONJ",
    "10": "PROPN",
    "11": "PRON",
    "12": "X",
    "13": "_",
    "14": "ADV",
    "15": "INTJ",
    "16": "VERB",
    "17": "AUX"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "ADJ": 6,
    "ADP": 2,
    "ADV": 14,
    "AUX": 17,
    "CCONJ": 9,
    "DET": 8,
    "INTJ": 15,
    "NOUN": 0,
    "NUM": 3,
    "PART": 7,
    "PRON": 11,
    "PROPN": 10,
    "PUNCT": 1,
    "SCONJ": 5,
    "SYM": 4,
    "VERB": 16,
    "X": 12,
    "_": 13
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "laye

In [None]:
print(dataset['test'][0]['tokens'])
print(dataset['test'][0]['upos'])

['“', 'แม้', 'ว่า', 'การเปลี่ยน', 'ไป', 'ใช้', 'ระบบ', 'ดิจิตัล', 'เป็น', 'สิ่ง', 'ที่', 'ไม่', 'เคย', 'มี', 'มา', 'ก่อน', 'ใน', 'สหรัฐ', 'ฯ', 'การเปลี่ยนผ่าน', 'อำนาจ', 'อย่างสันติ', 'นั้น', 'ก็', 'ไม่', 'ใช่', 'เรื่อง', 'ใหม่', '”', 'โคริ', 'ชูลแมน', 'ผู้', 'ช่วย', 'พิเศษ', 'ของ', 'โอบามา', 'เขียน', 'ลง', 'บล็อก', 'เมื่อ', 'วัน', 'จันทร์']
[1, 2, 2, 16, 7, 16, 0, 6, 17, 0, 8, 7, 7, 16, 7, 14, 2, 0, 1, 16, 0, 0, 8, 14, 7, 17, 0, 6, 1, 10, 10, 0, 16, 6, 2, 10, 16, 16, 0, 2, 0, 0]


In [None]:
# visualize for words in the dataset
words = dataset["test"][0]["tokens"]
labels = dataset["test"][0]["upos"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

“     แม้ ว่า การเปลี่ยน ไป   ใช้  ระบบ ดิจิตัล เป็น สิ่ง ที่ ไม่  เคย  มี   มา   ก่อน ใน  สหรัฐ ฯ     การเปลี่ยนผ่าน อำนาจ อย่างสันติ นั้น ก็  ไม่  ใช่ เรื่อง ใหม่ ”     โคริ  ชูลแมน ผู้  ช่วย พิเศษ ของ โอบามา เขียน ลง   บล็อก เมื่อ วัน  จันทร์ 
PUNCT ADP ADP VERB       PART VERB NOUN ADJ     AUX  NOUN DET PART PART VERB PART ADV  ADP NOUN  PUNCT VERB           NOUN  NOUN       DET  ADV PART AUX NOUN   ADJ  PUNCT PROPN PROPN  NOUN VERB ADJ   ADP PROPN  VERB  VERB NOUN  ADP   NOUN NOUN   


In [None]:
# tokenized input and include special tokens
inputs = tokenizer(dataset["test"][0]["tokens"], is_split_into_words=True)
print(inputs.tokens())

['[CLS]', '▁“', '▁', 'แม้', '▁', 'ว่า', '▁การ', 'เปลี่ยน', '▁', 'ไป', '▁ใช้', '▁ระบบ', '▁', 'ดิ', 'จิต', 'ัล', '▁เป็น', '▁', 'สิ่ง', '▁ที่', '▁ไม่', '▁', 'เคย', '▁มี', '▁', 'มา', '▁ก่อน', '▁ใน', '▁', 'สหรัฐ', '▁', 'ฯ', '▁การ', 'เปลี่ยน', 'ผ่าน', '▁', 'อํานาจ', '▁', 'อย่าง', 'สันติ', '▁', 'นั้น', '▁ก็', '▁ไม่', '▁', 'ใช่', '▁เรื่อง', '▁', 'ใหม่', '▁', '”', '▁', 'โค', 'ริ', '▁', 'ชู', 'ล', 'แมน', '▁ผู้', '▁', 'ช่วย', '▁', 'พิเศษ', '▁', 'ของ', '▁โอ', 'บา', 'มา', '▁', 'เขียน', '▁', 'ลง', '▁', 'บล็อก', '▁เมื่อ', '▁วัน', '▁', 'จันทร์', '[SEP]']


In [None]:
print(inputs.word_ids())

[None, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 6, 7, 7, 7, 7, 8, 9, 9, 10, 11, 12, 12, 13, 14, 14, 15, 16, 17, 17, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 29, 30, 30, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 38, 38, 39, 40, 41, 41, None]


## Aligning tokens

In [None]:
## this is for another language, seems not to be worked with Thai language
# def align_labels_with_tokens(labels, word_ids):
#     new_labels = []
#     current_word = None
#     for word_id in word_ids:
#         if word_id != current_word:
#             # Start of a new word!
#             current_word = word_id
#             label = -100 if word_id is None else labels[word_id]
#             new_labels.append(label)
#         elif word_id is None:
#             # Special token
#             new_labels.append(-100)
#         else:
#             # Same word as previous token
#             label = labels[word_id]
#             # If the label is B-XXX we change it to I-XXX
#             if label % 2 == 1:
#                 label += 1
#             new_labels.append(label)

#     return new_labels

## Align tokens in Thai dataset
- in nlp task, result after tokenization need to be treated

In [None]:
example_dataset = dataset['test'][0]
example_dataset
tokenized_input = tokenizer(example_dataset['tokens'], is_split_into_words=True)
print(tokenized_input.word_ids()) # where None indicated start and stop position
# to show that the length after tokenize compared with the label are not the same
# with that we need to re-align label to prevent model from misunderstand
print(len(tokenized_input['input_ids']), len(example_dataset['upos']))

[None, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 6, 7, 7, 7, 7, 8, 9, 9, 10, 11, 12, 12, 13, 14, 14, 15, 16, 17, 17, 18, 18, 19, 19, 19, 20, 20, 21, 21, 21, 22, 22, 23, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 29, 30, 30, 30, 30, 31, 32, 32, 33, 33, 34, 34, 35, 35, 35, 36, 36, 37, 37, 38, 38, 39, 40, 41, 41, None]
79 42


In [None]:
# with this, we need to re-align the label so that it matched with the result after tokenization
tokenized_input = tokenizer(example_dataset['tokens'], is_split_into_words=True)
# align the label to token after tokenization,
# we will map the position (word) id in the same id position with pos tag
aligned_labels = [-100 if id is None else example_dataset['upos'][id] for id in tokenized_input.word_ids()]

# show that aligning label would help, show that both label and input token after tokenization are equal
print(len(aligned_labels), len(tokenized_input['input_ids']))
print('input tokens:', tokenized_input['input_ids'])
print('aligned label:', aligned_labels)

79 79
input tokens: [1, 360, 260, 118789, 260, 6583, 12594, 48775, 260, 5270, 116650, 113309, 260, 81126, 97151, 94450, 21192, 260, 35831, 17637, 24947, 260, 57938, 13597, 260, 4729, 136532, 21638, 260, 191093, 260, 26037, 12594, 48775, 23044, 260, 147288, 260, 11859, 236015, 260, 10908, 42278, 24947, 260, 99756, 92693, 260, 11868, 260, 366, 260, 69013, 49377, 260, 148313, 7911, 104661, 70547, 260, 31851, 260, 44701, 260, 1882, 136006, 64470, 4729, 260, 53720, 260, 19896, 260, 182680, 21943, 22889, 260, 144526, 2]
aligned label: [-100, 1, 2, 2, 2, 2, 16, 16, 7, 7, 16, 0, 6, 6, 6, 6, 17, 0, 0, 8, 7, 7, 7, 16, 7, 7, 14, 2, 0, 0, 1, 1, 16, 16, 16, 0, 0, 0, 0, 0, 8, 8, 14, 7, 17, 17, 0, 6, 6, 1, 1, 10, 10, 10, 10, 10, 10, 10, 0, 16, 16, 6, 6, 2, 2, 10, 10, 10, 16, 16, 16, 16, 0, 0, 2, 0, 0, 0, -100]


In [None]:
# this is already adjusted for Thai dataaset
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    label_all_tokens = True
    labels = []

    for i, label in enumerate(examples[f"upos"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None: # case that it is a special token
                label_ids.append(-100)
            elif word_idx != previous_word_idx: # case that it is a new word
                label_ids.append(label[word_idx])
            else: # this is the case that it still the same word
                label_ids.append(label[word_idx] if label_all_tokens else -100)

            # set the current word to be previous word before going to the next word
            # so that there has a thing to compare
            previous_word_idx = word_idx

        labels.append(label_ids)

    # this will re-align the label
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["test"].column_names, # remove the former column in the dataset to be actual features
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
tokenized_datasets

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

## load `DataCollator`

In [None]:
# get a data collator model
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch = data_collator([tokenized_datasets["test"][i] for i in range(2)])
batch["labels"]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[-100,    1,    2,    2,    2,    2,   16,   16,    7,    7,   16,    0,
            6,    6,    6,    6,   17,    0,    0,    8,    7,    7,    7,   16,
            7,    7,   14,    2,    0,    0,    1,    1,   16,   16,   16,    0,
            0,    0,    0,    0,    8,    8,   14,    7,   17,   17,    0,    6,
            6,    1,    1,   10,   10,   10,   10,   10,   10,   10,    0,   16,
           16,    6,    6,    2,    2,   10,   10,   10,   16,   16,   16,   16,
            0,    0,    2,    0,    0,    0, -100],
        [-100,    2,    0,    8,   16,   16,   16,   16,   16,    6,    6,    6,
            6,    0,    0,    2,   10,   10,   10,   10,   10,   10,   10,    0,
            8,    8,   16,   16,   16,    7,    7,   14,   14, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, 

In [None]:
for i in range(2):
    print(tokenized_datasets["test"][i]["labels"])

[-100, 1, 2, 2, 2, 2, 16, 16, 7, 7, 16, 0, 6, 6, 6, 6, 17, 0, 0, 8, 7, 7, 7, 16, 7, 7, 14, 2, 0, 0, 1, 1, 16, 16, 16, 0, 0, 0, 0, 0, 8, 8, 14, 7, 17, 17, 0, 6, 6, 1, 1, 10, 10, 10, 10, 10, 10, 10, 0, 16, 16, 6, 6, 2, 2, 10, 10, 10, 16, 16, 16, 16, 0, 0, 2, 0, 0, 0, -100]
[-100, 2, 0, 8, 16, 16, 16, 16, 16, 6, 6, 6, 6, 0, 0, 2, 10, 10, 10, 10, 10, 10, 10, 0, 8, 8, 16, 16, 16, 7, 7, 14, 14, -100]


## define `metrics` using `seqeval`

In [None]:
metric = evaluate.load("poseval")
metric

Downloading builder script:   0%|          | 0.00/4.46k [00:00<?, ?B/s]

EvaluationModule(name: "poseval", module_type: "metric", features: {'predictions': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='label'), length=-1, id='sequence')}, usage: """
Computes the poseval metric.

Args:
    predictions: List of List of predicted labels (Estimated targets as returned by a tagger)
    references: List of List of reference labels (Ground truth (correct) target values)
    zero_division: Which value to substitute as a metric value when encountering zero division. Should be on of 0, 1,

Returns:
    'scores': dict. Summary of the scores for overall and per type
        Overall (weighted and macro avg):
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': F1 score, also known as balanced F-score or F-measure,
        Per type:
            'precision': precision,
            'recall': recall,
            'f1': F1 

In [None]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "macro avg precision": all_metrics["macro avg"]['precision'],
        "macro avg recall": all_metrics["macro avg"]['recall'],
        "macro avg f1": all_metrics["macro avg"]['f1-score'],
        "weighted avg precision": all_metrics["weighted avg"]['precision'],
        "weighted avg recall": all_metrics["weighted avg"]['recall'],
        "weighted avg f1": all_metrics["weighted avg"]['f1-score'],
        "accuracy": all_metrics["accuracy"],
    }

In [None]:
tokenized_datasets

DatasetDict({
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

In [None]:
# divide dataset into split
#tokenized_datasets = tokenized_datasets['test'].train_test_split(test_size=0.2, seed=42)
#tokenized_datasets

## training using `Huggingface Trainer`

In [None]:
from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="/content/udthaipos_model_weights",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    metric_for_best_model='accuracy',
    load_best_model_at_end=True,
    push_to_hub=True,
    auto_find_batch_size=True,
    gradient_checkpointing=True,
    seed=42,
    hub_model_id="Pavarissy/wangchanberta-ud-thai-pud-upos",
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["test"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()



Epoch,Training Loss,Validation Loss,Macro avg precision,Macro avg recall,Macro avg f1,Weighted avg precision,Weighted avg recall,Weighted avg f1,Accuracy
1,No log,0.5563,0.810297,0.723472,0.755155,0.857422,0.852248,0.849477,0.852248
2,No log,0.231587,0.870088,0.845994,0.856367,0.932048,0.931495,0.931029,0.931495
3,No log,0.163496,0.89028,0.872892,0.880862,0.951127,0.951058,0.950758,0.951058
4,0.578200,0.111216,0.903654,0.896411,0.899814,0.968672,0.96851,0.968462,0.96851
5,0.578200,0.085976,0.91101,0.904961,0.90789,0.975223,0.975197,0.975149,0.975197
6,0.578200,0.067523,0.916046,0.91035,0.913077,0.981495,0.981433,0.981407,0.981433
7,0.578200,0.05883,0.918866,0.913847,0.916294,0.983938,0.983923,0.983902,0.983923
8,0.107300,0.051437,0.921435,0.915484,0.918399,0.985848,0.985844,0.985821,0.985844
9,0.107300,0.046271,0.922462,0.917073,0.91971,0.987655,0.987646,0.987627,0.987646
10,0.107300,0.044153,0.922102,0.917778,0.919898,0.988341,0.988333,0.988318,0.988333


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

TrainOutput(global_step=1250, training_loss=0.2883079788208008, metrics={'train_runtime': 341.4145, 'train_samples_per_second': 29.29, 'train_steps_per_second': 3.661, 'total_flos': 360825436239840.0, 'train_loss': 0.2883079788208008, 'epoch': 10.0})

In [None]:
# evaluate test data
metrics = trainer.evaluate(tokenized_datasets['test'])
metrics

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.04415310174226761,
 'eval_macro avg precision': 0.9221015664202458,
 'eval_macro avg recall': 0.9177777813021438,
 'eval_macro avg f1': 0.9198982212872856,
 'eval_weighted avg precision': 0.9883405622265768,
 'eval_weighted avg recall': 0.9883334914161055,
 'eval_weighted avg f1': 0.9883176187725993,
 'eval_accuracy': 0.9883334914161055,
 'eval_runtime': 5.4534,
 'eval_samples_per_second': 183.373,
 'eval_steps_per_second': 22.922,
 'epoch': 10.0}

In [None]:
trainer.push_to_hub("Pavarissy/wangchanberta-ud-thai-pud-upos") # this part is for wanngchanberta
#trainer.push_to_hub("Pavarissy/mdeberta-v3-ud-thai-pud-upos")

'https://huggingface.co/Pavarissy/wangchanberta-ud-thai-pud-upos/tree/main/'

# Test Model Pipeline

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, TokenClassificationPipeline

model_name = "Pavarissy/wangchanberta-ud-thai-pud-upos"
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer, grouped_entities=True)
outputs = pipeline("ประเทศไทย อยู่ใน ทวีป เอเชีย")
print(outputs)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/419M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/905k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.18M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/365 [00:00<?, ?B/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'NOUN', 'score': 0.419697, 'word': '', 'start': 0, 'end': 1}, {'entity_group': 'PROPN', 'score': 0.8809489, 'word': 'ประเทศไทย', 'start': 0, 'end': 9}, {'entity_group': 'VERB', 'score': 0.7754166, 'word': 'อยู่ใน', 'start': 9, 'end': 16}, {'entity_group': 'NOUN', 'score': 0.9976932, 'word': 'ทวีป', 'start': 16, 'end': 21}, {'entity_group': 'PROPN', 'score': 0.97770107, 'word': 'เอเชีย', 'start': 21, 'end': 28}]


# Intensive Training using `PyTorch` 🔥

# Define `DataLoader` for construct a training loop

In [None]:
# check for labels again (must be 18)
model.config.num_labels

18

In [None]:
# this is just for kfold cross validation purposes
train_data = ThaiPOSDataset(
               parse_data=parsed_data,
               tokenizer=tokenizer,
               label2id=label2id,
            )
train_data

<__main__.ThaiPOSDataset at 0x7c5f69047910>

In [None]:
BATCH_SIZE = 16
EPOCHS = 30
# K-Fold Cross Validation
K_FOLD = 10


kfold = KFold(n_splits=K_FOLD, shuffle=True)


train_dataloader = DataLoader(
    tokenized_datasets["test"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=BATCH_SIZE,
)

In [None]:
# define another objective component
optimizer = Lion(model.parameters(), lr=2e-5)
loss_fn = nn.NLLLoss() #nn.CrossEntropyLoss()

num_train_epochs = EPOCHS
num_update_steps_per_epoch = len(train_dataloader) # iterations (total batches until complete 1 epoch)
num_training_steps = num_train_epochs * num_update_steps_per_epoch # total steps in all epochs

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
def postprocess(predictions, labels):
    # format it into form of seqeval
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

# Construct torch training loop
- this is for testing and debugging the pipeline of torch

In [None]:
# this is a debugging loop (we will fix it later)
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(device)

model.train()
for batch_data in train_dataloader:
  #print(batch_data) # can be unpacked
  # forward pass
  outputs = model(**batch_data.to(device))
  predictions = outputs.logits.argmax(dim=-1)
  labels = batch['labels']
  true_predictions, true_labels = postprocess(predictions, labels)
  print(true_predictions)
  print("shape after predictions: ", true_predictions.shape, true_labels.shape)
  # loss calculation
  loss = outputs.loss
  print('loss from model:', loss)
  loss_2 = loss_fn(true_predictions, true_labels)
  print('loss from loss function:', loss_2)
  # logits shape : [batch_size, total_words, possible_tags_probs]
  loss.backward() # compute gradient
  optimizer.step()
  lr_scheduler.step()
  optimizer.zero_grad()
  break

model.eval()
for batch_data in train_dataloader:
  with torch.inference_mode():
    outputs = model(**batch_data.to(device))

    predictions = outputs.logits.argmax(dim=-1)
    labels = batch["labels"]

  # compute the score
  true_predictions, true_labels = postprocess(predictions, labels)
  metric.add_batch(predictions=true_predictions, references=true_labels)
  break
results = metric.compute()
print(results)

cuda:0
[['PUNCT', 'ADP', 'ADP', 'ADP', 'ADP', 'VERB', 'VERB', 'PART', 'PART', 'VERB', 'NOUN', 'ADJ', 'ADJ', 'ADJ', 'ADJ', 'AUX', 'NOUN', 'NOUN', 'DET', 'PART', 'PART', 'PART', 'VERB', 'PART', 'PART', 'ADV', 'ADP', 'NOUN', 'NOUN', 'PUNCT', 'PUNCT', 'VERB', 'VERB', 'VERB', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'DET', 'DET', 'ADV', 'PART', 'AUX', 'AUX', 'NOUN', 'ADJ', 'ADJ', 'PUNCT', 'PUNCT', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'NOUN', 'VERB', 'VERB', 'ADJ', 'ADJ', 'ADP', 'ADP', 'PROPN', 'PROPN', 'PROPN', 'VERB', 'VERB', 'VERB', 'VERB', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'NOUN'], ['ADP', 'NOUN', 'DET', 'VERB', 'VERB', 'VERB', 'VERB', 'VERB', 'ADJ', 'ADJ', 'ADJ', 'ADJ', 'NOUN', 'NOUN', 'ADP', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'PROPN', 'NOUN', 'DET', 'DET', 'VERB', 'VERB', 'VERB', 'PART', 'PART', 'ADV', 'ADV']]


AttributeError: ignored

# [START] Training on `KFold` manner

# `K-Fold` Cross Validation training
- In order to check robustness of the model across wide range of datasets
- Lion optimizer (experiment with nlp task)
- gradient accumulation, aimed at `batch size = 64`, note here that we will cover the case where the last batch is not a complete desired batch size
- the model that held the best macro avg score will be considerated to be a great model

In [None]:
!nvidia-smi

Mon Nov  6 02:20:40 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8     9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
model.config._name_or_path

'microsoft/mdeberta-v3-base'

In [None]:
def setup_training_component(model_name: str,
                             train_idx,
                             val_idx,
                            ) -> Tuple:

  model, tokenizer = init_model(model_name)

  # define another objective component
  optimizer = Lion(model.parameters(), lr=2e-5)
  #loss_fn = nn.NLLLoss() #nn.CrossEntropyLoss()


  # Create DataLoader for training and validation
  train_dataloader = DataLoader(tokenized_datasets["test"],
                                collate_fn=data_collator,
                                batch_size=BATCH_SIZE,
                                sampler=SubsetRandomSampler(train_idx),
  )

  val_dataloader = DataLoader(tokenized_datasets["test"],
                                collate_fn=data_collator,
                                batch_size=BATCH_SIZE,
                                sampler=SubsetRandomSampler(val_idx),
  )

  return (
      model,
      tokenizer,
      optimizer,
      train_dataloader,
      val_dataloader
  )

In [None]:
# custom training loop in pytorch
BATCH_SIZE = 16
EPOCHS = 10
# K-Fold Cross Validation
K_FOLD = 10
output_dir = 'model_weights'

# this part is for keeping the training stats


device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
print(f" --- Begin ThaiPOS training on {device} --- ")

# this is for kfold cross validation
kfold = KFold(n_splits=K_FOLD, shuffle=True)


for fold, (train_idx, val_idx) in enumerate(kfold.split(train_data)):
  print(f"Fold {fold + 1}")

  # setting up training argument (include push it into device)
  model, tokenizer, optimizer, train_dataloader, val_dataloader = setup_training_component(
    model_names['tuned-mdeberta'], # we will test it with the tuned version using kfold cross validation
    train_idx=train_idx,
    val_idx=val_idx,
  )

  model = model.to(device)


  # this is for learning rate scheduler
  num_train_epochs = EPOCHS
  num_update_steps_per_epoch = len(train_dataloader) # iterations (total batches until complete 1 epoch)
  num_training_steps = num_train_epochs * num_update_steps_per_epoch # total steps in all epochs

  # init a scheduler
  lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
  )

  train_loss = 0
  for epoch in tqdm(range(EPOCHS), desc=f'Training on fold {fold+1} of {model.config._name_or_path} ...'):
      # training loop
      model.train()
      for batch_idx, train_batch in enumerate(train_dataloader):
        outputs = model(**train_batch.to(device))
        loss = outputs.loss
        train_loss += loss.item()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()


      # evaluation loop
      model.eval()
      for batch_idx, val_batch in enumerate(val_dataloader):
          with torch.no_grad():
              outputs = model(**val_batch.to(device))

          predictions = outputs.logits.argmax(dim=-1)
          labels = val_batch["labels"]

          # compute poseval for pos tags
          true_predictions, true_labels = postprocess(predictions, labels)
          metric.add_batch(predictions=true_predictions, references=true_labels)

      # compute and evaluation score in each fold
      results = metric.compute(predictions=true_predictions,
                               references=true_labels,
                               zero_division=1,
                               )

  # training stats in every fold
  print(
      # we should write a supported code to handle with saving the metrics
      f"| train_loss: {train_loss/len(train_dataloader)} | acc: {results['accuracy']} | precision: {results['macro avg']['precision']}| recall: {results['macro avg']['recall']} | f1: {results['macro avg']['f1-score']}",
  )
  print('-'*50) # to draw and underline when display training stats

  # this is a part to save the best model
  # [WIP]


  model.save_pretrained(f'{output_dir}/fold{fold}/')
  tokenizer.save_pretrained(f'{output_dir}/fold{fold}/')
  # this will stop kfold loop
  # break

 --- Begin ThaiPOS training on cuda:0 --- 
Fold 1


Training on fold 1 of Pavarissy/mdeberta-v3-ud-thai-pud-upos ...:   0%|          | 0/10 [00:00<?, ?it/s]

| train_loss: 0.45394315260152024 | acc: 0.9883668903803132 | precision: 0.9811677405922073| recall: 0.983413439313428 | f1: 0.9822114964936166
Fold 2


Training on fold 2 of Pavarissy/mdeberta-v3-ud-thai-pud-upos ...:   0%|          | 0/10 [00:00<?, ?it/s]

| train_loss: 0.4996157239207573 | acc: 0.9941467807294012 | precision: 0.9939432421850387| recall: 0.9865171073963264 | f1: 0.9900773783795823
Fold 3


Training on fold 3 of Pavarissy/mdeberta-v3-ud-thai-pud-upos ...:   0%|          | 0/10 [00:00<?, ?it/s]

| train_loss: 0.47534629901903763 | acc: 0.9924188580904999 | precision: 0.9843024192900275| recall: 0.9868125447633148 | f1: 0.985400615266019
Fold 4


Training on fold 4 of Pavarissy/mdeberta-v3-ud-thai-pud-upos ...:   0%|          | 0/10 [00:00<?, ?it/s]

| train_loss: 0.4905782475936831 | acc: 0.9927388245972317 | precision: 0.9855624287627454| recall: 0.9886121202155836 | f1: 0.9868707299272963
Fold 5


Training on fold 5 of Pavarissy/mdeberta-v3-ud-thai-pud-upos ...:   0%|          | 0/10 [00:00<?, ?it/s]

| train_loss: 0.48611264744479404 | acc: 0.9948186528497409 | precision: 0.9895259451514183| recall: 0.9913497527877507 | f1: 0.9903684342122265
Fold 6


Training on fold 6 of Pavarissy/mdeberta-v3-ud-thai-pud-upos ...:   0%|          | 0/10 [00:00<?, ?it/s]

| train_loss: 0.49484766235477046 | acc: 0.9954400364797081 | precision: 0.9959454842570432| recall: 0.9928867456275906 | f1: 0.9943761171557194
Fold 7


Training on fold 7 of Pavarissy/mdeberta-v3-ud-thai-pud-upos ...:   0%|          | 0/10 [00:00<?, ?it/s]

| train_loss: 0.497591396957113 | acc: 0.9964285714285714 | precision: 0.9918034151415039| recall: 0.9921880541121345 | f1: 0.9919255845793514
Fold 8


Training on fold 8 of Pavarissy/mdeberta-v3-ud-thai-pud-upos ...:   0%|          | 0/10 [00:00<?, ?it/s]

| train_loss: 0.48851557530201317 | acc: 0.9948922940262047 | precision: 0.991096283656462| recall: 0.9920923232897555 | f1: 0.9914756102148574
Fold 9


Training on fold 9 of Pavarissy/mdeberta-v3-ud-thai-pud-upos ...:   0%|          | 0/10 [00:00<?, ?it/s]

| train_loss: 0.48745406077553827 | acc: 0.9918845807033363 | precision: 0.9924803534189687| recall: 0.990615410868813 | f1: 0.9914758462628492
Fold 10


Training on fold 10 of Pavarissy/mdeberta-v3-ud-thai-pud-upos ...:   0%|          | 0/10 [00:00<?, ?it/s]

| train_loss: 0.491603070543262 | acc: 0.9950225171841669 | precision: 0.9186465031748293| recall: 0.99071394143533 | f1: 0.9210407429584838


In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, TokenClassificationPipeline

model_name = "/content/model_weights/fold7" # get the best model at specified fold
model = AutoModelForTokenClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer, grouped_entities=True)
outputs = pipeline("ประเทศไทย อยู่ใน ทวีป เอเชีย")
print(outputs)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity_group': 'PROPN', 'score': 0.9946701, 'word': 'ประเทศไทย', 'start': 0, 'end': 9}, {'entity_group': 'VERB', 'score': 0.85809743, 'word': 'อยู่ใน', 'start': 9, 'end': 16}, {'entity_group': 'NOUN', 'score': 0.99632, 'word': 'ทวีป', 'start': 16, 'end': 21}, {'entity_group': 'PROPN', 'score': 0.9961184, 'word': 'เอเชีย', 'start': 21, 'end': 28}]


In [None]:
import glob
for modelname in glob.glob('/content/model_weights/*'):
  print(modelname)

/content/model_weights/fold7
/content/model_weights/fold0
/content/model_weights/fold3
/content/model_weights/fold6
/content/model_weights/fold4
/content/model_weights/fold1
/content/model_weights/fold2
/content/model_weights/fold5
/content/model_weights/fold8
/content/model_weights/fold9


In [None]:
from transformers import Trainer
import glob


def finding_best_model():

  for modelname in glob.glob('/content/model_weights/*'):
    model_name = modelname # get the best model at specified fold
    model = AutoModelForTokenClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)


    trainer = Trainer(
        model=model,
        #train_dataset=tokenized_datasets["test"],
        # eval_dataset=tokenized_datasets["test"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )

    print(f'Evaluation of {model_name}')
    result = trainer.evaluate(tokenized_datasets['test'])
    print(result)

In [None]:
finding_best_model()

Evaluation of /content/model_weights/fold7


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.030260879546403885, 'eval_macro avg precision': 0.9235214138459755, 'eval_macro avg recall': 0.9227542302815904, 'eval_macro avg f1': 0.9231208236282316, 'eval_weighted avg precision': 0.9934730389849609, 'eval_weighted avg recall': 0.9934846474601972, 'eval_weighted avg f1': 0.9934714912122602, 'eval_accuracy': 0.9934846474601972, 'eval_runtime': 11.2972, 'eval_samples_per_second': 88.518, 'eval_steps_per_second': 11.065}
Evaluation of /content/model_weights/fold0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.030260879546403885, 'eval_macro avg precision': 0.9235214138459755, 'eval_macro avg recall': 0.9227542302815904, 'eval_macro avg f1': 0.9231208236282316, 'eval_weighted avg precision': 0.9934730389849609, 'eval_weighted avg recall': 0.9934846474601972, 'eval_weighted avg f1': 0.9934714912122602, 'eval_accuracy': 0.9934846474601972, 'eval_runtime': 7.3618, 'eval_samples_per_second': 135.835, 'eval_steps_per_second': 16.979}
Evaluation of /content/model_weights/fold3


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.030260879546403885, 'eval_macro avg precision': 0.9235214138459755, 'eval_macro avg recall': 0.9227542302815904, 'eval_macro avg f1': 0.9231208236282316, 'eval_weighted avg precision': 0.9934730389849609, 'eval_weighted avg recall': 0.9934846474601972, 'eval_weighted avg f1': 0.9934714912122602, 'eval_accuracy': 0.9934846474601972, 'eval_runtime': 6.8046, 'eval_samples_per_second': 146.96, 'eval_steps_per_second': 18.37}
Evaluation of /content/model_weights/fold6


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.030260879546403885, 'eval_macro avg precision': 0.9235214138459755, 'eval_macro avg recall': 0.9227542302815904, 'eval_macro avg f1': 0.9231208236282316, 'eval_weighted avg precision': 0.9934730389849609, 'eval_weighted avg recall': 0.9934846474601972, 'eval_weighted avg f1': 0.9934714912122602, 'eval_accuracy': 0.9934846474601972, 'eval_runtime': 6.4718, 'eval_samples_per_second': 154.518, 'eval_steps_per_second': 19.315}
Evaluation of /content/model_weights/fold4


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.030260879546403885, 'eval_macro avg precision': 0.9235214138459755, 'eval_macro avg recall': 0.9227542302815904, 'eval_macro avg f1': 0.9231208236282316, 'eval_weighted avg precision': 0.9934730389849609, 'eval_weighted avg recall': 0.9934846474601972, 'eval_weighted avg f1': 0.9934714912122602, 'eval_accuracy': 0.9934846474601972, 'eval_runtime': 6.415, 'eval_samples_per_second': 155.884, 'eval_steps_per_second': 19.486}
Evaluation of /content/model_weights/fold1


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.030260879546403885, 'eval_macro avg precision': 0.9235214138459755, 'eval_macro avg recall': 0.9227542302815904, 'eval_macro avg f1': 0.9231208236282316, 'eval_weighted avg precision': 0.9934730389849609, 'eval_weighted avg recall': 0.9934846474601972, 'eval_weighted avg f1': 0.9934714912122602, 'eval_accuracy': 0.9934846474601972, 'eval_runtime': 6.4384, 'eval_samples_per_second': 155.319, 'eval_steps_per_second': 19.415}
Evaluation of /content/model_weights/fold2


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.030260879546403885, 'eval_macro avg precision': 0.9235214138459755, 'eval_macro avg recall': 0.9227542302815904, 'eval_macro avg f1': 0.9231208236282316, 'eval_weighted avg precision': 0.9934730389849609, 'eval_weighted avg recall': 0.9934846474601972, 'eval_weighted avg f1': 0.9934714912122602, 'eval_accuracy': 0.9934846474601972, 'eval_runtime': 6.4651, 'eval_samples_per_second': 154.676, 'eval_steps_per_second': 19.334}
Evaluation of /content/model_weights/fold5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.030260879546403885, 'eval_macro avg precision': 0.9235214138459755, 'eval_macro avg recall': 0.9227542302815904, 'eval_macro avg f1': 0.9231208236282316, 'eval_weighted avg precision': 0.9934730389849609, 'eval_weighted avg recall': 0.9934846474601972, 'eval_weighted avg f1': 0.9934714912122602, 'eval_accuracy': 0.9934846474601972, 'eval_runtime': 7.0149, 'eval_samples_per_second': 142.554, 'eval_steps_per_second': 17.819}
Evaluation of /content/model_weights/fold8


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.030260879546403885, 'eval_macro avg precision': 0.9235214138459755, 'eval_macro avg recall': 0.9227542302815904, 'eval_macro avg f1': 0.9231208236282316, 'eval_weighted avg precision': 0.9934730389849609, 'eval_weighted avg recall': 0.9934846474601972, 'eval_weighted avg f1': 0.9934714912122602, 'eval_accuracy': 0.9934846474601972, 'eval_runtime': 6.3968, 'eval_samples_per_second': 156.328, 'eval_steps_per_second': 19.541}
Evaluation of /content/model_weights/fold9


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.030260879546403885, 'eval_macro avg precision': 0.9235214138459755, 'eval_macro avg recall': 0.9227542302815904, 'eval_macro avg f1': 0.9231208236282316, 'eval_weighted avg precision': 0.9934730389849609, 'eval_weighted avg recall': 0.9934846474601972, 'eval_weighted avg f1': 0.9934714912122602, 'eval_accuracy': 0.9934846474601972, 'eval_runtime': 6.3875, 'eval_samples_per_second': 156.556, 'eval_steps_per_second': 19.569}


  _warn_prf(average, modifier, msg_start, len(result))
