In [1]:
# !wget https://github.com/aisingapore/seacorenlp-data/raw/main/id/constituency/train.txt
# !wget https://github.com/aisingapore/seacorenlp-data/raw/main/id/constituency/test.txt

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
import numpy as np
import torch
import torch.nn as nn
from malaya.function.constituency import evaluate, trees_newline as trees
from transformers import AutoTokenizer, T5Config
from malaya.torch_model.t5 import T5Constituency
from tqdm import tqdm

2023-10-06 12:34:02.360731: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-06 12:34:02.440985: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-06 12:34:02.904010: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-10-06 12:34:02.904065: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

In [4]:
BERT_TOKEN_MAPPING = {
    "-LRB-": "(",
    "-RRB-": ")",
    "-LCB-": "{",
    "-RCB-": "}",
    "-LSB-": "[",
    "-RSB-": "]",
    "``": '"',
    "''": '"',
    "`": "'",
    '«': '"',
    '»': '"',
    '‘': "'",
    '’': "'",
    '“': '"',
    '”': '"',
    '„': '"',
    '‹': "'",
    '›': "'",
    "\u2013": "--",
    "\u2014": "--",
    }

def process_word(word):
    word = word.replace('\\/', '/').replace('\\*', '*')
    word = word.replace('-LSB-', '[').replace('-RSB-', ']')
    word = word.replace('-LRB-', '(').replace('-RRB-', ')')
    if word == "n't" and cleaned_words:
        cleaned_words[-1] = cleaned_words[-1] + "n"
        word = "'t"
    return word

In [5]:
import collections

class Vocabulary(object):
    def __init__(self):
        self.frozen = False
        self.values = []
        self.indices = {}
        self.counts = collections.defaultdict(int)

    @property
    def size(self):
        return len(self.values)

    def value(self, index):
        assert 0 <= index < len(self.values)
        return self.values[index]

    def index(self, value):
        if not self.frozen:
            self.counts[value] += 1

        if value in self.indices:
            return self.indices[value]

        elif not self.frozen:
            self.values.append(value)
            self.indices[value] = len(self.values) - 1
            return self.indices[value]

        else:
            raise ValueError("Unknown value: {}".format(value))

    def index_or_unk(self, value, unk_value):
        assert self.frozen
        if value in self.indices:
            return self.indices[value]
        else:
            return self.indices[unk_value]

    def count(self, value):
        return self.counts[value]

    def freeze(self):
        self.frozen = True


In [6]:
train_treebank = trees.load_trees('train.txt')
train_parse = [tree.convert() for tree in train_treebank]

In [7]:
dev_treebank = trees.load_trees('test.txt')

In [8]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/803k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
label_vocab = Vocabulary()
label_vocab.index(())

tag_vocab = Vocabulary()
START = '<s>'
STOP = '</s>'
UNK = tokenizer.unk_token
TAG_UNK = "UNK"
tag_vocab.index(START)
tag_vocab.index(STOP)
tag_vocab.index(TAG_UNK)

for tree in train_parse:
    nodes = [tree]
    while nodes:
        node = nodes.pop()
        if isinstance(node, trees.InternalParseNode):
            label_vocab.index(node.label)
            nodes.extend(reversed(node.children))
        else:
            tag_vocab.index(node.tag)
            
tag_vocab.freeze()
label_vocab.freeze()

In [10]:
config = T5Config.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')
config.num_labels = label_vocab.size
config.num_tags = tag_vocab.size
config.tag_loss_scale = 1.0
config.label_vocab = {str(k): v for k, v in label_vocab.indices.items()}
config.tag_vocab = tag_vocab.indices

Downloading (…)lve/main/config.json:   0%|          | 0.00/667 [00:00<?, ?B/s]

In [11]:
model = T5Constituency.from_pretrained('mesolitica/t5-small-standard-bahasa-cased', config = config)
_ = model.cuda()

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Some weights of T5Constituency were not initialized from the model checkpoint at mesolitica/t5-small-standard-bahasa-cased and are newly initialized: ['f_tag.3.weight', 'f_tag.1.a_2', 'encoder_encoder.attn_3.w_ks2', 'encoder_encoder.ff_4.w_2p.bias', 'encoder_encoder.ff_7.layer_norm.a_2', 'encoder_encoder.ff_5.w_1c.bias', 'encoder_encoder.attn_1.w_qs1', 'encoder_encoder.ff_4.w_1p.bias', 'encoder_encoder.attn_2.layer_norm.b_2', 'encoder_encoder.ff_1.w_2c.weight', 'encoder_encoder.attn_5.w_qs1', 'f_label.1.a_2', 'encoder_encoder.ff_7.w_2c.bias', 'encoder_encoder.ff_2.layer_norm.a_2', 'f_tag.0.bias', 'encoder_encoder.ff_6.w_1p.bias', 'encoder_encoder.ff_1.layer_norm.b_2', 'encoder_encoder.ff_7.layer_norm.b_2', 'encoder_encoder.ff_2.w_2c.weight', 'encoder_encoder.attn_5.w_vs1', 'encoder_encoder.ff_6.layer_norm.a_2', 'encoder_encoder.attn_7.w_qs1', 'encoder_encoder.ff_0.w_2p.bias', 'encoder_encoder.attn_4.w_qs1', 'encoder_encoder.ff_6.layer_norm.b_2', 'encoder_encoder.attn_0.layer_norm.b_2',

In [12]:
trainable_parameters = [param for param in model.parameters() if param.requires_grad]

In [13]:
trainer = torch.optim.AdamW(trainable_parameters, lr = 2e-4)

In [14]:
class BatchIndices:
    def __init__(self, batch_idxs_np):
        self.batch_idxs_np = batch_idxs_np
        self.batch_idxs_torch = torch.from_numpy(batch_idxs_np)
        self.batch_size = int(1 + np.max(batch_idxs_np))

        batch_idxs_np_extra = np.concatenate([[-1], batch_idxs_np, [-1]])
        self.boundaries_np = np.nonzero(batch_idxs_np_extra[1:] != batch_idxs_np_extra[:-1])[0]
        self.seq_lens_np = self.boundaries_np[1:] - self.boundaries_np[:-1]
        assert len(self.seq_lens_np) == self.batch_size
        self.max_len = int(np.max(self.boundaries_np[1:] - self.boundaries_np[:-1]))

In [15]:
def split_batch(sentences, golds, subbatch_max_tokens=3000):
    lens = [
        len(tokenizer.tokenize(' '.join([word for (_, word) in sentence]))) + 2
        for sentence in sentences
    ]

    lens = np.asarray(lens, dtype=int)
    lens_argsort = np.argsort(lens).tolist()

    num_subbatches = 0
    subbatch_size = 1
    while lens_argsort:
        if (subbatch_size == len(lens_argsort)) or (subbatch_size * lens[lens_argsort[subbatch_size]] > subbatch_max_tokens):
            yield [sentences[i] for i in lens_argsort[:subbatch_size]], [golds[i] for i in lens_argsort[:subbatch_size]]
            lens_argsort = lens_argsort[subbatch_size:]
            num_subbatches += 1
            subbatch_size = 1
        else:
            subbatch_size += 1
            
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(
            sentence + [pad_int] * (max_sentence_len - len(sentence))
        )
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [16]:
def process(sentences, golds = None):
    
    all_input_ids = []
    all_word_start_mask = []
    all_word_end_mask = []

    for snum, sentence in enumerate(sentences):

        tokens = []
        word_start_mask = []
        word_end_mask = []
        tokens.append(START)
        word_start_mask.append(1)
        word_end_mask.append(1)

        cleaned_words = []
        for _, word in sentence:
            cleaned_words.append(process_word(word))

        for word in cleaned_words:
            word_tokens = tokenizer.tokenize(word)
            for _ in range(len(word_tokens)):
                word_start_mask.append(0)
                word_end_mask.append(0)
            word_start_mask[len(tokens)] = 1
            word_end_mask[-1] = 1
            tokens.extend(word_tokens)
        tokens.append(STOP)
        word_start_mask.append(1)
        word_end_mask.append(1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        all_input_ids.append(input_ids)
        all_word_start_mask.append(word_start_mask)
        all_word_end_mask.append(word_end_mask)
    
    padded = tokenizer.pad({
        'input_ids': all_input_ids,
    }, return_tensors = 'pt')

    all_word_start_mask = torch.from_numpy(np.array(pad_sentence_batch(all_word_start_mask, 0)[0]))
    all_word_end_mask = torch.from_numpy(np.array(pad_sentence_batch(all_word_end_mask, 0)[0]))
    
    padded['sentences'] = sentences
    padded['all_word_start_mask'] = all_word_start_mask
    padded['all_word_end_mask'] = all_word_end_mask
    
    packed_len = sum([(len(sentence) + 2) for sentence in sentences])
    i = 0
    tag_idxs = np.zeros(packed_len, dtype=int)
    batch_idxs = np.zeros(packed_len, dtype=int)
    for snum, sentence in enumerate(sentences):
        for (tag, word) in [(START, START)] + sentence + [(STOP, STOP)]:
            if golds is not None:
                tag_idxs[i] = tag_vocab.index_or_unk(tag, TAG_UNK)
            else:
                tag_idxs[i] = 0
            batch_idxs[i] = snum
            i += 1
    
    batch_idxs = BatchIndices(batch_idxs)
    padded['batch_idxs'] = batch_idxs
    tag_idxs = torch.from_numpy(tag_idxs)
    padded['tag_idxs'] = tag_idxs
    
    if golds is not None:
        gold_tag_idxs = tag_idxs
        padded['gold_tag_idxs'] = gold_tag_idxs
        padded['golds'] = golds
        
        
    if torch.cuda.is_available():
        for k in padded.keys():
            if isinstance(padded[k], torch.Tensor):
                padded[k] = padded[k].cuda()
    
        padded['batch_idxs'].batch_idxs_torch = padded['batch_idxs'].batch_idxs_torch.cuda()
    
    return padded

In [17]:
start_index = 0
batch_size = 4
batch_trees = train_parse[start_index:start_index + batch_size]
batch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in batch_trees]
for subbatch_sentences, subbatch_trees in split_batch(batch_sentences, batch_trees):
    print(model(**process(subbatch_sentences, subbatch_trees)))
    break

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


(tensor(300.6927, device='cuda:0', grad_fn=<AddBackward0>), tensor(280.9204, device='cuda:0', grad_fn=<MulBackward0>))


In [18]:
# model(**process(subbatch_sentences)), model(**process(subbatch_sentences, subbatch_trees))

In [19]:
batch_size = 16
epoch = 100

best_dev_fscore = -np.inf
patient = 10
current_patient = 0

for e in range(epoch):
    pbar = tqdm(range(0, len(train_parse), batch_size))
    losses = []
    for start_index in pbar:
        trainer.zero_grad()
        batch_loss_value = 0.0
        batch_trees = train_parse[start_index:start_index + batch_size]
        batch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in batch_trees]
        batch_num_tokens = sum(len(sentence) for sentence in batch_sentences)

        for subbatch_sentences, subbatch_trees in split_batch(batch_sentences, batch_trees):
            loss, tag_loss =  model(**process(subbatch_sentences, subbatch_trees))
            loss = tag_loss / len(subbatch_sentences) + loss / batch_num_tokens
            loss_value = float(loss.data.cpu().numpy())
            batch_loss_value += loss_value
            if loss_value > 0:
                loss.backward()
        
        grad_norm = torch.nn.utils.clip_grad_norm_(trainable_parameters, 5.0)
        trainer.step()
        losses.append(batch_loss_value)
        
    dev_predicted = []
    for dev_start_index in range(0, len(dev_treebank), batch_size):
        subbatch_trees = dev_treebank[dev_start_index:dev_start_index+batch_size]
        subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees]
        predicted, _ =  model(**process(subbatch_sentences))
        dev_predicted.extend([p.convert() for p in predicted])
    
    dev_fscore = evaluate.evalb('deprecated/EVALB', dev_treebank, dev_predicted)
    
    print(f'epoch: {e}, loss: {np.mean(losses)}, dev_fscore: {dev_fscore}')
    
    if dev_fscore.fscore >= best_dev_fscore:
        best_dev_fscore = dev_fscore.fscore
        current_patient = 0
        model.save_pretrained('base')
    else:
        current_patient += 1
    
    if current_patient >= patient:
        break

100%|█████████████████████████████████████████| 500/500 [00:49<00:00, 10.18it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 0, loss: 7.743378249883651, dev_fscore: (Recall=69.78, Precision=73.57, FScore=71.63, CompleteMatch=7.50, TaggingAccuracy=93.89)


100%|█████████████████████████████████████████| 500/500 [00:45<00:00, 10.90it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 1, loss: 3.680590521335602, dev_fscore: (Recall=75.00, Precision=75.88, FScore=75.44, CompleteMatch=12.30, TaggingAccuracy=94.42)


100%|█████████████████████████████████████████| 500/500 [00:45<00:00, 10.98it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 2, loss: 2.4897709555625918, dev_fscore: (Recall=75.35, Precision=78.94, FScore=77.10, CompleteMatch=13.60, TaggingAccuracy=94.68)


100%|█████████████████████████████████████████| 500/500 [00:47<00:00, 10.55it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 3, loss: 1.8179046350121497, dev_fscore: (Recall=76.88, Precision=79.49, FScore=78.17, CompleteMatch=16.50, TaggingAccuracy=95.00)


100%|█████████████████████████████████████████| 500/500 [00:45<00:00, 10.96it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 4, loss: 1.3630381355285643, dev_fscore: (Recall=78.06, Precision=79.38, FScore=78.71, CompleteMatch=17.70, TaggingAccuracy=94.79)


100%|█████████████████████████████████████████| 500/500 [00:45<00:00, 10.95it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 5, loss: 1.146106514930725, dev_fscore: (Recall=79.12, Precision=80.29, FScore=79.70, CompleteMatch=19.00, TaggingAccuracy=94.98)


100%|█████████████████████████████████████████| 500/500 [00:45<00:00, 10.94it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 6, loss: 0.9282996150851249, dev_fscore: (Recall=80.48, Precision=80.38, FScore=80.43, CompleteMatch=19.90, TaggingAccuracy=94.89)


100%|█████████████████████████████████████████| 500/500 [00:45<00:00, 10.91it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 7, loss: 0.8417778767347336, dev_fscore: (Recall=82.12, Precision=80.10, FScore=81.10, CompleteMatch=20.00, TaggingAccuracy=94.85)


100%|█████████████████████████████████████████| 500/500 [00:45<00:00, 10.96it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 8, loss: 0.7515054059028625, dev_fscore: (Recall=81.64, Precision=80.08, FScore=80.85, CompleteMatch=20.30, TaggingAccuracy=94.86)


100%|█████████████████████████████████████████| 500/500 [00:45<00:00, 10.96it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 9, loss: 0.6541142633259296, dev_fscore: (Recall=81.56, Precision=80.82, FScore=81.19, CompleteMatch=20.30, TaggingAccuracy=94.76)


100%|█████████████████████████████████████████| 500/500 [00:45<00:00, 10.95it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 10, loss: 0.6230107491910457, dev_fscore: (Recall=81.99, Precision=80.74, FScore=81.36, CompleteMatch=20.30, TaggingAccuracy=94.85)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.69it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 11, loss: 0.5766208143681287, dev_fscore: (Recall=81.39, Precision=81.05, FScore=81.22, CompleteMatch=20.60, TaggingAccuracy=94.94)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.82it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 12, loss: 0.5240447754412889, dev_fscore: (Recall=82.35, Precision=81.29, FScore=81.82, CompleteMatch=21.40, TaggingAccuracy=94.86)


100%|█████████████████████████████████████████| 500/500 [00:45<00:00, 10.88it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 13, loss: 0.5033269971311093, dev_fscore: (Recall=82.55, Precision=81.00, FScore=81.77, CompleteMatch=21.70, TaggingAccuracy=94.84)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.80it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 14, loss: 0.46285473348200323, dev_fscore: (Recall=82.52, Precision=81.03, FScore=81.77, CompleteMatch=22.90, TaggingAccuracy=94.92)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.74it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 15, loss: 0.43112155309319494, dev_fscore: (Recall=81.68, Precision=81.94, FScore=81.81, CompleteMatch=21.30, TaggingAccuracy=95.10)


100%|█████████████████████████████████████████| 500/500 [00:47<00:00, 10.61it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 16, loss: 0.4138983658850193, dev_fscore: (Recall=82.31, Precision=82.01, FScore=82.16, CompleteMatch=24.00, TaggingAccuracy=94.93)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.65it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 17, loss: 0.36190101253986356, dev_fscore: (Recall=82.29, Precision=81.79, FScore=82.04, CompleteMatch=21.50, TaggingAccuracy=94.83)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.87it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 18, loss: 0.33861574751883744, dev_fscore: (Recall=82.53, Precision=81.68, FScore=82.11, CompleteMatch=22.40, TaggingAccuracy=95.15)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.82it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 19, loss: 0.3314727769270539, dev_fscore: (Recall=82.09, Precision=82.19, FScore=82.14, CompleteMatch=21.80, TaggingAccuracy=94.81)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.72it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 20, loss: 0.3376833411976695, dev_fscore: (Recall=81.72, Precision=82.26, FScore=81.99, CompleteMatch=21.70, TaggingAccuracy=94.79)


100%|█████████████████████████████████████████| 500/500 [00:45<00:00, 10.88it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 21, loss: 0.3095257109254599, dev_fscore: (Recall=82.18, Precision=82.42, FScore=82.30, CompleteMatch=23.00, TaggingAccuracy=95.14)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.77it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 22, loss: 0.30419243048131467, dev_fscore: (Recall=82.59, Precision=81.32, FScore=81.95, CompleteMatch=21.90, TaggingAccuracy=95.01)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.82it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 23, loss: 0.2654794286713004, dev_fscore: (Recall=82.46, Precision=81.77, FScore=82.11, CompleteMatch=23.60, TaggingAccuracy=94.92)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.77it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 24, loss: 0.2652436782196164, dev_fscore: (Recall=81.62, Precision=83.32, FScore=82.46, CompleteMatch=22.40, TaggingAccuracy=94.95)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.86it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 25, loss: 0.26486815672367814, dev_fscore: (Recall=81.70, Precision=82.50, FScore=82.10, CompleteMatch=22.60, TaggingAccuracy=94.91)


100%|█████████████████████████████████████████| 500/500 [00:45<00:00, 10.87it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 26, loss: 0.2550786470863968, dev_fscore: (Recall=81.19, Precision=82.79, FScore=81.98, CompleteMatch=22.40, TaggingAccuracy=95.09)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.78it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 27, loss: 0.2116141620799899, dev_fscore: (Recall=81.62, Precision=82.33, FScore=81.98, CompleteMatch=21.10, TaggingAccuracy=94.80)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.77it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 28, loss: 0.2125208693742752, dev_fscore: (Recall=81.75, Precision=82.02, FScore=81.88, CompleteMatch=20.70, TaggingAccuracy=94.93)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.84it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 29, loss: 0.21173147058114408, dev_fscore: (Recall=82.16, Precision=81.92, FScore=82.04, CompleteMatch=22.20, TaggingAccuracy=95.04)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.78it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 30, loss: 0.21903378688916564, dev_fscore: (Recall=82.03, Precision=82.18, FScore=82.11, CompleteMatch=22.40, TaggingAccuracy=95.13)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.77it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 31, loss: 0.19343078238144518, dev_fscore: (Recall=81.69, Precision=82.68, FScore=82.18, CompleteMatch=22.60, TaggingAccuracy=94.96)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.85it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 32, loss: 0.2054460514318198, dev_fscore: (Recall=81.78, Precision=82.78, FScore=82.28, CompleteMatch=24.00, TaggingAccuracy=95.19)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.71it/s]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


epoch: 33, loss: 0.19851240930706263, dev_fscore: (Recall=82.08, Precision=82.48, FScore=82.28, CompleteMatch=22.50, TaggingAccuracy=95.19)


100%|█████████████████████████████████████████| 500/500 [00:46<00:00, 10.79it/s]


epoch: 34, loss: 0.17575171417556704, dev_fscore: (Recall=81.71, Precision=82.67, FScore=82.18, CompleteMatch=23.10, TaggingAccuracy=95.14)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [23]:
best_dev_fscore

82.46

In [20]:
model_ = T5Constituency.from_pretrained('./base')

In [21]:
model_.push_to_hub('mesolitica/constituency-parsing-t5-small-standard-bahasa-cased', safe_serialization = True)

model.safetensors:   0%|          | 0.00/247M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/constituency-parsing-t5-small-standard-bahasa-cased/commit/6e5992ef042814a673d44df53222c598aad6f766', commit_message='Upload T5Constituency', commit_description='', oid='6e5992ef042814a673d44df53222c598aad6f766', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
tokenizer.push_to_hub('mesolitica/constituency-parsing-t5-small-standard-bahasa-cased', safe_serialization = True)

spiece.model:   0%|          | 0.00/803k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/constituency-parsing-t5-small-standard-bahasa-cased/commit/e1e0c23a17cf24d67fca055959c028af6de6d5f9', commit_message='Upload tokenizer', commit_description='', oid='e1e0c23a17cf24d67fca055959c028af6de6d5f9', pr_url=None, pr_revision=None, pr_num=None)