In [1]:
# !wget https://github.com/aisingapore/seacorenlp-data/raw/main/id/constituency/train.txt
# !wget https://github.com/aisingapore/seacorenlp-data/raw/main/id/constituency/test.txt

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
import numpy as np
import torch
import torch.nn as nn
from malaya.function.constituency import evaluate, trees_newline as trees
from transformers import AutoTokenizer, T5Config
from malaya.torch_model.t5 import T5Constituency
from tqdm import tqdm

2023-09-21 10:10:36.031542: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-21 10:10:36.108454: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-21 10:10:36.554157: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-09-21 10:10:36.554188: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

In [4]:
BERT_TOKEN_MAPPING = {
    "-LRB-": "(",
    "-RRB-": ")",
    "-LCB-": "{",
    "-RCB-": "}",
    "-LSB-": "[",
    "-RSB-": "]",
    "``": '"',
    "''": '"',
    "`": "'",
    '«': '"',
    '»': '"',
    '‘': "'",
    '’': "'",
    '“': '"',
    '”': '"',
    '„': '"',
    '‹': "'",
    '›': "'",
    "\u2013": "--",
    "\u2014": "--",
    }

def process_word(word):
    word = word.replace('\\/', '/').replace('\\*', '*')
    word = word.replace('-LSB-', '[').replace('-RSB-', ']')
    word = word.replace('-LRB-', '(').replace('-RRB-', ')')
    if word == "n't" and cleaned_words:
        cleaned_words[-1] = cleaned_words[-1] + "n"
        word = "'t"
    return word

In [5]:
import collections

class Vocabulary(object):
    def __init__(self):
        self.frozen = False
        self.values = []
        self.indices = {}
        self.counts = collections.defaultdict(int)

    @property
    def size(self):
        return len(self.values)

    def value(self, index):
        assert 0 <= index < len(self.values)
        return self.values[index]

    def index(self, value):
        if not self.frozen:
            self.counts[value] += 1

        if value in self.indices:
            return self.indices[value]

        elif not self.frozen:
            self.values.append(value)
            self.indices[value] = len(self.values) - 1
            return self.indices[value]

        else:
            raise ValueError("Unknown value: {}".format(value))

    def index_or_unk(self, value, unk_value):
        assert self.frozen
        if value in self.indices:
            return self.indices[value]
        else:
            return self.indices[unk_value]

    def count(self, value):
        return self.counts[value]

    def freeze(self):
        self.frozen = True


In [6]:
train_treebank = trees.load_trees('train.txt')
train_parse = [tree.convert() for tree in train_treebank]

In [7]:
dev_treebank = trees.load_trees('test.txt')

In [8]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/nanot5-base-malaysian-cased')

In [9]:
label_vocab = Vocabulary()
label_vocab.index(())

tag_vocab = Vocabulary()
START = '<s>'
STOP = '</s>'
UNK = tokenizer.unk_token
TAG_UNK = "UNK"
tag_vocab.index(START)
tag_vocab.index(STOP)
tag_vocab.index(TAG_UNK)

for tree in train_parse:
    nodes = [tree]
    while nodes:
        node = nodes.pop()
        if isinstance(node, trees.InternalParseNode):
            label_vocab.index(node.label)
            nodes.extend(reversed(node.children))
        else:
            tag_vocab.index(node.tag)
            
tag_vocab.freeze()
label_vocab.freeze()

In [10]:
config = T5Config.from_pretrained('mesolitica/nanot5-base-malaysian-cased')
config.num_labels = label_vocab.size
config.num_tags = tag_vocab.size
config.tag_loss_scale = 1.0
config.label_vocab = {str(k): v for k, v in label_vocab.indices.items()}
config.tag_vocab = tag_vocab.indices

In [11]:
model = T5Constituency.from_pretrained('mesolitica/nanot5-base-malaysian-cased', config = config)
_ = model.cuda()

Some weights of T5Constituency were not initialized from the model checkpoint at mesolitica/nanot5-base-malaysian-cased and are newly initialized: ['encoder_encoder.attn_1.w_qs1', 'encoder_encoder.ff_1.w_2c.bias', 'encoder_encoder.attn_5.proj1.weight', 'encoder_encoder.ff_5.layer_norm.b_2', 'encoder_encoder.attn_7.w_qs2', 'encoder_encoder.ff_6.w_2p.weight', 'encoder_encoder.attn_2.w_vs1', 'encoder_encoder.attn_2.proj2.weight', 'encoder_encoder.ff_4.w_2c.bias', 'f_label.3.weight', 'encoder_encoder.ff_7.w_2p.bias', 'f_tag.0.bias', 'encoder_encoder.ff_1.w_1p.bias', 'encoder_encoder.ff_6.layer_norm.b_2', 'encoder_encoder.ff_0.w_2p.weight', 'encoder_encoder.ff_7.w_1c.bias', 'encoder_encoder.attn_0.w_vs1', 'encoder_encoder.attn_1.layer_norm.b_2', 'embedding.position_table', 'encoder_encoder.attn_5.w_vs1', 'encoder_encoder.ff_0.w_2p.bias', 'encoder_encoder.attn_4.layer_norm.a_2', 'encoder_encoder.ff_7.w_2p.weight', 'encoder_encoder.ff_0.w_2c.bias', 'encoder_encoder.attn_2.layer_norm.a_2', 'en

In [12]:
trainable_parameters = [param for param in model.parameters() if param.requires_grad]

In [13]:
trainer = torch.optim.AdamW(trainable_parameters, lr = 2e-4)

In [14]:
class BatchIndices:
    def __init__(self, batch_idxs_np):
        self.batch_idxs_np = batch_idxs_np
        self.batch_idxs_torch = torch.from_numpy(batch_idxs_np)
        self.batch_size = int(1 + np.max(batch_idxs_np))

        batch_idxs_np_extra = np.concatenate([[-1], batch_idxs_np, [-1]])
        self.boundaries_np = np.nonzero(batch_idxs_np_extra[1:] != batch_idxs_np_extra[:-1])[0]
        self.seq_lens_np = self.boundaries_np[1:] - self.boundaries_np[:-1]
        assert len(self.seq_lens_np) == self.batch_size
        self.max_len = int(np.max(self.boundaries_np[1:] - self.boundaries_np[:-1]))

In [15]:
def split_batch(sentences, golds, subbatch_max_tokens=3000):
    lens = [
        len(tokenizer.tokenize(' '.join([word for (_, word) in sentence]))) + 2
        for sentence in sentences
    ]

    lens = np.asarray(lens, dtype=int)
    lens_argsort = np.argsort(lens).tolist()

    num_subbatches = 0
    subbatch_size = 1
    while lens_argsort:
        if (subbatch_size == len(lens_argsort)) or (subbatch_size * lens[lens_argsort[subbatch_size]] > subbatch_max_tokens):
            yield [sentences[i] for i in lens_argsort[:subbatch_size]], [golds[i] for i in lens_argsort[:subbatch_size]]
            lens_argsort = lens_argsort[subbatch_size:]
            num_subbatches += 1
            subbatch_size = 1
        else:
            subbatch_size += 1
            
def pad_sentence_batch(sentence_batch, pad_int):
    padded_seqs = []
    seq_lens = []
    max_sentence_len = max([len(sentence) for sentence in sentence_batch])
    for sentence in sentence_batch:
        padded_seqs.append(
            sentence + [pad_int] * (max_sentence_len - len(sentence))
        )
        seq_lens.append(len(sentence))
    return padded_seqs, seq_lens

In [16]:
def process(sentences, golds = None):
    
    all_input_ids = []
    all_word_start_mask = []
    all_word_end_mask = []

    for snum, sentence in enumerate(sentences):

        tokens = []
        word_start_mask = []
        word_end_mask = []
        tokens.append(START)
        word_start_mask.append(1)
        word_end_mask.append(1)

        cleaned_words = []
        for _, word in sentence:
            cleaned_words.append(process_word(word))

        for word in cleaned_words:
            word_tokens = tokenizer.tokenize(word)
            for _ in range(len(word_tokens)):
                word_start_mask.append(0)
                word_end_mask.append(0)
            word_start_mask[len(tokens)] = 1
            word_end_mask[-1] = 1
            tokens.extend(word_tokens)
        tokens.append(STOP)
        word_start_mask.append(1)
        word_end_mask.append(1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        all_input_ids.append(input_ids)
        all_word_start_mask.append(word_start_mask)
        all_word_end_mask.append(word_end_mask)
    
    padded = tokenizer.pad({
        'input_ids': all_input_ids,
    }, return_tensors = 'pt')

    all_word_start_mask = torch.from_numpy(np.array(pad_sentence_batch(all_word_start_mask, 0)[0]))
    all_word_end_mask = torch.from_numpy(np.array(pad_sentence_batch(all_word_end_mask, 0)[0]))
    
    padded['sentences'] = sentences
    padded['all_word_start_mask'] = all_word_start_mask
    padded['all_word_end_mask'] = all_word_end_mask
    
    packed_len = sum([(len(sentence) + 2) for sentence in sentences])
    i = 0
    tag_idxs = np.zeros(packed_len, dtype=int)
    batch_idxs = np.zeros(packed_len, dtype=int)
    for snum, sentence in enumerate(sentences):
        for (tag, word) in [(START, START)] + sentence + [(STOP, STOP)]:
            if golds is not None:
                tag_idxs[i] = tag_vocab.index_or_unk(tag, TAG_UNK)
            else:
                tag_idxs[i] = 0
            batch_idxs[i] = snum
            i += 1
    
    batch_idxs = BatchIndices(batch_idxs)
    padded['batch_idxs'] = batch_idxs
    tag_idxs = torch.from_numpy(tag_idxs)
    padded['tag_idxs'] = tag_idxs
    
    if golds is not None:
        gold_tag_idxs = tag_idxs
        padded['gold_tag_idxs'] = gold_tag_idxs
        padded['golds'] = golds
        
        
    if torch.cuda.is_available():
        for k in padded.keys():
            if isinstance(padded[k], torch.Tensor):
                padded[k] = padded[k].cuda()
    
        padded['batch_idxs'].batch_idxs_torch = padded['batch_idxs'].batch_idxs_torch.cuda()
    
    return padded

In [17]:
start_index = 0
batch_size = 4
batch_trees = train_parse[start_index:start_index + batch_size]
batch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in batch_trees]
for subbatch_sentences, subbatch_trees in split_batch(batch_sentences, batch_trees):
    print(model(**process(subbatch_sentences, subbatch_trees)))
    break

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


(tensor(268.9514, device='cuda:0', grad_fn=<AddBackward0>), tensor(278.9067, device='cuda:0', grad_fn=<MulBackward0>))


In [18]:
# model(**process(subbatch_sentences)), model(**process(subbatch_sentences, subbatch_trees))

In [19]:
batch_size = 16
epoch = 100

best_dev_fscore = -np.inf
patient = 10
current_patient = 0

for e in range(epoch):
    pbar = tqdm(range(0, len(train_parse), batch_size))
    losses = []
    for start_index in pbar:
        trainer.zero_grad()
        batch_loss_value = 0.0
        batch_trees = train_parse[start_index:start_index + batch_size]
        batch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in batch_trees]
        batch_num_tokens = sum(len(sentence) for sentence in batch_sentences)

        for subbatch_sentences, subbatch_trees in split_batch(batch_sentences, batch_trees):
            loss, tag_loss =  model(**process(subbatch_sentences, subbatch_trees))
            loss = tag_loss / len(subbatch_sentences) + loss / batch_num_tokens
            loss_value = float(loss.data.cpu().numpy())
            batch_loss_value += loss_value
            if loss_value > 0:
                loss.backward()
        
        grad_norm = torch.nn.utils.clip_grad_norm_(trainable_parameters, 5.0)
        trainer.step()
        losses.append(batch_loss_value)
        
    dev_predicted = []
    for dev_start_index in range(0, len(dev_treebank), batch_size):
        subbatch_trees = dev_treebank[dev_start_index:dev_start_index+batch_size]
        subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees]
        predicted, _ =  model(**process(subbatch_sentences))
        dev_predicted.extend([p.convert() for p in predicted])
    
    dev_fscore = evaluate.evalb('deprecated/EVALB', dev_treebank, dev_predicted)
    
    print(f'epoch: {e}, loss: {np.mean(losses)}, dev_fscore: {dev_fscore}')
    
    if dev_fscore.fscore >= best_dev_fscore:
        best_dev_fscore = dev_fscore.fscore
        current_patient = 0
        model.save_pretrained('base')
    else:
        current_patient += 1
    
    if current_patient >= patient:
        break

100%|█████████████████████████████████████████| 500/500 [01:07<00:00,  7.38it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 0, loss: 8.819639983177185, dev_fscore: (Recall=66.51, Precision=70.94, FScore=68.65, CompleteMatch=6.90, TaggingAccuracy=93.01)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.63it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 1, loss: 4.456185948133468, dev_fscore: (Recall=70.55, Precision=71.62, FScore=71.08, CompleteMatch=10.20, TaggingAccuracy=93.63)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.60it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 2, loss: 3.1180720081329345, dev_fscore: (Recall=73.64, Precision=75.03, FScore=74.33, CompleteMatch=12.40, TaggingAccuracy=93.80)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.63it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 3, loss: 2.430565314412117, dev_fscore: (Recall=74.54, Precision=73.88, FScore=74.21, CompleteMatch=14.00, TaggingAccuracy=93.50)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.62it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 4, loss: 1.9220690823793412, dev_fscore: (Recall=76.17, Precision=76.18, FScore=76.17, CompleteMatch=16.30, TaggingAccuracy=93.71)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.67it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 5, loss: 1.5730757367014885, dev_fscore: (Recall=77.61, Precision=77.25, FScore=77.43, CompleteMatch=14.70, TaggingAccuracy=93.87)


100%|█████████████████████████████████████████| 500/500 [01:06<00:00,  7.56it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 6, loss: 1.3969583784341812, dev_fscore: (Recall=76.75, Precision=77.46, FScore=77.10, CompleteMatch=16.30, TaggingAccuracy=93.85)


100%|█████████████████████████████████████████| 500/500 [01:04<00:00,  7.69it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 7, loss: 1.207198276937008, dev_fscore: (Recall=77.30, Precision=78.27, FScore=77.78, CompleteMatch=16.20, TaggingAccuracy=93.81)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.67it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 8, loss: 1.1719542775154115, dev_fscore: (Recall=78.59, Precision=78.16, FScore=78.37, CompleteMatch=17.60, TaggingAccuracy=93.93)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.67it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 9, loss: 1.0005309571921825, dev_fscore: (Recall=78.70, Precision=76.81, FScore=77.75, CompleteMatch=16.50, TaggingAccuracy=93.78)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.67it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 10, loss: 0.9286423883438111, dev_fscore: (Recall=79.51, Precision=78.38, FScore=78.94, CompleteMatch=18.60, TaggingAccuracy=93.97)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.63it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 11, loss: 0.8747475933432579, dev_fscore: (Recall=79.34, Precision=78.36, FScore=78.84, CompleteMatch=18.60, TaggingAccuracy=94.04)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.65it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 12, loss: 0.8335441668629646, dev_fscore: (Recall=79.42, Precision=78.98, FScore=79.20, CompleteMatch=18.80, TaggingAccuracy=94.09)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.66it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 13, loss: 0.8101441066265106, dev_fscore: (Recall=78.95, Precision=78.89, FScore=78.92, CompleteMatch=18.30, TaggingAccuracy=93.90)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.67it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 14, loss: 0.7101925003826618, dev_fscore: (Recall=79.36, Precision=79.43, FScore=79.39, CompleteMatch=19.10, TaggingAccuracy=94.27)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.62it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 15, loss: 0.675981358319521, dev_fscore: (Recall=80.10, Precision=79.61, FScore=79.85, CompleteMatch=20.70, TaggingAccuracy=94.30)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.65it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 16, loss: 0.6253926932662726, dev_fscore: (Recall=79.74, Precision=78.88, FScore=79.31, CompleteMatch=18.70, TaggingAccuracy=94.02)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.68it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 17, loss: 0.6020984571874142, dev_fscore: (Recall=81.04, Precision=79.03, FScore=80.02, CompleteMatch=20.80, TaggingAccuracy=94.28)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.62it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 18, loss: 0.5756492719501257, dev_fscore: (Recall=79.87, Precision=79.83, FScore=79.85, CompleteMatch=20.50, TaggingAccuracy=94.24)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.64it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 19, loss: 0.5746373542696237, dev_fscore: (Recall=79.38, Precision=79.91, FScore=79.65, CompleteMatch=22.40, TaggingAccuracy=94.27)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.61it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 20, loss: 0.5201877975389362, dev_fscore: (Recall=79.36, Precision=79.84, FScore=79.60, CompleteMatch=19.50, TaggingAccuracy=93.86)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.65it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 21, loss: 0.4912329556792974, dev_fscore: (Recall=80.15, Precision=80.23, FScore=80.19, CompleteMatch=21.40, TaggingAccuracy=94.26)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.67it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 22, loss: 0.4710603418499231, dev_fscore: (Recall=80.03, Precision=79.97, FScore=80.00, CompleteMatch=21.40, TaggingAccuracy=94.09)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.68it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 23, loss: 0.4637663812190294, dev_fscore: (Recall=80.32, Precision=79.65, FScore=79.99, CompleteMatch=19.10, TaggingAccuracy=94.23)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.66it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 24, loss: 0.46933632941544057, dev_fscore: (Recall=80.85, Precision=79.59, FScore=80.21, CompleteMatch=20.60, TaggingAccuracy=94.34)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.64it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 25, loss: 0.5320658619701862, dev_fscore: (Recall=80.34, Precision=79.97, FScore=80.15, CompleteMatch=20.70, TaggingAccuracy=94.38)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.67it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 26, loss: 0.4144169696420431, dev_fscore: (Recall=79.75, Precision=81.32, FScore=80.53, CompleteMatch=21.40, TaggingAccuracy=94.35)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.68it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 27, loss: 0.38793145314604044, dev_fscore: (Recall=80.74, Precision=81.15, FScore=80.94, CompleteMatch=22.50, TaggingAccuracy=94.13)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.65it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 28, loss: 0.4557119156047702, dev_fscore: (Recall=80.03, Precision=80.47, FScore=80.25, CompleteMatch=22.60, TaggingAccuracy=94.27)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.64it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 29, loss: 0.3504318243339658, dev_fscore: (Recall=80.55, Precision=80.12, FScore=80.33, CompleteMatch=22.50, TaggingAccuracy=94.02)


100%|█████████████████████████████████████████| 500/500 [01:06<00:00,  7.57it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 30, loss: 0.39788834031671283, dev_fscore: (Recall=80.85, Precision=80.93, FScore=80.89, CompleteMatch=22.10, TaggingAccuracy=94.35)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.69it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 31, loss: 0.37517670648172496, dev_fscore: (Recall=81.00, Precision=79.60, FScore=80.29, CompleteMatch=21.40, TaggingAccuracy=94.36)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.65it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 32, loss: 0.3500702136158943, dev_fscore: (Recall=80.75, Precision=80.45, FScore=80.60, CompleteMatch=22.50, TaggingAccuracy=94.40)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.66it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 33, loss: 0.3293591633290052, dev_fscore: (Recall=80.96, Precision=80.93, FScore=80.95, CompleteMatch=21.70, TaggingAccuracy=94.41)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.67it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 34, loss: 0.3103742697276175, dev_fscore: (Recall=80.54, Precision=80.38, FScore=80.46, CompleteMatch=22.30, TaggingAccuracy=94.27)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.59it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 35, loss: 0.3065598714016378, dev_fscore: (Recall=81.03, Precision=80.54, FScore=80.79, CompleteMatch=22.80, TaggingAccuracy=94.51)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.67it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 36, loss: 0.3278788373600692, dev_fscore: (Recall=80.10, Precision=80.81, FScore=80.45, CompleteMatch=21.20, TaggingAccuracy=94.30)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.66it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 37, loss: 0.3105921440348029, dev_fscore: (Recall=80.75, Precision=80.88, FScore=80.81, CompleteMatch=23.00, TaggingAccuracy=94.14)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.67it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 38, loss: 0.2826378598809242, dev_fscore: (Recall=79.74, Precision=80.76, FScore=80.25, CompleteMatch=21.90, TaggingAccuracy=94.06)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.69it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 39, loss: 0.3209829128049314, dev_fscore: (Recall=80.94, Precision=80.96, FScore=80.95, CompleteMatch=23.10, TaggingAccuracy=94.43)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.64it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 40, loss: 0.2583822923637927, dev_fscore: (Recall=80.89, Precision=81.87, FScore=81.38, CompleteMatch=23.90, TaggingAccuracy=94.45)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.66it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 41, loss: 0.2601054233200848, dev_fscore: (Recall=80.11, Precision=81.34, FScore=80.72, CompleteMatch=22.40, TaggingAccuracy=94.30)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.66it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 42, loss: 0.24275643637776376, dev_fscore: (Recall=79.67, Precision=81.57, FScore=80.61, CompleteMatch=21.90, TaggingAccuracy=94.35)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.64it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 43, loss: 0.24849758695811033, dev_fscore: (Recall=80.16, Precision=81.45, FScore=80.80, CompleteMatch=21.60, TaggingAccuracy=94.36)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.66it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 44, loss: 0.23292735051363705, dev_fscore: (Recall=80.08, Precision=80.11, FScore=80.10, CompleteMatch=20.20, TaggingAccuracy=94.15)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.66it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 45, loss: 0.2826069640889764, dev_fscore: (Recall=80.98, Precision=81.30, FScore=81.14, CompleteMatch=24.00, TaggingAccuracy=94.29)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.64it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 46, loss: 0.2309159503504634, dev_fscore: (Recall=79.95, Precision=81.88, FScore=80.90, CompleteMatch=22.40, TaggingAccuracy=94.32)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.66it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 47, loss: 0.19940490192826837, dev_fscore: (Recall=79.70, Precision=82.10, FScore=80.89, CompleteMatch=24.00, TaggingAccuracy=94.53)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.64it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 48, loss: 0.2133789891451597, dev_fscore: (Recall=78.65, Precision=82.14, FScore=80.35, CompleteMatch=22.50, TaggingAccuracy=94.29)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.67it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 49, loss: 0.23868915298767387, dev_fscore: (Recall=80.15, Precision=80.80, FScore=80.47, CompleteMatch=20.80, TaggingAccuracy=94.41)


100%|█████████████████████████████████████████| 500/500 [01:05<00:00,  7.66it/s]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch: 50, loss: 0.2080194965712726, dev_fscore: (Recall=79.93, Precision=81.58, FScore=80.75, CompleteMatch=22.40, TaggingAccuracy=94.43)


In [20]:
model_ = T5Constituency.from_pretrained('./base')

In [21]:
model_.push_to_hub('mesolitica/constituency-parsing-nanot5-base-malaysian-cased', safe_serialization = True)

model.safetensors:   0%|          | 0.00/545M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/constituency-parsing-nanot5-base-malaysian-cased/commit/c3d2dff23d539dad56f6cd12202d7c711782c236', commit_message='Upload T5Constituency', commit_description='', oid='c3d2dff23d539dad56f6cd12202d7c711782c236', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
tokenizer.push_to_hub('mesolitica/constituency-parsing-nanot5-base-malaysian-cased', safe_serialization = True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/constituency-parsing-nanot5-base-malaysian-cased/commit/bbcfb2dd265b33505c80f5fc12f005b7c3a70ace', commit_message='Upload tokenizer', commit_description='', oid='bbcfb2dd265b33505c80f5fc12f005b7c3a70ace', pr_url=None, pr_revision=None, pr_num=None)