In [2]:
import logging

logging.basicConfig(level=logging.DEBUG)

In [3]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [4]:
%%time

import malaya

DEBUG:tensorflow:Falling back to TensorFlow client; we recommended you install the Cloud TPU client directly with pip install cloud-tpu-client.
DEBUG:h5py._conv:Creating converter from 7 to 5
DEBUG:h5py._conv:Creating converter from 5 to 7
DEBUG:h5py._conv:Creating converter from 7 to 5
DEBUG:h5py._conv:Creating converter from 5 to 7
DEBUG:git.cmd:Popen(['git', 'version'], cwd=/home/husein/dev/malaya, universal_newlines=False, shell=None, istream=None)
DEBUG:git.cmd:Popen(['git', 'version'], cwd=/home/husein/dev/malaya, universal_newlines=False, shell=None, istream=None)


CPU times: user 3.28 s, sys: 3.46 s, total: 6.75 s
Wall time: 2.43 s


In [5]:
from malaya.torch_model.t5 import T5Diaparser
from transformers import T5Tokenizer, T5Config

In [6]:
tag2idx = {'PAD': 0,
 'X': 1,
 'nsubj': 2,
 'cop': 3,
 'det': 4,
 'root': 5,
 'nsubj:pass': 6,
 'acl': 7,
 'case': 8,
 'obl': 9,
 'flat': 10,
 'punct': 11,
 'appos': 12,
 'amod': 13,
 'compound': 14,
 'advmod': 15,
 'cc': 16,
 'obj': 17,
 'conj': 18,
 'mark': 19,
 'advcl': 20,
 'nmod': 21,
 'nummod': 22,
 'dep': 23,
 'xcomp': 24,
 'ccomp': 25,
 'parataxis': 26,
 'compound:plur': 27,
 'fixed': 28,
 'aux': 29,
 'csubj': 30,
 'iobj': 31,
 'csubj:pass': 32}

In [7]:
config = T5Config.from_pretrained('mesolitica/finetune-translation-t5-tiny-standard-bahasa-cased')
config.num_labels = len(tag2idx)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /mesolitica/finetune-translation-t5-tiny-standard-bahasa-cased/resolve/main/config.json HTTP/1.1" 200 0


In [8]:
model = T5Diaparser.from_pretrained('mesolitica/finetune-translation-t5-tiny-standard-bahasa-cased',
                                   config = config)

Some weights of T5Diaparser were not initialized from the model checkpoint at mesolitica/finetune-translation-t5-tiny-standard-bahasa-cased and are newly initialized: ['mlp_arc_d.linear.bias', 'mlp_rel_d.linear.bias', 'rel_attn.weight', 'mlp_arc_h.linear.bias', 'mlp_rel_h.linear.weight', 'mlp_rel_h.linear.bias', 'mlp_arc_h.linear.weight', 'mlp_rel_d.linear.weight', 'mlp_arc_d.linear.weight', 'arc_attn.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
tokenizer = T5Tokenizer.from_pretrained('mesolitica/finetune-translation-t5-tiny-standard-bahasa-cased')

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /mesolitica/finetune-translation-t5-tiny-standard-bahasa-cased/resolve/main/spiece.model HTTP/1.1" 302 0


In [10]:
# from malaya.parser.conll import CoNLL

In [11]:
# conll = CoNLL()
# conll('gsd-ud-dev.conllu.txt')
# conll.load()

In [12]:
# !wget https://raw.githubusercontent.com/huseinzol05/malay-dataset/master/parsing/dependency/gsd-ud-dev.conllu.txt
groups, temp = [], []
with open('gsd-ud-dev.conllu.txt') as fopen:
    for l in fopen:
        l = l.strip()
        if not len(l):
            groups.append(temp[2:])
            temp = []
        else:
            temp.append(l)

In [13]:
def get_train(group):
    texts, arcs, tags, indices = [], [], [], []
    for g in group:
        splitted = g.split('\t')
        texts.append(splitted[1])
        arcs.append(int(splitted[6]))
        tags.append(tag2idx[splitted[7]])
        indices.append(int(splitted[0]))
        
    return texts, arcs, tags, indices

In [14]:
# X, ARC, Y, MASK = [], [], [], []
features = []
for g in groups[:4]:
    texts, arcs, tags, indices = [1], [0], [0], [0]
    text, arc, tag, index = get_train(g)
    for i in range(len(text)):
        t = tokenizer.encode(text[i], add_special_tokens=False)
        texts.extend(t)
        arcs.extend([arc[i]] * len(t))
        tags.extend([tag[i]] * len(t))
        indices.extend([i + 1] * len(t))
        
#     X.append(texts)
#     ARC.append(arcs)
#     Y.append(tags)
#     MASK.append([1] * len(texts))
    
    model_inputs = {
        'input_ids': texts,
        # 'attention_mask': [0] + [1] * (len(texts) - 1),
        'attention_mask': [1] * len(texts),
        'labels': tags,
        'labels_arc': arcs,
        'indices': indices
    }
    features.append(model_inputs)

In [15]:
label_pad_token_id = -100
padding_side = 'right'
labels = [feature["labels"] for feature in features]
max_label_length = max(len(l) for l in labels)
for feature in features:
    remainder = [label_pad_token_id] * (max_label_length - len(feature["labels"]))
    remainder_ = [0] * (max_label_length - len(feature["labels"]))
    feature["labels"] = (
        feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels_tag"]
    )
    feature["labels_arc"] = (
        feature["labels_arc"] + remainder if padding_side == "right" else remainder + feature["labels_tag"]
    )
    feature["indices"] = (
        feature["indices"] + remainder_ if padding_side == "right" else remainder + feature["indices"]
    )

In [16]:
padded = tokenizer.pad(
    features,
    padding=True,
    max_length=None,
    pad_to_multiple_of=None,
    return_tensors='pt',
)

In [17]:
padded['attention_mask'].shape

torch.Size([4, 78])

In [18]:
padded['input_ids'].shape

torch.Size([4, 78])

In [19]:
padded['indices']

tensor([[ 0,  1,  2,  2,  2,  3,  4,  4,  4,  5,  6,  7,  8,  9, 10, 11, 11, 12,
         13, 13, 14, 15, 15, 16, 17, 18, 19, 20, 21, 22, 23, 23,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0],
        [ 0,  1,  2,  2,  3,  4,  5,  6,  6,  6,  7,  7,  8,  9, 10, 11, 12, 12,
         13, 14, 14, 15, 15, 15, 16, 17, 18, 18, 18, 19, 20, 20, 21, 22, 23, 24,
         25, 26, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 35,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 10, 11, 12, 12, 13, 13, 13,
         13, 13, 14, 14, 14, 14, 15, 16, 17, 18, 19, 20, 20, 21, 21, 22, 23, 24,
         24, 24, 25, 26, 27, 28, 29, 30, 31, 31, 32, 33, 33, 33, 33, 34, 35, 36,
         36, 37, 38, 39, 39, 39, 39, 39, 

In [21]:
o = model(**padded)

In [22]:
o.s_arc.argmax(axis = -1)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1

In [23]:
o.loss

tensor(7.3922, grad_fn=<AddBackward0>)

In [24]:
from torch.optim import Adam

In [25]:
optimizer = Adam(model.parameters(),2e-5)

In [27]:
for i in range(50):
    optimizer.zero_grad()
    loss = model(**padded, return_dict = True).loss
    print(i, loss)
    loss.backward()
    optimizer.step()

0 tensor(7.3922, grad_fn=<AddBackward0>)
1 tensor(7.1098, grad_fn=<AddBackward0>)
2 tensor(6.7844, grad_fn=<AddBackward0>)
3 tensor(6.4093, grad_fn=<AddBackward0>)
4 tensor(5.9998, grad_fn=<AddBackward0>)
5 tensor(5.5765, grad_fn=<AddBackward0>)
6 tensor(5.1111, grad_fn=<AddBackward0>)
7 tensor(4.7369, grad_fn=<AddBackward0>)
8 tensor(4.0739, grad_fn=<AddBackward0>)
9 tensor(3.6403, grad_fn=<AddBackward0>)
10 tensor(3.1981, grad_fn=<AddBackward0>)
11 tensor(2.7378, grad_fn=<AddBackward0>)
12 tensor(2.2673, grad_fn=<AddBackward0>)
13 tensor(1.8679, grad_fn=<AddBackward0>)
14 tensor(1.5290, grad_fn=<AddBackward0>)
15 tensor(1.4472, grad_fn=<AddBackward0>)
16 tensor(1.0049, grad_fn=<AddBackward0>)
17 tensor(0.8490, grad_fn=<AddBackward0>)
18 tensor(0.6342, grad_fn=<AddBackward0>)
19 tensor(0.4554, grad_fn=<AddBackward0>)
20 tensor(0.3242, grad_fn=<AddBackward0>)
21 tensor(0.2385, grad_fn=<AddBackward0>)
22 tensor(0.1764, grad_fn=<AddBackward0>)
23 tensor(0.1269, grad_fn=<AddBackward0>)
24

In [30]:
o = model(input_ids = padded['input_ids'], indices = padded['indices'],
      attention_mask = padded['attention_mask'])

In [31]:
o.s_arc.argmax(axis = -1)

tensor([[ 0,  4,  1,  1,  1,  2,  0,  0,  0,  4,  7,  4,  7, 10,  4, 12, 12, 10,
         14, 14, 10, 17, 17, 17, 10, 17, 20, 17, 20, 21,  4,  4,  4,  4,  7,  7,
          7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
          7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,
          7,  7,  7,  7,  7,  7],
        [ 0,  3,  1,  1,  0,  5,  3,  5,  5,  5, 28, 28,  9,  3, 11,  9, 11, 11,
         12, 28, 28,  3,  3,  3, 17, 15, 17, 17, 17, 18, 18, 18,  3, 23, 21, 23,
         24, 28, 28, 28,  3, 28, 31, 29, 31, 32, 33,  3,  3,  3, 31, 31, 31, 31,
          3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
          3,  3,  3,  3,  3,  3],
        [ 0,  4,  1,  4,  0,  7,  7,  4,  7,  7,  4,  4, 13, 13, 13,  4,  4,  4,
          4,  4, 13, 13, 13, 13, 14, 17, 13, 17, 18, 13, 13,  4,  4, 33, 22, 23,
         23, 23, 22, 28, 28, 22, 30, 28, 33, 33, 33,  4,  4,  4,  4, 33, 36, 34,
         34, 38, 34, 38, 38, 38, 38, 38, 

In [32]:
padded['labels_arc']

tensor([[   0,    4,    1,    1,    1,    2,    0,    0,    0,    4,    7,    4,
            7,   10,    4,   12,   12,   10,   14,   14,   10,   17,   17,   17,
           10,   17,   20,   17,   20,   21,    4,    4, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100],
        [   0,    3,    1,    1,    0,    5,    3,    5,    5,    5,   28,   28,
            9,    3,   11,    9,   11,   11,   12,   28,   28,    3,    3,    3,
           17,   15,   17,   17,   17,   18,   18,   18,    3,   23,   21,   23,
           24,   28,   28,   28,    3,   28,   31,   29,   31,   32,   33,    3,
            3, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, 

In [34]:
arc_preds = o.s_arc.argmax(axis = -1)
rel_preds = o.s_rel.argmax(-1)
rel_preds = rel_preds.gather(-1, arc_preds.unsqueeze(-1)).squeeze(-1)

In [36]:
padded['labels']

tensor([[   0,    2,   14,   14,   14,   14,    5,    5,    5,   17,    8,    9,
           13,    8,    9,   11,   11,   18,   11,   11,   18,   11,   11,   16,
           18,   13,    2,    7,   17,   14,   11,   11, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100],
        [   0,    2,   10,   10,    5,    8,    9,   13,   13,   13,   11,   11,
           15,   13,    8,   21,   10,   10,   10,   11,   11,   13,   13,   13,
            8,   21,   10,   10,   10,   10,   11,   11,   13,    8,   21,   10,
           10,   11,   11,   16,   18,   15,    8,   21,   10,   10,   10,   11,
           11, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, 

In [35]:
rel_preds

tensor([[ 0,  2, 14, 14, 14, 14,  5,  5,  5, 17,  8,  9, 13,  8,  9, 11, 11, 18,
         11, 11, 18, 11, 11, 16, 18, 13,  2,  7, 17, 14, 11, 11, 17, 17,  8,  8,
          8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
          8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
          8,  8,  8,  8,  8,  8],
        [ 0,  2, 10, 10,  5,  8,  9, 13, 13, 13, 11, 11, 15, 13,  8, 21, 10, 10,
         10, 11, 11, 13, 13, 13,  8, 21, 10, 10, 10, 10, 11, 11, 13,  8, 21, 10,
         10, 11, 11, 16, 18, 15,  8, 21, 10, 10, 10, 11, 11, 11, 10, 10, 21, 21,
         13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
         13, 13, 13, 13, 13, 13],
        [ 0,  6,  4, 15,  5,  8,  4,  9, 14,  4, 11, 11,  8, 11, 11,  9,  9,  9,
          9,  9, 10, 10, 10, 10, 10, 16, 18, 10, 10, 11, 11, 11, 11,  2, 10, 10,
         10, 10,  8,  6, 15,  7,  8,  9, 11, 11, 15, 23, 23, 23, 23, 17,  8, 21,
         21, 16, 18, 14, 14, 14, 14, 14, 