In [1]:
import sys
sys.path.append('./src')


from datasets import DatasetDict
from src.data import prepare_dataset_mlm, create_vocab, create_gloss_vocab
from src.encoder import CustomEncoder
import random


from src.data import load_data_file

train = load_data_file('./data/usp-train-track2-uncovered')
dev = load_data_file('./data/usp-dev-track2-uncovered')

train_vocab = create_vocab([line.morphemes() for line in train], threshold=1)
encoder = CustomEncoder(train_vocab)

train_dataset = prepare_dataset_mlm(data=[line.morphemes() for line in train], encoder=encoder, model_input_length=64, mlm_probability=0.3, device='cpu')

train_dataset[0]

  0%|          | 0/9774 [00:00<?, ?ex/s]

[3, 1, 3, 3, 3334, 3, 1, 2601, 1210]
[2870, 1, 2141, 1, 747, 1, 3271, 1, 2603, 1, 2601, 1105]
[2490, 3, 2603, 1, 3, 3177, 1, 1220, 1, 3]
[1385, 3, 1, 3, 1, 2577]
[3350, 3, 1322]
[2383, 3, 1069, 1089, 3, 2601, 1105, 3, 3, 1322]
[2599, 2917, 1, 1322, 1, 2594, 1, 3]
[3271, 1, 1215, 1792, 438]
[3237, 438]
[1322, 1, 2594, 1, 2630]
[3, 1, 2784, 1, 3, 686, 438]
[1728, 1, 2162, 1, 3, 3, 960, 1, 2628, 1, 3, 3, 3, 3, 3]
[979, 1, 3, 1, 3, 3024, 3, 2601, 1211]
[1642, 1, 3, 2556, 1, 2859, 3264, 1, 2919]
[1088, 1, 1642, 1, 1322]
[3, 1, 2594, 1, 2601, 916, 438]
[1088, 3, 3334, 960, 3, 2919, 3, 1979, 1, 1635, 1, 3, 1, 1215, 722, 1215, 1, 3, 1, 2784, 1, 3321]
[1289, 1, 2564, 722, 1215, 1, 1970, 1, 1635, 1, 1872]
[2171, 2071, 1, 2859, 3264, 1, 1835, 1, 3, 1105]
[3, 1, 2870, 1, 1418, 1, 2594, 1, 2628]
[2956, 3, 3, 3, 2276]
[715]
[3, 1, 3, 1, 3, 918]
[3]
[715]
[3350, 1, 722, 1215, 1, 3, 1, 3, 3, 1635]
[1088, 1, 2784, 1, 3321, 3, 3, 102, 408, 2377, 3, 2784, 3]
[1728, 1, 1642, 3, 3, 2738, 438, 1, 3, 2601, 9

{'strs': ["o'", '[SEP]', 'sea', '[SEP]', 'x', 'tok', '[SEP]', 'r', 'ixóqiil'],
 'input_ids': [3,
  1,
  3,
  3,
  3334,
  3,
  1,
  2601,
  1210,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'labels': [2246,
  -100,
  2729,
  1,
  -100,
  3024,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,


In [14]:

train = random.sample(train, 500)

train_vocab = create_vocab([line.morphemes() for line in train], threshold=1)

unknown = 0
all = 0
unknown_morphemes = set()
for line in dev:
    for morpheme in line.morphemes():
        if morpheme == "[SEP]":
            continue
        if morpheme not in train_vocab:
            unknown += 1
            unknown_morphemes.add(morpheme)
        all += 1

print(f"{unknown}/{all} = {unknown/all} morphemes unknown")
unknown_morphemes

194/1271 = 0.15263571990558616 morphemes unknown


{'aaj',
 'almaj',
 'amig',
 'ants',
 "b'a'",
 'benad',
 'bolber',
 "ch'ab'e",
 'chajaa',
 'cheje',
 'chek',
 'chep',
 'dyunabes',
 'entons',
 'fabor',
 "je'",
 "jich'",
 'jwelaa',
 'jwelagran:',
 "k'an",
 "k'echelaaj",
 "k'i",
 "k'it",
 "k'itj",
 'ka',
 'kamnaq',
 'kas',
 'kasar',
 'kask',
 "kib'",
 "kik'",
 'kistyan',
 'kitz',
 "ko'",
 'komo',
 'koop',
 'koopaa',
 'kristyan',
 'kwatro',
 'ljori',
 'maj',
 'maña',
 'medya',
 'mentiras',
 'mism',
 'miyer',
 'mod',
 'montañ',
 'montaña',
 'montañas',
 'moor',
 'morr',
 'morral',
 'nii',
 'noche',
 'ntons',
 'oq',
 'oqta',
 'orasyon',
 'pas',
 'paso',
 'pen',
 'plomas',
 'pogam',
 'pongamos',
 'pongams',
 'potrer',
 'potrero',
 'primer',
 'put',
 'puta',
 'qul',
 'rekomendar',
 'rifl',
 'rifle',
 'rii',
 's',
 'salbar',
 'seguir',
 'sep',
 'seraa',
 'sin',
 'sip',
 'sipa',
 'sách',
 'tal',
 'tamiyen',
 'tawen',
 "te'",
 'tres',
 'tronar',
 "uk'iil",
 'uq',
 'wij',
 'wákix',
 "ák'il"}

In [21]:
train[0].gloss_list(segmented=True)

['NEG',
 '[SEP]',
 'ADJ',
 '[SEP]',
 'INC',
 'E3',
 'VT',
 '[SEP]',
 'AFI',
 '[SEP]',
 'DEM']

In [1]:
from transformers import AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("michaelginn/uspanteko-masked-lm", use_auth_token=True)

In [6]:
model.base_model.save_pretrained('./uspanteko-roberta-base')

In [10]:
model.base_model

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(3522, 100, padding_idx=2)
    (position_embeddings): Embedding(64, 100, padding_idx=2)
    (token_type_embeddings): Embedding(2, 100)
    (LayerNorm): LayerNorm((100,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=100, out_features=100, bias=True)
            (key): Linear(in_features=100, out_features=100, bias=True)
            (value): Linear(in_features=100, out_features=100, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=100, out_features=100, bias=True)
            (LayerNorm): LayerNorm((100,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropou

In [11]:
from transformers import AutoModelForTokenClassification

flat = AutoModelForTokenClassification.from_pretrained("michaelginn/uspanteko-roberta-base", num_labels=66)

Downloading config.json:   0%|          | 0.00/681 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/8.92M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at michaelginn/uspanteko-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
flat

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(3522, 100, padding_idx=2)
      (position_embeddings): Embedding(64, 100, padding_idx=2)
      (token_type_embeddings): Embedding(2, 100)
      (LayerNorm): LayerNorm((100,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=100, out_features=100, bias=True)
              (key): Linear(in_features=100, out_features=100, bias=True)
              (value): Linear(in_features=100, out_features=100, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=100, out_features=100, bias=True)
              (LayerNorm):

In [12]:
from src.data import prepare_dataset
dataset = prepare_dataset(data=data, encoder=encoder,
                                           model_input_length=512, mask_tokens_proportion=0.1, device='cpu')

  0%|          | 0/9774 [00:00<?, ?ex/s]

In [13]:
print(dataset[0])

{'transcription': 'o sey xtok rixoqiil', 'translation': 'O sea busca esposa.', 'glosses': ['CONJ', '[SEP]', 'ADV', '[SEP]', 'COM', 'VT', '[SEP]', 'E3S', 'S'], 'segmentation': "o' sea x-tok r-ixóqiil", 'morphemes': ["o'", '[SEP]', 'sea', '[SEP]', 'x', 'tok', '[SEP]', 'r', 'ixóqiil'], 'input_ids': [0, 0, 0, 1, 0, 3030, 0, 0, 1216, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [13]:
encoder.PAD_ID

2

In [11]:
from src.taxonomic_loss_model import create_model
from transformers import BertConfig
from src.uspanteko_morphology import morphology

config = BertConfig(
        vocab_size=encoder.vocab_size(),
        max_position_embeddings=512,
        pad_token_id=encoder.PAD_ID,
        num_labels=len(encoder.vocabularies[1])
    )
model = create_model(encoder, 512)

Creating model...
1 []
before group [0. 0. 0. 0. 0.] 2
before group [0. 0. 0. 0. 0.] 2
before group [0. 0. 0. 0. 0.] 2
before group [0. 0. 0. 0. 0.] 2
2 [2.0, 2.0, 2.0, 2.0]
3 [2.0, 2.0, 2.0, 2.0]
before group [0. 0. 0. 0. 0.] 4
4 [2.0, 2.0, 2.0, 3.0]
5 [2.0, 2.0, 2.0, 3.0]
before group [0. 0. 0. 0. 0.] 6
before group [0. 0. 0. 0. 0.] 6
6 [2.0, 2.0, 3.0, 4.0]
7 [2.0, 2.0, 3.0, 4.0]
8 [2.0, 2.0, 3.0, 4.0]
before group [0. 0. 0. 0. 0.] 9
9 [2.0, 2.0, 3.0, 5.0]
10 [2.0, 2.0, 3.0, 5.0]
11 [2.0, 2.0, 3.0, 5.0]
before group [0. 0. 0. 0. 0.] 12
12 [2.0, 2.0, 3.0, 6.0]
13 [2.0, 2.0, 3.0, 6.0]
14 [2.0, 2.0, 3.0, 6.0]
before group [0. 0. 0. 0. 0.] 15
15 [2.0, 3.0]
16 [2.0, 3.0]
17 [2.0, 3.0]
before group [0. 0. 0. 0. 0.] 18
18 [2.0, 4.0]
19 [2.0, 4.0]
20 [2.0, 4.0]
21 [2.0, 4.0]
22 [2.0, 4.0]
23 [2.0, 4.0]
24 [2.0, 4.0]
before group [0. 0. 0. 0. 0.] 25
before group [0. 0. 0. 0. 0.] 25
25 [2.0, 5.0, 14.0]
26 [2.0, 5.0, 14.0]
27 [2.0, 5.0, 14.0]
28 [2.0, 5.0, 14.0]
before group [0. 0. 0. 0. 0.] 29

In [12]:
import torch
# o = model.forward(input_ids=torch.LongTensor(dataset[:2]['input_ids']),
#               attention_mask=torch.LongTensor(dataset[:2]['attention_mask']))
o = model.forward(input_ids=torch.LongTensor(dataset[:2]['input_ids']),
              attention_mask=torch.LongTensor(dataset[:2]['attention_mask']),
                  labels=torch.LongTensor(dataset[:2]['labels']))

torch.Size([2, 512, 66])


In [None]:
o.logits.view(-1, 66).size()

In [14]:
o.loss

tensor(20.3891, grad_fn=<AddBackward0>)

In [None]:
o.logits

In [54]:
from transformers import Trainer, TrainingArguments
import numpy as np

def eval_accuracy(pred, gold) -> dict:
    """Computes the average and overall accuracy, where predicted labels must be in the correct position in the list."""
    total_correct_predictions = 0
    total_tokens = 0
    summed_accuracies = 0

    for (entry_pred, entry_gold, i) in zip(pred, gold, range(len(gold))):
        entry_correct_predictions = 0

        for token_index in range(len(entry_gold)):
            # For each token, check if it matches
            if token_index < len(entry_pred) and \
                    entry_pred[token_index] == entry_gold[token_index] and \
                    entry_pred[token_index] not in ['[UNK]', '[SEP]']:
                entry_correct_predictions += 1

        entry_accuracy = (entry_correct_predictions / len(entry_gold))
        summed_accuracies += entry_accuracy

        total_correct_predictions += entry_correct_predictions
        total_tokens += len([token for token in entry_gold if token != '[SEP]'])

    total_entries = len(gold)
    average_accuracy = summed_accuracies / total_entries
    overall_accuracy = total_correct_predictions / total_tokens
    return {'average_accuracy': average_accuracy, 'accuracy': overall_accuracy}

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # Decode predicted output
    print(preds)
    decoded_preds = encoder.batch_decode(preds, from_vocabulary_index=1)
    print(decoded_preds[0:1])

    # Decode (gold) labels
    print(labels)
    labels = np.where(labels != -100, labels, encoder.PAD_ID)
    decoded_labels = encoder.batch_decode(labels, from_vocabulary_index=1)
    print(decoded_labels[0:1])

    return eval_accuracy(decoded_preds, decoded_labels)


def preprocess_logits_for_metrics(logits, labels):
    return logits.argmax(dim=2)


args = TrainingArguments(
    output_dir=f"../training-checkpoints",
    num_train_epochs=1000,
    per_device_train_batch_size=1, # set batch size to 1
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model,
    args,
    train_dataset=dataset,
    eval_dataset=dataset,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


AttributeError: 'JupyterNotebookCallback' object has no attribute 'on_init_end'

In [53]:
trainer.predict(test_dataset=[dataset[0]])

***** Running Prediction *****
  Num examples = 1
  Batch size = 8
The following columns in the test set don't have a corresponding argument in `HierarchicalMorphemeLabelingModel.forward` and have been ignored: glosses, morphemes, segmentation, translation, transcription. If glosses, morphemes, segmentation, translation, transcription are not expected by `HierarchicalMorphemeLabelingModel.forward`,  you can safely ignore this message.


PredictionOutput(predictions=array([[[3.0975988e-14, 1.0000000e+00, 6.6980982e-01, ...,
         5.4001689e-01, 1.3051839e-01, 6.7604112e-14],
        [3.0975927e-14, 1.0000000e+00, 6.6980982e-01, ...,
         5.4001689e-01, 1.3051839e-01, 6.7604112e-14],
        [3.0975927e-14, 1.0000000e+00, 6.6980982e-01, ...,
         5.4001689e-01, 1.3051839e-01, 6.7604112e-14],
        ...,
        [3.0975870e-14, 1.0000000e+00, 6.6980982e-01, ...,
         5.4001689e-01, 1.3051839e-01, 6.7603983e-14],
        [3.0975927e-14, 1.0000000e+00, 6.6980982e-01, ...,
         5.4001689e-01, 1.3051839e-01, 6.7604112e-14],
        [3.0975870e-14, 1.0000000e+00, 6.6980982e-01, ...,
         5.4001689e-01, 1.3051839e-01, 6.7603854e-14]]], dtype=float32), label_ids=array([[  61,    1,   54,    1,   25,   39,    1,   14,   43, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100

In [34]:
dataset[0]

{'transcription': 'o sey xtok rixoqiil',
 'translation': 'O sea busca esposa.',
 'glosses': ['CONJ',
  '[SEP]',
  'ADV',
  '[SEP]',
  'COM',
  'VT',
  '[SEP]',
  'E3S',
  'S'],
 'segmentation': "o' sea x-tok r-ixóqiil",
 'morphemes': ["o'",
  '[SEP]',
  'sea',
  '[SEP]',
  'x',
  'tok',
  '[SEP]',
  'r',
  'ixóqiil'],
 'input_ids': [2252,
  1,
  2735,
  1,
  3340,
  3030,
  1,
  2607,
  1216,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,

In [38]:
import pandas as pd

df = pd.DataFrame(m)
df

Unnamed: 0,0,1,2,3,4
0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0
2,2.0,2.0,2.0,2.0,2.0
3,2.0,2.0,2.0,2.0,3.0
4,2.0,2.0,2.0,3.0,4.0
...,...,...,...,...,...
61,7.0,19.0,46.0,53.0,61.0
62,8.0,20.0,47.0,54.0,62.0
63,8.0,21.0,48.0,55.0,63.0
64,8.0,22.0,49.0,56.0,64.0


In [78]:
o.logits[list(df.groupby(by=1).groups.values())]

IndexError: too many indices for tensor of dimension 3

In [96]:
# for group in list(df.groupby(by=1).groups.values()):
#     print(o.logits.view(-1, 66)[:,group].sum(axis=1))

torch.transpose(torch.stack([o.logits.view(-1, 66)[:,group].sum(axis=1) for group in list(df.groupby(by=1).groups.values())]), 0, 1).size()

torch.Size([1024, 24])

In [97]:
o.labels

AttributeError: 'TokenClassifierOutput' object has no attribute 'labels'

In [111]:
labels = torch.LongTensor(dataset[:2]['labels']).view(-1)
labels[labels == -100] = 66
np.vstack([df, [-100] * 5])[labels, 1].shape

(1024,)

In [112]:
df.shape[1]

5

In [1]:
def get_hierarchy_matrix(self, hierarchy_tree, num_tags, max_depth):
    """Takes a hierarchical tree, and creates a matrix of a_i,j where i is the tag and j is the level of hierarchy.
    """
    matrix = np.zeros((num_tags, max_depth))

    def parse_tree(tree, group_prefix=[], start_tag=0):
        for group_index, item in enumerate(tree):
            if start_tag == 0:
                start_tag += 1
                continue
            if isinstance(item, tuple):
                start_tag = parse_tree(item[1],
                                       group_prefix=group_prefix + [matrix[start_tag - 1][len(group_prefix)] + 1],
                                       start_tag=start_tag)
            else:
                matrix[start_tag] = matrix[start_tag - 1] + 1
                matrix[start_tag][:len(group_prefix)] = group_prefix
                start_tag += 1
        return start_tag

    parse_tree(hierarchy_tree)
    return matrix



In [3]:
import torch
import numpy as np

torch.LongTensor(np.ones((1,2)), device='cpu')

tensor([[1, 1]])

In [3]:
from src.uspanteko_morphology import morphology

def create_gloss_vocab():
    def parse_tree(morphology_subtree):
        all_glosses = []
        for item in morphology_subtree:
            if isinstance(item, tuple):
                all_glosses += parse_tree(item[1])
            else:
                all_glosses.append(item)
        return all_glosses

    return parse_tree(morphology)

glosses = create_gloss_vocab()


In [4]:
len(glosses)

66