In [1]:
import sys
sys.path.append('./src')

from src.data import prepare_dataset, create_vocab, create_gloss_vocab, special_chars, load_data_file, prepare_multitask_dataset
import random
from src.tokenizer import WordLevelTokenizer
from src.uspanteko_morphology import morphology
from datasets import DatasetDict

# Load data
train_data = load_data_file(f"./data/usp-train-track2-uncovered")
dev_data = load_data_file(f"./data/usp-dev-track2-uncovered")

train_vocab = create_vocab([line.morphemes() for line in train_data], threshold=1)
tokenizer = WordLevelTokenizer(vocab=train_vocab, model_max_length=64)

glosses = create_gloss_vocab(morphology)

dataset = DatasetDict()
dataset['train'] = prepare_multitask_dataset(data=train_data, tokenizer=tokenizer, labels=glosses, device='cpu')
dataset['dev'] = prepare_multitask_dataset(data=dev_data, tokenizer=tokenizer, labels=glosses, device='cpu')

dataset['train'][0]

  0%|          | 0/9774 [00:00<?, ?ex/s]

  0%|          | 0/232 [00:00<?, ?ex/s]

{'transcription': 'o sey xtok rixoqiil',
 'translation': 'O sea busca esposa.',
 'glosses': ['CONJ',
  '[SEP]',
  'ADV',
  '[SEP]',
  'COM',
  'VT',
  '[SEP]',
  'E3S',
  'S'],
 'segmentation': "o' sea x-tok r-ixóqiil",
 'morphemes': ["o'",
  '[SEP]',
  'sea',
  '[SEP]',
  'x',
  'tok',
  '[SEP]',
  'r',
  'ixóqiil'],
 'input_ids': [2250,
  1,
  2733,
  1,
  3338,
  3028,
  1,
  2605,
  1214,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [2]:
from transformers import AutoModelForTokenClassification
from src.finetune_token_classifier import create_trainer

model = AutoModelForTokenClassification.from_pretrained(f"./models/10-flat-42-linear", num_labels=len(glosses))
trainer = create_trainer(model, dataset=dataset, tokenizer=tokenizer, labels=glosses, batch_size=64, max_epochs=30)

Creating trainer...


In [3]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: segmentation, transcription, glosses, morphemes, translation. If segmentation, transcription, glosses, morphemes, translation are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 232
  Batch size = 64


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Preds:	 ['VI', '[SEP]', 'INC', 'PREP', 'VI', '[SEP]', 'S', '[SEP]', 'S']
Labels:	 ['PRON', '[SEP]', 'INC', 'E3S', 'VT', '[SEP]', 'VT', '[SEP]', 'S']


[34m[1mwandb[0m: Currently logged in as: [33mmichael-ginn[0m ([33mseminal-2023-legalner[0m). Use [1m`wandb login --relogin`[0m to force relogin


{'eval_loss': 1.6223783493041992,
 'eval_average_accuracy': 0.3624345541910198,
 'eval_accuracy': 0.44453186467348543,
 'eval_runtime': 7.6109,
 'eval_samples_per_second': 30.483,
 'eval_steps_per_second': 0.526}

In [14]:
preds = trainer.predict(dataset['dev'])

The following columns in the test set don't have a corresponding argument in `RobertaForTokenClassification.forward` and have been ignored: segmentation, transcription, glosses, morphemes, translation. If segmentation, transcription, glosses, morphemes, translation are not expected by `RobertaForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 232
  Batch size = 64


Preds:	 ['VI', '[SEP]', 'INC', 'PREP', 'VI', '[SEP]', 'S', '[SEP]', 'S']
Labels:	 ['PRON', '[SEP]', 'INC', 'E3S', 'VT', '[SEP]', 'VT', '[SEP]', 'S']


In [41]:
random.seed(42)
train_data = random.sample(train_data, 10)

seen_morphemes = set()
seen_morpheme_labels = set()
all_seen = set()

for row in train_data:
    for morpheme in row.morphemes():
        seen_morphemes.add(morpheme)
    for label in row.gloss_list(segmented=True):
        seen_morpheme_labels.add(label)
    for morpheme, label in zip(row.morphemes(), row.gloss_list(segmented=True)):
        all_seen.add((morpheme, label))

print(f"Train data includes {len(seen_morpheme_labels)}/{len(glosses)} of possible labels")

Train data includes 28/66 of possible labels


In [9]:
# What proportion of morphemes in the dev set are seen?
dev_seen_morphemes = 0
dev_total_morphemes = 0
dev_seen_labels = 0
dev_total_labels = 0
dev_morpheme_distribution = dict()

for row in dataset['dev']:
    for morpheme in row['morphemes']:
        if morpheme == '[SEP]':
            continue
        if morpheme in seen_morphemes:
            dev_seen_morphemes += 1
        dev_total_morphemes += 1
        dev_morpheme_distribution[morpheme] = dev_morpheme_distribution.get(morpheme, 0) + 1
    for gloss in row['glosses']:
        if gloss == '[SEP]':
            continue
        if gloss in seen_morpheme_labels:
            dev_seen_labels += 1
        dev_total_labels += 1

print(f"Seen {dev_seen_morphemes}/{dev_total_morphemes}={dev_seen_morphemes/dev_total_morphemes} morphemes in dev set")
print(f"Seen {dev_seen_labels}/{dev_total_labels} glosses in dev set")

Seen 367/1271=0.2887490165224233 morphemes in dev set
Seen 1114/1271 glosses in dev set


In [12]:
sorted(dev_morpheme_distribution.items(), key=lambda x:x[1])

[("ka'n", 1),
 ('montañas', 1),
 ('me', 1),
 ("ch'", 1),
 ("tz'i'", 1),
 ('nii', 1),
 ('kask', 1),
 ("chee'", 1),
 ('oqta', 1),
 ('chap', 1),
 ('mod', 1),
 ('yuq', 1),
 ('morr', 1),
 ("yu'", 1),
 ("at'", 1),
 ('miyer', 1),
 ("ak'l", 1),
 ('sera', 1),
 ('kasar', 1),
 ('tal', 1),
 ("q'iij", 1),
 ('oj', 1),
 ('ik', 1),
 ('qas', 1),
 ('aaj', 1),
 ('morral', 1),
 ('juntiir', 1),
 ('moor', 1),
 ('ni', 1),
 ("kita'", 1),
 ('maña', 1),
 ('pogam', 1),
 ('pores', 1),
 ('ants', 1),
 ('primer', 1),
 ('de', 1),
 ("uxib'", 1),
 ('kirtyaan', 1),
 ('rekomendar', 1),
 ('jaan', 1),
 ('qa', 1),
 ('ech', 1),
 ('etaam', 1),
 ('naa', 1),
 ('komo', 1),
 ('tres', 1),
 ("o'", 1),
 ('kwatro', 1),
 ('koopaa', 1),
 ('qej', 1),
 ('qul', 1),
 ('jwelagran:', 1),
 ('ok', 1),
 ('pen', 1),
 ("k'is", 1),
 ("tzik'", 1),
 ('pobr', 1),
 ('ntons', 1),
 ("k'itj", 1),
 ("k'it", 1),
 ('dyunabes', 1),
 ("xek'", 1),
 ('sip', 1),
 ('sep', 1),
 ('paso', 1),
 ('pongams', 1),
 ('kitz', 1),
 ('ichijil', 1),
 ("uk'iil", 1),
 ('montañ'

In [13]:
train_data

[Trnsc:	Rik'u li tijb'iij taq
 Segm:	ri-k'u li ti-j-b'ij taq
 Gloss:	E3-PART PREP INC-E3S-VT PL
 Trnsl:	Eso es lo que decían.
 ,
 Trnsc:	Iwir xojyolow ák'l ljori xojyolow chik juntir ák'l.
 Segm:	iwir x-oj-yol-ow a-k'l ljori x-oj-yol-ow chik juntiir a-k'l
 Gloss:	ADV COM-A1P-VT-AP E2S-SREL ADV COM-A1P-VT-AP PART ADV E2S-SREL
 Trnsl:	Ayer platicamos con usted y ahora palaticamos otraves con usted.
 ,
 Trnsc:	kwando, kwando xk'uli' jun xib'aal,
 Segm:	kwando kwando x-k'ul-i' jun xib'aal
 Gloss:	ADV ADV COM-VI-ENF NUM S
 Trnsl:	Cuando cuando se cazó un su hermano.
 ,
 Trnsc:	Baa de teraasa juntir.
 Segm:	baa de teraasa juntiir
 Gloss:	VOC PREP S ADV
 Trnsl:	Va de terraza todo.
 ,
 Trnsc:	neen chek tpee sub'laj q'etub'.
 Segm:	neen chek t-pee sub'-laj q'etub'
 Gloss:	INT ADV INC-VI ITS-??? S
 Trnsl:	Porque vienen muchos golpes con leña.
 ,
 Trnsc:	Xok chij chomorsajk
 Segm:	x-ok chi-j chomorsa-j-ik
 Gloss:	COM-VI PREP-E3S VT-SC-SC
 Trnsl:	Empezo a pensar.
 ,
 Trnsc:	Tb'e taq li tikij ab'ii

In [45]:
# For the correct predictions, how many of them were for seen morphemes/glosses?
dev_correct_seen_morphemes = 0
dev_incorrect_seen_morphemes = 0
dev_correct_unseen_morphemes = 0
dev_incorrect_unseen_morphemes = 0

dev_correct_seen_labels = 0
dev_incorrect_seen_labels = 0
dev_correct_unseen_labels = 0
dev_incorrect_unseen_labels = 0

dev_unseen_correct = dict()

dev_correct_unseen_morpheme_seen_gloss = 0
dev_incorrect_unseen_morpheme_seen_gloss = 0

for row_index in range(len(dataset['dev'])):
    row = dataset['dev'][row_index]
    pred_row = preds[0][row_index]

    for token_index in range(len(row['morphemes'])):
        morpheme = row['morphemes'][token_index]
        correct_label_index = row['labels'][token_index]
        predicted_label_index = pred_row[token_index]

        # Skip pad tokens
        if correct_label_index == 1:
            continue

        if correct_label_index == predicted_label_index:
            # Correct prediction
            if morpheme in seen_morphemes:
                dev_correct_seen_morphemes += 1
            else:
                dev_correct_unseen_morphemes += 1
                dev_unseen_correct[(morpheme, glosses[correct_label_index])] = dev_unseen_correct.get((morpheme, glosses[correct_label_index]), 0) + 1
                if glosses[correct_label_index] in seen_morpheme_labels:
                    dev_correct_unseen_morpheme_seen_gloss += 1
            if glosses[correct_label_index] in seen_morpheme_labels:
                dev_correct_seen_labels += 1
            else:
                dev_correct_unseen_labels += 1
        else:
            # Incorrect prediction
            if morpheme in seen_morphemes:
                dev_incorrect_seen_morphemes += 1
            else:
                dev_incorrect_unseen_morphemes += 1
                if glosses[correct_label_index] in seen_morpheme_labels:
                    dev_incorrect_unseen_morpheme_seen_gloss += 1
            if glosses[correct_label_index] in seen_morpheme_labels:
                dev_incorrect_seen_labels += 1
            else:
                dev_incorrect_unseen_labels += 1


print(f"Of the seen morphemes, instances in the dev set were labelled correctly {dev_correct_seen_morphemes} times and incorrectly {dev_incorrect_seen_morphemes} times")
print(f"Of the unseen morphemes (not in training data), instances in the dev set were labelled correctly {dev_correct_unseen_morphemes} times and incorrectly {dev_incorrect_unseen_morphemes} times")

print(f"Of the seen labels, instances in the dev set were labelled correctly {dev_correct_seen_labels} times and incorrectly {dev_incorrect_seen_labels} times")
print(f"Of the unseen labels (not in training data), instances in the dev set were labelled correctly {dev_correct_unseen_labels} times and incorrectly {dev_incorrect_unseen_labels} times")

print(f"Of the unseen morphemes with seen labels, {dev_correct_unseen_morpheme_seen_gloss} were labelled correctly and {dev_incorrect_unseen_morpheme_seen_gloss} were labelled incorrectly.")

Of the seen morphemes, instances in the dev set were labelled correctly 308 times and incorrectly 59 times
Of the unseen morphemes (not in training data), instances in the dev set were labelled correctly 257 times and incorrectly 647 times
Of the seen labels, instances in the dev set were labelled correctly 565 times and incorrectly 549 times
Of the unseen labels (not in training data), instances in the dev set were labelled correctly 0 times and incorrectly 157 times
Of the unseen morphemes with seen labels, 257 were labelled correctly and 520 were labelled incorrectly.


{("ka'n", 'S'): 1,
 ('despwes', 'ADV'): 17,
 ("b'ee", 'VI'): 2,
 ('montañas', 'S'): 1,
 ("tz'i'", 'S'): 1,
 ("k'am", 'VT'): 6,
 ('rifle', 'S'): 4,
 ('kond', 'ADV'): 6,
 ('kwand', 'ADV'): 2,
 ('pet', 'VI'): 6,
 ('tok', 'VT'): 2,
 ('benad', 'S'): 5,
 ("chee'", 'S'): 1,
 ('montaña', 'S'): 6,
 ("te'", 'VT'): 1,
 ('chap', 'VT'): 1,
 ('koop', 'S'): 9,
 ("kik'", 'S'): 4,
 ('tij', 'VT'): 11,
 ('sol', 'ADV'): 1,
 ('yuq', 'VI'): 1,
 ('kristyan', 'S'): 4,
 ("yu'", 'VI'): 1,
 ('saber', 'ADV'): 10,
 ('wunaq', 'S'): 6,
 ('moo', 'ADV'): 4,
 ("cha'", 'VI'): 17,
 ("b'an", 'VT'): 6,
 ('kom', 'ADV'): 1,
 ("ák'il", 'S'): 1,
 ('tawen', 'ADV'): 2,
 ('pwes', 'ADV'): 3,
 ("q'iij", 'S'): 1,
 ('pe', 'VI'): 16,
 ('qas', 'VT'): 1,
 ('antons', 'ADV'): 3,
 ('nada', 'ADV'): 1,
 ('koj', 'S'): 7,
 ('chep', 'VT'): 1,
 ('si', 'ADV'): 2,
 ('entons', 'ADV'): 2,
 ('pores', 'ADV'): 1,
 ('ta', 'INC'): 2,
 ('por', 'ADV'): 1,
 ('kirtyaan', 'S'): 1,
 ('mas', 'ADV'): 1,
 ('bolber', 'VI'): 1,
 ('koopaa', 'S'): 1,
 ('qej', 'VI'): 

In [42]:
all_seen

{("(ch'ek')", '???'),
 ('[SEP]', '[SEP]'),
 ('a', 'CONJ'),
 ('a', 'E2S'),
 ("ab'iix", 'S'),
 ("b'e", 'VI'),
 ("b'ij", 'VT'),
 ('baa', 'VOC'),
 ('ch', 'INC'),
 ('chaak', 'S'),
 ("che'", 'VI'),
 ('chek', 'ADV'),
 ('chi', 'PREP'),
 ('chik', 'PART'),
 ('chomorsa', 'VT'),
 ('de', 'PREP'),
 ("i'", 'ENF'),
 ("i'n", 'ENF'),
 ('ij', 'ITR'),
 ('ik', 'SC'),
 ('in', 'PRON'),
 ('iwir', 'ADV'),
 ('j', 'E3S'),
 ('j', 'SC'),
 ('jun', 'NUM'),
 ('juntiir', 'ADV'),
 ("k'l", 'SREL'),
 ("k'u", 'PART'),
 ("k'ul", 'VI'),
 ('kwando', 'ADV'),
 ('laj', '???'),
 ('li', 'DEM'),
 ('li', 'PREP'),
 ('ljori', 'ADV'),
 ('miij', 'S'),
 ('neen', 'INT'),
 ('oj', 'A1P'),
 ('ok', 'VI'),
 ('ow', 'AP'),
 ('pee', 'VI'),
 ("q'etub'", 'S'),
 ('qaaj', 'S'),
 ('ri', 'DEM'),
 ('ri', 'E3'),
 ("sub'", 'ITS'),
 ('t', 'INC'),
 ('taq', 'PL'),
 ('teraasa', 'S'),
 ('ti', 'INC'),
 ('tik', 'VT'),
 ('un', 'AP'),
 ('we', 'COND'),
 ('x', 'COM'),
 ("xib'aal", 'S'),
 ('yol', 'VT')}

In [28]:
len(dataset['dev'])

232

In [33]:
seen_morpheme_labels

{'???',
 'A1P',
 'ADV',
 'AP',
 'COM',
 'COND',
 'CONJ',
 'DEM',
 'E2S',
 'E3',
 'E3S',
 'ENF',
 'INC',
 'INT',
 'ITR',
 'ITS',
 'NUM',
 'PART',
 'PL',
 'PREP',
 'PRON',
 'S',
 'SC',
 'SREL',
 'VI',
 'VOC',
 'VT',
 '[SEP]'}

In [47]:
for row in dataset['train']:
    print(row['morphemes'])

["o'", '[SEP]', 'sea', '[SEP]', 'x', 'tok', '[SEP]', 'r', 'ixóqiil']
["ta'", '[SEP]', 'nada', '[SEP]', 'chi', '[SEP]', "wi'", '[SEP]', 'ra', '[SEP]', 'r', 'ichooch']
['pwees', '[SEP]', 'ra', '[SEP]', 'jup', 'ul', '[SEP]', 'jaa', '[SEP]', 'qe']
["k'ark'", 'aq', '[SEP]', 'jaa', '[SEP]', 'qe']
["xe'", '[SEP]', 'juntiir']
['rechaq', '[SEP]', 'galaan', "i'", '[SEP]', 'r', 'ichooch', 'aq', '[SEP]', 'juntiir']
['qus', "i'", '[SEP]', 'juntiir', '[SEP]', 'qleen', '[SEP]', 'rechaq']
["wi'", '[SEP]', 'j', "kwa'y", 'aq']
['wakix', 'aq']
['juntiir', '[SEP]', 'qleen', '[SEP]', 'rechaq']
['i', '[SEP]', "sik'", '[SEP]', 'r', "ch'elxik", 'aq']
['koom', '[SEP]', 'neen', '[SEP]', 't', 'r', 'en', '[SEP]', 're', '[SEP]', 'j', 'pobre', 'iil', '[SEP]', 'jun']
['entoons', '[SEP]', 're', '[SEP]', 'x', 'tok', '[SEP]', 'r', 'ixóqil']
["kita'", '[SEP]', 'j', "q'unik", '[SEP]', 't', 'wer', '[SEP]', 'taq']
['i', '[SEP]', "kita'", '[SEP]', 'juntiir']
["kita'", '[SEP]', 'qleen', '[SEP]', 'r', 'ech', 'aq']
['i', '[SEP

In [26]:
from src.multitask_model import MultitaskModel

model = MultitaskModel.from_pretrained("michaelginn/uspanteko-mlm-large", classifier_head_sizes=[66, 21, 19, 10])

Some weights of the model checkpoint at michaelginn/uspanteko-mlm-large were not used when initializing MultitaskModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing MultitaskModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MultitaskModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MultitaskModel were not initialized from the model checkpoint at michaelginn/uspanteko-mlm-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
model(input_ids=dataset['train'][0]['input_ids'], attention_mask=dataset['train'][0]['attention_mask'], labels=dataset['train'][0]['labels'])

AttributeError: 'list' object has no attribute 'size'

In [32]:
dataset['train'][0]

{'transcription': 'o sey xtok rixoqiil',
 'translation': 'O sea busca esposa.',
 'glosses': ['CONJ',
  '[SEP]',
  'ADV',
  '[SEP]',
  'COM',
  'VT',
  '[SEP]',
  'E3S',
  'S'],
 'segmentation': "o' sea x-tok r-ixóqiil",
 'morphemes': ["o'",
  '[SEP]',
  'sea',
  '[SEP]',
  'x',
  'tok',
  '[SEP]',
  'r',
  'ixóqiil'],
 'input_ids': [2250,
  1,
  2733,
  1,
  3338,
  3028,
  1,
  2605,
  1214,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
