In [1]:
import sys
sys.path.append('./src')

from src.data import prepare_dataset, create_vocab, create_gloss_vocab, special_chars, load_data_file
import random
from src.tokenizer import WordLevelTokenizer
from src.uspanteko_morphology import morphology
from datasets import DatasetDict

# Load data
train_data = load_data_file(f"./data/usp-train-track2-uncovered")
dev_data = load_data_file(f"./data/usp-dev-track2-uncovered")

train_vocab = create_vocab([line.morphemes() for line in train_data], threshold=1)
tokenizer = WordLevelTokenizer(vocab=train_vocab, model_max_length=64)

glosses = create_gloss_vocab(morphology)

dataset = DatasetDict()
dataset['train'] = prepare_dataset(data=train_data, tokenizer=tokenizer, labels=glosses, device='cpu')
dataset['dev'] = prepare_dataset(data=dev_data, tokenizer=tokenizer, labels=glosses, device='cpu')

  0%|          | 0/9774 [00:00<?, ?ex/s]

  0%|          | 0/232 [00:00<?, ?ex/s]

In [2]:
from transformers import TrainingArguments, Trainer, AutoModelForTokenClassification
from src.taxonomic_loss_model import TaxonomicLossModel
from IPython.display import clear_output
import pandas as pd

def compute_metrics(eval_preds):
    preds, gold_labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    total_morphemes = 0
    total_correct = 0
    total_top_5_correct = 0

    # Calculate top k accuracy
    for seq_index in range(len(gold_labels)):
        for token_index in range(len(gold_labels[seq_index])):
            correct_token_id = gold_labels[seq_index][token_index]
            if len(glosses) > correct_token_id >= 0 and correct_token_id != 1:
                total_morphemes += 1
                if correct_token_id in preds[seq_index][token_index]:
                    total_top_5_correct += 1
                if correct_token_id == preds[seq_index][token_index][0]:
                    total_correct += 1

    return {'accuracy': total_correct / total_morphemes, 'topkaccuracy': total_top_5_correct / total_morphemes}

def preprocess_logits_for_metrics(topk):
    def _preprocess_logits_for_metrics(logits, labels):
        return torch.topk(logits, topk, dim=2).indices
    return _preprocess_logits_for_metrics


args = TrainingArguments(
    output_dir=f"../training-checkpoints",
    evaluation_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=3,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=3,
    num_train_epochs=100,
    load_best_model_at_end=True,
    report_to="wandb",
)

def eval_topk(size, k):
    tax_accs = []
    harmonic_accs = []
    flat_accs = []

    for seed in range(42, 52):
        tax_model = TaxonomicLossModel.from_pretrained(f"./models/{size}-tax-{seed}", num_labels=len(glosses))
        tax_model.use_morphology_tree(morphology, 5)

        harmonic_tax_model = TaxonomicLossModel.from_pretrained(f"./models/{size}-tax-{seed}-harmonic", num_labels=len(glosses))
        harmonic_tax_model.use_morphology_tree(morphology, 5)

        flat_model = AutoModelForTokenClassification.from_pretrained(f"./models/{size}-flat-{seed}", num_labels=len(glosses))

        tax_trainer = Trainer(
            tax_model,
            args,
            compute_metrics=compute_metrics,
            preprocess_logits_for_metrics=preprocess_logits_for_metrics(k),
        )

        harmonic_trainer = Trainer(
            harmonic_tax_model,
            args,
            compute_metrics=compute_metrics,
            preprocess_logits_for_metrics=preprocess_logits_for_metrics(k),
        )

        flat_trainer = Trainer(
            flat_model,
            args,
            compute_metrics=compute_metrics,
            preprocess_logits_for_metrics=preprocess_logits_for_metrics(k),
        )

        flat_accs.append(flat_trainer.evaluate(dataset['dev'])['eval_topkaccuracy'])
        tax_accs.append(tax_trainer.evaluate(dataset['dev'])['eval_topkaccuracy'])
        harmonic_accs.append(harmonic_trainer.evaluate(dataset['dev'])['eval_topkaccuracy'])

        clear_output(wait=True)

    return sum(flat_accs) / len(flat_accs), sum(tax_accs) / len(tax_accs), sum(harmonic_accs) / len(harmonic_accs)

all_results = []

for size in [10, 100, 500, 1000]:
    size_results = []
    for k in [1, 2, 3, 4, 5, 6]:
        flat_score, tax_score, harmonic_score = eval_topk(size, k)
        size_results.append((flat_score, tax_score, harmonic_score))
    all_results.append(size_results)

pd.DataFrame(all_results)

Unnamed: 0,0,1,2,3,4,5
0,"(0.26302124311565694, 0.24130605822187254, 0.2...","(0.38874901652242333, 0.3643587726199843, 0.35...","(0.47033831628638867, 0.42446892210857595, 0.4...","(0.5184893784421715, 0.4749016522423289, 0.468...","(0.5533438237608183, 0.5130605822187253, 0.504...","(0.5837922895357986, 0.5429583005507475, 0.536..."
1,"(0.29960660896931557, 0.3286388670338317, 0.34...","(0.46388670338316296, 0.48080251770259635, 0.5...","(0.5402832415420928, 0.552006294256491, 0.5818...","(0.598426435877262, 0.5944138473642802, 0.6394...","(0.6424075531077891, 0.6276947285601888, 0.675...","(0.6745082612116444, 0.6549173878835562, 0.704..."
2,"(0.47285601888276946, 0.45719905586152637, 0.4...","(0.6333595594020458, 0.6586939417781276, 0.635...","(0.7128245476003147, 0.7382376081825335, 0.734...","(0.7664044059795436, 0.7843430369787567, 0.779...","(0.8041699449252556, 0.8167584579071597, 0.814...","(0.8332022029897719, 0.8374508261211645, 0.839..."
3,"(0.6808025177025964, 0.6530291109362707, 0.677...","(0.8243902439024391, 0.8114083398898506, 0.818...","(0.8697088906372933, 0.8595594020456334, 0.866...","(0.8948072383949646, 0.8845003933910307, 0.897...","(0.9101494885916601, 0.9035405192761606, 0.915...","(0.9230527143981118, 0.9201416207710466, 0.928..."


In [10]:
df = pd.DataFrame(all_results)
df.applymap(lambda x: (round(x[0], 3), round(x[1], 3), round(x[2], 3)))

Unnamed: 0,0,1,2,3,4,5
0,"(0.263, 0.241, 0.248)","(0.389, 0.364, 0.355)","(0.47, 0.424, 0.427)","(0.518, 0.475, 0.468)","(0.553, 0.513, 0.504)","(0.584, 0.543, 0.536)"
1,"(0.3, 0.329, 0.348)","(0.464, 0.481, 0.503)","(0.54, 0.552, 0.582)","(0.598, 0.594, 0.639)","(0.642, 0.628, 0.675)","(0.675, 0.655, 0.704)"
2,"(0.473, 0.457, 0.431)","(0.633, 0.659, 0.635)","(0.713, 0.738, 0.734)","(0.766, 0.784, 0.779)","(0.804, 0.817, 0.815)","(0.833, 0.837, 0.839)"
3,"(0.681, 0.653, 0.677)","(0.824, 0.811, 0.818)","(0.87, 0.86, 0.867)","(0.895, 0.885, 0.898)","(0.91, 0.904, 0.915)","(0.923, 0.92, 0.928)"


In [13]:
import pandas as pd

# Explore some of the wrong predictions made by our model
incorrect = []

tax_model = TaxonomicLossModel.from_pretrained(f"./models/1000-tax-42", num_labels=len(glosses))
tax_model.use_morphology_tree(morphology, 5)

harmonic_tax_model = TaxonomicLossModel.from_pretrained(f"./models/1000-tax-42-harmonic", num_labels=len(glosses))
harmonic_tax_model.use_morphology_tree(morphology, 5)

flat_model = AutoModelForTokenClassification.from_pretrained(f"./models/1000-flat-42", num_labels=len(glosses))
        
        
tax_trainer = Trainer(
    tax_model,
    args,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics(k),
)

harmonic_trainer = Trainer(
    harmonic_tax_model,
    args,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics(k),
)

flat_trainer = Trainer(
    flat_model,
    args,
    compute_metrics=compute_metrics,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics(k),
)
        
flat_preds = flat_trainer.predict(dataset['dev'])
tax_preds = tax_trainer.predict(dataset['dev'])

for seq_index in range(len(flat_preds.label_ids)):
    for token_index in range(len(flat_preds.label_ids[seq_index])):
        correct_token_id = flat_preds.label_ids[seq_index][token_index]
        if len(glosses) > correct_token_id >= 0 and correct_token_id != 1:
            if correct_token_id != flat_preds.predictions[seq_index][token_index][0] \
                    or correct_token_id != tax_preds.predictions[seq_index][token_index][0]:
                incorrect.append({
                    'sentence': seq_index,
                    'morpheme': dataset['dev'][seq_index]['morphemes'][token_index],
                    'correct_token': glosses[correct_token_id],
                    'flat_preds': [glosses[token_id] for token_id in flat_preds.predictions[seq_index][token_index]],
                    'tax_preds': [glosses[token_id] for token_id in tax_preds.predictions[seq_index][token_index]]
                })



loading configuration file ./models/1000-tax-42/config.json
Model config RobertaConfig {
  "_name_or_path": "michaelginn/uspanteko-masked-lm",
  "architectures": [
    "TaxonomicLossModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 100,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL_27",
    "28

The following columns in the test set don't have a corresponding argument in `TaxonomicLossModel.forward` and have been ignored: glosses, translation, segmentation, morphemes, transcription. If glosses, translation, segmentation, morphemes, transcription are not expected by `TaxonomicLossModel.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 232
  Batch size = 16


LEVEL 0 tensor(0.4146)
LEVEL 1 tensor(0.6322)
LEVEL 2 tensor(1.4254)
LEVEL 3 tensor(1.7305)
LEVEL 4 tensor(1.9446)


LEVEL 0 tensor(0.3269)
LEVEL 1 tensor(0.6287)
LEVEL 2 tensor(1.3686)
LEVEL 3 tensor(1.6942)
LEVEL 4 tensor(1.9460)
LEVEL 0 tensor(0.5675)
LEVEL 1 tensor(0.7645)
LEVEL 2 tensor(1.7684)
LEVEL 3 tensor(1.9330)
LEVEL 4 tensor(2.1195)
LEVEL 0 tensor(0.3524)
LEVEL 1 tensor(0.6436)
LEVEL 2 tensor(1.5724)
LEVEL 3 tensor(1.7886)
LEVEL 4 tensor(2.0093)
LEVEL 0 tensor(0.3385)
LEVEL 1 tensor(0.5933)
LEVEL 2 tensor(1.4456)
LEVEL 3 tensor(1.7077)
LEVEL 4 tensor(1.9664)
LEVEL 0 tensor(0.2986)
LEVEL 1 tensor(0.5916)
LEVEL 2 tensor(1.4352)
LEVEL 3 tensor(1.6942)
LEVEL 4 tensor(1.8781)
LEVEL 0 tensor(0.3214)
LEVEL 1 tensor(0.7107)
LEVEL 2 tensor(1.4681)
LEVEL 3 tensor(1.7258)
LEVEL 4 tensor(1.8728)
LEVEL 0 tensor(0.4715)
LEVEL 1 tensor(0.6778)
LEVEL 2 tensor(1.4500)
LEVEL 3 tensor(1.7203)
LEVEL 4 tensor(1.9083)
LEVEL 0 tensor(0.6914)
LEVEL 1 tensor(0.8779)
LEVEL 2 tensor(1.6297)
LEVEL 3 tensor(1.8415)
LEVEL 4 tensor(2.0168)
LEVEL 0 tensor(0.3581)
LEVEL 1 tensor(0.6959)
LEVEL 2 tensor(1.7130)
LEVEL 3 ten

In [26]:
incorrect = pd.DataFrame(incorrect)
incorrect[incorrect['correct_token'] == 'E1S']

Unnamed: 0,sentence,morpheme,correct_token,flat_preds,tax_preds
90,46,w,E1S,"[E1S, [SEP], PART, INC, PRON, ENF]","[E3S, PART, E1S, SC, E2S, E1P]"
113,59,w,E1S,"[E1S, [SEP], PART, PRON, INC, ENF]","[PART, E3S, E1S, SC, E2S, DEM]"
114,59,in,E1S,"[E1S, A1S, INC, PRON, [SEP], E1P]","[E3S, E1S, E1P, A2S, E2S, A1S]"
137,67,in,E1S,"[E1S, A1S, PRON, INC, E1P, E2S]","[E3S, E1S, A2S, E1P, A1S, E2S]"
213,115,in,E1S,"[E1S, A1S, INC, PRON, E2S, E1P]","[E3S, E1S, E1P, A2S, E2S, SC]"
252,133,in,E1S,"[E1S, A1S, PRON, INC, [SEP], VT]","[E3S, E1S, E1P, A2S, E2S, A1S]"
319,157,in,E1S,"[E1S, A1S, INC, PRON, VT, [SEP]]","[E3S, E1S, A2S, E1P, E2S, A1S]"
350,173,in,E1S,"[E1S, A1S, PRON, INC, VT, E1P]","[E3S, E1S, A2S, E1P, E2S, A1S]"
351,173,in,E1S,"[E1S, A1S, PRON, INC, VT, [SEP]]","[E3S, E1S, E1P, A2S, E2S, SC]"
355,177,in,E1S,"[E1S, A1S, INC, PRON, [SEP], VT]","[E3S, E1S, E1P, SC, A2S, E2S]"
