In [2]:
import random

from data_handling import load_data_file, create_vocab, prepare_dataset, create_gloss_vocab
from uspanteko_morphology import morphology
from tokenizer import WordLevelTokenizer
from datasets import DatasetDict

train_data = load_data_file("../data/GenBench/train.txt")
eval_ood = load_data_file("../data/GenBench/eval_ood.txt")
eval_id = load_data_file("../data/GenBench/eval_id.txt")
test_ood = load_data_file("../data/GenBench/test_ood.txt")

MODEL_INPUT_LENGTH = 64
device = 'mps'

train_vocab = create_vocab([line.morphemes() for line in train_data], threshold=1)
tokenizer = WordLevelTokenizer(vocab=train_vocab, model_max_length=MODEL_INPUT_LENGTH)

glosses = create_gloss_vocab(morphology)

dataset = DatasetDict()

dataset['train'] = prepare_dataset(data=train_data, tokenizer=tokenizer, labels=glosses, device=device)
dataset['eval_OOD'] = prepare_dataset(data=eval_ood, tokenizer=tokenizer, labels=glosses, device=device)
dataset['eval_ID'] = prepare_dataset(data=eval_id, tokenizer=tokenizer, labels=glosses, device=device)
dataset['test_OOD'] = prepare_dataset(data=test_ood, tokenizer=tokenizer, labels=glosses, device=device)

Map:   0%|          | 0/5049 [00:00<?, ? examples/s]

Map:   0%|          | 0/2128 [00:00<?, ? examples/s]

Map:   0%|          | 0/2128 [00:00<?, ? examples/s]

Map:   0%|          | 0/2128 [00:00<?, ? examples/s]

In [10]:
from collections import defaultdict

def learn_glosses(dataset):
    # For each morpheme, keep a dictionary of gloss-counts
    morpheme_glosses = defaultdict(lambda: defaultdict(lambda: 0))
    for row in dataset:
        for morpheme, gloss in zip(row['morphemes'], row['glosses']):
            morpheme_glosses[morpheme][gloss] += 1
    return morpheme_glosses

learn_glosses(dataset['train'])['[SEP]']

defaultdict(<function __main__.learn_glosses.<locals>.<lambda>.<locals>.<lambda>()>,
            {'[SEP]': 15542})

In [19]:
import random 

def gloss_with_top_gloss(gloss_dict):
    return max(gloss_dict, key=gloss_dict.get)

def gloss_with_random_gloss(gloss_dict):
    return random.choice(list(gloss_dict.keys()))

def make_preds(train_dataset, prediction_dataset, pred_strategy):
    morpheme_glosses = learn_glosses(train_dataset)
    
    predictions = []
    for row in prediction_dataset:
        line_predictions = []
        for morpheme in row['morphemes']:
            if morpheme not in morpheme_glosses:
                predicted_gloss = "???"
            else:
                predicted_gloss = pred_strategy(morpheme_glosses[morpheme])
            line_predictions.append(predicted_gloss)
        predictions.append(line_predictions)
    return predictions

make_preds(dataset['train'], dataset['eval_ID'], gloss_with_random_gloss)[0]

['E1S', '[SEP]', 'VOC', '[SEP]', 'E1S', '[SEP]', 'VOC']

In [23]:
from eval import eval_accuracy

def eval_data(dataset):
    for dataset_id in ['eval_ID', 'eval_OOD', 'test_OOD']:
        eval_labels = [row['glosses'] for row in dataset[dataset_id]]
        random_preds = make_preds(dataset['train'], dataset[dataset_id], gloss_with_random_gloss)
        random_perf = eval_accuracy(random_preds, eval_labels)
        print(f"{dataset_id} / random: {random_perf['accuracy']}")
        
        top_preds = make_preds(dataset['train'], dataset[dataset_id], gloss_with_top_gloss)
        top_perf = eval_accuracy(top_preds, eval_labels)
        print(f"{dataset_id} / top: {top_perf['accuracy']}")
    

eval_data(dataset)

eval_ID / random: 0.4436551501453019
eval_ID / top: 0.8496932515337423
eval_OOD / random: 0.40628166160081053
eval_OOD / top: 0.7416413373860182
test_OOD / random: 0.4020343384792959
test_OOD / top: 0.7485211369210792
