## Basic Model Training
Let's get our data into a format that we can use to train transformer-style models.

We will start with the Korean dataset, as it is well-annotated.

In [1]:
!pip install xigt



In [2]:
from xigt.codecs import xigtxml
corpus = xigtxml.load(open('../data/kor.xml'))

In [3]:
class MissingValueError(Exception):
    pass

# From a single line of IGT, extracts the features which are allowed in this shared task:
# 1. Transcribed words (not segmented)
# 2. Translation (not aligned)
# 3. Glosses
def extract_igt(igt):
    if not igt.get('w'):
        raise MissingValueError("words")
    if not igt.get('tw'):
        raise MissingValueError("translation")
    if not igt.get('gw'):
        raise MissingValueError("glosses")
        
    words = [word.value() for word in igt['w'].items]
    glosses = [gloss.value() for gloss in igt['gw'].items]
    
    translation = [item.value() for item in igt['tw']]
    return [words, translation, glosses]
    
extract_igt(corpus[1])

[['ani', ',', 'caki-ka', 'cikcep', 'o-ess-ta', '.'],
 ['No', ',', 'self', 'came', 'in', 'person', '.'],
 ['no', 'self-NOM', 'in-person', 'come-PAST-DEC']]

In [4]:
corpus_data = []

missing_words_count = 0
missing_translation_count = 0
missing_gloss_count = 0
all_good_count = 0

for i, igt in enumerate(corpus):
    try:
        igt_data = extract_igt(igt)
        corpus_data.append(igt_data)
        all_good_count += 1
    except MissingValueError as v:
        match str(v):
            case 'words': missing_words_count += 1
            case 'translation': missing_translation_count += 1
            case 'glosses': missing_gloss_count += 1
            case 'alignments': missing_aligments_count += 1
    
print(f"Parsed corpus, with \n\t{all_good_count} good rows\n\t{missing_words_count} rows missing words\
        \n\t{missing_translation_count} missing translations\n\t{missing_gloss_count} missing glosses")

Parsed corpus, with 
	4839 good rows
	73 rows missing words        
	471 missing translations
	0 missing glosses


In [5]:
corpus_data[4]

[['Chelsu-nun', 'pam-ul', 'kuw-e', 'mek-ess-ta', '.'],
 ['Chelsu', 'broiled', 'and', 'ate', 'the', 'chestnut'],
 ['Top', 'chestnut-Acc', 'broil-Inf', 'eat-Past-Dec']]

In [6]:
# Let's remove the dashes from the input, to simulate the case where we don't have segmentation
for item in corpus_data:
    for i, word in enumerate(item[0]):
        item[0][i] = word.replace('-', '')
        
corpus_data[4]

[['Chelsunun', 'pamul', 'kuwe', 'mekessta', '.'],
 ['Chelsu', 'broiled', 'and', 'ate', 'the', 'chestnut'],
 ['Top', 'chestnut-Acc', 'broil-Inf', 'eat-Past-Dec']]

In [7]:
# Let's also split the output by dashes
for item in corpus_data:
    glosses = []
    for i, word in enumerate(item[2]):
        word_glosses = word.split("-")
        glosses.append(word_glosses[0])
        glosses += ["-" + gloss for gloss in word_glosses[1:]]
    item[2] = glosses

corpus_data[4]

[['Chelsunun', 'pamul', 'kuwe', 'mekessta', '.'],
 ['Chelsu', 'broiled', 'and', 'ate', 'the', 'chestnut'],
 ['Top', 'chestnut', '-Acc', 'broil', '-Inf', 'eat', '-Past', '-Dec']]

Notes:
- We originally tried to align words and glosses, but it turns out a huge number of rows are either missing alignments, or have completely wrong alignments. Rather than mess up our model with incorrect data, we will simply provide unaligned glosses.
- There's a lot of messy unnecessary data. We will have to count on the transformer to deal with those.

# Encoding
Input: transcription + translation

Output: glosses (stems and grams)

We need to encode all of our items, input and output, as integers.

In [28]:
from typing import List

special_chars = ["[UNK]", "[SEP]", "[PAD]", "[MASK]", "[BOS]", "[EOS]"]

def create_vocab(sentences: List[List[str]], threshold=2):
    all_words = dict()
    for sentence in sentences:
        for word in sentence:
            all_words[word.lower()] = all_words.get(word.lower(), 0) + 1

    all_words_list = []
    for word, count in all_words.items():
        if count >= threshold:
            all_words_list.append(word)

    return sorted(all_words_list)

source_vocab = create_vocab([item[0] for item in corpus_data])
len(source_vocab)

3169

In [11]:
# Also create a list for the target and gloss words
target_and_gloss_vocab = create_vocab([item[1] for item in corpus_data] + [item[2] for item in corpus_data])
print(len(target_and_gloss_vocab))

3357


In [18]:
def encode_word(word, vocab='source'):
    word = word.lower()
    
    if word in special_chars:
        return special_chars.index(word)
    if vocab=='source':
        if word in source_vocab:
            return source_vocab.index(word) + len(special_chars)
        else:
            return 0
    else:
        if word in target_and_gloss_vocab:
            return target_and_gloss_vocab.index(word) + len(special_chars) + len(source_vocab)
        else:
            return 0

encode_word('', vocab='transl')

3175

In [13]:
import torch

MODEL_INPUT_LENGTH = 512

PAD_ID = special_chars.index("[PAD]")
SEP_ID = special_chars.index("[SEP]")

# Encodes a sentence as integers, and pads it
def encode(sentence: List[str], vocab='source') -> List[int]:
    return [encode_word(word, vocab=vocab) for word in sentence]
            
    
def encode_item(item) -> List[int]:
    """Encodes an item from the data set by combining the source and target"""
    source_enc = encode(item[0])
    target_enc = encode(item[1], vocab='transl')
    combined_enc = source_enc + [SEP_ID] + target_enc
    
    # Pad
    initial_length = len(combined_enc)
    combined_enc += [PAD_ID] * (MODEL_INPUT_LENGTH - initial_length)
    
    # Create attention mask
    attention_mask = [1] * initial_length + [0] * (MODEL_INPUT_LENGTH - initial_length)
    return {'input_ids': torch.tensor(combined_enc), 'attention_mask': torch.tensor(attention_mask)}
    
encode_item(corpus_data[4])

{'input_ids': tensor([ 481, 2143, 1532, 1721,   57,    1, 4259, 4160, 3917, 3976, 6189, 4264,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2, 

In [37]:
def encode_batch(batch: List[List[List[str]]]):
    """Input should be a list of lists, where the first item is the source and the second is the translation"""
    all_encoded = {'input_ids': [], 'attention_mask': []}
    
    for item in batch:
        encoded = encode_item(item)
        all_encoded['input_ids'].append(encoded['input_ids'])
        all_encoded['attention_mask'].append(encoded['attention_mask'])
    
    input_ids = torch.stack(all_encoded['input_ids'])
    attention_mask = torch.stack(all_encoded['attention_mask'])
    return {'input_ids': input_ids, 'attention_mask': attention_mask}

encode_batch(corpus_data[:5])

{'input_ids': tensor([[  17,   79,   20,  ...,    2,    2,    2],
         [ 242,   55,  339,  ...,    2,    2,    2],
         [ 154,   57,  481,  ...,    2,    2,    2],
         [ 295,   57, 1400,  ...,    2,    2,    2],
         [ 481, 2143, 1532,  ...,    2,    2,    2]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [44]:
BOS_ID = special_chars.index("[BOS]")
EOS_ID = special_chars.index("[EOS]")

def encode_gloss_batch(batch: List[List[str]]):
    """Encodes an output batch. Each item should be a list of output gloss words."""
    all_encoded = {'input_ids': [], 'attention_mask': []}
    for item in batch:
        enc = encode(item, vocab='transl')
        enc = [BOS_ID] + enc + [EOS_ID]
        initial_length = len(enc)
        enc += [PAD_ID] * (MODEL_INPUT_LENGTH - initial_length)
    
        # Create attention mask
        attention_mask = [1] * initial_length + [0] * (MODEL_INPUT_LENGTH - initial_length)
        all_encoded['input_ids'].append(torch.tensor(enc))
        all_encoded['attention_mask'].append(torch.tensor(attention_mask))
        
    return {'input_ids': torch.stack(all_encoded['input_ids']), 'attention_mask': torch.stack(all_encoded['attention_mask'])}

print(encode_gloss_batch([item[2] for item in corpus_data[:5]]))

{'input_ids': tensor([[   4, 3175, 3525,  ...,    2,    2,    2],
        [   4, 5421, 5862,  ...,    2,    2,    2],
        [   4, 4259, 3683,  ...,    2,    2,    2],
        [   4, 4705, 3525,  ...,    2,    2,    2],
        [   4, 6265, 4264,  ...,    2,    2,    2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [42]:
def chunk(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]
        
batch_size = 64

train_input_batches = [encode_batch(b) for b in chunk(corpus_data, batch_size)]
train_input_batches[0]

{'input_ids': tensor([[  17,   79,   20,  ...,    2,    2,    2],
         [ 242,   55,  339,  ...,    2,    2,    2],
         [ 154,   57,  481,  ...,    2,    2,    2],
         ...,
         [1169,    0,    1,  ...,    2,    2,    2],
         [ 140, 2358, 1154,  ...,    2,    2,    2],
         [1676,  140, 2358,  ...,    2,    2,    2]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [45]:
# Same for output
train_output_batches = [encode_gloss_batch(b) for b in chunk([item[2] for item in corpus_data], batch_size)]
train_output_batches[0]

{'input_ids': tensor([[   4, 3175, 3525,  ...,    2,    2,    2],
         [   4, 5421, 5862,  ...,    2,    2,    2],
         [   4, 4259, 3683,  ...,    2,    2,    2],
         ...,
         [   4, 5077, 3425,  ...,    2,    2,    2],
         [   4, 5077, 3525,  ...,    2,    2,    2],
         [   4, 5278, 3525,  ...,    2,    2,    2]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}