## Basic Model Training
Let's get our data into a format that we can use to train transformer-style models.

We will start with the Korean dataset, as it is well-annotated.

In [1]:
!pip install xigt



In [2]:
from xigt.codecs import xigtxml
corpus = xigtxml.load(open('../data/kor.xml'))

In [70]:
class MissingValueError(Exception):
    pass

# From a single line of IGT, extracts the features which are allowed in this shared task:
# 1. Transcribed words (not segmented)
# 2. Translation (not aligned)
# 3. Glosses
def extract_igt(igt):
    if not igt.get('w'):
        raise MissingValueError("words")
    if not igt.get('tw'):
        raise MissingValueError("translation")
    if not igt.get('gw'):
        raise MissingValueError("glosses")
        
    words = [word.value() for word in igt['w'].items]
    glosses = [gloss.value() for gloss in igt['gw'].items]
    
    translation = [item.value() for item in igt['tw']]
    return {'words': words, 'translation': translation, 'glosses': glosses}
    
extract_igt(corpus[1])

{'words': ['ani', ',', 'caki-ka', 'cikcep', 'o-ess-ta', '.'],
 'translation': ['No', ',', 'self', 'came', 'in', 'person', '.'],
 'glosses': ['no', 'self-NOM', 'in-person', 'come-PAST-DEC']}

In [71]:
corpus_data = []

missing_words_count = 0
missing_translation_count = 0
missing_gloss_count = 0
all_good_count = 0

for i, igt in enumerate(corpus):
    try:
        igt_data = extract_igt(igt)
        corpus_data.append(igt_data)
        all_good_count += 1
    except MissingValueError as v:
        match str(v):
            case 'words': missing_words_count += 1
            case 'translation': missing_translation_count += 1
            case 'glosses': missing_gloss_count += 1
            case 'alignments': missing_aligments_count += 1
    
print(f"Parsed corpus, with \n\t{all_good_count} good rows\n\t{missing_words_count} rows missing words\
        \n\t{missing_translation_count} missing translations\n\t{missing_gloss_count} missing glosses")

Parsed corpus, with 
	4839 good rows
	73 rows missing words        
	471 missing translations
	0 missing glosses


In [72]:
corpus_data[4]

{'words': ['Chelsu-nun', 'pam-ul', 'kuw-e', 'mek-ess-ta', '.'],
 'translation': ['Chelsu', 'broiled', 'and', 'ate', 'the', 'chestnut'],
 'glosses': ['Top', 'chestnut-Acc', 'broil-Inf', 'eat-Past-Dec']}

In [74]:
# Let's remove the dashes from the input, to simulate the case where we don't have segmentation
for item in corpus_data:
    for i, word in enumerate(item['words']):
        item['words'][i] = word.replace('-', '')
        
corpus_data[4]

{'words': ['Chelsunun', 'pamul', 'kuwe', 'mekessta', '.'],
 'translation': ['Chelsu', 'broiled', 'and', 'ate', 'the', 'chestnut'],
 'glosses': ['Top', 'chestnut-Acc', 'broil-Inf', 'eat-Past-Dec']}

In [75]:
# Let's also split the output by dashes
for item in corpus_data:
    glosses = []
    for i, word in enumerate(item['glosses']):
        word_glosses = word.split("-")
        glosses.append(word_glosses[0])
        glosses += ["-" + gloss for gloss in word_glosses[1:]]
    item['glosses'] = glosses

corpus_data[4]

{'words': ['Chelsunun', 'pamul', 'kuwe', 'mekessta', '.'],
 'translation': ['Chelsu', 'broiled', 'and', 'ate', 'the', 'chestnut'],
 'glosses': ['Top',
  'chestnut',
  '-Acc',
  'broil',
  '-Inf',
  'eat',
  '-Past',
  '-Dec']}

Notes:
- We originally tried to align words and glosses, but it turns out a huge number of rows are either missing alignments, or have completely wrong alignments. Rather than mess up our model with incorrect data, we will simply provide unaligned glosses.
- There's a lot of messy unnecessary data. We will have to count on the transformer to deal with those.

# Encoding
Input: transcription + translation

Output: glosses (stems and grams)

We need to encode all of our items, input and output, as integers.

In [76]:
from typing import List

special_chars = ["[UNK]", "[SEP]", "[PAD]", "[MASK]", "[BOS]", "[EOS]"]

def create_vocab(sentences: List[List[str]], threshold=2):
    all_words = dict()
    for sentence in sentences:
        for word in sentence:
            all_words[word.lower()] = all_words.get(word.lower(), 0) + 1

    all_words_list = []
    for word, count in all_words.items():
        if count >= threshold:
            all_words_list.append(word)

    return sorted(all_words_list)

source_vocab = create_vocab([item['words'] for item in corpus_data])
len(source_vocab)

3169

In [77]:
# Also create a list for the target and gloss words
target_and_gloss_vocab = create_vocab([item['translation'] for item in corpus_data] + [item['glosses'] for item in corpus_data])
print(len(target_and_gloss_vocab))

3357


In [78]:
def encode_word(word, vocab='source'):
    word = word.lower()
    
    if word in special_chars:
        return special_chars.index(word)
    if vocab=='source':
        if word in source_vocab:
            return source_vocab.index(word) + len(special_chars)
        else:
            return 0
    else:
        if word in target_and_gloss_vocab:
            return target_and_gloss_vocab.index(word) + len(special_chars) + len(source_vocab)
        else:
            return 0

encode_word('', vocab='transl')

3175

In [80]:
import torch

MODEL_INPUT_LENGTH = 512

PAD_ID = special_chars.index("[PAD]")
SEP_ID = special_chars.index("[SEP]")

# Encodes a sentence as integers, and pads it
def encode(sentence: List[str], vocab='source') -> List[int]:
    return [encode_word(word, vocab=vocab) for word in sentence]
            
encode(corpus_data[4]['words']) 

[481, 2143, 1532, 1721, 57]

Now let's divide our data and turn it into the Dataset format.

In [81]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(corpus_data, test_size=0.3)
test, dev = train_test_split(test, test_size=0.5)

print(f"Train: {len(train)}")
print(f"Dev: {len(dev)}")
print(f"Test: {len(test)}")

Train: 3387
Dev: 726
Test: 726


In [84]:
print(train[2])

{'words': ['ce', 'seykayuy', 'cakun', 'inhyeng'], 'translation': ['Those', 'three', 'little', 'dolls'], 'glosses': ['DEM', 'three', '-CL', '-GEN', 'little', 'doll']}


In [86]:
from datasets import Dataset, DatasetDict

raw_dataset = DatasetDict()
raw_dataset['train'] = Dataset.from_list(train)
raw_dataset['validation'] = Dataset.from_list(dev)
raw_dataset['test'] = Dataset.from_list(test)

raw_dataset

DatasetDict({
    train: Dataset({
        features: ['words', 'translation', 'glosses'],
        num_rows: 3387
    })
    validation: Dataset({
        features: ['words', 'translation', 'glosses'],
        num_rows: 726
    })
    test: Dataset({
        features: ['words', 'translation', 'glosses'],
        num_rows: 726
    })
})

In [91]:
def preprocess(row):
    """Preprocesses each row in the dataset
    1. Combines the source and translation into a single list, and encodes
    2. Pads the combined input and output sequences
    3. Creates attention mask
    """
    source_enc = encode(row['words'])
    transl_enc = encode(row['translation'], vocab='transl')
    combined_enc = source_enc + [SEP_ID] + transl_enc
    
    # Pad
    initial_length = len(combined_enc)
    combined_enc += [PAD_ID] * (MODEL_INPUT_LENGTH - initial_length)
    
    # Create attention mask
    attention_mask = [1] * initial_length + [0] * (MODEL_INPUT_LENGTH - initial_length)
    
    # Encode the output
    output_enc = encode(row['glosses'], vocab='transl')
    output_enc = [BOS_ID] + output_enc + [EOS_ID]
    initial_length = len(output_enc)
    output_enc += [PAD_ID] * (MODEL_INPUT_LENGTH - initial_length)
    
    return {'input_ids': torch.tensor(combined_enc), 'attention_mask': torch.tensor(attention_mask), 'decoder_input_ids': torch.tensor(output_enc)}
    
preprocess(raw_dataset['train'][1])

{'input_ids': tensor([1878,  766,  140, 2024, 2970, 2092, 3091,  145,  540, 2978, 1301,  135,
            1, 3216, 4939, 3944, 6510, 4793, 6241, 5170, 5000, 3185, 4875, 3198,
         4324, 4602, 3781,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,    2,
            2, 

In [94]:
# Map to all datasets
dataset = DatasetDict()
dataset['train'] = raw_dataset['train'].map(preprocess)
dataset['validation'] = raw_dataset['validation'].map(preprocess)
dataset['test'] = raw_dataset['train'].map(preprocess)

# preprocess_all()
# raw_dataset['train'][:10]


  0%|          | 0/3387 [00:00<?, ?ex/s]

Dataset({
    features: ['words', 'translation', 'glosses', 'input_ids', 'attention_mask', 'decoder_input_ids'],
    num_rows: 3387
})

In [37]:
def encode_batch(batch: List[List[List[str]]]):
    """Input should be a list of lists, where the first item is the source and the second is the translation"""
    all_encoded = {'input_ids': [], 'attention_mask': []}
    
    for item in batch:
        encoded = encode_item(item)
        all_encoded['input_ids'].append(encoded['input_ids'])
        all_encoded['attention_mask'].append(encoded['attention_mask'])
    
    input_ids = torch.stack(all_encoded['input_ids'])
    attention_mask = torch.stack(all_encoded['attention_mask'])
    return {'input_ids': input_ids, 'attention_mask': attention_mask}

encode_batch(corpus_data[:5])

{'input_ids': tensor([[  17,   79,   20,  ...,    2,    2,    2],
         [ 242,   55,  339,  ...,    2,    2,    2],
         [ 154,   57,  481,  ...,    2,    2,    2],
         [ 295,   57, 1400,  ...,    2,    2,    2],
         [ 481, 2143, 1532,  ...,    2,    2,    2]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [44]:
BOS_ID = special_chars.index("[BOS]")
EOS_ID = special_chars.index("[EOS]")

def encode_gloss_batch(batch: List[List[str]]):
    """Encodes an output batch. Each item should be a list of output gloss words."""
    all_encoded = {'input_ids': [], 'attention_mask': []}
    for item in batch:
        enc = encode(item, vocab='transl')
        enc = [BOS_ID] + enc + [EOS_ID]
        initial_length = len(enc)
        enc += [PAD_ID] * (MODEL_INPUT_LENGTH - initial_length)
    
        # Create attention mask
        attention_mask = [1] * initial_length + [0] * (MODEL_INPUT_LENGTH - initial_length)
        all_encoded['input_ids'].append(torch.tensor(enc))
        all_encoded['attention_mask'].append(torch.tensor(attention_mask))
        
    return {'input_ids': torch.stack(all_encoded['input_ids']), 'attention_mask': torch.stack(all_encoded['attention_mask'])}

print(encode_gloss_batch([item[2] for item in corpus_data[:5]]))

{'input_ids': tensor([[   4, 3175, 3525,  ...,    2,    2,    2],
        [   4, 5421, 5862,  ...,    2,    2,    2],
        [   4, 4259, 3683,  ...,    2,    2,    2],
        [   4, 4705, 3525,  ...,    2,    2,    2],
        [   4, 6265, 4264,  ...,    2,    2,    2]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [55]:
def chunk(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]
        
batch_size = 64

train_input_batches = [encode_batch(b) for b in chunk(corpus_data, batch_size)]
train_input_batches[0]

{'input_ids': tensor([[  17,   79,   20,  ...,    2,    2,    2],
         [ 242,   55,  339,  ...,    2,    2,    2],
         [ 154,   57,  481,  ...,    2,    2,    2],
         ...,
         [1169,    0,    1,  ...,    2,    2,    2],
         [ 140, 2358, 1154,  ...,    2,    2,    2],
         [1676,  140, 2358,  ...,    2,    2,    2]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [45]:
# Same for output
train_output_batches = [encode_gloss_batch(b) for b in chunk([item[2] for item in corpus_data], batch_size)]
train_output_batches[0]

{'input_ids': tensor([[   4, 3175, 3525,  ...,    2,    2,    2],
         [   4, 5421, 5862,  ...,    2,    2,    2],
         [   4, 4259, 3683,  ...,    2,    2,    2],
         ...,
         [   4, 5077, 3425,  ...,    2,    2,    2],
         [   4, 5077, 3525,  ...,    2,    2,    2],
         [   4, 5278, 3525,  ...,    2,    2,    2]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

## Model Creation

In [48]:
from transformers import BartConfig, BartForConditionalGeneration

config = BartConfig(
    vocab_size=len(source_vocab) + len(target_and_gloss_vocab),
    max_position_embeddings=512,
    pad_token_id=PAD_ID,
    bos_token_id=BOS_ID,
    eos_token_id=EOS_ID,
    decoder_start_token_id=BOS_ID,
    forced_eos_token_id=EOS_ID
)

model = BartForConditionalGeneration(config)
model.config

BartConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "attention_dropout": 0.0,
  "bos_token_id": 4,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 4,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 5,
  "forced_eos_token_id": 5,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "bart",
  "num_hidden_layers": 12,
  "pad_token_id": 2,
  "scale_embedding": false,
  "transformers_version": "4.21.3",
  "use_cache": true,
  "vocab_size": 6526
}

In [50]:
# Confirm our model works with our data
# model(input_ids=train_input_batches[0]['input_ids'],
#       attention_mask=train_input_batches[0]['attention_mask'],
#       decoder_input_ids=train_output_batches[0]['input_ids'],
#       decoder_attention_mask=train_output_batches[0]['attention_mask'])

## Training

In [52]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

args = Seq2SeqTrainingArguments(
    f"igt-word-level",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    # fp16=True,
)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)