# GPU stuff

In [1]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [2]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


# Packages and imports

In [3]:
!pip3 install torchmetrics==0.4.1
!pip3 install transformers==4.8.2
!pip3 install pytorch_lightning==1.3.8
!pip3 install Levenshtein

import json
import pandas as pd
import numpy as np
import random
from pytorch_lightning.utilities.apply_func import move_data_to_device
import Levenshtein as Lev
from tqdm.auto import tqdm

# Rasmus imports
from typing import List
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import T_co
from transformers import AutoTokenizer, T5ForConditionalGeneration

Collecting torchmetrics==0.4.1
  Downloading torchmetrics-0.4.1-py3-none-any.whl (234 kB)
[?25l[K     |█▍                              | 10 kB 26.9 MB/s eta 0:00:01[K     |██▉                             | 20 kB 19.9 MB/s eta 0:00:01[K     |████▏                           | 30 kB 10.4 MB/s eta 0:00:01[K     |█████▋                          | 40 kB 8.6 MB/s eta 0:00:01[K     |███████                         | 51 kB 5.4 MB/s eta 0:00:01[K     |████████▍                       | 61 kB 5.5 MB/s eta 0:00:01[K     |█████████▊                      | 71 kB 5.5 MB/s eta 0:00:01[K     |███████████▏                    | 81 kB 6.1 MB/s eta 0:00:01[K     |████████████▋                   | 92 kB 6.3 MB/s eta 0:00:01[K     |██████████████                  | 102 kB 5.3 MB/s eta 0:00:01[K     |███████████████▍                | 112 kB 5.3 MB/s eta 0:00:01[K     |████████████████▊               | 122 kB 5.3 MB/s eta 0:00:01[K     |██████████████████▏             | 133 kB 5.3 MB/

# Load Data

Mount drive to load files faster

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
class MultiPlexDataset(Dataset):

    def __init__(self,
                 path_to_files: List[str],
                 only_include_corrections: bool = False):
        """

        :param path_to_files: List of paths to the files with data
        :param only_include_corrections: Whether to only include samples where there are corrections
        """

        self.only_include_corrections = only_include_corrections
        self.dataset_counter = 0
        self.data = {}

        print("Loading data...")
        for path in path_to_files:
            with open(path, "r", encoding="utf-8") as f:
                data = f.read().split("\n")

            current_norm_words = []
            current_ref_words = []

            for line in data:
                if not line:
                    self.create_samples(current_norm_words, current_ref_words)
                    current_ref_words = []
                    current_norm_words = []
                else:
                    norm, ref = line.split("\t")
                    current_norm_words.append(norm)
                    current_ref_words.append(ref)

        print("Dataset initialized...")

    def create_samples(self, norms, refs):
        if norms and refs:
            for i, word in enumerate(norms):

                if self.only_include_corrections and word == refs[i]:
                    continue

                if i == 0:
                    sample_input = "<extra_id_0>" + word + "<extra_id_1> " + " ".join(norms[i + 1:])
                elif i == len(norms) - 1:
                    sample_input = " ".join(norms[:i]) + " <extra_id_0>" + word + "<extra_id_1>"
                else:
                    sample_input = " ".join(norms[:i]) + " <extra_id_0>" + word + "<extra_id_1> " + " ".join(
                        norms[i + 1:])

                self.data[self.dataset_counter] = {"input_sample": sample_input, "expected_output": refs[i]}
                self.dataset_counter += 1

    def __getitem__(self, index) -> T_co:
        return self.data[index]

    def __len__(self):
        return len(self.data.keys())


class CollateFunctor:
    def __init__(self, tokenizer, encoder_max_length=320, decoder_max_length=32):
        self.tokenizer = tokenizer
        self.encoder_max_length = encoder_max_length
        self.decoder_max_length = decoder_max_length

    def __call__(self, samples):
        inputs = list(map(lambda x: x["input_sample"], samples))

        inputs = self.tokenizer(
            inputs, padding=True, truncation=True, pad_to_multiple_of=8,
            max_length=self.encoder_max_length, return_attention_mask=True, return_tensors='pt'
        )

        outputs = list(map(lambda x: x["expected_output"], samples))

        outputs = self.tokenizer(
            outputs, padding=True, truncation=True, pad_to_multiple_of=8,
            max_length=self.decoder_max_length, return_attention_mask=True, return_tensors='pt'
        )

        batch = {
            "input_ids": inputs.input_ids,
            "attention_mask": inputs.attention_mask,
            "labels": outputs.input_ids,
            "decoder_attention_mask": outputs.attention_mask
        }
        batch["labels"][batch["labels"] == self.tokenizer.pad_token_id] = -100  # used to mask the loss in T5
        return batch

In [6]:
dataset = MultiPlexDataset(path_to_files=["/content/drive/MyDrive/text/noisy_danish_data.txt"], only_include_corrections=True)

Loading data...
Dataset initialized...


# Train test split and shorten data

In [7]:
N = 36010
dataset_short = random.sample(dataset.data.items(), N)

In [8]:
input_samples = list(map(lambda x: x[1]["input_sample"], dataset_short))
expected_outputs = list(map(lambda x: x[1]["expected_output"], dataset_short))

In [9]:
pc = 0.1
from sklearn.model_selection import train_test_split
input_samples_train, input_samples_test, expected_outputs_train, expected_outputs_test = train_test_split(input_samples, expected_outputs, test_size=pc)

In [10]:
print("Total number of train inputs", len(input_samples_train))
print("Total number of train outputs", len(expected_outputs_train))
print("Total number of test inputs", len(input_samples_test))
print("Total number of test outputs", len(expected_outputs_test))

Total number of train inputs 32409
Total number of train outputs 32409
Total number of test inputs 3601
Total number of test outputs 3601


# Baseline WER

In [11]:
def calculate_wer(s1, s2):
    b = set(s1.split() + s2.split())
    word2char = dict(zip(b, range(len(b))))
    w1 = [chr(word2char[w]) for w in s1.split()]
    w2 = [chr(word2char[w]) for w in s2.split()]
    return Lev.distance(''.join(w1), ''.join(w2))

In [12]:
total_wer_baseline = 0
for i, input in enumerate(input_samples_test):
  idx1 = input.find('<') + 12
  input = input[idx1:]
  idx2 = input.find('<')
  input = input[:idx2]
  
  wer_baseline =  calculate_wer(expected_outputs_test[i],input)
  total_wer_baseline= total_wer_baseline + wer_baseline

avg_err_baseline = total_wer_baseline/len(input_samples_test)

print("Baseline word error rate is", avg_err_baseline)

Baseline word error rate is 1.00944182171619


# Untrained MLN baseline

In [13]:
tokenizer = AutoTokenizer.from_pretrained("ufal/byt5-small-multilexnorm2021-da")
model = T5ForConditionalGeneration.from_pretrained("ufal/byt5-small-multilexnorm2021-da")

Downloading:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/706 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

In [14]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(384, 1472)
  (encoder): T5Stack(
    (embed_tokens): Embedding(384, 1472)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=1472, out_features=384, bias=False)
              (k): Linear(in_features=1472, out_features=384, bias=False)
              (v): Linear(in_features=1472, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=1472, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedGeluDense(
              (wi_0): Linear(in_features=1472, out_features=3584, bias=False)
              (wi_1): Linear(in_features=1472, out_features=3584, bias=False)
              (

In [15]:
total_wer_untrained= 0
output_print=[]

progress_bar = tqdm(range(len(input_samples_test)))

model.eval()
for i, input in enumerate(input_samples_test):
  input_batch = tokenizer(input, padding=True, truncation=False, pad_to_multiple_of=8,
                        return_attention_mask=True, return_tensors='pt')
  input_batch = move_data_to_device(input_batch, device)
  output= model.generate(**input_batch)
  decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)
  decoded_output= " ".join(decoded_output)

  output_print.append(decoded_output)

  wer_untrained=  calculate_wer(expected_outputs_test[i],decoded_output)
  total_wer_untrained= total_wer_untrained +wer_untrained

  progress_bar.update(1)


avg_err_untrained= total_wer_untrained/len(input_samples_test)
output_untrained = output_print


print("Average word error rate(Untrained MLN) is", avg_err_untrained)

  0%|          | 0/3601 [00:00<?, ?it/s]

Average word error rate(Untrained MLN) is 1.0011108025548459


In [27]:
 incorrect_predictions_untrained = len(expected_outputs_test) - len([i for i, j in zip(output_untrained, expected_outputs_test) if i == j])
 print('incorrect predictions: ', incorrect_predictions_untrained, ' out of ', len(expected_outputs_test))

incorrect predictions:  3571  out of  3601


# Trained MLN

In [16]:
class MultilexnormDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __getitem__(self, index):
        return self.inputs[index], self.outputs[index]

    def __len__(self):
        return len(self.inputs)

In [17]:
class CollateFunctor:
    def __init__(self, tokenizer, encoder_max_length=320, decoder_max_length=32):
        self.tokenizer = tokenizer
        self.encoder_max_length = encoder_max_length
        self.decoder_max_length = decoder_max_length

    def __call__(self, samples):

        # should be list of input and list of output
        inputs, outputs = map(list, zip(*samples))

        inputs = self.tokenizer(
            inputs, padding=True, truncation=True, pad_to_multiple_of=8,
            max_length=self.encoder_max_length, return_attention_mask=True, return_tensors='pt'
        )

        outputs = self.tokenizer(
            outputs, padding=True, truncation=True, pad_to_multiple_of=8,
            max_length=self.decoder_max_length, return_attention_mask=True, return_tensors='pt'
        )

        batch = {
            "input_ids": inputs.input_ids,
            "attention_mask": inputs.attention_mask,
            "labels": outputs.input_ids,
            "decoder_attention_mask": outputs.attention_mask
        }
        batch["labels"][batch["labels"] == self.tokenizer.pad_token_id] = -100  # used to mask the loss in T5
        return batch

In [18]:
dataset_mln = MultilexnormDataset(input_samples_train, expected_outputs_train)

In [19]:
dataloader = DataLoader(dataset_mln, batch_size=4, collate_fn = CollateFunctor(tokenizer))

In [20]:
from transformers import get_scheduler,AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 1
num_training_steps = num_epochs * len(dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [21]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
total_loss = 0

model.train()
for epoch in range(num_epochs):
    for i, batch in enumerate(dataloader):
      batch = move_data_to_device(batch, device)
      outputs = model(**batch)
      loss = outputs.loss
      total_loss = total_loss + loss.item()
      loss.backward()

      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      progress_bar.update(1)

print(total_loss/len(batch))

  0%|          | 0/8103 [00:00<?, ?it/s]

1060.5532449658494


In [22]:
print((total_loss/len(batch))/len(dataloader))

0.13088402381412434


In [23]:
total_wer = 0


progress_bar = tqdm(range(len(input_samples_test)))
output_print=[]

model.eval()
for i, input in enumerate(input_samples_test):
  input_batch = tokenizer(input, padding=True, truncation=False, pad_to_multiple_of=8,
                        return_attention_mask=True, return_tensors='pt')
  input_batch = move_data_to_device(input_batch, device)
  output= model.generate(**input_batch)
  decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)
  decoded_output= " ".join(decoded_output)

  output_print.append(decoded_output)
  wer=  calculate_wer(expected_outputs_test[i],decoded_output)
  total_wer= total_wer +wer

  progress_bar.update(1)

output_trained = output_print

  0%|          | 0/3601 [00:00<?, ?it/s]

In [24]:
avg_err_trained= total_wer/len(input_samples_test)
print(avg_err_trained)

0.34768119966675926


In [25]:
 incorrect_predictions_trained = len(expected_outputs_test) - len([i for i, j in zip(output_print, expected_outputs_test) if i == j])
 print('incorrect predictions: ', incorrect_predictions_trained, ' out of ', len(expected_outputs_test))

incorrect predictions:  1208  out of  3601


# Comparison

In [38]:
print("mean error (baseline):  ", avg_err_baseline)
print("mean error (untrained): ", avg_err_untrained)
print('mean error (trained):   ', avg_err_trained)
print('incorrect predictions (baseline):  ', 'all')
print('incorrect predictions (untrained): ', incorrect_predictions_untrained, ' out of ', len(expected_outputs_test))
print('incorrect predictions (trained):   ', incorrect_predictions_trained, ' out of ', len(expected_outputs_test))

mean error (baseline):   1.00944182171619
mean error (untrained):  1.0011108025548459
mean error (trained):    0.34768119966675926
incorrect predictions (baseline):   all
incorrect predictions (untrained):  3571  out of  3601
incorrect predictions (trained):    1208  out of  3601


In [41]:
n = 50
idx = np.random.choice(np.arange(len(expected_outputs_test)), n, replace=False)
for i in range(n):
  print('input:              ', input_samples_test[idx[i]])
  print('expected output:    ', expected_outputs_test[idx[i]])
  print('output (untrained): ', output_untrained[idx[i]])
  print('output (trained):   ', output_trained[idx[i]])
  print('\n')

input:               skuret måler om ring tre gange to meter har krydsfiner <extra_id_0>vægge<extra_id_1> grus bund og et bliktag
expected output:     
output (untrained):  vægge
output (trained):    


input:               de to andre står pænt i kø på sengen <extra_id_0>i<extra_id_1> mands punktum
expected output:     imens
output (untrained):  i
output (trained):    imands


input:               desuden kan man lave top <extra_id_0>tekster<extra_id_1> og bund tekster indsætte grafik og beder udskrive kuverter flette
expected output:     
output (untrained):  tekster
output (trained):    


input:               det er jo tør være næsten <extra_id_0>hver<extra_id_1> dag
expected output:     vejr
output (untrained):  hver
output (trained):    vejr


input:               det er tanke bevægende at de i tredje klasse er <extra_id_0>motiveret<extra_id_1> for at læse men at de herefter taber interessen for læsning
expected output:     motiverede
output (untrained):  motiveret
output (traine

In [42]:
n = 50
idx = np.random.choice(np.arange(len(expected_outputs_test)), n, replace=False)
for i in range(n):
  print('input:              ', input_samples_test[idx[i]])
  print('expected output:    ', expected_outputs_test[idx[i]])
  print('output (untrained): ', output_untrained[idx[i]])
  print('output (trained):   ', output_trained[idx[i]])
  print('\n')

input:               han har vejen indsat skaffe sig <extra_id_0>et<extra_id_1> elevplads
expected output:     en
output (untrained):  et
output (trained):    elevpladsen


input:               byen var domineret af grå betonblokke og alenlange køer der ventede tålmodigt på de rustne bybusser der osede af <extra_id_0>elle<extra_id_1>
expected output:     ælde
output (untrained):  eller
output (trained):    ælde


input:               det er også på <extra_id_0>tyde<extra_id_1>
expected output:     tide
output (untrained):  tyde
output (trained):    tydelse


input:               de <extra_id_0>roede<extra_id_1> unge til at holde sig væk punktum
expected output:     rådede
output (untrained):  roede
output (trained):    roede


input:               det er menneskeligt at fejle og selv en <extra_id_0>gastronomer<extra_id_1> kan tage fejl
expected output:     gastronom
output (untrained):  gastronomer
output (trained):    


input:               udskriv dokumentet til <extra_id_0>fax<extr

In [43]:
n = 50
idx = np.random.choice(np.arange(len(expected_outputs_test)), n, replace=False)
for i in range(n):
  print('input:              ', input_samples_test[idx[i]])
  print('expected output:    ', expected_outputs_test[idx[i]])
  print('output (untrained): ', output_untrained[idx[i]])
  print('output (trained):   ', output_trained[idx[i]])
  print('\n')

input:               vis <extra_id_0>hjælpe<extra_id_1> inat
expected output:     hjælpelineal
output (untrained):  hjælpe
output (trained):    hjælpesnat


input:               den gamle <extra_id_0>bronze<extra_id_1> statue foran rådhuset at pille ned lagt i en kiste med skumplast og befordret til hovedstaden
expected output:     bronzestatue
output (untrained):  bronze
output (trained):    bronzestatue


input:               fire ni seks otte <extra_id_0>otte<extra_id_1> og tyve to hundrede og sytten
expected output:     
output (untrained):  otte
output (trained):    otteogtyve


input:               hun har ikke tænkt sig lige nu at lave flere detaljerede <extra_id_0>mindste<extra_id_1> krav til kommunernes ældrepleje børnepasning eller aktivering af bistandsmodtagere
expected output:     mindstekrav
output (untrained):  mindste
output (trained):    ministerkrav


input:               tre seks nul seks syv <extra_id_0>og<extra_id_1> halvfems et hundrede og sytten
expected output: 

In [44]:
n = 50
idx = np.random.choice(np.arange(len(expected_outputs_test)), n, replace=False)
for i in range(n):
  print('input:              ', input_samples_test[idx[i]])
  print('expected output:    ', expected_outputs_test[idx[i]])
  print('output (untrained): ', output_untrained[idx[i]])
  print('output (trained):   ', output_trained[idx[i]])
  print('\n')

input:               har man en <extra_id_0>vindues<extra_id_1> niche kan der for os endnu en dimension til den ethan kan udnyttes
expected output:     vinduesniche
output (untrained):  vindues
output (trained):    vinduesniche


input:               indsæt <extra_id_0>s<extra_id_1> bet i det markerede felt
expected output:     tabel
output (untrained):  så
output (trained):    søbet


input:               <extra_id_0>ved<extra_id_1> hvad du skal bruge den fodnote til
expected output:     vid
output (untrained):  ved
output (trained):    vi


input:               det er altså en <extra_id_0>teater<extra_id_1> krig mellem en ase og en ossi
expected output:     teaterkrig
output (untrained):  teater
output (trained):    teaterkrig


input:               byen var domineret af grå betonblokke <extra_id_0>alen<extra_id_1> lange køer der ventede tålmodigt på de rustne bybusser der osede af elle
expected output:     og
output (untrained):  alen
output (trained):    


input:               han

In [45]:
n = 50
idx = np.random.choice(np.arange(len(expected_outputs_test)), n, replace=False)
for i in range(n):
  print('input:              ', input_samples_test[idx[i]])
  print('expected output:    ', expected_outputs_test[idx[i]])
  print('output (untrained): ', output_untrained[idx[i]])
  print('output (trained):   ', output_trained[idx[i]])
  print('\n')

input:               københavnerne skal ikke nøjes med at sætte deres skriftlige aftryk på kommunens fremtid på en stemme <extra_id_0>så<extra_id_1> hvert fjerde år punktum
expected output:     
output (untrained):  så
output (trained):    så


input:               slet kommandoer og <extra_id_0>skrevet<extra_id_1> på dansk
expected output:     
output (untrained):  skrevet
output (trained):    skrevet


input:               men lykkelig det bliver du ikke siger den <extra_id_0>tredive<extra_id_1> årige læge der ikke selv har det alt for godt med livskvaliteten i disse dage
expected output:     trediveårige
output (untrained):  tredive
output (trained):    trediveårige


input:               de spiller på mandag ud med et tilbud komma der ifølge selskabet selv gør det billigere end nogen <extra_id_0>sinde<extra_id_1> tidligere at tale i mobiltelefon punktum
expected output:     
output (untrained):  sinde
output (trained):    


input:               evolutionen vil helt sikkert bringe 

In [46]:
n = 50
idx = np.random.choice(np.arange(len(expected_outputs_test)), n, replace=False)
for i in range(n):
  print('input:              ', input_samples_test[idx[i]])
  print('expected output:    ', expected_outputs_test[idx[i]])
  print('output (untrained): ', output_untrained[idx[i]])
  print('output (trained):   ', output_trained[idx[i]])
  print('\n')

input:               det vil næppe få det danske samfund eller <extra_id_0>nogle<extra_id_1> af de andre lande der overholder reglerne til at bryde sammen
expected output:     nogen
output (untrained):  nogle
output (trained):    nogen


input:               det giver en utilfredsstillende konkurrence <extra_id_0>for<extra_id_1> bredning siger han
expected output:     
output (untrained):  for
output (trained):    for


input:               der skal også mere end man vi <extra_id_0>er<extra_id_1> i stand til at holde punktum
expected output:     være
output (untrained):  er
output (trained):    er


input:               der er et udvalg <extra_id_0>arbejde<extra_id_1> med disse ting og hvis det er et udvalg siger at der skal bruges flere penge vil vi se på sagen
expected output:     der arbejder
output (untrained):  arbejde
output (trained):    


input:               han var iført en mørk <extra_id_0>neglen<extra_id_1> jakke med et rødligt mærke ved venstre bryst forvaskede lyseblå ko

In [47]:
n = 50
idx = np.random.choice(np.arange(len(expected_outputs_test)), n, replace=False)
for i in range(n):
  print('input:              ', input_samples_test[idx[i]])
  print('expected output:    ', expected_outputs_test[idx[i]])
  print('output (untrained): ', output_untrained[idx[i]])
  print('output (trained):   ', output_trained[idx[i]])
  print('\n')

input:               seks fire syv en syv <extra_id_0>og<extra_id_1> tres seks hundrede og syvogtyve
expected output:     
output (untrained):  og
output (trained):    


input:               det er en utrolig arrogance <extra_id_0>fodbold<extra_id_1> organisationerne udviser over for almindelige mennesker
expected output:     fodboldorganisationerne
output (untrained):  fodbold
output (trained):    fodboldorganisation


input:               kan man leve af <extra_id_0>en<extra_id_1> lille galleri
expected output:     et
output (untrained):  en
output (trained):    en


input:               jeg tror oven <extra_id_0>i<extra_id_1> købet at den er sund
expected output:     
output (untrained):  i
output (trained):    


input:               da nat <extra_id_0>dykning<extra_id_1> sætter store krav til planlægning bruger vi ventetiden inden solen går ned til at gennemgå hånd signaler og andre praktiske foranstaltninger
expected output:     
output (untrained):  dykning
output (trained):   

In [48]:
n = 50
idx = np.random.choice(np.arange(len(expected_outputs_test)), n, replace=False)
for i in range(n):
  print('input:              ', input_samples_test[idx[i]])
  print('expected output:    ', expected_outputs_test[idx[i]])
  print('output (untrained): ', output_untrained[idx[i]])
  print('output (trained):   ', output_trained[idx[i]])
  print('\n')

input:               det er langt <extra_id_0>fra<extra_id_1> alt man holder af der er sundt
expected output:     
output (untrained):  fra
output (trained):    


input:               nu kan man endelig få øve <extra_id_0>lokalets<extra_id_1> hemmelighed er at se på en scenen
expected output:     
output (untrained):  lokalets
output (trained):    lokalets


input:               vælg <extra_id_0>auto<extra_id_1> korrektur i minu
expected output:     autokorrektur
output (untrained):  auto
output (trained):    autokorrektur


input:               opret dokument oversigt på et drev <extra_id_0>se<extra_id_1>
expected output:     c
output (untrained):  se
output (trained):    


input:               de unge der takker <extra_id_0>hver<extra_id_1> familiens indsats har fået job og indflydelse skal give familien noget igen
expected output:     være
output (untrained):  hver
output (trained):    hvert familiens


input:               <extra_id_0>benet<extra_id_1> fed skrift i dokumentet
exp