## Data Preprocessing

<br>

### Development Envrionment

In [416]:
import re
import tqdm
import glob
import json
from datasets import load_dataset
import xml.etree.ElementTree as ET

### Huggingface Data Format

In [289]:
dataset = load_dataset('iwslt2017')
dataset
for split, split_dataset in dataset.items():
    split_dataset.to_json(f"data/iwslt17.de.en.huggingface/{split}-de-en.json", force_ascii=False)

DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 206112
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 888
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 8079
    })
})

In [77]:
dataset['train'][0]

{'translation': {'de': 'Vielen Dank, Chris.',
  'en': 'Thank you so much, Chris.'}}

### TGZ to TXT & XML

[IWSLT 2017-01](https://wit3.fbk.eu/2017-01)

In [None]:
!tar -xzf ./data/iwslt17.de.en.orig/tgz/2017-01-trnmted.tgz -C ./data/iwslt17.de.en.orig

In [None]:
!tar -xzf ./data/iwslt17.de.en.orig/2017-01-trnmted/texts/DeEnItNlRo/DeEnItNlRo/DeEnItNlRo-DeEnItNlRo.tgz -C ./data/iwslt17.de.en.orig

In [292]:
!cp ./data/iwslt17.de.en.orig/DeEnItNlRo-DeEnItNlRo/train.tags.de-en.de ./data/iwslt17.de.en.orig/xml/train/train.tags.de-en.de.txt

In [293]:
!cp ./data/iwslt17.de.en.orig/DeEnItNlRo-DeEnItNlRo/train.tags.de-en.en ./data/iwslt17.de.en.orig/xml/train/train.tags.de-en.en.txt

In [21]:
!grep -lir 'dev2010' ./data/iwslt17.de.en.orig/DeEnItNlRo-DeEnItNlRo/* | xargs mv -t ./data/iwslt17.de.en.orig/xml/dev

[IWSLT 2017-01-B](https://wit3.fbk.eu/2017-01-b)

In [None]:
!tar -xzf ./data/iwslt17.de.en.orig/tgz/2017-01-mted-test.tgz -C ./data/iwslt17.de.en.orig

In [52]:
!tar -xzf ./data/iwslt17.de.en.orig/2017-01-mted-test/texts/de/en/de-en.tgz -C ./data/iwslt17.de.en.orig/xml/test

In [53]:
!tar -xzf ./data/iwslt17.de.en.orig/2017-01-mted-test/texts/en/de/en-de.tgz -C ./data/iwslt17.de.en.orig/xml/test

In [54]:
!mv ./data/iwslt17.de.en.orig/xml/test/de-en/IWSLT17.TED.tst2017.mltlng.de-en.de.xml ./data/iwslt17.de.en.orig/xml/test/IWSLT17.TED.tst2017.mltlng.de-en.de.xml

In [55]:
!mv ./data/iwslt17.de.en.orig/xml/test/en-de/IWSLT17.TED.tst2017.mltlng.en-de.en.xml ./data/iwslt17.de.en.orig/xml/test/IWSLT17.TED.tst2017.mltlng.en-de.en.xml

In [56]:
!rm -r ./data/iwslt17.de.en.orig/xml/test/de-en
!rm -r ./data/iwslt17.de.en.orig/xml/test/en-de

### TXT to XML

In [314]:
train_txt_paths = glob.glob("./data/iwslt17.de.en.orig/xml/**/*.txt")

In [320]:
def divide_source_file_list(l, n): 
    
  for i in range(0, len(l), n): 
    yield l[i:i + n] 

In [418]:
for path in train_txt_paths:

    xml_text = ET.Element('xml')
    xml_text.set('version', 'Accepted')
    xml_text.set('encoding', 'UTF-8')

    num = 0; num_list = []
    with open(path, 'r') as f:
        lines = f.read().splitlines() 
        for line in lines:
            num += 1
            if '</description>' in line:
                num_list.append(num)
            elif '</reviewer>' in line:
                num_list.append(num)

    num_list = list(divide_source_file_list(num_list, 2))

    text = ""; num = 0; del_num = 0
    with open(path, 'r') as f:
        lines = f.read().splitlines() 
        with tqdm.tqdm(lines) as pbar:
            for line in pbar:
                num += 1
                for idx, value in enumerate(num_list):
                    if num > value[0] and num < value[1]:
                        seg_num += 1
                        seg_name = "seg" + str(seg_num)
                        globals()[seg_name] = ET.SubElement(xml_text, "seg")
                        globals()[seg_name].set('type', str(seg_num))
                        globals()[seg_name].text = line
                    if num == value[1]:
                        del num_list[idx - del_num]
                        del_num += 1
                        seg_num = 0
        pbar.close()

    xml_text = ET.tostring(xml_text)
    xml_path = path.replace("txt", "xml")
    with open(xml_path, "wb") as f:
        f.write(xml_text)

100%|██████████| 223164/223164 [00:24<00:00, 9272.51it/s] 
100%|██████████| 223163/223163 [00:23<00:00, 9353.23it/s] 


'\n    xml_path = path.replace("txt", "xml")\n    with open(xml_path, \'w+\') as f:\n        text = text.split("\n")\n        for line in text:\n            f.write(line + "\n")\n'

### XML to JSON

In [419]:
train_xml_paths = []; dev_xml_paths = []; test_xml_paths = []

xml_paths = glob.glob("./data/iwslt17.de.en.orig/xml/**/*.xml")

for i in xml_paths:
    if all(j in i for j in ['train', 'de-en']):
        train_xml_paths.append(i)
    elif all(j in i for j in ['dev', 'de-en']):
        dev_xml_paths.append(i)
    elif all(j in i for j in ['test']):
        test_xml_paths.append(i)

In [429]:
tree = ET.parse(train_xml_paths[0])

In [435]:
temp_paths = [train_xml_paths[0], dev_xml_paths[0], test_xml_paths[0]]

for path in temp_paths :
    tree = ET.parse(path)

    root = tree.getroot()
    # ET.dump(tree)

    elemList = [elem.tag for elem in tree.iter()]
    elemList = list(set(elemList))
    if 'train' in path: print("Train Data: "); print(str(elemList) + "\n")
    elif 'dev' in path: print("Validation & Evaluation Data: "); print(elemList) 

Train Data: 
['seg', 'xml']

Validation & Evaluation Data: 
['doc', 'title', 'translator', 'reviewer', 'mteval', 'talkid', 'seg', 'description', 'url', 'keywords', 'refset']


In [453]:
def xml_to_json(xml_paths):
    de_texts = []; en_texts = []
    for path in xml_paths:
        tree = ET.parse(path)
        for seg in tree.iter('seg'):
            sample_text = seg.text
            
        if 'sample_text' in locals():
            del sample_text
            for seg in tree.iter('seg'):
                text = seg.text.strip()
                
                if path.split('.')[-2] == 'de':  de_texts.append(text)
                elif path.split('.')[-2] == 'en': en_texts.append(text)
                
        else:
            for refset in tree.iter('refset'):
                for seg in refset.iter('seg'):
                    text = seg.text.strip()
                    if path.split('.')[-2] == 'de':  de_texts.append(text)
                    elif path.split('.')[-2] == 'en': en_texts.append(text)
    
    texts = []
    for de, en in zip(de_texts, en_texts):
        pair_dict = {}; pair_dict["de"] = de; pair_dict["en"] = en
        trans_dict = {}; trans_dict["translation"] = pair_dict
        texts.append(trans_dict)
    
    if 'train' in xml_paths[0]: split = 'train'; 
    elif 'dev' in xml_paths[0]: split = 'validation'
    elif 'test' in xml_paths[0]: split = 'test'

    save_json_path = './datasets/iwslt17.de.en/' + split + '-de-en.json'

    with open(save_json_path, 'w') as fp:
        fp.write('[' + ',\n'.join(json.dumps(i, ensure_ascii=False) for i in texts) + ']\n')

In [454]:
xml_paths = [train_xml_paths, dev_xml_paths, test_xml_paths]

for xml_path in xml_paths:
    xml_to_json(xml_path)

### Huggingface Dataset

In [455]:
dataset = load_dataset('./data/iwslt17.de.en')
dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 116654 examples [00:00, 259924.33 examples/s]
Generating validation split: 888 examples [00:00, 307966.09 examples/s]
Generating test split: 1138 examples [00:00, 439918.71 examples/s]


DatasetDict({
    train: Dataset({
        features: ['translation'],
        num_rows: 116654
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 888
    })
    test: Dataset({
        features: ['translation'],
        num_rows: 1138
    })
})

In [456]:
dataset['train'][0]

{'translation': {'de': 'Vielen Dank, Chris.',
  'en': 'Thank you so much, Chris.'}}

## Reference

<b>Paper</b>
<br>[Attention is all you need](https://arxiv.org/abs/1706.03762)

<br><b>Data</b>
<br>[IWSLT 2017-01](https://wit3.fbk.eu/2017-01)
<br>[IWSLT 2017-01-B](https://wit3.fbk.eu/2017-01-b)

<br><b>GeekforGeeks</b>
<br>[Reading and Writing XML Files in Python](https://www.geeksforgeeks.org/reading-and-writing-xml-files-in-python/)