# Get Data for Assignment

Copyright (c) 2022 Herman Kamper, MIT License

## Preliminaries

In [24]:
import numpy as np
import re
from pathlib import Path
from tqdm import tqdm

## Data

In [25]:
data_dir = Path("data/language_model_data/data/")
langs = ["af", "xh", "zu"]

original_data = {}
for lang in langs:
    line_count_ignored = 0
    lang_fn = data_dir/f"cleaned_wikipedia.{lang}.txt"
    with open(str(lang_fn)) as f:
        lines = f.readlines()
    new_lines = []
    
    for line in tqdm(lines):
        if (len(line) > 20):
            new_lines.append(line)
        else:
            line_count_ignored += 1
    original_data[lang] = new_lines
    print(line_count_ignored)

100%|███████████████████████████████████████████████████████████| 1305990/1305990 [00:01<00:00, 766610.87it/s]


799087


100%|███████████████████████████████████████████████████████████████| 22035/22035 [00:00<00:00, 491290.07it/s]


11599


100%|█████████████████████████████████████████████████████████████| 100281/100281 [00:00<00:00, 608550.78it/s]

69360





## Subsets

In [26]:
n_train = 10000
n_val = 5000
n_test = 1000

train_data = {}
val_data = {}
test_data = {}

for lang in original_data:
    train_data[lang] = original_data[lang][:n_train]
    val_data[lang] = original_data[lang][n_train:n_train + n_val]
    test_data[lang] = original_data[lang][n_train + n_val:n_train + n_val + n_test]

In [27]:
# Write data
output_dir = Path("data/language_model_data/")
for lang in train_data:
    print(f"Language: {lang}")
    lang_fn = output_dir/f"train.{lang}.txt"
    print(f"Writing: {lang_fn}")
    with open(lang_fn, "w") as f:
        for line in train_data[lang]:
            f.write(line)
    print()

Language: af
Writing: data/language_model_data/train.af.txt

Language: xh
Writing: data/language_model_data/train.xh.txt

Language: zu
Writing: data/language_model_data/train.zu.txt



## Normalize validation and testing data

In [28]:
from utils import remove_special_characters

def preprocess_line(line):
#     line = line.lower()

#     # Remove accents
#     line = re.sub(u"[àáâãäå]", "a", line)
#     line = re.sub(u"[èéêë]", "e", line)
#     line = re.sub(u"[ìíîï]", "i", line)
#     line = re.sub(u"[òóôõö]", "o", line)
#     line = re.sub(u"[ùúûü]", "u", line)
#     line = re.sub(u"[ýÿ]", "y", line)
#     line = re.sub(u"[ß]", "ss", line)
#     line = re.sub(u"[ñ]", "n", line)

#     # Map all digits to 0
#     line = re.sub(r"[0-9]", "0", line)
#     line = re.sub(r"([0]+)", " \g<1> ", line)  # add space before and after 0
    
#     # Add space before and after . or ,
#     line = re.sub(r"([\.\,])", " \g<1> ", line)

#     # Remove all redundant characteres
# #     line = re.sub(r"[^a-z0\.\ \,]", "", line)
# #     line = re.sub(r"[^a-z\ ]", "", line)
#     line = re.sub(r"[^a-z0\ ]", "", line)
    
#     # Eliminate duplicate whitespaces
#     line = re.sub(r"\s+", " ", line)
    return remove_special_characters(line)

In [29]:
# Validation
val_data_normalized = {}
for lang in val_data:
# for lang in ["zu", ]:
    val_data_normalized[lang] = []
    for line in val_data[lang]:
        split_lines = re.findall(r".*?[.?!][.?!\s]+", line)
#         print(split_lines)
#         print(line)
#         assert False
        val_data_normalized[lang] += [preprocess_line(i) for i in split_lines]
    print(f"Language: {lang}")
    print(f"No. lines: {len(val_data_normalized[lang])}")
    val_data_normalized[lang] = list(set(val_data_normalized[lang]))
    print(f"No. unique lines: {len(val_data_normalized[lang])}")
    val_data_normalized[lang] = val_data_normalized[lang][:n_val]
    print(f"No. val lines: {len(val_data_normalized[lang])}")
    print()

Language: af
No. lines: 14901
No. unique lines: 14776
No. val lines: 5000

Language: xh
No. lines: 902
No. unique lines: 901
No. val lines: 901

Language: zu
No. lines: 7310
No. unique lines: 3010
No. val lines: 3010



In [30]:
# Write data
for lang in val_data:
    print(f"Language: {lang}")
    lang_fn = output_dir/f"val.{lang}.txt"
    print(f"Writing: {lang_fn}")
    with open(lang_fn, "w") as f:
        for line in val_data_normalized[lang]:
            f.write(line + "\n")
    print()

Language: af
Writing: data/language_model_data/val.af.txt

Language: xh
Writing: data/language_model_data/val.xh.txt

Language: zu
Writing: data/language_model_data/val.zu.txt



In [31]:
# Test
test_data_normalized = []
test_labels = []
for lang in test_data:
    print(f"Language: {lang}")
    test_lines_lang = []
    for line in test_data[lang]:
        split_lines = re.findall(r".*?[.?!][.?!\s]+", line)
        test_lines_lang += [preprocess_line(i) for i in split_lines]
    print(f"No. lines: {len(test_lines_lang)}")
    test_lines_lang = list(set(test_lines_lang))
    print(f"No. unique lines: {len(test_lines_lang)}")
    test_lines_lang = test_lines_lang[:n_test]
    print(f"No. test lines: {len(test_lines_lang)}")
    test_data_normalized += test_lines_lang
    test_labels += [lang]*n_test
    print()

Language: af
No. lines: 2562
No. unique lines: 2560
No. test lines: 1000

Language: xh
No. lines: 0
No. unique lines: 0
No. test lines: 0

Language: zu
No. lines: 1414
No. unique lines: 501
No. test lines: 501



In [32]:
# Shuffle test data
import random
random.seed(1)

temp = list(zip(test_data_normalized, test_labels))
random.shuffle(temp)
res1, res2 = zip(*temp)
test_data_normalized, test_labels = list(res1), list(res2)

print(f"Total no. test items:  {len(test_data_normalized)}")
print(f"Total no. test labels: {len(test_labels)}")

Total no. test items:  1501
Total no. test labels: 1501


In [33]:
for i in range(10):
    print(f"{test_labels[i]} {test_data_normalized[i]}")

af selfs al glo baie dat die beginsels waarop die dvoraksleutelbord gebaseer is beter is as die ouer qwerty het pogings om wêreldwyd na die dvorakuitleg oor te skakel baie weerstand ondervind 
xh iroox yidolobha elikwisifundazwe se mudug esomaliya

af kommissies bestaande uit lede van die sweedse akademie vir wetenskap (vir ekonomie natuur en skeikunde) en die koninklike karolingiese instituut vir geneeskunde (vir geneeskunde fisiologie) stuur in die europese herfs tussen 2 000 en 3000 vertroulike briewe uit waarin die name van moontlike kandidate aangevra word 

xh loluhlelo lusebenzisa amakhodi ukuchaza izifunda izifundazwe kanye namazwe

af slegs 10 van bophuthatswana se totale grondoppervlakte was bewerkbaar en meeste daarvan was met veldstruike bedek

xh leli dolobha layisiqongo sesifunda se meru north esaqedwa ngonyaka wezi2009

af baie huldig die teorie dat waterstof sulke brandstowwe in die toekoms mag vervang 
af turbulensie in die atmosfeer veroorsaak ook vonkeling (ŉ gedurig

In [34]:
# Write data
fn = output_dir/f"test.lid.txt"
print(f"Writing: {fn}")
with open(fn, "w") as f:
    for lang, line in zip(test_labels, test_data_normalized):
        f.write(f"{lang} {line}\n")

Writing: data/language_model_data/test.lid.txt
