In [1]:
import os
import pickle
from itertools import combinations, chain
from collections import Counter

import datasets

from nltk import sent_tokenize
from datasets import load_dataset, concatenate_datasets, load_from_disk, Dataset

# Update language-specific wiki40b datasets

In [None]:
language = "en"

dataset_wb = load_dataset("wiki40b", language)
dataset_wb = concatenate_datasets([dataset_wb[split] for split in ('train', 'test', 'validation')])
dataset_wb = dataset_wb.remove_columns("version_id")

Downloading and preparing dataset wiki40b/en (download: Unknown size, generated: 9.75 GiB, post-processed: Unknown size, total: 9.75 GiB) to C:\Users\onurg\.cache\huggingface\datasets\wiki40b\en\1.1.0\d15702fbf830e65fd775c50946364ff0c02fd3089b31887fabf97c2dad970760...


Downloading: 100%|██████████| 1.38k/1.38k [00:00<00:00, 343kB/s]
Downloading:  39%|███▉      | 3.67G/9.42G [09:21<13:16, 7.23MB/s]

In [87]:
language = "de"

dataset_wb = load_dataset("csv", data_files=f"data/{language}_raw.csv", split="train")
dataset_wb = dataset_wb.remove_columns("version_id")

Using custom data configuration default-730e2fce18f82c32
Reusing dataset csv (C:\Users\onurg\.cache\huggingface\datasets\csv\default-730e2fce18f82c32\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


In [None]:
dataset_wb[7]

"""
https://de.wikipedia.org/wiki/National_Park_(Dorf)

'\n_START_ARTICLE_\nNational Park (Dorf)\n_START_SECTION_\nNamensherkunft\n_START_PARAGRAPH_\nDer Name des Dorfes bezieht sich auf den Tongariro National Park,....'
"""

{'text': '\n_START_ARTICLE_\nNational Park (Dorf)\n_START_SECTION_\nNamensherkunft\n_START_PARAGRAPH_\nDer Name des Dorfes bezieht sich auf den Tongariro National Park, Neuseelands ersten Nationalpark, der 1894 eingerichtet wurde und an dessen westlicher Grenze sich das Dorf befindet.\n_START_SECTION_\nGeographie\n_START_PARAGRAPH_\nDas Dorf befindet sich rund 18\xa0km nordwestlich des Gipfels des 2797\xa0m hohen aktiven Vulkans Ruapehu und damit an seinen nordwestlichen Ausläufern.\n_START_SECTION_\nBevölkerung\n_START_PARAGRAPH_\nZum Zensus des Jahres 2013 zählte das Dorf 174\xa0Einwohner, 27,5\xa0% weniger als zur Volkszählung im Jahr 2006.\n_START_SECTION_\nTourismus\n_START_PARAGRAPH_\nDominanter Wirtschaftszweig des Dorfes ist der Tourismus. Zahlreiche Unternehmungen haben sich hier niedergelassen, um Ausrüstung, Wandertouren oder Beherbergung für Touristen bereitzustellen. Das Dorf ist Ausgangspunkt für zahlreiche geführte Touren sowie von Shuttlebussen zum 15 km entfernten größ

In [None]:
def process(example):
    example["text"] = example["text"].replace("_START_ARTICLE_", "")
    example["text"] = example["text"].replace("_START_PARAGRAPH_", "")
    
    example["text"] = example["text"].replace("_NEWLINE_", " ")
    example["text"] = example["text"].replace("_START_SECTION_", " ")

    _, _, example['title'], *text_lst  = example['text'].split("\n")
    example['text'] = " ".join(text_lst)
    example["text"] = example["text"].replace("\xa0", " ")

    return example

In [None]:
updated_wb = dataset_wb.map(process, num_proc=4) 

In [None]:
updated_wb[107]

{'text': ' Brand Hogefeld (* in Wismar; † 1496 in Lübeck) war ein deutscher Kaufmann und Ratsherr der Hansestadt Lübeck.   Leben  Brand Hogefeld war Ältermann der Bergenfahrer in Lübeck. Er vertrat die Bergenfahrer 1478 (gemeinsam mit dem Sekretär des Hansekontors in Bergen Theodericus Brandes) bei König Christian I. von Dänemark in Kopenhagen und 1479 auf dem Tag der Wendischen Städte in Lübeck. Er wurde 1479 in den Lübecker Rat erwählt. Er wurde vom Lübecker Rat nach Bryggen in Bergen gesandt. 1484 verhandelte er erneut in Kopenhagen wegen der Privilegien der Hanse in Dänemark und in Norwegen. Beim Hansetag 1487 in Lübeck erhielt er den Auftrag zwischen den Abgesandten der Hansestädte Deventer und Kampen einen Vergleich zu finden. Er vermittelte auch zwischen den Bergenfahrern und dem Hansekontor in Brügge. Hogefeld wohnte in Lübeck in der Beckergrube 12.',
 'wikidata_id': 'Q23061875',
 'title': 'Brand Hogefeld'}

In [None]:
root_dir = r"C:\Users\onurg\.cache\huggingface\datasets"
data_dir = "updated_wiki40b"

path = os.path.join(root_dir, data_dir, language)
updated_wb.save_to_disk(path)

# Process Updated Datasets

In [3]:
root_dir = r"C:\Users\onurg\.cache\huggingface\datasets"
data_dir = "updated_wiki40b"
languages = ('fr', 'it', 'de', 'en')

In [3]:
path = os.path.join(root_dir, data_dir)
list_updated = [load_from_disk(os.path.join(root_dir, data_dir, language)) for language in languages]

In [4]:
for language, dataset in zip(languages, list_updated):
    print(f"{language} dataset has {len(dataset)} instances.")

fr dataset has 1363865 instances.
it dataset has 813736 instances.
de dataset has 1727572 instances.
en dataset has 3252407 instances.


### Set Operations

In [5]:
# list of IDs of article in wikipedia corpora
id_list = [lng['wikidata_id'] for lng in list_updated]
ids = list(chain.from_iterable(id_list))

# Unique IDs
id_set = set(ids)

# IDs that exist at least in two corpora, so that we can use them to create article pairs
common_ids = {id for id, cnt in  Counter(ids).items()  if cnt >= 2}


print(f"number of unique articles: {len(id_set)}")
print(f"number of articles that appear at least in two languages: {len(common_ids)}")

number of unique articles: 5170691
number of articles that appear at least in two languages: 1274091


### Combining

In [8]:
# Update "other" language datasets: changing text field, removing title
list_updated = [dataset.rename_column("text", f"text_{language}") for dataset, language in zip(list_updated, languages)]
list_updated = [dataset.remove_columns("title") for dataset in list_updated]

In [9]:
temp_directory = "../data/"

# Mapping of ID's to articles in different Wikipedia Corpora
def mapping_closure(dset: datasets.Dataset, language: str) -> dict:
    dic = {}

    def get_mapping(example, language: str):
        dic[example["wikidata_id"]] =  example[f"text_{language}"]

    dset.map(get_mapping, fn_kwargs={"language": language})
    return dic


#list_mapping = [mapping_closure(dset, language) for dset, language  in zip(list_updated, languages)]
for dset, language  in zip(list_updated, languages):
    temp_dic = mapping_closure(dset, language)
    with open(os.path.join(temp_directory, f"temp_dic_{language}.pkl"), 'wb') as f:
        pickle.dump(temp_dic, f)


100%|██████████| 1363865/1363865 [02:37<00:00, 8673.38ex/s]
100%|██████████| 813736/813736 [01:54<00:00, 7127.44ex/s]
100%|██████████| 1727572/1727572 [04:07<00:00, 6992.89ex/s]
100%|██████████| 3252407/3252407 [08:28<00:00, 6395.58ex/s]


In [None]:
# Create a new datasets.Dataset which contains the "common IDs"
filtered_dataset = Dataset.from_dict({"wikidata_id": list(common_ids)})
path = os.path.join(root_dir, data_dir, "only_ids")
filtered_dataset.save_to_disk(path)

## Load Again

In [6]:
temp_directory = "../data/"
root_dir = r"C:\Users\onurg\.cache\huggingface\datasets"
data_dir = "updated_wiki40b"
languages = ('fr', 'it', 'de', 'en')


path = os.path.join(root_dir, data_dir, "only_ids")
filtered_dataset = load_from_disk(path)

In [None]:
# TODO: refactor!! MemoryError

def merge_language(example, mapping: dict, language: str):
    example[f"text_{language}"] = mapping.get(example["wikidata_id"])
    return example

for language in languages:
    with open(os.path.join(temp_directory, f"temp_dic_{language}.pkl"), 'rb') as f:
        print(f"Start loading dictionary for {language}")
        loaded_dict = pickle.load(f)
        print(f"Done loading dictionary for {language}")
        filtered_dataset = filtered_dataset.map(merge_language, fn_kwargs={"mapping": loaded_dict, "language": language}, num_proc=4)  
        print(f"Done mapping instances from {language}")
#merged_dataset = filtered_dataset.map(merge_language, fn_kwargs={"list_mapping": list_mapping, "languages": languages}, num_proc=4)  

In [None]:
filtered_dataset[11]

In [15]:
path = os.path.join(root_dir, data_dir, "filtered_small_updated")
filtered_dataset.save_to_disk(path)

In [16]:
def filter_short(example, languages, min_sentences: int):
    for language in languages:
        if example[f"text_{language}"] and len(sent_tokenize(example[f"text_{language}"])) <= min_sentences:
            return False
    return True

# Filter the articles 
min_sentences = 5

filtered_dataset = filtered_dataset.filter(filter_short, fn_kwargs={"languages": languages, "min_sentences": min_sentences})
print(f"Number of articles in the dataset: {len(filtered_dataset)}")

path = os.path.join(root_dir, data_dir, "filtered_small_dataset")
filtered_dataset.save_to_disk(path)

# Altering the Dataset

In [None]:
# TODO: Creating a long dataset from the already existing wide dataset. For each Wikipedia ID, all available pair combinations are

def make_long(example, languages: tuple, min_sentences: int):
    available_list = list()
    new_example = dict()

    for language in languages:
        # TODO: check
        #if example[f"text_{language}"][0]:
        if example[f"text_{language}"][0] and len(sent_tokenize(example[f"text_{language}"])) >= min_sentences:
            available_list.append(language)

    pairs = list(combinations(available_list, 2))

    article1 = list()
    article2 = list()

    for lang1, lang2 in pairs:
        article1.append(*example[f"text_{lang1}"])
        article2.append(*example[f"text_{lang2}"])


    new_example["wikidata_id"] = example["wikidata_id"] * len(pairs)  
    new_example["pair"] = [f"{lang1}_{lang2}" for lang1, lang2 in pairs]
    new_example["article_1"] = article1
    new_example["article_2"] = article2

    return new_example

# Filter the articles 
min_sentences = 5

long_dataset = filtered_dataset.map(make_long, fn_kwargs={"languages": languages, "min_sentences": min_sentences}, 
                                    remove_columns=filtered_dataset.column_names, 
                                    batched=True, 
                                    batch_size=1)
print(f"Number of articles in the dataset: {len(long_dataset)}")

In [183]:
path = os.path.join(root_dir, data_dir, "long_small_dataset")
long_dataset.save_to_disk(path)

In [184]:
long_dataset[11]

{'article_1': '  The sources  A tablet recovered in Nippur lists grain rations given to the messenger of a certain Šubši-mašrâ-Šakkan during Nazi-Marrutaš’ fourth year (1304 BC). There is a court order found in Ur, dated to the sixteenth year of Nazi-Maruttaš (1292 BC), in which Šubši-mašrâ-šakkan is given the title šakin māti, lúGAR KUR, “governor of the country.” It is an injunction forbidding harvesting reeds from a certain river or canal. The poetic work, Ludlul bēl nēmeqi, describes how the fortunes of Šubši-mašrâ-Šakkan, a rich man of high rank, turned one day. When beset by ominous signs, he incurred the wrath of the king, and seven courtiers plotted every kind of mischief against him. This resulted in him losing his property, “they have divided all my possessions among foreign riffraff,” friends, “my city frowns on me as an enemy; indeed my land is savage and hostile,” physical strength, “my flesh is flaccid, and my blood has ebbed away,” and health, as he relates that he “wall