In [37]:
import os
import sys
from dataclasses import dataclass
from itertools import combinations, chain
from collections import Counter

import datasets

from nltk import sent_tokenize
from datasets import load_dataset, concatenate_datasets, load_from_disk, Dataset
from tokenizers import Tokenizer
from transformers import BertTokenizerFast

# Update language-specific wiki40b datasets

In [2]:
language = "en"

dataset_wb = load_dataset("wiki40b", language)
dataset_wb = concatenate_datasets([dataset_wb[split] for split in ('train', 'test', 'validation')])
dataset_wb = dataset_wb.remove_columns("version_id")

Downloading and preparing dataset wiki40b/en (download: Unknown size, generated: 9.75 GiB, post-processed: Unknown size, total: 9.75 GiB) to C:\Users\onurg\.cache\huggingface\datasets\wiki40b\en\1.1.0\d15702fbf830e65fd775c50946364ff0c02fd3089b31887fabf97c2dad970760...


Downloading: 100%|██████████| 1.38k/1.38k [00:00<00:00, 343kB/s]
Downloading:  39%|███▉      | 3.67G/9.42G [09:21<13:16, 7.23MB/s]

In [87]:
language = "de"

dataset_wb = load_dataset("csv", data_files=f"data/{language}_raw.csv", split="train")
dataset_wb = dataset_wb.remove_columns("version_id")

Using custom data configuration default-730e2fce18f82c32
Reusing dataset csv (C:\Users\onurg\.cache\huggingface\datasets\csv\default-730e2fce18f82c32\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e)


In [None]:
dataset_wb[7]

"""
https://de.wikipedia.org/wiki/National_Park_(Dorf)

'\n_START_ARTICLE_\nNational Park (Dorf)\n_START_SECTION_\nNamensherkunft\n_START_PARAGRAPH_\nDer Name des Dorfes bezieht sich auf den Tongariro National Park,....'
"""

{'text': '\n_START_ARTICLE_\nNational Park (Dorf)\n_START_SECTION_\nNamensherkunft\n_START_PARAGRAPH_\nDer Name des Dorfes bezieht sich auf den Tongariro National Park, Neuseelands ersten Nationalpark, der 1894 eingerichtet wurde und an dessen westlicher Grenze sich das Dorf befindet.\n_START_SECTION_\nGeographie\n_START_PARAGRAPH_\nDas Dorf befindet sich rund 18\xa0km nordwestlich des Gipfels des 2797\xa0m hohen aktiven Vulkans Ruapehu und damit an seinen nordwestlichen Ausläufern.\n_START_SECTION_\nBevölkerung\n_START_PARAGRAPH_\nZum Zensus des Jahres 2013 zählte das Dorf 174\xa0Einwohner, 27,5\xa0% weniger als zur Volkszählung im Jahr 2006.\n_START_SECTION_\nTourismus\n_START_PARAGRAPH_\nDominanter Wirtschaftszweig des Dorfes ist der Tourismus. Zahlreiche Unternehmungen haben sich hier niedergelassen, um Ausrüstung, Wandertouren oder Beherbergung für Touristen bereitzustellen. Das Dorf ist Ausgangspunkt für zahlreiche geführte Touren sowie von Shuttlebussen zum 15 km entfernten größ

In [None]:
def process(example):
    example["text"] = example["text"].replace("_START_ARTICLE_", "")
    example["text"] = example["text"].replace("_START_PARAGRAPH_", "")
    
    example["text"] = example["text"].replace("_NEWLINE_", " ")
    example["text"] = example["text"].replace("_START_SECTION_", " ")

    _, _, example['title'], *text_lst  = example['text'].split("\n")
    example['text'] = " ".join(text_lst)
    example["text"] = example["text"].replace("\xa0", " ")

    return example

In [None]:
updated_wb = dataset_wb.map(process, num_proc=4) 

In [None]:
updated_wb[107]

{'text': ' Brand Hogefeld (* in Wismar; † 1496 in Lübeck) war ein deutscher Kaufmann und Ratsherr der Hansestadt Lübeck.   Leben  Brand Hogefeld war Ältermann der Bergenfahrer in Lübeck. Er vertrat die Bergenfahrer 1478 (gemeinsam mit dem Sekretär des Hansekontors in Bergen Theodericus Brandes) bei König Christian I. von Dänemark in Kopenhagen und 1479 auf dem Tag der Wendischen Städte in Lübeck. Er wurde 1479 in den Lübecker Rat erwählt. Er wurde vom Lübecker Rat nach Bryggen in Bergen gesandt. 1484 verhandelte er erneut in Kopenhagen wegen der Privilegien der Hanse in Dänemark und in Norwegen. Beim Hansetag 1487 in Lübeck erhielt er den Auftrag zwischen den Abgesandten der Hansestädte Deventer und Kampen einen Vergleich zu finden. Er vermittelte auch zwischen den Bergenfahrern und dem Hansekontor in Brügge. Hogefeld wohnte in Lübeck in der Beckergrube 12.',
 'wikidata_id': 'Q23061875',
 'title': 'Brand Hogefeld'}

In [None]:
root_dir = r"C:\Users\onurg\.cache\huggingface\datasets"
data_dir = "updated_wiki40b"

path = os.path.join(root_dir, data_dir, language)
updated_wb.save_to_disk(path)

# Process Updated Datasets

In [5]:
root_dir = r"C:\Users\onurg\.cache\huggingface\datasets"
data_dir = "updated_wiki40b"
languages = ('fr', 'it', 'de', 'en')

In [6]:
path = os.path.join(root_dir, data_dir)
list_updated = [load_from_disk(os.path.join(root_dir, data_dir, language)) for language in languages]

### Set Operations

In [32]:
# list of IDs of article in wikipedia corpora
id_list = [lng['wikidata_id'] for lng in list_updated]
ids = list(chain.from_iterable(id_list))

# Unique IDs
id_set = set(ids)

# IDs that exist at least in two corpora, so that we can use them to create article pairs
common_ids = {id for id, cnt in  Counter(ids).items()  if cnt >= 2}


print(f"number of unique articles: {len(id_set)}")
print(f"number of articles that appear at least in two languages: {len(common_ids)}")

5170691
1274091


### Combining

In [33]:
# Update "other" language datasets: changing text field, removing title
list_updated = [dataset.rename_column("text", f"text_{language}") for dataset, language in zip(list_updated, languages)]
list_updated = [dataset.remove_columns("title") for dataset in list_updated]

# Create a new datasets.Dataset which contains the "common IDs"
filtered_dataset = Dataset.from_dict({"wikidata_id": list(common_ids)})

In [40]:
# Mapping of ID's to articles in different Wikipedia Corpora
def mapping_closure(dset: datasets.Dataset, language: str) -> dict:
    dic = {}

    def get_mapping(example, language: str):
        dic[example["wikidata_id"]] =  example[f"text_{language}"]

    dset.map(get_mapping, fn_kwargs={"language": language})
    return dic


list_mapping = [mapping_closure(dset, language) for dset, language  in zip(list_updated, languages)]

100%|██████████| 1363865/1363865 [03:03<00:00, 7427.78ex/s]
100%|██████████| 813736/813736 [01:51<00:00, 7329.27ex/s]
100%|██████████| 1727572/1727572 [04:06<00:00, 7002.91ex/s]
100%|██████████| 3252407/3252407 [07:52<00:00, 6879.37ex/s]


In [41]:
def merge_language(example, list_mapping, languages):
    for mapping, language in zip(list_mapping, languages):
        example[f"text_{language}"] = mapping.get(example["wikidata_id"])

    return example

merged_dataset = filtered_dataset.map(merge_language, fn_kwargs={"list_mapping": list_mapping, "languages": languages})  

MemoryError: 

In [None]:
merged_dataset[11]

{'wikidata_id': 'Q278549',
 'text_en': '  Applications  An important use for \\Gamma-convergence is in homogenization theory. It can also be used to rigorously justify the passage from discrete to continuum theories for materials, for example, in elasticity theory.',
 'title': 'Γ-convergence',
 'text_fr': None,
 'text_it': None,
 'text_de': ' In der Variationsrechnung bezeichnet Γ-Konvergenz (Gamma-Konvergenz) eine spezielle Konvergenzart für Funktionale. Sie wurde von Ennio de Giorgi eingeführt. Ursprünglich wurde sie als G-Konvergenz bezeichnet, da sie für greensche Funktionale entwickelt wurde. Der Begriff Γ-Konvergenz entstand durch die Verallgemeinerung dieses Konvergenzbegriffes.   Anwendungen  Eine wichtige Anwendung findet die Γ-Konvergenz in der Homogenisierungstheorie und der Dimensionsreduktion. Sie kann auch benutzt werden, um eine rigorose Begründung für den Übergang von diskreten zu kontinuierlichen Modellen zu liefern, beispielsweise bei der Elastizitätstheorie. Weitere 

In [15]:
root_dir = r"C:\Users\onurg\.cache\huggingface\datasets"
data_dir = "updated_wiki40b"

path = os.path.join(root_dir, data_dir, "merged_small")
merged_dataset.save_to_disk(path)

In [16]:
def filter_short(example, languages, min_sentences):
    # TODO: change
    # if len(sent_tokenize(example["text_en"])) <= min_sentences:
    #     return False
    for language in languages:
        if example[f"text_{language}"] and len(sent_tokenize(example[f"text_{language}"])) <= min_sentences:
            return False
    return True

In [17]:
min_sentences = 5

filtered_dataset = merged_dataset.filter(filter_short, fn_kwargs={"languages": languages, "min_sentences": min_sentences})

100%|██████████| 1078/1078 [58:45<00:00,  3.27s/ba] 


In [18]:
filtered_dataset

Dataset({
    features: ['wikidata_id', 'text_en', 'title', 'text_fr', 'text_it', 'text_de'],
    num_rows: 639046
})

In [19]:
filtered_dataset[5]

{'wikidata_id': 'Q388957',
 'text_en': "  Profesional career  Has transferred to Fenerbahçe from Beyoğluspor in 1955. He was one of the fan favourites when he was playing. He played for Fenerbahçe between 1955–69, scoring 168 goals. He won the Turkish League 4 times and the Istanbul League title twice.   International career  Has played 37 times for Turkey, starting as captain 10 times.   Personal life  Has' brother, Mehmet Ali Has, was also a Turkish professional footballer.",
 'title': 'Şeref Has',
 'text_fr': "  Carrière de joueur  Avec le club de Fenerbahçe, il remporte notamment quatre championnats de Turquie, une Coupe de Turquie et une Coupe des Balkans. Il joue huit matchs en Coupe d'Europe des clubs champions, inscrivant trois buts dans cette compétition. Il dispute un total de 323 matchs en première division turque, pour 80 buts marqués. Il réalise sa meilleure performance lors de la saison 1958-1959, où il inscrit 14 buts en championnat.   Carrière internationale  Şeref Has 

In [22]:
path = os.path.join(root_dir, data_dir, "filtered_small")
filtered_dataset.save_to_disk(path)

Loading cached processed dataset at C:/Users/onurg/.cache/huggingface/datasets/updated_wiki40b/en\cache-cea87e24ce22810a.arrow


# Altering the Dataset

In [45]:
#languages = ("en", "fr", "it", "de")

In [47]:
# TODO: comment

def make_long(example, languages):
    available_list = list()
    new_example = dict()

    for language in languages:
        # TODO: check
        if example[f"text_{language}"][0]:
            available_list.append(language)

    pairs = list(combinations(available_list, 2))

    article1 = list()
    article2 = list()

    for lang1, lang2 in pairs:
        article1.append(*example[f"text_{lang1}"])
        article2.append(*example[f"text_{lang2}"])


    new_example["wikidata_id"] = example["wikidata_id"] * len(pairs)  
    new_example["pair"] = [f"{lang1}_{lang2}" for lang1, lang2 in pairs]
    new_example["article_1"] = article1
    new_example["article_2"] = article2

    return new_example

In [48]:
long_dataset = filtered_dataset.map(make_long, fn_kwargs={"languages": languages}, remove_columns=filtered_dataset.column_names, batched=True, batch_size=1)

100%|██████████| 639046/639046 [09:57<00:00, 1069.16ba/s]


In [180]:
long_dataset

Dataset({
    features: ['article_1', 'article_2', 'pair', 'wikidata_id'],
    num_rows: 1558009
})

In [183]:
path = os.path.join(root_dir, data_dir, "long_small_dataset")
long_dataset.save_to_disk(path)

In [184]:
long_dataset[11]

{'article_1': '  The sources  A tablet recovered in Nippur lists grain rations given to the messenger of a certain Šubši-mašrâ-Šakkan during Nazi-Marrutaš’ fourth year (1304 BC). There is a court order found in Ur, dated to the sixteenth year of Nazi-Maruttaš (1292 BC), in which Šubši-mašrâ-šakkan is given the title šakin māti, lúGAR KUR, “governor of the country.” It is an injunction forbidding harvesting reeds from a certain river or canal. The poetic work, Ludlul bēl nēmeqi, describes how the fortunes of Šubši-mašrâ-Šakkan, a rich man of high rank, turned one day. When beset by ominous signs, he incurred the wrath of the king, and seven courtiers plotted every kind of mischief against him. This resulted in him losing his property, “they have divided all my possessions among foreign riffraff,” friends, “my city frowns on me as an enemy; indeed my land is savage and hostile,” physical strength, “my flesh is flaccid, and my blood has ebbed away,” and health, as he relates that he “wall

# Offline Tokenization

In [176]:
@dataclass
class Config:
    def __init__(self, max_sentence_len, max_doc_len):
      self.max_sentence_len = max_sentence_len
      self.max_doc_len = max_doc_len

args = Config(128,32)

In [56]:
file_name = "tokenizers/tokenizer_2_1000.json"
base_tokenizer = Tokenizer.from_file(file_name)
tokenizer = BertTokenizerFast(tokenizer_object=base_tokenizer)

In [52]:
small_dataset = long_dataset.select(range(100))

In [167]:
def tokenize(example, tokenizer, args):
    # https://github.com/castorini/hedwig/blob/master/datasets/bert_processors/abstract_processor.py
    # https://github.com/abhishekkrthakur/bert-entity-extraction/blob/master/src/dataset.py

    def tokenize_helper(article, tokenizer, args):
        # TODO: comment
        sentences = [tokenizer.encode(sentence, add_special_tokens=False) for sentence in sent_tokenize(article)] 
        sentences = [sentence[:args.max_sentence_len - 2] for sentence in sentences]
        sentences = [[tokenizer.convert_tokens_to_ids("[CLS]")] + sentence + [tokenizer.convert_tokens_to_ids("[SEP]")] for sentence in sentences]

        sentence_lengths = [len(sentence) for sentence in sentences]
        mask = [[1]*sen_len for sen_len in sentence_lengths]

        return sentences, mask
 
    for i in range(1, 3):
        
        # example[f"article_{i}"] = [tokenizer.encode(sentence, 
        #                                             truncation=True,
        #                                             add_special_tokens=True,                                            
        #                                             max_length=args.max_sentence_len) for sentence in sent_tokenize(example[f"article_{i}"])]
                                                
        #print(example)

        example[f"article_{i}"], example[f"mask_{i}"] = tokenize_helper(example[f"article_{i}"], tokenizer, args)
        
    return example

In [168]:
#new_dataset = small_dataset.map(tokenize, batched=True, fn_kwargs={"tokenizer": tokenizer, "args": args})
new_dataset = small_dataset.map(tokenize, fn_kwargs={"tokenizer": tokenizer, "args": args})

100%|██████████| 100/100 [00:02<00:00, 40.98ex/s]
