In [1]:
import datasets
from datasets import load_dataset, concatenate_datasets
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# This can be used to find the data_id of different corpora, just change language or year.
links = load_dataset("imvladikon/leipzig_corpora_collection", "links", split="train", trust_remote_code=True)

ds = links.filter(lambda x: x["language"] == "Swedish" and x["year"] == "2019")

for sample in ds:
    print(sample)

Downloading builder script: 3.89kB [00:00, 9.77MB/s]
Downloading readme: 7.84kB [00:00, 14.9MB/s]
Generating train split: 5780 examples [00:00, 63773.91 examples/s]
Filter: 100%|██████████| 5780/5780 [00:00<00:00, 225206.94 examples/s]

{'id': '5198', 'data_id': 'swe_news_2019_10K', 'url': 'https://downloads.wortschatz-leipzig.de/corpora/swe_news_2019_10K.tar.gz', 'language': 'Swedish', 'language_short': 'swe', 'year': '2019', 'size': '10K'}
{'id': '5199', 'data_id': 'swe_news_2019_30K', 'url': 'https://downloads.wortschatz-leipzig.de/corpora/swe_news_2019_30K.tar.gz', 'language': 'Swedish', 'language_short': 'swe', 'year': '2019', 'size': '30K'}
{'id': '5200', 'data_id': 'swe_news_2019_100K', 'url': 'https://downloads.wortschatz-leipzig.de/corpora/swe_news_2019_100K.tar.gz', 'language': 'Swedish', 'language_short': 'swe', 'year': '2019', 'size': '100K'}
{'id': '5201', 'data_id': 'swe_news_2019_300K', 'url': 'https://downloads.wortschatz-leipzig.de/corpora/swe_news_2019_300K.tar.gz', 'language': 'Swedish', 'language_short': 'swe', 'year': '2019', 'size': '300K'}
{'id': '5202', 'data_id': 'swe_news_2019_1M', 'url': 'https://downloads.wortschatz-leipzig.de/corpora/swe_news_2019_1M.tar.gz', 'language': 'Swedish', 'langua




In [2]:
print(datasets.__version__)

2.19.1


In [4]:
# Many of the 21 corpora have sizes: 10K, 30K, 100K, 300K, 1M but not all of them
# All except Slovak was available for 2019
# Change the ids below to try different sizes or years
data_ids = [
    "bul_news_2019_10K",
    "ces_news_2019_10K",
    "dan_news_2019_10K",
    "deu_news_2019_10K",
    "ell_news_2019_10K",
    "eng_news_2019_10K",
    "est_news_2019_10K",
    "fin_news_2019_10K",
    "fra_news_2019_10K",
    "hun_news_2019_10K",
    "ita_news_2019_10K",
    "lav_news_2019_10K",
    "lit_news_2019_10K",
    "nld_news_2019_10K",
    "pol_news_2019_10K",
    "por_news_2019_10K",
    "ron_news_2019_10K",
    "slk_news_2020_10K",
    "slv_news_2019_10K",
    "spa_news_2019_10K",
    "swe_news_2019_10K"
]

language_map = {
    "bul": "Bulgarian",
    "ces": "Czech",
    "dan": "Danish",
    "deu": "German",
    "ell": "Greek",
    "eng": "English",
    "est": "Estonian",
    "fin": "Finnish",
    "fra": "French",
    "hun": "Hungarian",
    "ita": "Italian",
    "lav": "Latvian",
    "lit": "Lithuanian",
    "nld": "Dutch",
    "pol": "Polish",
    "por": "Portuguese",
    "ron": "Romanian",
    "slk": "Slovak",
    "slv": "Slovenian",
    "spa": "Spanish",
    "swe": "Swedish"
}


In [13]:
# I had to "pip install datasets==2.19.1" for this to work 
print("datasets version: ", datasets.__version__)

datasets version:  2.19.1


In [5]:
def get_multi_language_dataset(data_ids):
    
    datasets = []
    
    for data_id in data_ids:
        ds = load_dataset("imvladikon/leipzig_corpora_collection", data_id, split="train", trust_remote_code=True)
        language_code = data_id.split('_')[0]
        label = language_map[language_code]
        ds = ds.add_column("label", [label] * len(ds))
        datasets.append(ds)

    full_dataset = concatenate_datasets(datasets)

    return full_dataset

full_ds = get_multi_language_dataset(data_ids)


Downloading data: 100%|██████████| 2.16M/2.16M [00:00<00:00, 2.45MB/s]
Generating train split: 8807 examples [00:00, 114442.68 examples/s]
Downloading data: 100%|██████████| 2.20M/2.20M [00:00<00:00, 2.85MB/s]
Generating train split: 7691 examples [00:00, 108078.45 examples/s]
Downloading data: 100%|██████████| 2.22M/2.22M [00:01<00:00, 1.54MB/s]
Generating train split: 9140 examples [00:00, 122872.78 examples/s]
Downloading data: 100%|██████████| 2.31M/2.31M [00:01<00:00, 2.19MB/s]
Generating train split: 9494 examples [00:00, 68452.59 examples/s]
Downloading data: 100%|██████████| 2.74M/2.74M [00:00<00:00, 3.63MB/s]
Generating train split: 9139 examples [00:00, 111320.76 examples/s]
Downloading data: 100%|██████████| 2.76M/2.76M [00:00<00:00, 3.12MB/s]
Generating train split: 8455 examples [00:00, 115047.20 examples/s]
Downloading data: 100%|██████████| 2.12M/2.12M [00:01<00:00, 1.67MB/s]
Generating train split: 8144 examples [00:00, 115901.63 examples/s]
Downloading data: 100%|█████

In [6]:
# dataset information
print("Dataset info: ", full_ds)

Dataset info:  Dataset({
    features: ['id', 'sentence', 'label'],
    num_rows: 184591
})


In [7]:
print("Dataset shape: ", full_ds.shape)

Dataset shape:  (184591, 3)


In [8]:
print("Labels: ", set(full_ds["label"]))

Labels:  {'Polish', 'Dutch', 'Hungarian', 'Danish', 'Portuguese', 'Slovak', 'Finnish', 'Latvian', 'Romanian', 'German', 'Estonian', 'Swedish', 'English', 'Bulgarian', 'Slovenian', 'Spanish', 'French', 'Greek', 'Lithuanian', 'Czech', 'Italian'}


In [9]:
print("Examples:")
for i in range(5):
    j = np.random.randint(0, len(full_ds))
    print(j, full_ds[j])

Examples:
133497 {'id': '1016', 'sentence': 'A mulher sofreu ferimentos leves, chegou a ser atendida e foi levada ao hotel.', 'label': 'Portuguese'}
113745 {'id': '9299', 'sentence': 'Vakcina nuo meningokokinės B tipo infekcijos buvo Europoje užregistruota 2013-aisiais, o Lietuvoje ja pasiskiepyti galima jau nuo 2014-ųjų.', 'label': 'Lithuanian'}
62208 {'id': '1339', 'sentence': 'Hän antoi Ruotsi-ottelun voitto-osumastaan varauksettomasti tunnustusta hienon syötön antaneelle Teemu Turuselle.', 'label': 'Finnish'}
82878 {'id': '4002', 'sentence': 'Az izraeli és az egyiptomi hírszerzés jelentései szerint a Hamasz támadásokat tervez Törökország területéről.', 'label': 'Hungarian'}
11633 {'id': '3654', 'sentence': 'Michel je teprve třetím stálým předsedou rady a po Hermanu van Rompuyovi druhým Belgičanem v této funkci.', 'label': 'Czech'}
