# Short (sampled) multilingual dataset creation

Concatenation of the monolingual subsampled datasets into one.

In [2]:
!pip install datasets==2.11.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Data Download

In [3]:
from datasets import load_dataset, interleave_datasets

We use our own created datasets with paraphrases.

In [4]:
raw_ds_en = load_dataset('yawnick/para_crawl_enen')  # change to english
raw_ds_de = load_dataset('yawnick/para_crawl_dede')
raw_ds_cz = load_dataset('yawnick/para_crawl_cscs')  # change to czech
raw_ds_sl = load_dataset('yawnick/para_crawl_slsl')  # change to slovene



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]



  0%|          | 0/3 [00:00<?, ?it/s]

Let's store the splits separately.

In [5]:
raw_ds_en_train = raw_ds_en['train']
raw_ds_en_val = raw_ds_en['validation']
raw_ds_en_test = raw_ds_en['test']
raw_ds_de_train = raw_ds_de['train']
raw_ds_de_val = raw_ds_de['validation']
raw_ds_de_test = raw_ds_de['test']
raw_ds_cz_train = raw_ds_cz['train']
raw_ds_cz_val = raw_ds_cz['validation']
raw_ds_cz_test = raw_ds_cz['test']
raw_ds_sl_train = raw_ds_sl['train']
raw_ds_sl_val = raw_ds_sl['validation']
raw_ds_sl_test = raw_ds_sl['test']

In [6]:
raw_ds_en_train, raw_ds_en_val, raw_ds_en_test

(Dataset({
     features: ['Original', 'Paraphrase'],
     num_rows: 55544
 }),
 Dataset({
     features: ['Original', 'Paraphrase'],
     num_rows: 9803
 }),
 Dataset({
     features: ['Original', 'Paraphrase'],
     num_rows: 11532
 }))

First, we shuffle all datasets. The seed is important if we later want to continue training and use the same training and validation data.

In [7]:
seed = 3435

In [8]:
raw_ds_en_train = raw_ds_en_train.shuffle(seed=seed)
raw_ds_en_val = raw_ds_en_val.shuffle(seed=seed)
raw_ds_en_test = raw_ds_en_test.shuffle(seed=seed)

raw_ds_de_train = raw_ds_de_train.shuffle(seed=seed)
raw_ds_de_val = raw_ds_de_val.shuffle(seed=seed)
raw_ds_de_test = raw_ds_de_test.shuffle(seed=seed)

raw_ds_cz_train = raw_ds_cz_train.shuffle(seed=seed)
raw_ds_cz_val = raw_ds_cz_val.shuffle(seed=seed)
raw_ds_cz_test = raw_ds_cz_test.shuffle(seed=seed)

raw_ds_sl_train = raw_ds_sl_train.shuffle(seed=seed)
raw_ds_sl_val = raw_ds_sl_val.shuffle(seed=seed)
raw_ds_sl_test = raw_ds_sl_test.shuffle(seed=seed)



`flatten_indices()` is used to resolve/remove the index mapping created by `shuffle()`.

In [9]:
raw_ds_en_train = raw_ds_en_train.flatten_indices()
raw_ds_en_val = raw_ds_en_val.flatten_indices()
raw_ds_en_test = raw_ds_en_test.flatten_indices()

raw_ds_de_train = raw_ds_de_train.flatten_indices()
raw_ds_de_val = raw_ds_de_val.flatten_indices()
raw_ds_de_test = raw_ds_de_test.flatten_indices()

raw_ds_cz_train = raw_ds_cz_train.flatten_indices()
raw_ds_cz_val = raw_ds_cz_val.flatten_indices()
raw_ds_cz_test = raw_ds_cz_test.flatten_indices()

raw_ds_sl_train = raw_ds_sl_train.flatten_indices()
raw_ds_sl_val = raw_ds_sl_val.flatten_indices()
raw_ds_sl_test = raw_ds_sl_test.flatten_indices()



If we want to take one fourth of each dataset we do that here:

In [10]:
SUBSAMPLING_CONSTANT = 4

In [11]:
assert raw_ds_en_train.num_rows == raw_ds_de_train.num_rows == raw_ds_cz_train.num_rows == raw_ds_sl_train.num_rows
assert raw_ds_en_val.num_rows == raw_ds_de_val.num_rows == raw_ds_cz_val.num_rows == raw_ds_sl_val.num_rows
assert raw_ds_en_test.num_rows == raw_ds_de_test.num_rows == raw_ds_cz_test.num_rows == raw_ds_sl_test.num_rows

length_per_ds_train = raw_ds_en_train.num_rows // SUBSAMPLING_CONSTANT
length_per_ds_val = raw_ds_en_val.num_rows // SUBSAMPLING_CONSTANT
length_per_ds_test = raw_ds_en_test.num_rows // SUBSAMPLING_CONSTANT

print("PER LANGUAGE:")
print(f"  {raw_ds_en_train.num_rows=} -> {length_per_ds_train=}")
print(f"  {raw_ds_en_val.num_rows=} -> {length_per_ds_val=}")
print(f"  {raw_ds_en_test.num_rows=} -> {length_per_ds_test=}")

total_before = raw_ds_en_train.num_rows + raw_ds_en_val.num_rows + raw_ds_en_test.num_rows
total_after = length_per_ds_train + length_per_ds_val + length_per_ds_test
print(f"  {total_before=} {total_after=}")

PER LANGUAGE:
  raw_ds_en_train.num_rows=55544 -> length_per_ds_train=13886
  raw_ds_en_val.num_rows=9803 -> length_per_ds_val=2450
  raw_ds_en_test.num_rows=11532 -> length_per_ds_test=2883
  total_before=76879 total_after=19219


In [12]:
raw_ds_en_train = raw_ds_en_train.select(range(length_per_ds_train))
raw_ds_en_val = raw_ds_en_val.select(range(length_per_ds_val))
raw_ds_en_test = raw_ds_en_test.select(range(length_per_ds_test))

raw_ds_de_train = raw_ds_de_train.select(range(length_per_ds_train, length_per_ds_train * 2))
raw_ds_de_val = raw_ds_de_val.select(range(length_per_ds_val, length_per_ds_val * 2))
raw_ds_de_test = raw_ds_de_test.select(range(length_per_ds_test, length_per_ds_test * 2))

raw_ds_cz_train = raw_ds_cz_train.select(range(length_per_ds_train * 2, length_per_ds_train * 3))
raw_ds_cz_val = raw_ds_cz_val.select(range(length_per_ds_val * 2, length_per_ds_val * 3))
raw_ds_cz_test = raw_ds_cz_test.select(range(length_per_ds_test * 2, length_per_ds_test * 3))

raw_ds_sl_train = raw_ds_sl_train.select(range(length_per_ds_train * 3, length_per_ds_train * 4))
raw_ds_sl_val = raw_ds_sl_val.select(range(length_per_ds_val * 3, length_per_ds_val * 4))
raw_ds_sl_test = raw_ds_sl_test.select(range(length_per_ds_test * 3, length_per_ds_test * 4))

Finally, we conbine the datasets into one.

In [13]:
raw_dataset_train = interleave_datasets([raw_ds_en_train, raw_ds_de_train, raw_ds_cz_train, raw_ds_sl_train])
raw_dataset_val = interleave_datasets([raw_ds_en_val, raw_ds_de_val, raw_ds_cz_val, raw_ds_sl_val])
raw_dataset_test = interleave_datasets([raw_ds_en_test, raw_ds_de_test, raw_ds_cz_test, raw_ds_sl_test])

In [14]:
raw_dataset_train, raw_dataset_val, raw_dataset_test

(Dataset({
     features: ['Original', 'Paraphrase'],
     num_rows: 55544
 }),
 Dataset({
     features: ['Original', 'Paraphrase'],
     num_rows: 9800
 }),
 Dataset({
     features: ['Original', 'Paraphrase'],
     num_rows: 11532
 }))

## Save data

In [15]:
DATA_OUTPUT_DIR = "/content/drive/MyDrive/data/"

FILE_NAME_TRAIN = "paraphrases_multilingual_subsampled_train.csv"
FILE_NAME_VAL = "paraphrases_multilingual_subsampled_val.csv"
FILE_NAME_TEST = "paraphrases_multilingual_subsampled_test.csv"

In [16]:
raw_dataset_train.to_csv(DATA_OUTPUT_DIR + FILE_NAME_TRAIN)
raw_dataset_val.to_csv(DATA_OUTPUT_DIR + FILE_NAME_VAL)
raw_dataset_test.to_csv(DATA_OUTPUT_DIR + FILE_NAME_TEST)

Creating CSV from Arrow format:   0%|          | 0/56 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

2088745