# Create news dataset

1. Import dependencies

In [1]:
import random

import numpy as np
import torch

from transformers import AutoTokenizer
from datasets import concatenate_datasets

from src.data.ukrainian_news import load_ukrainian_news_dataset
from src.data.lenta_ru_extended import load_lenta_ru_extended_dataset
from src.data.rus_news_classifier import load_rus_news_classifier_dataset

from src.definitions import PROCESSED_DATA_FOLDER

2. Prepare Env

In [2]:
random_seed = 42

random.seed(random_seed)
torch.manual_seed(random_seed)
np.random.seed(random_seed)

lang_part_len = 152000
model_checkpoint = "FacebookAI/xlm-roberta-base"

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
tokenizer.pad_token = tokenizer.eos_token

3. Load ua-news dataset

In [None]:
ua_news = load_ukrainian_news_dataset(tokenizer, rows_count=200000)

In [None]:
ua_news

4. Load extended lenta ru dataset

In [4]:
ext_lenta_ru = load_lenta_ru_extended_dataset(tokenizer)

In [5]:
ext_lenta_ru

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 326179
})

5. Load ru news dataset

In [6]:
ru_news = load_rus_news_classifier_dataset(tokenizer)

In [7]:
ru_news

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 53527
})

6. Concat ru datasets

In [8]:
ru_ds = concatenate_datasets([ext_lenta_ru, ru_news], split="train")

In [9]:
ru_ds

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 379706
})

In [11]:
ru_ds = ru_ds.shuffle(random_seed).train_test_split(test_size=0.01)

In [12]:
ru_ds.save_to_disk(PROCESSED_DATA_FOLDER / "ru-news")

Saving the dataset (0/1 shards):   0%|          | 0/150480 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1520 [00:00<?, ? examples/s]

7. Concat Datasets

In [10]:
ru_ds = ru_ds.shuffle(random_seed).take(lang_part_len)

In [None]:
ua_ds = ua_news.shuffle(random_seed).take(lang_part_len)

In [None]:
ds = concatenate_datasets([ua_ds, ru_ds], split="train")

In [None]:
ds

8. Split

In [None]:
ds = ds.shuffle(random_seed).train_test_split(test_size=0.01)

In [None]:
ds

9. Save

In [None]:
ds.save_to_disk(PROCESSED_DATA_FOLDER / "ua-ru-news")