In [2]:
from pathlib import Path
from tqdm.auto import tqdm

In [3]:
DATA_FOLDER_PATH = Path('../data/')
DATA_FILE_PATH = DATA_FOLDER_PATH / "ubertext.wikipedia.filter_rus_gcld+short.text_only.txt"
HF_DATASET_PATH = DATA_FOLDER_PATH / "ubertext_wiki_sentsplit_hfdataset"
TOKENIZED_DATASET_PATH = DATA_FOLDER_PATH / "ubertext.wikipedia.filter_rus_gcld+short.text_only.tokenized.npy"

In [4]:
from tokenizers.trainers import BpeTrainer
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from datasets import load_from_disk


## Train tokenizer

In [None]:
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]"])
tokenizer.pre_tokenizer = Whitespace()
tokenizer.train([str(DATA_FILE_PATH)], trainer)

In [None]:
tokenizer.save("../data/tokenizer-ubertext-wiki.json")

## Test

In [7]:
tokenizer = Tokenizer.from_file("../data/tokenizer-ubertext-wiki.json")

In [26]:
tokenizer.encode("Привіт, цікава людина. Що ж ти тут робиш?").ids

[19580,
 21062,
 12,
 24777,
 19424,
 24996,
 14,
 23955,
 892,
 19421,
 21595,
 20439,
 910,
 31]

In [32]:
tokenizer.decode([10])

'*'

## Split and format to HF dataset

In [4]:
from datasets import Dataset

In [5]:
with open(DATA_FILE_PATH) as f:
    train_sentences = [line for line in tqdm(f.readlines())]

  0%|          | 0/29531702 [00:00<?, ?it/s]

In [6]:
len(train_sentences)

29531702

In [7]:
train_sentences[100]

'Країнознавство\xa0— наука, що вивчає . \n'

In [8]:
dataset = Dataset.from_dict({"text": train_sentences})

In [11]:
dataset.save_to_disk(HF_DATASET_PATH)

Saving the dataset (0/10 shards):   0%|          | 0/29531702 [00:00<?, ? examples/s]

## Tokenize dataset

In [5]:
dataset = load_from_disk(HF_DATASET_PATH)

In [28]:
dataset_tokenized = list()
for d in tqdm(dataset):
    dataset_tokenized.extend(tokenizer.encode(d["text"]).ids)

  0%|          | 0/29531702 [00:00<?, ?it/s]

In [5]:
import numpy as np

In [29]:
train_encoded = np.array(dataset_tokenized, dtype=np.int64)
np.save(TOKENIZED_DATASET_PATH, train_encoded)

In [6]:
np.load(TOKENIZED_DATASET_PATH).shape

(738603343,)

## Push to HF hub

In [6]:
dataset.push_to_hub("nikiandr/ubertext2_wiki", private=True)

Pushing dataset shards to the dataset hub:   0%|          | 0/10 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2954 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2954 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2954 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2954 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2954 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2954 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2954 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2954 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2954 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/2954 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]