In [1]:
import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
dataset1 = datasets.load_dataset('HAERAE-HUB/KOREAN-WEBTEXT', split='train')
dataset1

Dataset({
    features: ['text', 'source', 'token_count', '__index_level_0__'],
    num_rows: 1284879
})

In [16]:
dataset2 = datasets.load_dataset('blueapple8259/c4-ko-cleaned-2', split='train')
dataset2

Dataset({
    features: ['text'],
    num_rows: 2261464
})

In [None]:
dataset3 = datasets.load_dataset('HAERAE-HUB/KOREAN-SyntheticText-1.5B', split='train')
dataset3

Dataset({
    features: ['text', '__index_level_0__'],
    num_rows: 1552370
})

In [14]:
# maywell/korean_textbooks 데이터셋에서 분류기로 3점 이상의 데이터만 수집
dataset4 = datasets.load_dataset("devngho/korean-textbooks-edu", name="scored_over_3", split="train")
dataset4

Dataset({
    features: ['text', 'score'],
    num_rows: 1735255
})

In [19]:
from datasets import concatenate_datasets

combined_dataset = concatenate_datasets([dataset1, dataset2, dataset3, dataset4])
combined_dataset

Dataset({
    features: ['text', 'source', 'token_count', '__index_level_0__', 'score'],
    num_rows: 6833968
})

In [21]:
seen = set()
def filter_duplicates(example):
    text = example["text"]
    if text in seen:
        return False
    seen.add(text)
    return True

unique_dataset = combined_dataset.filter(filter_duplicates)
unique_dataset

Filter: 100%|██████████| 6833968/6833968 [02:18<00:00, 49167.91 examples/s]


Dataset({
    features: ['text', 'source', 'token_count', '__index_level_0__', 'score'],
    num_rows: 6832738
})

In [22]:
# text 컬럼만 남기고 나머지 컬럼 제거
columns_to_remove = [col for col in unique_dataset.column_names if col != "text"]
text_only_dataset = unique_dataset.remove_columns(columns_to_remove)
text_only_dataset

Dataset({
    features: ['text'],
    num_rows: 6832738
})

In [23]:
# text안에 내용이 100글자 아래인 경우에 대해서 제거
def filter_short_texts(example):
    return len(example["text"]) >= 100

filtered_dataset = text_only_dataset.filter(filter_short_texts)
filtered_dataset

Filter: 100%|██████████| 6832738/6832738 [04:03<00:00, 28089.51 examples/s]


Dataset({
    features: ['text'],
    num_rows: 6826068
})

In [27]:
shuffled_dataset = filtered_dataset.shuffle(seed=5768112)

In [28]:
shuffled_dataset.push_to_hub("minpeter/pretrain-korean-dedup", split="train")

Creating parquet from Arrow format: 100%|██████████| 107/107 [00:02<00:00, 45.70ba/s]
Creating parquet from Arrow format: 100%|██████████| 107/107 [00:03<00:00, 34.21ba/s]
Creating parquet from Arrow format: 100%|██████████| 107/107 [00:03<00:00, 35.22ba/s]
Creating parquet from Arrow format: 100%|██████████| 107/107 [00:02<00:00, 45.97ba/s]
Creating parquet from Arrow format: 100%|██████████| 107/107 [00:02<00:00, 46.20ba/s]
Creating parquet from Arrow format: 100%|██████████| 107/107 [00:02<00:00, 45.57ba/s]
Creating parquet from Arrow format: 100%|██████████| 107/107 [00:02<00:00, 46.18ba/s]
Creating parquet from Arrow format: 100%|██████████| 107/107 [00:02<00:00, 46.85ba/s]
Creating parquet from Arrow format: 100%|██████████| 107/107 [00:02<00:00, 45.38ba/s]
Creating parquet from Arrow format: 100%|██████████| 107/107 [00:02<00:00, 45.88ba/s]
Creating parquet from Arrow format: 100%|██████████| 107/107 [00:02<00:00, 46.61ba/s]
Creating parquet from Arrow format: 100%|██████████| 1

CommitInfo(commit_url='https://huggingface.co/datasets/minpeter/pretrained-tiny-ko/commit/f270bf60eae0e7ce84bef433de4d4a8d5c9a90c6', commit_message='Upload dataset (part 00001-of-00002)', commit_description='', oid='f270bf60eae0e7ce84bef433de4d4a8d5c9a90c6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/minpeter/pretrained-tiny-ko', endpoint='https://huggingface.co', repo_type='dataset', repo_id='minpeter/pretrained-tiny-ko'), pr_revision=None, pr_num=None)

In [None]:
from transformers import AutoTokenizer

# tokenize dataset
context_length = 512
tokenizer = AutoTokenizer.from_pretrained("kakaocorp/kanana-nano-2.1b-base")

def tokenize(element):
    """
    A text which length is over `context_length` is divided into multiple segments
    """
    outputs = tokenizer(
        element["text"],
        truncation=True,
        max_length=context_length,
        return_overflowing_tokens=True,
        return_length=True,
    )
    return outputs

tokenized_dataset = shuffled_dataset.map(
    tokenize,
    remove_columns=shuffled_dataset.column_names,
    batched=True,
    batch_size=5_000,  # adjust batch size based on your memory capacity
    num_proc=64,      # depending on your CPU cores, you can adjust this number
)
print(tokenized_dataset)

Map (num_proc=64):  23%|██▎       | 1595000/6826068 [01:35<05:19, 16376.34 examples/s]Process ForkPoolWorker-172:
Process ForkPoolWorker-155:
Traceback (most recent call last):
  File "/data/minpeter/github.com/minpeter/mirco-ko-llama/.venv/lib/python3.13/site-packages/multiprocess/process.py", line 314, in _bootstrap
    self.run()
    ~~~~~~~~^^
  File "/data/minpeter/github.com/minpeter/mirco-ko-llama/.venv/lib/python3.13/site-packages/multiprocess/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
    ~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/data/minpeter/github.com/minpeter/mirco-ko-llama/.venv/lib/python3.13/site-packages/multiprocess/pool.py", line 125, in worker
    result = (True, func(*args, **kwds))
                    ~~~~^^^^^^^^^^^^^^^
  File "/data/minpeter/github.com/minpeter/mirco-ko-llama/.venv/lib/python3.13/site-packages/datasets/utils/py_utils.py", line 688, in _write_generator_to_queue
    for i, result in enumerate(func(**kwargs