In [1]:
from datasets import load_dataset
from datasets import concatenate_datasets

  from .autonotebook import tqdm as notebook_tqdm


## Korean datasets

In [2]:
fw2_edu_ko = load_dataset("minpeter/fineweb-2-edu-korean-score-2", num_proc=128)
fw2_edu_ko = concatenate_datasets(list(fw2_edu_ko.values()))
fw2_edu_ko = fw2_edu_ko.remove_columns([col for col in fw2_edu_ko.column_names if col != "text"])
fw2_edu_ko

Dataset({
    features: ['text'],
    num_rows: 14221010
})

In [3]:
haerae_webtext = load_dataset("HAERAE-HUB/KOREAN-WEBTEXT", num_proc=128)
haerae_webtext = concatenate_datasets(list(haerae_webtext.values()))
haerae_webtext = haerae_webtext.remove_columns([col for col in haerae_webtext.column_names if col != "text"])
print(haerae_webtext)

haerae_synthetic_text = load_dataset("HAERAE-HUB/KOREAN-SyntheticText-1.5B", num_proc=128)
haerae_synthetic_text = concatenate_datasets(list(haerae_synthetic_text.values()))
haerae_synthetic_text = haerae_synthetic_text.remove_columns([col for col in haerae_synthetic_text.column_names if col != "text"])
print(haerae_synthetic_text)

Dataset({
    features: ['text'],
    num_rows: 1284879
})
Dataset({
    features: ['text'],
    num_rows: 1552370
})


In [4]:

import datasets

geulgyeol_byncsa = load_dataset("geulgyeol/geulgyeol-by-nc-sa", num_proc=128)
geulgyeol_byncsa = concatenate_datasets(list(geulgyeol_byncsa.values()))
geulgyeol_byncsa = geulgyeol_byncsa.remove_columns([col for col in geulgyeol_byncsa.column_names if col != "text"])
print(geulgyeol_byncsa)

geulgyeol_bysa = load_dataset("geulgyeol/geulgyeol-by-sa", num_proc=128)
geulgyeol_bysa = concatenate_datasets(list(geulgyeol_bysa.values()))
geulgyeol_bysa = geulgyeol_bysa.remove_columns([col for col in geulgyeol_bysa.column_names if col != "text"])
print(geulgyeol_bysa)

geulgyeol_by = load_dataset("geulgyeol/geulgyeol-by", num_proc=128)
geulgyeol_by = concatenate_datasets(list(geulgyeol_by.values()))
geulgyeol_by = geulgyeol_by.remove_columns([col for col in geulgyeol_by.column_names if col != "text"])
print(geulgyeol_by)

geulgyeol_bync = load_dataset("geulgyeol/geulgyeol-by-nc", num_proc=128)
geulgyeol_bync = concatenate_datasets(list(geulgyeol_bync.values()))
geulgyeol_bync = geulgyeol_bync.remove_columns([col for col in geulgyeol_bync.column_names if col != "text"])
print(geulgyeol_bync)

geulgyeol_to_combine = [
    geulgyeol_byncsa, 
    geulgyeol_bysa, 
    geulgyeol_by, 
    geulgyeol_bync
]

new_features = datasets.Features({'text': datasets.Value('string')})

corrected_datasets = []
for ds in geulgyeol_to_combine:
    corrected_ds = ds.cast(new_features, num_proc=128)
    corrected_datasets.append(corrected_ds)

geulgyeol = datasets.concatenate_datasets(corrected_datasets)

print("✨ 데이터셋 병합 완료!")
print(geulgyeol)
print(geulgyeol.features)

Dataset({
    features: ['text'],
    num_rows: 121972
})
Dataset({
    features: ['text'],
    num_rows: 110820
})
Dataset({
    features: ['text'],
    num_rows: 204395
})
Dataset({
    features: ['text'],
    num_rows: 1313581
})
✨ 데이터셋 병합 완료!
Dataset({
    features: ['text'],
    num_rows: 1750768
})
{'text': Value('string')}


In [5]:
# 청와대 국민청원
heegyu_petitions = load_dataset("heegyu/korean-petitions", num_proc=128)
heegyu_petitions = concatenate_datasets(list(heegyu_petitions.values()))
heegyu_petitions = heegyu_petitions.rename_column("content", "text")
heegyu_petitions = heegyu_petitions.remove_columns([col for col in heegyu_petitions.column_names if col != "text"])
heegyu_petitions

Dataset({
    features: ['text'],
    num_rows: 436660
})

In [6]:
combined_korean_dataset = concatenate_datasets([
    fw2_edu_ko, 
    haerae_webtext, 
    haerae_synthetic_text,
    geulgyeol,
    heegyu_petitions
])

print(f"combined_korean_dataset total row: {combined_korean_dataset.num_rows / 1_000_000:.2f}M")

combined_korean_dataset total row: 19.25M


In [7]:
import os
import re

def clean_text(text):
    if not isinstance(text, str):
        return "" # 문자열이 아니면 빈 문자열 반환하거나 오류 처리

    # 문자열 양 끝의 공백 제거
    text = text.strip()

    # 0-1. UTF-8 유효성을 강제로 확인 및 유효하지 않은 문자 제거 (추가된 부분)
    # 이 과정에서 유효하지 않은 UTF-8 바이트 시퀀스가 제거됩니다.
    # 즉, 파이썬 문자열 내부에서 UTF-8로 다시 인코딩될 수 없는 문자를 제거합니다.
    try:
        text = text.encode('utf-8', errors='ignore').decode('utf-8')
    except Exception as e:
        # 이 예외는 이론적으로 발생하지 않아야 하지만, 만약을 위해 로깅합니다.
        print(f"Warning: Error during UTF-8 re-encoding/decoding: {e}. Original text: {text[:50]}...")
        text = "" # 오류 발생 시 해당 텍스트를 비움
    
    # 1-6. fix UnicodeEncodeError: 'utf-8' codec can't encode character '\udd2b' in position 2095: surrogates not allowed
    text = re.sub(r'[\uD800-\uDFFF]', '', text)

    return text

num_processors = max(1, os.cpu_count() - 8)
print(f"Total CPUs: {os.cpu_count()}, Using {num_processors} processes for mapping.")

print("\n2. 텍스트 정제 및 정규화를 시작합니다... (.map)")
cleaned_korean_dataset = combined_korean_dataset.map(
    lambda example: {'text': clean_text(example['text'])},
    num_proc=num_processors,
)

print(f"cleaned_korean_dataset total row: {cleaned_korean_dataset.num_rows / 1_000_000:.2f}M")


Total CPUs: 128, Using 120 processes for mapping.

2. 텍스트 정제 및 정규화를 시작합니다... (.map)
cleaned_korean_dataset total row: 19.25M


In [8]:
# 중복 제거를 위한 전역 세트(set) 선언
seen_texts = set()

def is_high_quality_and_unique(example):
    """
    품질 필터링(길이)과 중복 제거를 동시에 수행하는 함수
    """
    text = example['text']
    
    # 2-1. 길이 필터링: 텍스트 길이가 100글자 미만이면 탈락
    if len(text) < 100:
        return False
    
    # 2-2. 중복 필터링: 이미 등장한 텍스트면 탈락
    if text in seen_texts:
        return False
    
    # 모든 필터를 통과한 경우, seen_texts에 추가하고 통과 처리
    seen_texts.add(text)
    return True

# --- 3. 품질 및 중복 필터링 (.filter) ---
# 정제된 텍스트를 기준으로 길이 필터링 및 중복 제거를 수행합니다.
print("\n3. 품질 및 중복 필터링을 시작합니다... (.filter)")
final_korean_dataset = cleaned_korean_dataset.filter(
    is_high_quality_and_unique,
    num_proc=1 # 'seen_texts' 세트는 전역 변수이므로 다중 처리(num_proc > 1) 시 충돌할 수 있습니다.
                # 대용량 데이터 처리 시에는 다른 중복 제거 방식이 필요할 수 있습니다.
)

print(f"final_korean_dataset total row: {final_korean_dataset.num_rows / 1_000_000:.4f}M")


3. 품질 및 중복 필터링을 시작합니다... (.filter)
final_korean_dataset total row: 19.0010M


In [9]:
cosmopedia = load_dataset(
    "HuggingFaceTB/smollm-corpus",
    data_files=[f'cosmopedia-v2/train-{i:05d}-of-00104.parquet' for i in range(21)],
    # split="train[:1000]",
    split="train",
    num_proc=128
)
cosmopedia = cosmopedia.remove_columns([col for col in cosmopedia.column_names if col != "text"])
print(cosmopedia)


fineweb = load_dataset(
    "HuggingFaceTB/smollm-corpus",
    data_files=[f'fineweb-edu-dedup/train-{i:05d}-of-00234.parquet' for i in range(21)],
    # split="train[:1000]",
    split="train",
    num_proc=128
)
fineweb = fineweb.remove_columns([col for col in fineweb.column_names if col != "text"])
print(fineweb)

Dataset({
    features: ['text'],
    num_rows: 7902069
})
Dataset({
    features: ['text'],
    num_rows: 17066364
})


In [10]:
combined_english_dataset = concatenate_datasets([
    cosmopedia,
    fineweb,
])

final_english_dataset = combined_english_dataset
print(f"final_english_dataset total row: {final_english_dataset.num_rows / 1_000_000:.2f}M")

final_english_dataset total row: 24.97M


In [11]:

tiny_corpus = concatenate_datasets([
    final_korean_dataset,
    final_english_dataset,
])
print(f"tiny_corpus total row: {tiny_corpus.num_rows / 1_000_000:.2f}M")
tiny_corpus = tiny_corpus.shuffle(seed=5768112)

tiny_corpus total row: 43.97M


In [12]:
import os

num_processors = max(1, os.cpu_count() - 8)
print(f"Total CPUs: {os.cpu_count()}, Using {num_processors} processes for mapping.")

tiny_corpus.push_to_hub("minpeter/tiny-corpus", private=False, num_proc=num_processors)

Total CPUs: 128, Using 120 processes for mapping.


Creating parquet from Arrow format:   0%|          | 0/92 [00:00<?, ?ba/s]<?, ? shards/s][A
Creating parquet from Arrow format: 100%|██████████| 92/92 [06:10<00:00,  4.03s/ba]
Creating parquet from Arrow format: 100%|██████████| 92/92 [06:11<00:00,  4.04s/ba]
Creating parquet from Arrow format: 100%|██████████| 92/92 [06:12<00:00,  4.04s/ba]
Creating parquet from Arrow format: 100%|██████████| 92/92 [06:12<00:00,  4.05s/ba]
Creating parquet from Arrow format: 100%|██████████| 92/92 [06:12<00:00,  4.05s/ba]
Creating parquet from Arrow format: 100%|██████████| 92/92 [06:12<00:00,  4.05s/ba]
Creating parquet from Arrow format: 100%|██████████| 92/92 [06:12<00:00,  4.05s/ba]
Creating parquet from Arrow format: 100%|██████████| 92/92 [06:13<00:00,  4.06s/ba]
Creating parquet from Arrow format: 100%|██████████| 92/92 [06:14<00:00,  4.07s/ba]
Creating parquet from Arrow format: 100%|██████████| 92/92 [06:13<00:00,  4.06s/ba]
Creating parquet from Arrow format: 100%|██████████| 92/92 [06:13<0

CommitInfo(commit_url='https://huggingface.co/datasets/minpeter/tiny-corpus/commit/6ed87afb78737c3b7fe2666b27a5c3f68c7301d2', commit_message='Upload dataset (part 00007-of-00008)', commit_description='', oid='6ed87afb78737c3b7fe2666b27a5c3f68c7301d2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/minpeter/tiny-corpus', endpoint='https://huggingface.co', repo_type='dataset', repo_id='minpeter/tiny-corpus'), pr_revision=None, pr_num=None)

In [14]:
from datasets import Dataset

with open("../input.txt", encoding="utf-8") as f:
    chunks = [c.strip() for c in f.read().split("\n\n") if c.strip()]

grouped = ["\n\n".join(chunks[i:i+5]) for i in range(0, len(chunks), 5)]
ds = Dataset.from_dict({"text": grouped}).train_test_split(test_size=0.2, shuffle=True, seed=5768112)
ds


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1156
    })
    test: Dataset({
        features: ['text'],
        num_rows: 289
    })
})