# Preprocess the SmolLM Dataset


## Load the Dataset


In [None]:
import os
from datasets import load_dataset, interleave_datasets

dataset_paths=["HuggingFaceTB/smollm-corpus",
                "HuggingFaceTB/smollm-corpus"]
dataset_names=["cosmopedia-v2",
                "fineweb-edu-dedup"]
#probabilities=[0.111, 0.016 , 0.873]

local_dir =  "train-gpt2-data" 
DATA_CACHE_DIR = os.path.join("/lambda/nfs", local_dir)
os.makedirs(DATA_CACHE_DIR, exist_ok=True)

dataset_objs = []
for ds_path, ds_name in zip(dataset_paths, dataset_names, strict=False):
    dataset_objs.append(
        load_dataset(ds_path, ds_name, split="train", cache_dir=DATA_CACHE_DIR)
    )
#ds = interleave_datasets(
#    dataset_objs, probabilities=probabilities, seed=1337)


In [None]:
import boto3
import gzip
from datasets import load_dataset
from botocore.exceptions import ClientError

session = boto3.Session(
    aws_access_key_id="",
    aws_secret_access_key="")
s3 = session.client("s3")
num_proc = 16
bucket_name = "softwareheritage"

def download_contents(blob_id):
    key = f"content/{blob_id}"
    try:
        obj = s3.get_object(Bucket=bucket_name, Key=key)
        with gzip.GzipFile(fileobj=obj['Body']) as fin:
            content = fin.read().decode("utf-8", errors="ignore")
        return {"text": content, "download_success": True}
    except ClientError as e:
        if e.response['Error']['Code'] == 'NoSuchKey':
            print(f"File not found: {key}")
            return {"text": "", "download_success": False}
        else:
            raise

ds = load_dataset("HuggingFaceTB/smollm-corpus", "python-edu", split="train", num_proc=num_proc, cache_dir=CACHE_DIR)
ds = ds.map(download_contents, input_columns="blob_id", num_proc=num_proc)

# Filter out failed downloads
ds = ds.filter(lambda x: x['download_success'])

# Optionally, print the first example to verify the data
print(ds[0])

dataset_objs.append(ds)

## Setup


In [None]:
import time
import math
from functools import partial
import concurrent.futures as cf

import numpy as np
from transformers import AutoTokenizer

#import tiktoken
#enc = tiktoken.get_encoding("gpt2")
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-360M")

num_cpus = os.cpu_count()
print(f"""
sytem statistics:
-----------------
cpu count: {num_cpus}""")
total_docs = sum([len(dataset) for datasset in dataset_objs])
docs_per_cpu = int(math.ceil(total_docs/num_cpus))
print(f"""
dataset statistics
------------------
documents: {total_docs:,}
docs_per_cpu: {docs_per_cpu:,}""")


## Dummy Preprocessing Operation

In [None]:
def count_tokens(dataset, _tokenizer, idx):
    tokens = _tokenizer.encode(dataset[idx]['text'])
    return len(tokens)


with cf.ProcessPoolExecutor(max_workers = num_cpus) as ex:
    start = time.time()
    documents = 0
    tokens = 0

    for dataset in dataset_objs: 
        f = partial(count_tokens, dataset, tokenizer)
        for result in ex.map(f, range(len(dataset)), chunksize=docs_per_cpu//100):
            documents += 1
            tokens += result
            documents % 1e3 == 0 and print(f"processed {documents:,}", end="\r")
            
    print(f"processed documents in {time.time()-start:0.2f} seconds")
    print(f"total tokens: {tokens:,}")
    print(f"total documents: {documents:,}")   
    assert(documents == total_docs)

## Actual Preprocessing Operation

In [None]:
SHARD_SIZE = int(1e8)
output_dir = "processed"
os.makedirs(os.path.join(DATA_CACHE_DIR, output_dir), exist_ok=True)

def write_shard(shard, shard_idx):
    if shard_idx % 100 == 0:
        split = "valid"
    else:
        split = "train"
    
    
    f_path = os.path.join(DATA_CACHE_DIR, output_dir, f"smol_lm_corpus_{split}_{shard_idx}")
    np.savez(f_path, shard)

def tokenize(dataset, encoder, idx):
    eot = encoder._special_tokens['<|endoftext|>']
    tokens = [eot] + encoder.encode(dataset[idx]['text'])
    return tokens

f = partial(tokenize, dataset, enc)

with cf.ProcessPoolExecutor(max_workers = num_cpus) as ex:
    start = time.time()
    
    docs_processed = 0
    shards_written = 0
    tokens_generated = 0
    shard_token_count = 0

    shard = np.empty((SHARD_SIZE,), dtype=np.uint16)
    
    for tokens in ex.map(f, range(len(dataset)), chunksize=docs_per_cpu//100):
        docs_processed += 1
        tokens_generated += len(tokens)

        if docs_processed % 1e4 == 0:
            print(f"processed {docs_processed:,} documents | generated {tokens_generated:,} tokens | wrote {shards_written} shards", end="\r")

        if shard_token_count + len(tokens) < SHARD_SIZE:
            shard[shard_token_count:shard_token_count + len(tokens)] = tokens 
            shard_token_count += len(tokens)
        else:
            remainder = SHARD_SIZE - shard_token_count
            shard[shard_token_count:shard_token_count + remainder] = tokens[:remainder]
            write_shard(shard, shards_written)
            shards_written += 1
            
            shard[:len(tokens) - remainder] = tokens[remainder:]
            shard_token_count = len(tokens) - remainder
    
    write_shard(shard, shards_written) #write the final shard
    shards_written += 1
    print(f"processed {docs_processed:,} documents | generated {tokens_generated:,} tokens | wrote {shards_written} shards", end="\r")        
    print(f"finished in {time.time()-start:.2f} seconds")
    assert(docs_processed == total_docs)
    print(f"total shards written: {shards_written:,}")
    print(f"total tokens: {tokens_generated:,}")