In [None]:
import os
project_root = os.path.dirname(os.getcwd())
import sys
# Add the directory to the Python path
sys.path.append(f"{project_root}/src")

In [None]:
from pathlib import Path
import shutil

import ray

from config import gpt2_nano_cfg as cfg 
from preprocessor.datasource_processor import DatasourceProcessor
from preprocessor.chunk_processor import ChunkProcessor
from preprocessor.token_processor import TokenProcessor

In [None]:
def generate_chunk_tokn_ids(cfg,source_file_paths,split):
    file_path_ds = ray.data.from_items(source_file_paths)
    datasource_processor = DatasourceProcessor(source_format=DatasourceProcessor.TEXT)
    texts_ds = file_path_ds.map(datasource_processor,
                            num_cpus=1,
                            concurrency=1)

    tokenizer_class = TokenProcessor.create(cfg['ray_data']['tokenizer_class']['name'])
    tokenizer_args =  cfg['ray_data']['tokenizer_class']['args']
    tokenizer= tokenizer_class(**tokenizer_args)
    tokens_ds = texts_ds.map(tokenizer,
                               num_cpus=1,
                               concurrency=1)
    block_size = cfg["model"]["block_size"]
    stride = cfg["model"]["stride"]
    chunk_processor = ChunkProcessor(block_size=block_size, stride=stride)
    chunked_tokens_ds = tokens_ds.flat_map(chunk_processor,
                                  num_cpus=1,
                                  concurrency=1)
    target_path = Path(cfg["dataset"]["chunked_tokens"]).joinpath(split)

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
        print(f"The directory {target_path} has been removed.")
    else:
        print(f"The directory {target_path} does not exist.")

    chunked_tokens_ds.write_parquet(target_path,concurrency=1)


In [None]:
source_path = Path(cfg["dataset"]["path"])
source_file_paths = list(source_path.glob("*.txt"))

len = len(source_file_paths)

train_ratio = cfg["ray_data"]["train_ratio"]

train_file_paths = source_file_paths[:int(len*train_ratio)]
validate_file_paths = source_file_paths[int(len*train_ratio):]


In [None]:
generate_chunk_tokn_ids(cfg,train_file_paths,"train")
generate_chunk_tokn_ids(cfg,validate_file_paths,"validate")