In [None]:
import os
project_root = os.path.dirname(os.getcwd())
import sys
# Add the directory to the Python path
sys.path.append(f"{project_root}/src")

In [None]:
from pathlib import Path
import shutil

import ray

from config import gpt2_cfg as cfg 
from preprocessor.datasource_processor import DatasourceProcessor
from preprocessor.chunk_processor import ChunkProcessor
from preprocessor.token_processor import TokenProcessor


In [None]:
source_path = Path(cfg["dataset"]["path"])
file_paths = list(source_path.glob("*.parquet"))

In [None]:
file_path_ds = ray.data.from_items(file_paths)

In [None]:
datasource_processor = DatasourceProcessor(source_format="parquet")
texts_ds = file_path_ds.map(datasource_processor,
                            num_cpus=2,
                            concurrency=2)

In [None]:

tokenizer_class = TokenProcessor.create(cfg['ray_data']['tokenizer_class']['name'])
tokenizer_args =  cfg['ray_data']['tokenizer_class']['args']
tokenizer= tokenizer_class(**tokenizer_args)
tokens_ds = texts_ds.map(tokenizer,
                               num_cpus=4,
                               concurrency=4)

In [None]:
block_size = cfg["model"]["block_size"]
stride = cfg["model"]["stride"]
chunk_processor = ChunkProcessor(block_size=block_size, stride=stride)
chunked_tokens_ds = tokens_ds.flat_map(chunk_processor,
                                  num_cpus=8,
                                  concurrency=8)

In [None]:
target_path = Path(cfg["dataset"]["chunked_tokens"])
# remove the target path and its contents recursively
# Check if the directory exists before trying to remove it
if os.path.exists(target_path):
    shutil.rmtree(target_path)
    print(f"The directory {target_path} has been removed.")
else:
    print(f"The directory {target_path} does not exist.")


In [None]:
chunked_tokens_ds.write_parquet(target_path.joinpath("validate"),concurrency=8)

