In [1]:
import os
project_root = os.path.dirname(os.getcwd())
import sys
# Add the directory to the Python path
sys.path.append(f"{project_root}/src")

In [2]:
from pathlib import Path
import shutil

import ray

from config import gpt2_nano_cfg as cfg 
from preprocessor.datasource_processor import DatasourceProcessor
from preprocessor.chunk_processor import ChunkProcessor
from preprocessor.token_processor import TokenProcessor

In [3]:
def generate_chunk_tokn_ids(cfg,source_file_paths,split):
    file_path_ds = ray.data.from_items(source_file_paths)
    datasource_processor = DatasourceProcessor(source_format="txt")
    texts_ds = file_path_ds.map(datasource_processor,
                            num_cpus=1,
                            concurrency=1)

    tokenizer_class = TokenProcessor.create(cfg['ray_data']['tokenizer_class']['name'])
    tokenizer_args =  cfg['ray_data']['tokenizer_class']['args']
    tokenizer= tokenizer_class(**tokenizer_args)
    tokens_ds = texts_ds.map(tokenizer,
                               num_cpus=1,
                               concurrency=1)
    block_size = cfg["model"]["block_size"]
    stride = cfg["model"]["stride"]
    chunk_processor = ChunkProcessor(block_size=block_size, stride=stride)
    chunked_tokens_ds = tokens_ds.flat_map(chunk_processor,
                                  num_cpus=1,
                                  concurrency=1)
    target_path = Path(cfg["dataset"]["chunked_tokens"]).joinpath(split)

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
        print(f"The directory {target_path} has been removed.")
    else:
        print(f"The directory {target_path} does not exist.")

    chunked_tokens_ds.write_parquet(target_path,concurrency=1)


In [4]:
source_path = Path(cfg["dataset"]["path"])
source_file_paths = list(source_path.glob("*.txt"))

len = len(source_file_paths)

train_ratio = cfg["ray_data"]["train_ratio"]

train_file_paths = source_file_paths[:int(len*train_ratio)]
validate_file_paths = source_file_paths[int(len*train_ratio):]


In [5]:
validate_file_paths

[PosixPath('/workspaces/CaiZi/dataset/shakespeare_char/input copy 3.txt'),
 PosixPath('/workspaces/CaiZi/dataset/shakespeare_char/input copy 7.txt')]

In [6]:
generate_chunk_tokn_ids(cfg,train_file_paths,"train")
generate_chunk_tokn_ids(cfg,validate_file_paths,"validate")

2024-09-03 04:27:02,506	INFO worker.py:1598 -- Connecting to existing Ray cluster at address: 192.168.2.113:6379...
2024-09-03 04:27:02,510	INFO worker.py:1774 -- Connected to Ray cluster. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
2024-09-03 04:27:02,550	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-03_03-09-33_107155_713028/logs/ray-data
2024-09-03 04:27:02,550	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[Map(DatasourceProcessor)->Map(CharTokenizer)->FlatMap(ChunkProcessor)->Write]


The directory /workspaces/CaiZi/dataset/shakespeare_char/chunked_tokens/train has been removed.


Running 0: 0.00 row [00:00, ? row/s]

- Map(DatasourceProcessor)->Map(CharTokenizer)->FlatMap(ChunkProcessor)->Write 1: 0.00 row [00:00, ? row/s]

2024-09-03 04:27:03,668	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-03_03-09-33_107155_713028/logs/ray-data
2024-09-03 04:27:03,668	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[Map(DatasourceProcessor)->Map(CharTokenizer)->FlatMap(ChunkProcessor)->Write]


The directory /workspaces/CaiZi/dataset/shakespeare_char/chunked_tokens/validate does not exist.


Running 0: 0.00 row [00:00, ? row/s]

- Map(DatasourceProcessor)->Map(CharTokenizer)->FlatMap(ChunkProcessor)->Write 1: 0.00 row [00:00, ? row/s]