In [1]:
import os
project_root = os.path.dirname(os.getcwd())
import sys
# Add the directory to the Python path
sys.path.append(f"{project_root}/src")

In [2]:
from pathlib import Path

import ray
from datasets import load_from_disk,load_dataset

from config import gpt2_cfg as cfg 
from datasource_processor import DatasourceProcessor
from text_split_processor import TextSplitProcessor
from chunk_processor import ChunkProcessor
from token_processor import TokenProcessor


In [3]:
source_path = Path(cfg["dataset"]["source"][0]["path"])
file_paths = list(source_path.glob("*.parquet"))[:20]

In [4]:
file_path_ds = ray.data.from_items(file_paths)

2024-09-02 07:49:33,942	INFO worker.py:1598 -- Connecting to existing Ray cluster at address: 192.168.2.113:6379...
2024-09-02 07:49:33,946	INFO worker.py:1774 -- Connected to Ray cluster. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


In [5]:
datasource_processor = DatasourceProcessor(source_format="parquet")
texts_ds = file_path_ds.map(datasource_processor,
                            num_cpus=16,
                            concurrency=16)

In [6]:
train_ratio = cfg["ray_data"]["train_ratio"]
text_split_processor = TextSplitProcessor(train_ratio=train_ratio)
texts_split_ds = texts_ds.map(text_split_processor,
                              num_cpus=4,
                              concurrency=4)


In [7]:

tokenizer_class = TokenProcessor.create(cfg['ray_data']['tokenizer_class']['name'])
tokenizer_args =  cfg['ray_data']['tokenizer_class']['args']
tokenizer= tokenizer_class(**tokenizer_args)
tokens_ds = texts_split_ds.map(tokenizer,
                               num_cpus=4,
                               concurrency=4)

In [8]:
block_size = cfg["model"]["block_size"]
stride = cfg["model"]["stride"]
chunk_processor = ChunkProcessor(block_size=block_size, stride=stride)
chunked_tokens_ds = tokens_ds.map(chunk_processor,
                                  num_cpus=8,
                                  concurrency=8)

In [9]:
target_path = Path(cfg["dataset"]["source"][0]["chunked_tokens"])
# remove the shards directory if it exists
if target_path.exists():
    for file in target_path.iterdir():
        file.unlink()
    target_path.rmdir()

chunked_tokens_ds.write_parquet(target_path)

# train_chunked_tokens = []
# validate_chunked_tokens = []

# for item in chunked_tokens.iter_rows():
#     train_chunked_tokens.extend(item["train"])
#     validate_chunked_tokens.extend(item["validate"])

#self.train_chunked_tokens = ray.data.from_items(train_chunked_tokens)
#self.validate_chunked_tokens = ray.data.from_items(validate_chunked_tokens)


2024-09-02 07:49:34,602	INFO streaming_executor.py:108 -- Starting execution of Dataset. Full logs are in /tmp/ray/session_2024-09-02_07-07-11_852929_41461/logs/ray-data
2024-09-02 07:49:34,603	INFO streaming_executor.py:109 -- Execution plan of Dataset: InputDataBuffer[Input] -> TaskPoolMapOperator[Map(DatasourceProcessor)] -> TaskPoolMapOperator[Map(TextSplitProcessor)->Map(TikTokenizer)] -> TaskPoolMapOperator[Map(ChunkProcessor)] -> TaskPoolMapOperator[Write]


Running 0: 0.00 row [00:00, ? row/s]

- Map(DatasourceProcessor) 1: 0.00 row [00:00, ? row/s]

- Map(TextSplitProcessor)->Map(TikTokenizer) 2: 0.00 row [00:00, ? row/s]

- Map(ChunkProcessor) 3: 0.00 row [00:00, ? row/s]

- Write 4: 0.00 row [00:00, ? row/s]