In [None]:
import os
project_root = os.path.dirname(os.getcwd())
import sys
# Add the directory to the Python path
sys.path.append(f"{project_root}/src")

In [None]:
from pathlib import Path

import ray

from config import gpt2_cfg as cfg 
from datasource_processor import DatasourceProcessor
from text_split_processor import TextSplitProcessor
from chunk_processor import ChunkProcessor
from token_processor import TokenProcessor


In [None]:
text_dataset = cfg["text_dataset"]


data_sources = [Path(item["path"]) for item in text_dataset["source"]]
document_paths = ray.data.from_items(data_sources)
texts = document_paths.map(DatasourceProcessor())

train_ratio = cfg["ray_data"]["train_ratio"]
text_split_processor = TextSplitProcessor(train_ratio=train_ratio)
texts = texts.map(text_split_processor)

tokenizer_class = TokenProcessor.create(cfg['ray_data']['tokenizer_class']['name'])
tokenizer_args =  cfg['ray_data']['tokenizer_class']['args']
tokenizer= tokenizer_class(**tokenizer_args)
tokens = texts.map(tokenizer)

block_size = cfg["model"]["block_size"]
stride = cfg["model"]["stride"]
chunk_processor = ChunkProcessor(block_size=block_size, stride=stride)
chunked_tokens = tokens.map(chunk_processor)

chunked_tokens.write_parquet(cfg["text_dataset"]["processed"])

# train_chunked_tokens = []
# validate_chunked_tokens = []

# for item in chunked_tokens.iter_rows():
#     train_chunked_tokens.extend(item["train"])
#     validate_chunked_tokens.extend(item["validate"])

#self.train_chunked_tokens = ray.data.from_items(train_chunked_tokens)
#self.validate_chunked_tokens = ray.data.from_items(validate_chunked_tokens)
