# Create Tiny Smoke Dataset

This notebook creates **tiny subsets** of the main dataset for fast smoke testing and validation.

## Features

- **Random sampling**: All datasets use random sampling with different seeds
- **Seed-based organization**: All datasets are organized by seed (seed0, seed1, seed2, ...)
- **Configurable count**: Number of datasets created is controlled by `config/test/dataset_creation.yaml`
- **Edge case detection**: Helps catch issues like UTF-8 surrogates, special characters, and data format problems

## Outputs

- Random datasets: `../dataset_tiny/seed{N}/{train.json, validation.json}` (N = 0, 1, 2, ..., num_datasets-1)
  - Each seed uses random sampling with its respective seed value
- Data config: `../config/data/resume_tiny.yaml`

After running this, you can point `01_orchestrate_training.ipynb` at `resume_tiny.yaml` for fast orchestration tests.


In [1]:
from pathlib import Path
import json
import yaml
import random
from typing import List, Dict, Any
import sys

# Auto-detect project root directory
# Try to find root by looking for config directory
notebook_dir = Path.cwd()
# If we're in tests/, go up one level
if notebook_dir.name == "tests":
    root_dir = notebook_dir.parent
else:
    # Otherwise, assume we're at root or search upward
    root_dir = notebook_dir
    # Look for config/test/dataset_creation.yaml
    for candidate in [root_dir, root_dir.parent]:
        config_candidate = candidate / "config" / "test" / "dataset_creation.yaml"
        if config_candidate.exists():
            root_dir = candidate
            break

# Add src to path to import shared utilities
if str(root_dir / "src") not in sys.path:
    sys.path.insert(0, str(root_dir / "src"))

from shared.yaml_utils import load_yaml

# Load configuration from central config file
config_path = root_dir / "config" / "test" / "dataset_creation.yaml"
config = load_yaml(config_path)

# Extract configuration parameters (all paths are relative to project root)
RAW_DATA_DIR = root_dir / config["raw_data_dir"]
RAW_TRAIN_FILE = RAW_DATA_DIR / config["raw_train_file"]

BASE_DATASET_DIR = root_dir / config["base_dataset_dir"]
NUM_TRAIN_SAMPLES = config["num_train_samples"]
NUM_VAL_SAMPLES = config["num_val_samples"]
MAX_TEXT_LENGTH_CHARS = config["max_text_length_chars"]
NUM_DATASETS = config["num_datasets"]

BASE_CONFIG_PATH = root_dir / config["base_config_path"]
TINY_CONFIG_PATH = root_dir / config["tiny_config_path"]

# Metadata for generated config
DATASET_NAME = config["dataset_name"]
DATASET_VERSION = config["dataset_version"]
DATASET_DESCRIPTION = config["dataset_description"]

print(f"Loaded configuration from: {config_path}")
print(f"Raw train file: {RAW_TRAIN_FILE.resolve()}")
print(f"Base dataset directory: {BASE_DATASET_DIR.resolve()}")
print(f"Will create {NUM_DATASETS} random datasets (seed0, seed1, ..., seed{NUM_DATASETS-1})")


Loaded configuration from: c:\Users\HOANG PHI LONG DANG\repos\resume-ner-azureml\config\test\dataset_creation.yaml
Raw train file: C:\Users\HOANG PHI LONG DANG\repos\resume-ner-azureml\dataset\train.json
Base dataset directory: C:\Users\HOANG PHI LONG DANG\repos\resume-ner-azureml\dataset_tiny
Will create 5 random datasets (seed0, seed1, ..., seed4)


In [2]:
def load_full_dataset(train_file_path: Path) -> List[Dict[str, Any]]:
    """
    Load the full training dataset from JSON file.
    
    Args:
        train_file_path: Path to train.json file.
        
    Returns:
        List of sample dictionaries.
        
    Raises:
        FileNotFoundError: If train file doesn't exist.
        ValueError: If file doesn't contain a non-empty list.
    """
    if not train_file_path.exists():
        raise FileNotFoundError(f"Train file not found: {train_file_path}")
    
    with train_file_path.open("r", encoding="utf-8") as f:
        dataset = json.load(f)
    
    if not isinstance(dataset, list) or not dataset:
        raise ValueError("Expected train.json to be a non-empty list of samples")
    
    return dataset


def filter_short_samples(
    samples: List[Dict[str, Any]], 
    max_length: int
) -> List[Dict[str, Any]]:
    """
    Filter samples to those with text length <= max_length.
    
    Args:
        samples: List of sample dictionaries.
        max_length: Maximum text length in characters.
        
    Returns:
        Filtered list of samples with short text.
    """
    short_samples = []
    for sample in samples:
        text = sample.get("text", "")
        if isinstance(text, str) and len(text) <= max_length:
            short_samples.append(sample)
    return short_samples


def save_dataset_split(
    train_samples: List[Dict[str, Any]],
    val_samples: List[Dict[str, Any]],
    output_dir: Path
) -> None:
    """
    Save train and validation samples to JSON files.
    
    Args:
        train_samples: List of training samples.
        val_samples: List of validation samples.
        output_dir: Directory to save the files.
    """
    output_dir.mkdir(parents=True, exist_ok=True)
    
    train_file = output_dir / "train.json"
    val_file = output_dir / "validation.json"
    
    with train_file.open("w", encoding="utf-8") as f:
        json.dump(train_samples, f, ensure_ascii=False, indent=2)
    
    with val_file.open("w", encoding="utf-8") as f:
        json.dump(val_samples, f, ensure_ascii=False, indent=2)


full_dataset = load_full_dataset(RAW_TRAIN_FILE)
short_samples = filter_short_samples(full_dataset, MAX_TEXT_LENGTH_CHARS)

required_samples = NUM_TRAIN_SAMPLES + NUM_VAL_SAMPLES
if len(short_samples) < required_samples:
    raise ValueError(
        f"Not enough short samples (<= {MAX_TEXT_LENGTH_CHARS} chars). "
        f"Found {len(short_samples)}, need at least {required_samples}."
    )

print(f"Loaded {len(full_dataset)} total samples")
print(f"Filtered to {len(short_samples)} samples with text <= {MAX_TEXT_LENGTH_CHARS} chars")


Loaded 5960 total samples
Filtered to 1836 samples with text <= 2500 chars


In [3]:
def create_random_dataset(
    short_samples: List[Dict[str, Any]],
    seed: int,
    output_dir: Path,
    num_train: int,
    num_val: int
) -> None:
    """
    Create a randomly sampled dataset with a specific random seed.
    
    Args:
        short_samples: List of filtered short samples.
        seed: Random seed for reproducibility.
        output_dir: Directory to save the dataset.
        num_train: Number of training samples.
        num_val: Number of validation samples.
    """
    random.seed(seed)
    sampled = random.sample(short_samples, num_train + num_val)
    
    train_samples = sampled[:num_train]
    val_samples = sampled[num_train:]
    
    save_dataset_split(train_samples, val_samples, output_dir)
    
    print(f"Created random dataset (seed={seed}) in {output_dir}")
    print(f"  Train: {len(train_samples)} samples")
    print(f"  Validation: {len(val_samples)} samples")


# Create all random datasets (number controlled by config)
for seed in range(NUM_DATASETS):
    dataset_dir = BASE_DATASET_DIR / f"seed{seed}"
    create_random_dataset(
        short_samples,
        seed,
        dataset_dir,
        NUM_TRAIN_SAMPLES,
        NUM_VAL_SAMPLES
    )


Created random dataset (seed=0) in c:\Users\HOANG PHI LONG DANG\repos\resume-ner-azureml\dataset_tiny\seed0
  Train: 8 samples
  Validation: 2 samples
Created random dataset (seed=1) in c:\Users\HOANG PHI LONG DANG\repos\resume-ner-azureml\dataset_tiny\seed1
  Train: 8 samples
  Validation: 2 samples
Created random dataset (seed=2) in c:\Users\HOANG PHI LONG DANG\repos\resume-ner-azureml\dataset_tiny\seed2
  Train: 8 samples
  Validation: 2 samples
Created random dataset (seed=3) in c:\Users\HOANG PHI LONG DANG\repos\resume-ner-azureml\dataset_tiny\seed3
  Train: 8 samples
  Validation: 2 samples
Created random dataset (seed=4) in c:\Users\HOANG PHI LONG DANG\repos\resume-ner-azureml\dataset_tiny\seed4
  Train: 8 samples
  Validation: 2 samples


In [4]:
# All datasets are now created in Cell 3 using seed-based approach
# This cell is kept for backward compatibility but is no longer needed
# All seed-based datasets (including deterministic seed0) are created above


In [5]:
def create_data_config(
    base_config_path: Path,
    output_config_path: Path,
    dataset_name: str,
    dataset_version: str,
    description: str
) -> None:
    """
    Create a data config YAML by copying and modifying a base config.
    
    Args:
        base_config_path: Path to base data config.
        output_config_path: Path to save the new config.
        dataset_name: Name for the new dataset.
        dataset_version: Version string for the dataset.
        description: Description of the dataset.
        
    Raises:
        FileNotFoundError: If base config doesn't exist.
    """
    if not base_config_path.exists():
        raise FileNotFoundError(f"Base data config not found: {base_config_path}")
    
    with base_config_path.open("r", encoding="utf-8") as f:
        config = yaml.safe_load(f)
    
    config["name"] = dataset_name
    config["version"] = dataset_version
    config["description"] = description
    
    with output_config_path.open("w", encoding="utf-8") as f:
        yaml.safe_dump(config, f, sort_keys=False)


# DATASET_NAME, DATASET_VERSION, and DATASET_DESCRIPTION are loaded from config in Cell 1

create_data_config(
    BASE_CONFIG_PATH,
    TINY_CONFIG_PATH,
    DATASET_NAME,
    DATASET_VERSION,
    DATASET_DESCRIPTION
)

print(f"Created data config: {TINY_CONFIG_PATH.resolve()}")


Created data config: C:\Users\HOANG PHI LONG DANG\repos\resume-ner-azureml\config\data\resume_tiny.yaml
