In [None]:
NOTE: After clicking the save button:

* config/training_config.yaml will be created
* Ready to validate with config_loader.py
* Ready to train your dialect-specific LLM

In [None]:
import json
import yaml
from pathlib import Path
import ipywidgets as widgets
from IPython.display import display

metadata_path = Path("../datasets/sample_dataset_metadata.json")
tokenizer_path = Path("../tokenizer/custom_tokenizer.json")
output_path = Path("../config/training_config.yaml")

with open(metadata_path, "r", encoding="utf-8") as f:
    meta = json.load(f)


In [None]:
language = widgets.Text(value=meta["language"], description="Language")
dialect = widgets.Dropdown(options=["Eastern", "Central", "Western", "English Phonetics"], value=meta["dialect"], description="Dialect")
version = widgets.Text(value=meta["version"], description="Version")
run_id = widgets.Text(value=f"{meta['dialect'].lower()}-llm-qlora-v1", description="Run ID")
tokenizer_path_display = widgets.Text(value=str(tokenizer_path), description="Tokenizer Path", layout=widgets.Layout(width="95%"))

display(language, dialect, version, run_id, tokenizer_path_display)


In [None]:
dataset_path = widgets.Text(value="../datasets/kanienkeha_prefixes.jsonl", description="Dataset Path", layout=widgets.Layout(width="95%"))
batch_size = widgets.IntSlider(value=2, min=1, max=16, description="Batch Size")
epochs = widgets.IntSlider(value=3, min=1, max=10, description="Epochs")
learning_rate = widgets.FloatSlider(value=2e-4, min=1e-5, max=1e-3, step=1e-5, description="Learning Rate")
gradient_steps = widgets.IntSlider(value=8, min=1, max=64, description="Grad Steps")

display(dataset_path, batch_size, epochs, learning_rate, gradient_steps)


In [None]:
ethics_worksheet = widgets.Text(value="../ethics-protocols/ethics_team_input.csv", description="Ethics CSV", layout=widgets.Layout(width="95%"))
offline_mode = widgets.Checkbox(value=True, description="Allow Offline Mode")

display(ethics_worksheet, offline_mode)


In [None]:
save_btn = widgets.Button(description="Generate training_config.yaml", button_style="success")

def on_save_clicked(b):
    config = {
        "project_name": "mini-indig-llm-kit",
        "language": language.value,
        "dialect": dialect.value,
        "version": version.value,
        "run_id": run_id.value,

        "base_model": {
            "name": "meta-llama/Llama-3-8B",
            "revision": "main",
            "quantization": "4bit",
            "architecture": "llama",
            "tokenizer_path": tokenizer_path_display.value
        },

        "trainer": "qlora",
        "precision": "bf16",
        "use_flash_attention": False,
        "gradient_checkpointing": True,

        "dataset": {
            "name": f"{language.value.lower()}_{dialect.value.lower()}_dataset",
            "path": dataset_path.value,
            "metadata": str(metadata_path),
            "format": "jsonl",
            "text_field": "text",
            "max_seq_length": 2048,
            "shuffle": True,
            "num_workers": 2,
            "tokenization_rules": "../tokenizer/kanienkeha_vocab_rules.yaml"
        },

        "training": {
            "epochs": epochs.value,
            "batch_size": batch_size.value,
            "gradient_accumulation_steps": gradient_steps.value,
            "learning_rate": learning_rate.value,
            "weight_decay": 0.01,
            "warmup_steps": 20,
            "logging_steps": 10,
            "save_steps": 50,
            "eval_steps": 25,
            "seed": 42
        },

        "output": {
            "output_dir": f"../models/{dialect.value.lower()}_llm",
            "model_card": "../models/generated_model_card.md",
            "save_total_limit": 2,
            "save_strategy": "steps",
            "eval_strategy": "steps"
        },

        "offline": {
            "tokenizer_cache": "../tokenizer/",
            "model_cache": "../models/base/",
            "allow_offline_mode": offline_mode.value
        },

        "evaluation": {
            "enabled": True,
            "notebook": "../notebooks/5_evaluation_metrics.ipynb",
            "metrics": [
                "exact_match",
                "morpheme_accuracy",
                "prefix_precision",
                "custom_speaker_validation"
            ]
        },

        "ethics": {
            "worksheet_path": ethics_worksheet.value,
            "respect_data_sovereignty": True,
            "require_contributor_consent": True,
            "dialect_guidelines_path": "../ethics-protocols/dialect_ethics.md"
        },

        "collaboration": {
            "contributors_file": str(metadata_path),
            "editable_notes": "../ethics-protocols/notes_by_team.csv",
            "session_tracking": "enabled"
        }
    }

    output_path.parent.mkdir(parents=True, exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        yaml.dump(config, f, allow_unicode=True, sort_keys=False)

    print(f"✅ training_config.yaml written to {output_path}")

save_btn.on_click(on_save_clicked)
display(save_btn)
