In [None]:
# Option 1: Load from config file
config_path = "config/training_config.yaml"

if Path(config_path).exists():
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    print("✅ Loaded configuration from file")
else:
    # Option 2: Define configuration here
    config = {
        "model": {
            "name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
        },
        "training": {
            "type": "lora",
            "lora": {
                "rank": 8,
                "alpha": 16,
                "dropout": 0.0,
                "scale": 10.0,
                "num_layers": 8
            }
        },
        "dataset": {
            "path": "data/sample_dataset.jsonl",
            "output_dir": "data",
            "validation_split": 0.2,
            "max_seq_length": 2048,
            "format_type": "chat"
        },
        "hyperparameters": {
            "batch_size": 2,
            "learning_rate": 1e-5,
            "num_iterations": 100,
            "val_batches": 25,
            "steps_per_report": 10,
            "steps_per_eval": 50
        },
        "checkpointing": {
            "save_every": 100,
            "output_dir": "adapters"
        },
        "distributed": {
            "enabled": False
        }
    }
    print("✅ Using default configuration")

# Display configuration
print("\nCurrent Configuration:")
print(json.dumps(config, indent=2))

### Quick Configuration Editor

Modify key parameters easily:

In [None]:
# Easy parameter adjustment
# Uncomment and modify as needed:

# config["model"]["name"] = "mistralai/Mistral-7B-Instruct-v0.2"
# config["hyperparameters"]["batch_size"] = 4
# config["hyperparameters"]["num_iterations"] = 1000
# config["distributed"]["enabled"] = True

print("Model:", config["model"]["name"])
print("Batch size:", config["hyperparameters"]["batch_size"])
print("Iterations:", config["hyperparameters"]["num_iterations"])
print("Distributed:", config["distributed"]["enabled"])

## 3. Initialize Components

In [None]:
# Setup logging
logger = setup_logging(log_level="INFO")

# Initialize cache manager
cache_manager = ModelCacheManager()
print(f"Cache directory: {cache_manager.cache_dir}")

# List cached models
cached_models = cache_manager.list_cached_models()
if cached_models:
    print("\nCached models:")
    for model in cached_models:
        print(f"  - {model}")
else:
    print("\nNo cached models found")

## 4. Dataset Preparation

Prepare your dataset for training. You can either:
1. Use an existing dataset
2. Create a sample dataset for testing
3. Load and format your own data

In [None]:
# Check if dataset exists
dataset_path = Path(config["dataset"]["path"])

if not dataset_path.exists():
    print(f"Dataset not found at {dataset_path}")
    print("Creating sample dataset for testing...")
    
    # Create directory if needed
    dataset_path.parent.mkdir(exist_ok=True)
    
    # Create sample dataset
    create_sample_dataset(dataset_path, num_examples=200)

# Prepare dataset (format and split)
print("\nPreparing dataset...")
dataset_info = prepare_dataset(
    file_path=dataset_path,
    output_dir=config["dataset"]["output_dir"],
    val_ratio=config["dataset"]["validation_split"],
    format_type=config["dataset"]["format_type"],
    shuffle=config["dataset"].get("shuffle", True),
    seed=config["dataset"].get("seed", 42)
)

print(f"\nDataset prepared:")
print(f"  Training examples: {dataset_info['train_size']}")
print(f"  Validation examples: {dataset_info['valid_size']}")

### Explore Dataset

In [None]:
# Load and display a few examples
train_data = load_jsonl(dataset_info['train'])

print("Sample training examples:")
for i, example in enumerate(train_data[:3]):
    print(f"\nExample {i+1}:")
    if "messages" in example:
        for msg in example["messages"]:
            print(f"  {msg['role']}: {msg['content'][:100]}...")
    else:
        print(f"  {example}")

## 5. Training Setup

Configure training parameters and callbacks.

In [None]:
# Prepare training configuration
training_config = {
    "fine_tune_type": config["training"]["type"],
    "num_layers": config["training"]["lora"]["num_layers"],
    "lora_parameters": config["training"]["lora"],
    "batch_size": config["hyperparameters"]["batch_size"],
    "iters": config["hyperparameters"]["num_iterations"],
    "learning_rate": config["hyperparameters"]["learning_rate"],
    "val_batches": config["hyperparameters"]["val_batches"],
    "steps_per_report": config["hyperparameters"]["steps_per_report"],
    "steps_per_eval": config["hyperparameters"]["steps_per_eval"],
    "save_every": config["checkpointing"]["save_every"],
    "adapter_path": config["checkpointing"]["output_dir"],
    "max_seq_length": config["dataset"]["max_seq_length"],
    "grad_checkpoint": config.get("optimization", {}).get("gradient_checkpointing", False)
}

print("Training configuration ready!")
print(f"Training type: {training_config['fine_tune_type']}")
print(f"Iterations: {training_config['iters']}")
print(f"Learning rate: {training_config['learning_rate']}")

## 6. Single-Node Training

Train the model on a single node. This is the simplest way to get started.

In [None]:
# Train model (single node)
print("Starting single-node training...")
print("=" * 50)

start_time = time.time()

# Run training
results = train_model(
    model_name=config["model"]["name"],
    data_path=config["dataset"]["output_dir"],
    training_config=training_config,
    distributed=False
)

training_time = time.time() - start_time

print("\n" + "=" * 50)
print(f"Training completed in {training_time:.2f} seconds")

if results.get('success'):
    print(f"✅ Training successful!")
    print(f"Adapter saved to: {results.get('adapter_path')}")
    if 'final_train_loss' in results:
        print(f"Final training loss: {results['final_train_loss']:.4f}")
    if 'final_val_loss' in results:
        print(f"Final validation loss: {results['final_val_loss']:.4f}")
else:
    print(f"❌ Training failed: {results.get('error')}")

### Visualize Training Progress

In [None]:
# Plot training curves if available
if results.get('success') and 'train_losses' in results:
    plt.figure(figsize=(10, 6))
    
    # Plot training loss
    if results['train_losses']:
        train_iters, train_losses = zip(*results['train_losses'])
        plt.plot(train_iters, train_losses, 'b-', label='Training Loss', alpha=0.7)
    
    # Plot validation loss
    if results.get('val_losses'):
        val_iters, val_losses = zip(*results['val_losses'])
        plt.plot(val_iters, val_losses, 'r-', label='Validation Loss', alpha=0.7)
    
    plt.xlabel('Iteration')
    plt.ylabel('Loss')
    plt.title('Training Progress')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
else:
    print("No training data to visualize")

## 7. Distributed Training

For distributed training, you need to:
1. Set up MPI on your nodes
2. Configure the hostfile
3. Run this notebook using `mpirun`

Example command:
```bash
mpirun --hostfile hosts.json -np 2 jupyter notebook distributed_training.ipynb
```

In [None]:
# Check if we're in a distributed environment
world, size, rank = setup_distributed()

if size > 1:
    print(f"🌐 Distributed mode detected!")
    print(f"World size: {size}")
    print(f"Current rank: {rank}")
else:
    print("💻 Single node mode")
    print("To use distributed training, run this notebook with mpirun")

In [None]:
# Distributed training (only run if you want to test distributed mode)
if config["distributed"]["enabled"] and size > 1:
    print("Starting distributed training...")
    print("=" * 50)
    
    start_time = time.time()
    
    # Create distributed callback
    dist_callback = DistributedTrainingCallback(world_size=size, rank=rank)
    
    # Run distributed training
    dist_results = train_model(
        model_name=config["model"]["name"],
        data_path=config["dataset"]["output_dir"],
        training_config=training_config,
        distributed=True,
        callback=dist_callback
    )
    
    dist_training_time = time.time() - start_time
    
    if rank == 0:  # Only print from rank 0
        print("\n" + "=" * 50)
        print(f"Distributed training completed in {dist_training_time:.2f} seconds")
        print(f"Speedup: {training_time / dist_training_time:.2f}x")
else:
    print("Distributed training not enabled or not in distributed environment")

## 8. Model Evaluation

Test the trained model with some prompts.

In [None]:
# Test prompts
test_prompts = config.get("evaluation", {}).get("test_prompts", [
    "What is machine learning?",
    "Explain quantum computing in simple terms.",
    "How does photosynthesis work?"
])

# Evaluate with base model first
print("=== Base Model Responses ===")
for prompt in test_prompts[:1]:  # Test with first prompt
    print(f"\nPrompt: {prompt}")
    response = evaluate_model(
        model_name=config["model"]["name"],
        test_prompt=prompt,
        max_tokens=50
    )
    print(f"Response: {response}")

In [None]:
# Evaluate with fine-tuned model
if results.get('success'):
    print("\n=== Fine-tuned Model Responses ===")
    adapter_path = results.get('adapter_path')
    
    for prompt in test_prompts[:1]:  # Test with first prompt
        print(f"\nPrompt: {prompt}")
        response = evaluate_model(
            model_name=config["model"]["name"],
            adapter_path=adapter_path,
            test_prompt=prompt,
            max_tokens=50
        )
        print(f"Response: {response}")
else:
    print("No trained model available for evaluation")

## 9. Save and Export Results

Save training results and configuration for future reference.

In [None]:
# Save results
if results.get('success'):
    results_file = Path("training_results.json")
    
    # Prepare results for saving
    save_results = {
        "model": config["model"]["name"],
        "training_time": training_time,
        "final_train_loss": results.get('final_train_loss'),
        "final_val_loss": results.get('final_val_loss'),
        "adapter_path": str(results.get('adapter_path')),
        "config": config,
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
    }
    
    with open(results_file, 'w') as f:
        json.dump(save_results, f, indent=2)
    
    print(f"✅ Results saved to {results_file}")
    print(f"✅ Adapter saved to {results.get('adapter_path')}")
    print("\nYou can now use the adapter for inference with:")
    print(f"mlx_lm.generate --model {config['model']['name']} --adapter-path {results.get('adapter_path')} --prompt 'Your prompt here'")

## 10. Tips and Troubleshooting

### Memory Issues
- Reduce `batch_size` if you run out of memory
- Enable `gradient_checkpointing` in the config
- Use a smaller model or reduce `max_seq_length`

### Distributed Training
- Ensure all nodes have the same code and data
- Check MPI is properly configured with `mpirun -np 2 hostname`
- Use Thunderbolt for best performance between Macs

### Dataset Format
- Use the `prepare_dataset` function to format your data
- Ensure your data has the correct keys (instruction, response, etc.)
- Validate your dataset with `validate_dataset`

### Next Steps
1. Try different models from Hugging Face
2. Experiment with hyperparameters
3. Create custom datasets for your use case
4. Scale up with distributed training