# Fine-Tune Model with Synthetic Data

This notebook demonstrates training a magnetic distortion classifier using synthetically generated sensor data. This approach eliminates the need for external sensor logs and provides full control over the training data characteristics.

## Process:
1. **Generate Training Data**: Create multiple synthetic datasets with different magnetic distortion levels
2. **Create Spectrograms**: Convert time-series data to spectrograms for audio classification
3. **Prepare Dataset**: Build HuggingFace dataset with proper labels and splitting
4. **Configure Model**: Set up Audio Spectrogram Transformer (AST) for magnetic distortion classification
5. **Train Model**: Fine-tune the model with configurable parameters
6. **Evaluate**: Test the model and save for deployment

## Key Features:
- **Self-contained**: Generates its own training data - no external dependencies
- **Configurable**: Easy to adjust distortion levels, data amounts, and training parameters
- **Debug Mode**: Fast training for testing (`DEBUG_MODE = True`)
- **Production Ready**: Full training capabilities for deployment
- **Modular**: Uses new modular architecture for maintainability

## Configuration:
- Set `DEBUG_MODE = True` for fast testing with minimal data
- Adjust `NUM_SEQUENCES` to control amount of training data
- Modify distortion parameters to customize the classification task

In [None]:
# Install required packages
%pip install datasets[audio]==3.0.1
%pip install mcap==1.2.1
%pip install torch
%pip install torchaudio
%pip install transformers[torch]==4.46.2
%pip install nstrumenta==0.1.3
%pip install evaluate
%pip install numpy

# Clone repository if in Colab (needed for source files and utilities)
import sys
import os

if "google.colab" in sys.modules:
    print("🔄 Detected Google Colab - setting up repository...")
    
    # Check if repo is already cloned
    if not os.path.exists("time-series-classifier"):
        print("📥 Cloning time-series-classifier repository...")
        !git clone https://github.com/nstrumenta/time-series-classifier.git
    else:
        print("✓ Repository already exists")
    
    # Change to repo directory
    %cd time-series-classifier
    print(f"✓ Working directory: {os.getcwd()}")
else:
    print("🖥️ Detected local environment")

In [None]:
import sys
import os

# Setup paths for both Colab and local environments
def setup_environment():
    """Setup environment for both Colab and local development"""
    current_dir = os.getcwd()
    
    # Determine if we're in Colab and adjust paths accordingly
    if "google.colab" in sys.modules:
        # In Colab, we should be in the repo directory after the previous cell
        src_dir = os.path.join(current_dir, "src")
        scripts_dir = os.path.join(current_dir, "scripts")
    else:
        # Local development - find repo root
        # Look for src directory in current or parent directories
        check_dirs = [current_dir, os.path.dirname(current_dir), os.path.join(current_dir, "..")]
        src_dir = None
        scripts_dir = None
        
        for check_dir in check_dirs:
            potential_src = os.path.join(check_dir, "src")
            potential_scripts = os.path.join(check_dir, "scripts")
            if os.path.exists(potential_src) and os.path.exists(potential_scripts):
                src_dir = potential_src
                scripts_dir = potential_scripts
                break
        
        if not src_dir:
            # Fallback to current directory structure
            src_dir = os.path.abspath(os.path.join(current_dir, "src"))
            scripts_dir = os.path.abspath(os.path.join(current_dir, "scripts"))
    
    # Add paths to sys.path
    for path in [src_dir, scripts_dir]:
        if os.path.exists(path) and path not in sys.path:
            sys.path.append(path)
            print(f"✓ Added to path: {path}")
    
    return src_dir, scripts_dir

# Setup environment
src_dir, scripts_dir = setup_environment()

# Import utilities with fallback handling
def import_with_fallback():
    """Import modules with graceful fallbacks"""
    nst_client = None
    imports = {}
    
    try:
        # Try new modular imports first
        from script_utils import init_script_environment
        src_dir, nst_client = init_script_environment()
        imports['script_utils'] = True
        print("✓ Using new script_utils environment setup")
        
    except ImportError:
        print("⚠️ script_utils not available, using manual setup...")
        imports['script_utils'] = False
        
        # Manual Nstrumenta client setup
        from nstrumenta import NstrumentaClient
        
        if "google.colab" in sys.modules:
            from google.colab import userdata
            os.environ["NSTRUMENTA_API_KEY"] = userdata.get("NSTRUMENTA_API_KEY")
        
        nst_client = NstrumentaClient(os.getenv("NSTRUMENTA_API_KEY"))
        
        try:
            print(f"✓ Connected to project: {nst_client.get_project()}")
        except Exception as e:
            print(f"⚠️ Could not verify project connection: {e}")
    
    # Try importing project modules
    try:
        from mcap_utils import create_dataset, spectrogram_from_timeseries
        imports['mcap_utils'] = 'modular'
        print("✓ Using new modular mcap_utils")
    except ImportError:
        try:
            import mcap_utilities
            imports['mcap_utils'] = 'legacy'
            create_dataset = mcap_utilities.create_dataset
            spectrogram_from_timeseries = mcap_utilities.spectrogram_from_timeseries
            print("⚠️ Using legacy mcap_utilities")
        except ImportError:
            imports['mcap_utils'] = 'failed'
            create_dataset = None
            spectrogram_from_timeseries = None
            print("❌ Could not import MCAP utilities")
    
    try:
        from synthetic import SyntheticDataGenerator
        imports['synthetic'] = True
        print("✓ Synthetic data generator available")
    except ImportError:
        imports['synthetic'] = False
        SyntheticDataGenerator = None
        print("⚠️ Synthetic data generator not available")
    
    return nst_client, imports, create_dataset, spectrogram_from_timeseries, SyntheticDataGenerator

# Initialize environment
nst_client, imports, create_dataset, spectrogram_from_timeseries, SyntheticDataGenerator = import_with_fallback()

print(f"\n📋 Environment Summary:")
print(f"  - Script utils: {'✓' if imports.get('script_utils') else '⚠️'}")
print(f"  - MCAP utils: {imports.get('mcap_utils', 'failed')}")
print(f"  - Synthetic: {'✓' if imports.get('synthetic') else '⚠️'}")
print(f"  - Working directory: {os.getcwd()}")
print(f"  - Python path includes: {len([p for p in sys.path if 'src' in p or 'scripts' in p])} project directories")

In [None]:
# Configuration for synthetic data generation and training
import uuid

# Training mode configuration
DEBUG_MODE = True  # Set to False for full training

# Synthetic data generation configuration
NUM_SEQUENCES = 2 if DEBUG_MODE else 8  # Number of synthetic sequences to generate
SEQUENCE_DURATION = 60.0  # Duration of each sequence in seconds
SAMPLE_RATE = 100  # Sample rate for synthetic data

if DEBUG_MODE:
    model_id = "MAG_DIST_DEBUG"
    max_steps = 10
    num_train_epochs = 1
    per_device_train_batch_size = 1
    per_device_eval_batch_size = 1
    eval_steps = 5
    print("🐛 DEBUG MODE ENABLED")
    print("Using reduced parameters for fast testing")
else:
    model_id = "MAG_DIST_SYNTHETIC"
    max_steps = 300
    num_train_epochs = 3
    per_device_train_batch_size = 2
    per_device_eval_batch_size = 4
    eval_steps = 50
    print("🚀 FULL TRAINING MODE")
    print("Using full parameters for production training")

# Generate unique training session ID
session_id = str(uuid.uuid4())[:8]
working_folder = f"./temp/{model_id}_{session_id}"

print(f"🎯 Training Session: {session_id}")
print(f"📊 Will generate {NUM_SEQUENCES} synthetic sequences")
print(f"⏱️  Each sequence: {SEQUENCE_DURATION} seconds")
print(f"📈 Sample rate: {SAMPLE_RATE} Hz")

# Working directory management with cross-platform support
def setup_working_directory_portable(path):
    """Setup working directory that works in both Colab and local environments"""
    abs_path = os.path.abspath(path)
    os.makedirs(abs_path, exist_ok=True)
    os.chdir(abs_path)
    print(f"✓ Working directory set to: {os.getcwd()}")
    return abs_path

def reset_to_initial_directory():
    """Reset to initial directory with environment detection"""
    if "google.colab" in sys.modules:
        # In Colab, go back to the repo root
        repo_indicators = [".git", "src", "scripts", "notebooks"]
        current = os.getcwd()
        
        # Navigate up until we find repo root
        while current != "/" and not all(os.path.exists(os.path.join(current, indicator)) for indicator in repo_indicators[:2]):
            current = os.path.dirname(current)
        
        if current != "/":
            os.chdir(current)
            print(f"✓ Reset to repo root: {os.getcwd()}")
        else:
            print("⚠️ Could not find repo root, staying in current directory")
    else:
        # Local environment
        try:
            if imports.get('script_utils'):
                from script_utils import reset_to_initial_cwd
                reset_to_initial_cwd()
            else:
                print(f"Current working directory: {os.getcwd()}")
        except (ImportError, NameError):
            print(f"Current working directory: {os.getcwd()}")

# Reset to appropriate starting directory
reset_to_initial_directory()

# Setup working directory
try:
    if imports.get('script_utils'):
        from script_utils import setup_working_directory
        setup_working_directory(working_folder)
    else:
        setup_working_directory_portable(working_folder)
except (ImportError, NameError):
    setup_working_directory_portable(working_folder)

print(f"📁 Training workspace: {working_folder}")
print(f"🤖 Model ID: {model_id}")
print(f"📊 Training parameters:")
print(f"  - Max steps: {max_steps}")
print(f"  - Epochs: {num_train_epochs}")
print(f"  - Train batch size: {per_device_train_batch_size}")
print(f"  - Eval batch size: {per_device_eval_batch_size}")

In [None]:
from transformers import ASTFeatureExtractor
import os
import json

# Load pretrained feature extractor
pretrained_model = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(pretrained_model)
print("✓ Feature extractor loaded")

# Check for required functions
if create_dataset is None:
    print("❌ create_dataset function not available")
    print("This is required for training. Please check module imports.")
    raise ImportError("Required MCAP utilities not available")

if spectrogram_from_timeseries is None:
    print("❌ spectrogram_from_timeseries function not available")
    print("This is required for training. Please check module imports.")
    raise ImportError("Required MCAP utilities not available")

# Generate synthetic training data
print("🔬 Generating synthetic training datasets...")
print("=" * 50)

# Initialize the synthetic data generator
try:
    if SyntheticDataGenerator is not None:
        generator = SyntheticDataGenerator()
    else:
        from synthetic import SyntheticDataGenerator
        generator = SyntheticDataGenerator()
    print("✓ Synthetic data generator initialized")
except Exception as e:
    print(f"❌ Could not initialize synthetic data generator: {e}")
    raise

# Create base plan template
base_plan = {
    "initialization": {
        "pose": {
            "origin": {"lat": 38.446, "lng": -122.687, "height": 0.0},
            "position": {"x": 0.0, "y": 0.0, "z": 0.0},
            "rotation": {"w": 1.0, "x": 0.0, "y": 0.0, "z": 0.0}
        },
        "start_time_ns": 0,
        "sample_rate": SAMPLE_RATE,
        "mag": {
            "calibration": {
                "bias": {"x": 0.0, "y": 0.0, "z": 0.0},
                "matrix": [[0.00053, 0.0, 0.0], [0.0, 0.00053, 0.0], [0.0, 0.0, 0.00053]]
            }
        },
        "acc": {
            "calibration": {
                "bias": {"x": 0.0, "y": 0.0, "z": 0.0},
                "matrix": [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]
            }
        },
        "gyro": {
            "calibration": {
                "bias": {"x": 0.0, "y": 0.0, "z": 0.0},
                "matrix": [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]]
            }
        }
    },
    "segments": []
}

# Define distortion levels and motion patterns
distortion_configs = [
    {"level": "none", "value": 0.0},
    {"level": "low", "value": 1.0},
    {"level": "high", "value": 2.5}
]

motion_patterns = [
    {"roll": 0, "pitch": 0, "yaw": 0},      # Stationary
    {"roll": 45, "pitch": 0, "yaw": 0},     # Roll motion
    {"roll": 0, "pitch": 30, "yaw": 0},     # Pitch motion
    {"roll": 0, "pitch": 0, "yaw": 60},     # Yaw motion
    {"roll": 30, "pitch": 30, "yaw": 30},   # Complex motion
]

# Generate synthetic datasets
file_pairs = []
generated_files = []

for seq_idx in range(NUM_SEQUENCES):
    # Cycle through distortion levels and motion patterns
    distortion = distortion_configs[seq_idx % len(distortion_configs)]
    motion = motion_patterns[seq_idx % len(motion_patterns)]
    
    # Create plan for this sequence
    plan = base_plan.copy()
    plan["segments"] = [
        {
            "name": f"segment_{seq_idx}_{distortion['level']}",
            "duration_s": SEQUENCE_DURATION,
            "rotation_rpy_degrees": motion,
            "magnetic_distortion": distortion["value"],
            "mag_distortion": {"level": distortion["level"]}
        }
    ]
    
    # Generate filenames
    sequence_name = f"training_sequence_{seq_idx:02d}_{distortion['level']}"
    mcap_file = f"{sequence_name}.mcap"
    labels_file = f"{sequence_name}.labels.json"
    spectrogram_file = f"{sequence_name}.spectrogram.mcap"
    
    print(f"📊 Generating sequence {seq_idx+1}/{NUM_SEQUENCES}: {sequence_name}")
    print(f"  - Distortion: {distortion['level']} ({distortion['value']})")
    print(f"  - Motion: roll={motion['roll']}°, pitch={motion['pitch']}°, yaw={motion['yaw']}°")
    
    try:
        # Generate synthetic data
        generator.generate(plan_data=plan, output_file=mcap_file, verbose=False)
        
        # Generate labels
        generator.generate_labels(plan, labels_file)
        
        # Create spectrogram
        print(f"  🔄 Creating spectrogram...")
        spectrogram_from_timeseries(mcap_file, spectrogram_file, feature_extractor=feature_extractor)
        
        # Verify files were created
        if all(os.path.exists(f) for f in [mcap_file, labels_file, spectrogram_file]):
            file_pairs.append([spectrogram_file, labels_file])
            generated_files.extend([mcap_file, labels_file, spectrogram_file])
            
            # Show file sizes for verification
            mcap_size = os.path.getsize(mcap_file)
            spec_size = os.path.getsize(spectrogram_file)
            print(f"  ✓ Generated: MCAP={mcap_size:,}B, Spectrogram={spec_size:,}B")
        else:
            print(f"  ❌ Failed to generate all files for {sequence_name}")
            
    except Exception as e:
        print(f"  ❌ Error generating {sequence_name}: {e}")
        continue

print(f"\n✅ Generated {len(file_pairs)} training sequences successfully!")
print(f"📁 Total files created: {len(generated_files)}")

if len(file_pairs) == 0:
    print("❌ No training data was generated!")
    print("🔧 Check the synthetic data generator setup and try again")
    raise ValueError("No training data available")

# Show what was generated
print(f"\n📋 Training Data Summary:")
distortion_counts = {}
for spectrogram_file, labels_file in file_pairs:
    # Read the labels to see what distortion levels we have
    with open(labels_file, 'r') as f:
        labels_data = json.load(f)
        events = labels_data.get('events', [])
        for event in events:
            level = event.get('metadata', {}).get('mag_distortion', 'unknown')
            distortion_counts[level] = distortion_counts.get(level, 0) + 1

print("Distortion level distribution:")
for level, count in sorted(distortion_counts.items()):
    print(f"  - {level}: {count} sequences")

total_duration = len(file_pairs) * SEQUENCE_DURATION
print(f"Total training data: {total_duration} seconds ({total_duration/60:.1f} minutes)")

In [None]:
from transformers import ASTFeatureExtractor

# we define which pretrained model we want to use and instantiate a feature extractor
pretrained_model = "MIT/ast-finetuned-audioset-10-10-0.4593"
feature_extractor = ASTFeatureExtractor.from_pretrained(pretrained_model)

In [None]:
# Create HuggingFace dataset from the generated synthetic data
print("🔄 Creating dataset from generated synthetic data...")

# Create dataset using the synthetic data file pairs
dataset = create_dataset(
    file_pairs=file_pairs,
    use_unlabeled_sections=False,  # Use only labeled events for cleaner training
    unlabeled_section_label="none",  # Default for any unlabeled sections
    aggregate_labels=True,
    aggregate_label_dict={
        "0": "none",    # Map numeric labels to descriptive names
        "1": "low", 
        "2": "high",
        "none": "none",  # Already correct
        "low": "low",    # Already correct
        "high": "high"   # Already correct
    },
)

print("✓ Dataset created from synthetic data")

# Save dataset
dataset_dir = "dataset" if not DEBUG_MODE else "dataset_debug"
dataset.save_to_disk(dataset_dir)
print(f"✓ Dataset saved to: {dataset_dir}")

# Display dataset information
print(f"\n📊 Dataset Information:")
print(f"  - Total samples: {len(dataset)}")
print(f"  - Features: {list(dataset.features.keys())}")

# Show label distribution
if 'labels' in dataset.features:
    label_feature = dataset.features['labels']
    label_names = label_feature.names
    print(f"  - Classes ({len(label_names)}): {label_names}")
    
    # Count samples per class
    if len(dataset) > 0:
        label_counts = {}
        for sample in dataset:
            label_name = label_names[sample['labels']]
            label_counts[label_name] = label_counts.get(label_name, 0) + 1
        
        print(f"  - Label distribution:")
        total_samples = len(dataset)
        for label, count in sorted(label_counts.items()):
            percentage = (count / total_samples) * 100
            print(f"    - {label}: {count} samples ({percentage:.1f}%)")
        
        # Check for class balance
        max_count = max(label_counts.values())
        min_count = min(label_counts.values())
        balance_ratio = min_count / max_count if max_count > 0 else 0
        
        if balance_ratio > 0.8:
            print(f"  ✅ Well-balanced dataset (ratio: {balance_ratio:.2f})")
        elif balance_ratio > 0.5:
            print(f"  ✓ Reasonably balanced dataset (ratio: {balance_ratio:.2f})")
        else:
            print(f"  ⚠️ Imbalanced dataset (ratio: {balance_ratio:.2f}) - consider generating more data")

# Show sample data structure
if len(dataset) > 0:
    sample = dataset[0]
    print(f"\n📋 Sample Data Structure:")
    for key, value in sample.items():
        if hasattr(value, 'shape'):
            print(f"  - {key}: shape {value.shape}, dtype {value.dtype}")
        else:
            print(f"  - {key}: {type(value)} = {value}")

if DEBUG_MODE:
    print(f"\n🐛 Debug mode: Using reduced dataset for fast testing")
    print(f"   For production training, set DEBUG_MODE = False")
else:
    print(f"\n🚀 Production dataset ready for full training")

# Verify we have enough data for training
min_samples_needed = 10
if len(dataset) < min_samples_needed:
    print(f"⚠️ Warning: Only {len(dataset)} samples available")
    print(f"   Recommended minimum: {min_samples_needed} samples")
    print(f"   Consider increasing NUM_SEQUENCES or SEQUENCE_DURATION")
else:
    print(f"✅ Dataset size is sufficient for training ({len(dataset)} samples)")

In [None]:
from transformers import ASTConfig, ASTForAudioClassification, TrainingArguments, Trainer
import torch
import numpy as np
import evaluate

print("🤖 Configuring model...")

# Load configuration from the pretrained model
config = ASTConfig.from_pretrained(pretrained_model)

# Access the ClassLabel feature for the labels
label_feature = dataset.features["labels"]
label_names = label_feature.names

print(f"✓ Label names: {label_names}")

# Update model configuration for our specific task
config.num_labels = len(label_names)
config.label2id = {label: i for i, label in enumerate(label_names)}
config.id2label = {i: label for label, i in config.label2id.items()}

print(f"✓ Model configured for {config.num_labels} classes")
print(f"  - Label mapping: {config.label2id}")

# Split training data if test split doesn't exist
if "test" not in dataset:
    print("🔄 Splitting dataset into train/test...")
    dataset = dataset.train_test_split(
        test_size=0.2, shuffle=True, seed=42, stratify_by_column="labels"
    )
    print(f"✓ Dataset split:")
    print(f"  - Training samples: {len(dataset['train'])}")
    print(f"  - Test samples: {len(dataset['test'])}")

# Initialize model
model = ASTForAudioClassification.from_pretrained(pretrained_model, config=config)
print("✓ Model initialized with custom configuration")

# Training arguments - use the configuration from earlier
training_args = TrainingArguments(
    output_dir=f"./runs{'_debug' if DEBUG_MODE else ''}",
    eval_strategy="steps",
    eval_steps=eval_steps,
    logging_steps=eval_steps,
    save_steps=eval_steps * 2,
    max_steps=max_steps,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    warmup_ratio=0.1,
    logging_dir=f"./logs{'_debug' if DEBUG_MODE else ''}",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    push_to_hub=False,
    remove_unused_columns=False,
    dataloader_num_workers=0,  # Avoid multiprocessing issues in notebooks
)

print("✓ Training arguments configured")
print(f"  - Output directory: {training_args.output_dir}")
print(f"  - Logging directory: {training_args.logging_dir}")
print(f"  - Max steps: {training_args.max_steps}")
print(f"  - Batch sizes: train={training_args.per_device_train_batch_size}, eval={training_args.per_device_eval_batch_size}")

# Load accuracy metric
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    """Compute accuracy metrics for evaluation"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

print("✓ Metrics configured (accuracy)")

In [None]:
# Initialize trainer
print("🏃 Initializing trainer...")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)

print("✓ Trainer initialized")

# Start training
print("🚀 Starting training...")
print(f"📊 Training on {len(dataset['train'])} samples, evaluating on {len(dataset['test'])} samples")

if DEBUG_MODE:
    print("🐛 Debug mode: Training will be quick but not optimal")
else:
    print("⏳ Full training mode: This may take a while...")

train_result = trainer.train()

print("✓ Training completed!")
print(f"📈 Final training loss: {train_result.training_loss:.4f}")

# Final evaluation
print("📊 Running final evaluation...")
eval_result = trainer.evaluate()
print("✓ Evaluation completed!")
print(f"📊 Final accuracy: {eval_result['eval_accuracy']:.4f}")

# Save the fine-tuned model
model_dir = "model" if not DEBUG_MODE else "model_debug"
print(f"💾 Saving model to {model_dir}...")

model.save_pretrained(model_dir)
feature_extractor.save_pretrained(model_dir)

print(f"✓ Model saved to {model_dir}/")

# Create model archive for upload
import tarfile
model_tar_filename = f"{model_id}.model.tar.gz"

print(f"📦 Creating model archive: {model_tar_filename}")
with tarfile.open(model_tar_filename, "w:gz") as tar:
    tar.add(model_dir, arcname="model")

print(f"✓ Model archive created: {model_tar_filename}")

# Upload model archive
print("📤 Uploading model...")
try:
    upload_with_prefix(nst_client, model_tar_filename, "", overwrite=True)
    print(f"✓ Model uploaded successfully")
except NameError:
    # Fallback upload
    nst_client.upload(model_tar_filename, model_tar_filename, overwrite=True)
    print(f"⚠️ Model uploaded using legacy method")

print("🎉 Fine-tuning pipeline completed successfully!")
print(f"\n📋 Summary:")
print(f"  - Model ID: {model_id}")
print(f"  - Training samples: {len(dataset['train'])}")
print(f"  - Test samples: {len(dataset['test'])}")
print(f"  - Final accuracy: {eval_result['eval_accuracy']:.4f}")
print(f"  - Classes: {label_names}")
print(f"  - Model saved: {model_dir}/")
print(f"  - Archive: {model_tar_filename}")