# Data Collection Pipeline for Preference-to-Cost ML Model

This notebook collects 200 training samples by:
1. Sampling random preferences
2. Translating to cost parameters
3. Sending to ODL API
4. Collecting route results

**Estimated time**: ~5 hours (200 samples × 90 seconds each)

In [1]:
import json
import time
import requests
import numpy as np
import pandas as pd
from datetime import datetime
from pathlib import Path
import copy

print("✓ Imports successful")

✓ Imports successful


In [2]:
import json

# One folder out
with open('../../test_model_mathias_fixed.json', 'r') as f:
    model = json.load(f)

# Remove complete jobs (this removes both pickup + delivery)
model['data']['jobs'] = model['data']['jobs'][:25]  # Keep first 25 jobs

# Remove vehicles
model['data']['vehicles'] = model['data']['vehicles'][:8]  # Keep 8 vehicles

# Save
with open('test_model_25jobs_8vehicles.json', 'w') as f:
    json.dump(model, f, indent=2)

## Configuration

**⚠️ CHANGE THESE VALUES:**

In [3]:
# API Configuration
 

# Model file (25 jobs, 8 vehicles)
MODEL_PATH = "test_model_25jobs_8vehicles.json"  

# Data collection settings
N_SAMPLES = 200
OUTPUT_FILE = "training_data.json"
CHECKPOINT_EVERY = 10  # Save every 10 samples
WAIT_SECONDS = 90  # Wait time for optimization

print(f"Configuration:")
print(f"  Base URL: {BASE_URL}")
print(f"  Model: {MODEL_PATH}")
print(f"  Samples to collect: {N_SAMPLES}")
print(f"  Output: {OUTPUT_FILE}")

Configuration:
  Base URL: https://optimizer-0.staging.zenderatms.com
  Model: test_model_25jobs_8vehicles.json
  Samples to collect: 200
  Output: training_data.json


## Data Collector Class

In [None]:
class PreferenceCostDataCollector:
    """Collects training data for preference-to-cost translation"""
    
    def __init__(self, base_url, username, password, model_path):
        """Initialize data collector"""
        self.base_url = base_url.rstrip('/')
        self.username = username
        self.password = password
        
        # Initialize session
        self.session = requests.Session()
        self.session.auth = (username, password)
        self.session.headers.update({
            'Content-Type': 'application/json',
            'Accept': 'application/json'
        })
        
        # Load base model
        with open(model_path, 'r') as f:
            self.base_model = json.load(f)
        
        print(f"✓ Base model loaded")
        print(f"  Jobs: {len(self.base_model['data']['jobs'])}")
        print(f"  Vehicles: {len(self.base_model['data']['vehicles'])}")
        
        # Data storage
        self.collected_data = []
    
    def sample_preferences(self):
        """Sample random preference values"""
        preferences = {
            'parking_importance': np.random.uniform(0.0, 1.0),
            'time_importance': np.random.uniform(0.0, 1.0),
            'distance_importance': np.random.uniform(0.0, 1.0)
        }
        return preferences
    
    def preferences_to_costs(self, preferences):
        """Translate preferences to cost parameters using heuristic"""
        # Baseline costs
        baseline_cost_per_hour = 1.0
        baseline_cost_per_km = 0.028
        baseline_parking_mult = 1.0
        
        # Map preferences to costs
        # High time_importance -> increase costPerTravelHour (range: 0.5 to 5.0)
        cost_per_travel_hour = baseline_cost_per_hour * (0.5 + preferences['time_importance'] * 4.5)
        
        # High distance_importance -> increase costPerKm (range: 0.014 to 0.49)
        cost_per_km = baseline_cost_per_km * (0.5 + preferences['distance_importance'] * 17.0)
        
        # High parking_importance -> increase parking multiplier (range: 0.2 to 5.0)
        parking_multiplier = baseline_parking_mult * (0.2 + preferences['parking_importance'] * 4.8)
        
        costs = {
            'costPerTravelHour': round(cost_per_travel_hour, 3),
            'costPerKm': round(cost_per_km, 4),
            'parking_multiplier': round(parking_multiplier, 2)
        }
        
        return costs
    
    # watch out for parking cost on pickup stops
    def apply_costs_to_model(self, costs):
        """Apply cost parameters to model"""
        model = copy.deepcopy(self.base_model)
        
        # Apply vehicle costs
        for vehicle in model['data']['vehicles']:
            if 'definition' in vehicle:
                vehicle['definition']['costPerTravelHour'] = costs['costPerTravelHour']
                vehicle['definition']['costPerKm'] = costs['costPerKm']
        
        # Apply parking multiplier
        for job in model['data']['jobs']:
            for stop in job.get('stops', []):
                if 'parking' in stop and 'cost' in stop['parking']:
                    original_cost = stop['parking']['cost']
                    stop['parking']['cost'] = original_cost * costs['parking_multiplier']
        
        return model
    
    def send_to_odl(self, model, model_id):
        """Send model to ODL API"""
        url = f"{self.base_url}/models/{model_id}"
        
        try:
            response = self.session.put(url, json=model, timeout=60)
            
            if response.status_code in [200, 201, 204]:
                return True
            else:
                print(f"  ✗ PUT failed: {response.status_code}")
                return False
                
        except Exception as e:
            print(f"  ✗ Error: {e}")
            return False
    
    def get_plan(self, model_id, wait_seconds=90):
        """Get optimized plan from ODL"""
        # Wait for optimization
        time.sleep(wait_seconds)
        
        url = f"{self.base_url}/models/{model_id}/optimiserstate/plan"
        
        try:
            response = self.session.get(url, timeout=30)
            
            if response.status_code == 200:
                return response.json()
            else:
                print(f"  ✗ GET failed: {response.status_code}")
                return None
                
        except Exception as e:
            print(f"  ✗ Error: {e}")
            return None
    
    def delete_model(self, model_id):
        """Delete model from server"""
        url = f"{self.base_url}/models/{model_id}"
        try:
            self.session.delete(url, timeout=10)
        except:
            pass
    
    def extract_route_features(self, plan):
        """Extract features from optimized routes"""
        vehicle_plans = plan.get('vehiclePlans', [])
        
        # Aggregate metrics
        total_distance_m = 0
        total_travel_seconds = 0
        total_cost = 0
        total_stops = 0
        
        for vehicle in vehicle_plans:
            time_stats = vehicle.get('timeStatistics', {})
            total_distance_m += time_stats.get('travelMetres', 0)
            total_travel_seconds += time_stats.get('travelSeconds', 0)
            total_cost += time_stats.get('cost', 0)
            
            stops = vehicle.get('plannedStops', [])
            total_stops += len(stops)
        
        features = {
            'total_distance_km': total_distance_m / 1000,
            'total_travel_hours': total_travel_seconds / 3600,
            'total_cost': total_cost,
            'total_stops': total_stops,
            'vehicles_used': sum(1 for v in vehicle_plans if len(v.get('plannedStops', [])) > 0),
            'unplanned_jobs': len(plan.get('unplannedJobs', []))
        }
        
        return features
    
    def collect_sample(self, sample_num, wait_seconds=90):
        """Collect one training sample"""
        print(f"\n{'='*60}")
        print(f"Sample {sample_num}")
        print(f"{'='*60}")
        
        # 1. Sample preferences
        preferences = self.sample_preferences()
        print(f"Preferences:")
        print(f"  parking_importance: {preferences['parking_importance']:.3f}")
        print(f"  time_importance: {preferences['time_importance']:.3f}")
        print(f"  distance_importance: {preferences['distance_importance']:.3f}")
        
        # 2. Translate to costs
        costs = self.preferences_to_costs(preferences)
        print(f"Costs:")
        print(f"  costPerTravelHour: {costs['costPerTravelHour']:.3f}")
        print(f"  costPerKm: {costs['costPerKm']:.4f}")
        print(f"  parking_multiplier: {costs['parking_multiplier']:.2f}")
        
        # 3. Apply costs to model
        model = self.apply_costs_to_model(costs)
        
        # 4. Send to ODL
        model_id = f"sample_{sample_num}_{int(time.time())}"
        print(f"Sending to ODL (ID: {model_id})...")
        
        if not self.send_to_odl(model, model_id):
            print(f"  ✗ Failed to send model")
            return None
        
        print(f"  ✓ Model uploaded, waiting {wait_seconds} seconds for optimization...")
        
        # 5. Get plan
        plan = self.get_plan(model_id, wait_seconds=wait_seconds)
        
        if plan is None:
            print(f"  ✗ Failed to get plan")
            self.delete_model(model_id)
            return None
        
        # 6. Extract features
        features = self.extract_route_features(plan)
        print(f"Route features:")
        print(f"  distance: {features['total_distance_km']:.2f} km")
        print(f"  time: {features['total_travel_hours']:.2f} hours")
        print(f"  cost: {features['total_cost']:.2f}")
        print(f"  stops: {features['total_stops']}")
        print(f"  vehicles: {features['vehicles_used']}")
        
        # 7. Clean up
        self.delete_model(model_id)
        
        # 8. Create sample
        sample = {
            'sample_num': sample_num,
            'timestamp': datetime.now().isoformat(),
            'preferences': preferences,
            'costs': costs,
            'features': features
        }
        
        return sample
    
    def save_data(self, output_file):
        """Save collected data to JSON file"""
        with open(output_file, 'w') as f:
            json.dump(self.collected_data, f, indent=2)
        print(f"💾 Saved {len(self.collected_data)} samples to {output_file}")

print("✓ Class defined")

## Initialize Data Collector

In [None]:
# Validate configuration
if USERNAME == "your-username" or PASSWORD == "your-password":
    print("❌ ERROR: Please set your USERNAME and PASSWORD in the configuration cell above!")
else:
    # Initialize collector
    collector = PreferenceCostDataCollector(
        base_url=BASE_URL,
        username=USERNAME,
        password=PASSWORD,
        model_path=MODEL_PATH
    )
    print("\n✓ Collector initialized and ready!")

## Test with Single Sample

Run this first to make sure everything works before collecting all 200 samples.

In [None]:
# Test with one sample
test_sample = collector.collect_sample(sample_num=0, wait_seconds=WAIT_SECONDS)

if test_sample:
    print("\n✓ Test successful!")
    print("\nSample data structure:")
    print(json.dumps(test_sample, indent=2))
else:
    print("\n✗ Test failed - check your configuration")

## Collect Full Dataset

⚠️ **This will take ~5 hours to complete!**

The notebook will:
- Collect 200 samples
- Save checkpoints every 10 samples
- Show progress and time estimates

You can interrupt with Kernel → Interrupt and your data will be saved.

In [None]:
print(f"\n{'='*60}")
print(f"DATA COLLECTION PIPELINE")
print(f"{'='*60}")
print(f"Target samples: {N_SAMPLES}")
print(f"Output file: {OUTPUT_FILE}")
print(f"Checkpoint every: {CHECKPOINT_EVERY} samples")
print(f"Wait time: {WAIT_SECONDS} seconds per sample")
print(f"Estimated total time: {(N_SAMPLES * WAIT_SECONDS) / 3600:.1f} hours")
print(f"{'='*60}\n")

start_time = time.time()

try:
    for i in range(1, N_SAMPLES + 1):
        try:
            sample = collector.collect_sample(i, wait_seconds=WAIT_SECONDS)
            
            if sample is not None:
                collector.collected_data.append(sample)
                print(f"✓ Sample {i} collected successfully")
            else:
                print(f"✗ Sample {i} failed, skipping...")
            
            # Save checkpoint
            if i % CHECKPOINT_EVERY == 0:
                collector.save_data(OUTPUT_FILE)
                elapsed = time.time() - start_time
                avg_time = elapsed / i
                remaining = (N_SAMPLES - i) * avg_time
                print(f"\n{'='*60}")
                print(f"CHECKPOINT: {i}/{N_SAMPLES} samples collected")
                print(f"Elapsed: {elapsed/60:.1f} minutes")
                print(f"Estimated remaining: {remaining/60:.1f} minutes")
                print(f"{'='*60}\n")
                
        except Exception as e:
            print(f"✗ Unexpected error on sample {i}: {e}")
            continue
            
except KeyboardInterrupt:
    print(f"\n\nInterrupted by user!")
    print(f"Collected {len(collector.collected_data)} samples so far")

# Final save
collector.save_data(OUTPUT_FILE)

elapsed = time.time() - start_time
print(f"\n{'='*60}")
print(f"DATA COLLECTION COMPLETE")
print(f"{'='*60}")
print(f"Total samples collected: {len(collector.collected_data)}")
print(f"Total time: {elapsed/60:.1f} minutes")
if len(collector.collected_data) > 0:
    print(f"Average time per sample: {elapsed/len(collector.collected_data):.1f} seconds")
print(f"Data saved to: {OUTPUT_FILE}")
print(f"{'='*60}\n")

## Analyze Collected Data

Quick analysis of the collected dataset.

In [None]:
# Load the collected data
with open(OUTPUT_FILE, 'r') as f:
    data = json.load(f)

print(f"Dataset Summary:")
print(f"  Total samples: {len(data)}")

# Convert to DataFrame for analysis
df_records = []
for sample in data:
    record = {
        'sample_num': sample['sample_num'],
        **{f'pref_{k}': v for k, v in sample['preferences'].items()},
        **{f'cost_{k}': v for k, v in sample['costs'].items()},
        **{f'feat_{k}': v for k, v in sample['features'].items()}
    }
    df_records.append(record)

df = pd.DataFrame(df_records)

print(f"\nDataFrame shape: {df.shape}")
print(f"\nFirst few rows:")
print(df.head())

print(f"\nPreferences statistics:")
print(df[['pref_parking_importance', 'pref_time_importance', 'pref_distance_importance']].describe())

print(f"\nCosts statistics:")
print(df[['cost_costPerTravelHour', 'cost_costPerKm', 'cost_parking_multiplier']].describe())

print(f"\nFeatures statistics:")
print(df[['feat_total_distance_km', 'feat_total_travel_hours', 'feat_total_cost']].describe())

## Next Steps

Now that you have collected the training data:

1. ✅ **Data collected**: `training_data.json`
2. 🔜 **Train regression model**: preferences → costs
3. 🔜 **Validate model performance**
4. 🔜 **Deploy for real-time preference translation**

The next notebook will focus on training the ML model!