# ML Model Training: Preferences → Cost Parameters

Train a regression model to translate user preferences into ODL cost parameters.

## Goal:
- **Input:** User preferences (parking, time, distance importance)
- **Output:** Cost parameters (costPerTravelHour, costPerKm, parking_multiplier)

## Approach:
1. Load training data from Pareto Top-N selection
2. Train multiple regression models
3. Evaluate and compare
4. Save best model for deployment

## 1. Imports

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import pickle

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Imports successful")

## 2. Configuration

In [None]:
# Data file
TRAINING_DATA_FILE = 'training_data_v4.json'  # ← Your training data

# Model save path
MODEL_SAVE_PATH = 'preference_to_cost_model.pkl'
SCALER_SAVE_PATH = 'preference_scaler.pkl'

# Train/test split
TEST_SIZE = 0.2
RANDOM_STATE = 42

print(f"Configuration:")
print(f"  Training data: {TRAINING_DATA_FILE}")
print(f"  Test size: {TEST_SIZE*100:.0f}%")
print(f"  Model output: {MODEL_SAVE_PATH}")

## 3. Load and Explore Data

In [None]:
# Load training data
with open(TRAINING_DATA_FILE, 'r') as f:
    training_data = json.load(f)

print(f"Loaded {len(training_data)} training samples")

# Show first sample
print("\nSample structure:")
print(json.dumps(training_data[0], indent=2))

In [None]:
# Convert to DataFrame for easier analysis
data = []
for sample in training_data:
    row = {
        # Inputs
        'parking_importance': sample['preferences']['parking_importance'],
        'time_importance': sample['preferences']['time_importance'],
        'distance_importance': sample['preferences']['distance_importance'],
        # Outputs
        'costPerTravelHour': sample['costs']['costPerTravelHour'],
        'costPerKm': sample['costs']['costPerKm'],
        'parking_multiplier': sample['costs']['parking_multiplier'],
        # Metadata
        'pool_id': sample['metadata']['pool_id'],
        'pareto_distance': sample['metadata']['pareto_distance']
    }
    data.append(row)

df = pd.DataFrame(data)
print(f"\nDataFrame shape: {df.shape}")
df.head()

## 4. Data Analysis

In [None]:
# Basic statistics
print("Dataset Statistics:\n")
print(df.describe())

# Check output diversity
unique_costs = df[['costPerTravelHour', 'costPerKm', 'parking_multiplier']].drop_duplicates()
print(f"\nUnique cost combinations: {len(unique_costs)}")
print(f"Samples per unique output: {len(df) / len(unique_costs):.1f}")

# Check pool_id distribution
pool_id_counts = df['pool_id'].value_counts()
print(f"\nMost common pool_id: {pool_id_counts.index[0]} ({pool_id_counts.iloc[0]} times)")
print(f"Least common pool_id: {pool_id_counts.index[-1]} ({pool_id_counts.iloc[-1]} times)")

In [None]:
# Visualize input distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(df['parking_importance'], bins=30, edgecolor='black', alpha=0.7)
axes[0].set_title('Parking Importance Distribution')
axes[0].set_xlabel('Importance')

axes[1].hist(df['time_importance'], bins=30, edgecolor='black', alpha=0.7)
axes[1].set_title('Time Importance Distribution')
axes[1].set_xlabel('Importance')

axes[2].hist(df['distance_importance'], bins=30, edgecolor='black', alpha=0.7)
axes[2].set_title('Distance Importance Distribution')
axes[2].set_xlabel('Importance')

plt.tight_layout()
plt.show()

print("✓ Inputs are well-distributed (Dirichlet sampling)")

In [None]:
# Visualize output distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(df['costPerTravelHour'], bins=20, edgecolor='black', alpha=0.7, color='orange')
axes[0].set_title(f'Cost Per Travel Hour\n({len(df["costPerTravelHour"].unique())} unique values)')
axes[0].set_xlabel('Cost ($/hour)')

axes[1].hist(df['costPerKm'], bins=20, edgecolor='black', alpha=0.7, color='green')
axes[1].set_title(f'Cost Per Km\n({len(df["costPerKm"].unique())} unique values)')
axes[1].set_xlabel('Cost ($/km)')

axes[2].hist(df['parking_multiplier'], bins=20, edgecolor='black', alpha=0.7, color='red')
axes[2].set_title(f'Parking Multiplier\n({len(df["parking_multiplier"].unique())} unique values)')
axes[2].set_xlabel('Multiplier')

plt.tight_layout()
plt.show()

## 5. Prepare Data for Training

In [None]:
# Separate features (X) and targets (y)
X = df[['parking_importance', 'time_importance', 'distance_importance']].values
y = df[['costPerTravelHour', 'costPerKm', 'parking_multiplier']].values

print(f"X shape: {X.shape}  (preferences)")
print(f"y shape: {y.shape}  (cost parameters)")

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
)

print(f"\nTrain set: {X_train.shape[0]} samples")
print(f"Test set:  {X_test.shape[0]} samples")

In [None]:
# Optional: Standardize inputs (may help neural networks)
# For tree-based models, this is not necessary
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✓ Data standardized for neural networks")
print(f"  Mean: {X_train_scaled.mean(axis=0)}")
print(f"  Std:  {X_train_scaled.std(axis=0)}")

## 6. Train Multiple Models

In [None]:
# Define models to try
models = {}

# 1. Random Forest (usually best for this type of problem)
models['Random Forest'] = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

# 2. Gradient Boosting
models['Gradient Boosting'] = GradientBoostingRegressor(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=RANDOM_STATE
)

# 3. Neural Network (use scaled data)
models['Neural Network'] = MLPRegressor(
    hidden_layer_sizes=(64, 32, 16),
    activation='relu',
    max_iter=1000,
    early_stopping=True,
    random_state=RANDOM_STATE
)

print(f"Training {len(models)} models...\n")

In [None]:
# Train all models and collect results
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Use scaled data for Neural Network, original for others
    if name == 'Neural Network':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
    
    # Calculate metrics for each output
    metrics = {}
    output_names = ['costPerTravelHour', 'costPerKm', 'parking_multiplier']
    
    for i, out_name in enumerate(output_names):
        mse = mean_squared_error(y_test[:, i], y_pred[:, i])
        mae = mean_absolute_error(y_test[:, i], y_pred[:, i])
        r2 = r2_score(y_test[:, i], y_pred[:, i])
        
        metrics[out_name] = {
            'MSE': mse,
            'MAE': mae,
            'R2': r2
        }
    
    # Overall R2
    overall_r2 = r2_score(y_test, y_pred)
    
    results[name] = {
        'model': model,
        'predictions': y_pred,
        'metrics': metrics,
        'overall_r2': overall_r2
    }
    
    print(f"  Overall R² = {overall_r2:.4f}")
    for out_name, m in metrics.items():
        print(f"    {out_name}: R²={m['R2']:.4f}, MAE={m['MAE']:.4f}")

print("\n✓ All models trained!")

## 7. Compare Models

In [None]:
# Create comparison table
comparison = []
for name, result in results.items():
    row = {'Model': name, 'Overall R²': result['overall_r2']}
    for out_name, metrics in result['metrics'].items():
        row[f'{out_name} R²'] = metrics['R2']
        row[f'{out_name} MAE'] = metrics['MAE']
    comparison.append(row)

comparison_df = pd.DataFrame(comparison)
comparison_df = comparison_df.sort_values('Overall R²', ascending=False)

print("\nModel Comparison:")
print(comparison_df.to_string(index=False))

# Find best model
best_model_name = comparison_df.iloc[0]['Model']
best_r2 = comparison_df.iloc[0]['Overall R²']
print(f"\n🏆 Best Model: {best_model_name} (R² = {best_r2:.4f})")

## 8. Visualize Best Model Performance

In [None]:
# Get best model predictions
best_model = results[best_model_name]['model']
best_predictions = results[best_model_name]['predictions']

# Plot predictions vs actual for each output
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
output_names = ['costPerTravelHour', 'costPerKm', 'parking_multiplier']

for i, (ax, name) in enumerate(zip(axes, output_names)):
    ax.scatter(y_test[:, i], best_predictions[:, i], alpha=0.6, s=50)
    
    # Perfect prediction line
    min_val = min(y_test[:, i].min(), best_predictions[:, i].min())
    max_val = max(y_test[:, i].max(), best_predictions[:, i].max())
    ax.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='Perfect prediction')
    
    r2 = results[best_model_name]['metrics'][name]['R2']
    mae = results[best_model_name]['metrics'][name]['MAE']
    
    ax.set_xlabel(f'Actual {name}')
    ax.set_ylabel(f'Predicted {name}')
    ax.set_title(f'{name}\nR² = {r2:.4f}, MAE = {mae:.4f}')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.suptitle(f'{best_model_name} - Predictions vs Actual', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Residual plot
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for i, (ax, name) in enumerate(zip(axes, output_names)):
    residuals = y_test[:, i] - best_predictions[:, i]
    
    ax.scatter(best_predictions[:, i], residuals, alpha=0.6, s=50)
    ax.axhline(y=0, color='r', linestyle='--', linewidth=2)
    
    ax.set_xlabel(f'Predicted {name}')
    ax.set_ylabel('Residuals')
    ax.set_title(f'{name} Residuals')
    ax.grid(True, alpha=0.3)

plt.suptitle(f'{best_model_name} - Residual Analysis', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("✓ If residuals are randomly scattered around 0, the model is unbiased")

## 9. Feature Importance (for tree-based models)

In [None]:
# Feature importance for Random Forest or Gradient Boosting
if best_model_name in ['Random Forest', 'Gradient Boosting']:
    feature_names = ['parking_importance', 'time_importance', 'distance_importance']
    
    # Get feature importances
    importances = best_model.feature_importances_
    
    # Plot
    plt.figure(figsize=(8, 5))
    plt.bar(feature_names, importances, edgecolor='black', alpha=0.7)
    plt.title(f'Feature Importance - {best_model_name}')
    plt.ylabel('Importance')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()
    
    print("Feature Importances:")
    for name, imp in zip(feature_names, importances):
        print(f"  {name}: {imp:.4f}")
else:
    print(f"Feature importance not available for {best_model_name}")

## 10. Test on New Preferences

In [None]:
# Test predictions on some example preferences
test_cases = [
    {'parking_importance': 0.8, 'time_importance': 0.1, 'distance_importance': 0.1},
    {'parking_importance': 0.1, 'time_importance': 0.8, 'distance_importance': 0.1},
    {'parking_importance': 0.1, 'time_importance': 0.1, 'distance_importance': 0.8},
    {'parking_importance': 0.33, 'time_importance': 0.33, 'distance_importance': 0.34}
]

print("Test Predictions:\n")

for i, prefs in enumerate(test_cases, 1):
    X_new = np.array([[prefs['parking_importance'], 
                      prefs['time_importance'], 
                      prefs['distance_importance']]])
    
    # Scale if using Neural Network
    if best_model_name == 'Neural Network':
        X_new = scaler.transform(X_new)
    
    prediction = best_model.predict(X_new)[0]
    
    print(f"Case {i}: parking={prefs['parking_importance']:.2f}, "
          f"time={prefs['time_importance']:.2f}, "
          f"distance={prefs['distance_importance']:.2f}")
    print(f"  Predicted costs:")
    print(f"    costPerTravelHour = {prediction[0]:.2f}")
    print(f"    costPerKm = {prediction[1]:.4f}")
    print(f"    parking_multiplier = {prediction[2]:.2f}")
    print()

## 11. Save Best Model

In [None]:
# Save the best model
with open(MODEL_SAVE_PATH, 'wb') as f:
    pickle.dump(best_model, f)

print(f"✓ Best model ({best_model_name}) saved to: {MODEL_SAVE_PATH}")

# Save scaler if using Neural Network
if best_model_name == 'Neural Network':
    with open(SCALER_SAVE_PATH, 'wb') as f:
        pickle.dump(scaler, f)
    print(f"✓ Scaler saved to: {SCALER_SAVE_PATH}")

# Save metadata
metadata = {
    'model_type': best_model_name,
    'overall_r2': float(best_r2),
    'training_samples': len(X_train),
    'test_samples': len(X_test),
    'unique_outputs': int(len(unique_costs)),
    'metrics': {name: {k: float(v) for k, v in metrics.items()} 
                for name, metrics in results[best_model_name]['metrics'].items()},
    'input_features': ['parking_importance', 'time_importance', 'distance_importance'],
    'output_features': ['costPerTravelHour', 'costPerKm', 'parking_multiplier'],
    'requires_scaling': best_model_name == 'Neural Network'
}

with open('model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"✓ Metadata saved to: model_metadata.json")

## 12. Model Usage Example

In [None]:
# Example: How to load and use the saved model
print("Example usage code:\n")
print("""
# Load model
import pickle
import numpy as np

with open('preference_to_cost_model.pkl', 'rb') as f:
    model = pickle.load(f)

# Optional: Load scaler if using Neural Network
# with open('preference_scaler.pkl', 'rb') as f:
#     scaler = pickle.load(f)

# Get user preferences
preferences = {
    'parking_importance': 0.7,
    'time_importance': 0.2,
    'distance_importance': 0.1
}

# Prepare input
X = np.array([[
    preferences['parking_importance'],
    preferences['time_importance'],
    preferences['distance_importance']
]])

# Scale if needed
# X = scaler.transform(X)  # Only for Neural Network

# Predict costs
costs = model.predict(X)[0]

result = {
    'costPerTravelHour': costs[0],
    'costPerKm': costs[1],
    'parking_multiplier': costs[2]
}

print(result)
""")

## Summary

✅ Trained multiple regression models
✅ Evaluated and compared performance
✅ Saved best model for deployment
✅ Model translates preferences → cost parameters

### Next Steps:
1. Integrate model into your application
2. Get user preferences (parking, time, distance importance)
3. Predict cost parameters using model
4. Send costs to ODL API for route optimization
5. Return optimized routes to user!