# SmartPave Analytics: Machine Learning Modeling

## Overview
This notebook trains machine learning models to predict pavement degradation, estimate repair costs, and prioritize maintenance activities.

## Objectives
- Train degradation prediction model
- Build cost estimation model
- Develop priority scoring algorithm
- Evaluate model performance
- Create ensemble predictions


In [None]:
# Import libraries for machine learning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
import lightgbm as lgb
import joblib
import warnings
warnings.filterwarnings('ignore')

# Connect to Snowflake
from snowflake.snowpark.context import get_active_session
session = get_active_session()
session.sql("USE DATABASE DOT_workshop_test").collect()
session.sql("USE SCHEMA smartpave_analytics").collect()

print("ML libraries imported successfully!")


In [None]:
# Load processed features from Snowflake
print("Loading processed features for ML modeling...")

# Load the features table created in notebook 2
features_df = session.sql("SELECT * FROM DOT_workshop_test.smartpave_analytics.pavement_features").to_pandas()
print(f"Loaded {len(features_df):,} feature records")

# Load additional data for cost prediction
maintenance_df = session.sql("SELECT * FROM DOT_workshop_test.smartpave_analytics.maintenance_records").to_pandas()
if 'DATE' in maintenance_df.columns:
    maintenance_df['DATE'] = pd.to_datetime(maintenance_df['DATE'])

print(f"Loaded {len(maintenance_df):,} maintenance records")

# Display feature information
print(f"\nFeature columns available: {len(features_df.columns)}")
print(f"Sample features: {list(features_df.columns)[:10]}...")

# Detect key columns dynamically
date_col = None
condition_score_col = None
segment_id_col = None

for col in features_df.columns:
    if col.upper() == 'DATE':
        date_col = col
    elif col.upper() == 'CONDITION_SCORE':
        condition_score_col = col
    elif col.upper() == 'SEGMENT_ID':
        segment_id_col = col

print(f"\nDetected key columns:")
print(f"  Date: {date_col}")
print(f"  Condition Score: {condition_score_col}")
print(f"  Segment ID: {segment_id_col}")

# Show sample data
print(f"\nSample of features data:")
print(features_df.head())


In [None]:
# Model 1: Pavement Condition Prediction
print("="*60)
print("MODEL 1: PAVEMENT CONDITION PREDICTION")
print("="*60)

# WORKSHOP OPTIMIZATION: Use sample for faster training
print("🔧 WORKSHOP MODE: Using sample for faster ML training...")
sample_size = min(50000, len(features_df))  # Max 50K records for ML
features_sample = features_df.sample(n=sample_size, random_state=42).copy()
print(f"Using sample: {len(features_sample):,} records for ML training")

# Select features that actually exist in our data
available_features = []
feature_candidates = [
    'month', 'quarter', 'year', 'season', 'days_since_last_inspection', 'degradation_rate',
    'traffic_stress', 'heavy_truck_impact', 'traffic_category', 'TRAFFIC_VOLUME',
    'freeze_thaw_damage', 'precipitation_damage', 'temperature_stress', 'total_weather_damage',
    'days_since_last_maintenance', 'maintenance_frequency', 'avg_maintenance_cost', 'total_maintenance_cost',
    'is_highway', 'is_arterial', 'is_local', 'ROAD_LATITUDE', 'ROAD_LONGITUDE',
    'condition_avg_30d', 'condition_avg_90d', 'condition_std_30d', 'condition_std_90d',
    'condition_lag_1m', 'condition_lag_3m', 'condition_lag_6m', 'condition_trend_3m',
    'traffic_weather_interaction', 'risk_score'
]

for feature in feature_candidates:
    if feature in features_sample.columns:
        available_features.append(feature)

print(f"Available features for ML: {len(available_features)}")
print(f"Features: {available_features}")

# Prepare data - handle categorical variables
print("Preparing features for ML...")

# Encode categorical variables
X_processed = features_sample[available_features].copy()

# Handle season column (categorical)
if 'season' in X_processed.columns:
    le_season = LabelEncoder()
    X_processed['season_encoded'] = le_season.fit_transform(X_processed['season'].fillna('Unknown'))
    X_processed = X_processed.drop('season', axis=1)

# Handle traffic_category column (categorical)
if 'traffic_category' in X_processed.columns:
    le_traffic = LabelEncoder()
    X_processed['traffic_category_encoded'] = le_traffic.fit_transform(X_processed['traffic_category'].fillna('Unknown'))
    X_processed = X_processed.drop('traffic_category', axis=1)

# Fill remaining NaN values and ensure numeric types
X = X_processed.fillna(0).astype(float)
y = features_sample[condition_score_col]

print(f"Features after encoding: {X.shape}")
print(f"Feature columns: {list(X.columns)}")

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split data (time-aware split if date available)
if date_col:
    features_sample_sorted = features_sample.sort_values(date_col)
    split_date = features_sample_sorted[date_col].quantile(0.8)
    train_mask = features_sample_sorted[date_col] < split_date
    
    X_train = X[train_mask]
    X_test = X[~train_mask]
    y_train = y[train_mask]
    y_test = y[~train_mask]
else:
    # Random split if no date
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {len(X_train):,} samples")
print(f"Test set: {len(X_test):,} samples")

# Train multiple models (workshop optimized - fewer estimators)
models = {
    'Random Forest': RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=50, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=50, random_state=42),
    'LightGBM': lgb.LGBMRegressor(n_estimators=50, random_state=42, verbose=-1)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Metrics
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    results[name] = {
        'train_r2': train_r2,
        'test_r2': test_r2,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'model': model,
        'training_time': training_time
    }
    
    print(f"  Train R²: {train_r2:.3f}, Test R²: {test_r2:.3f}")
    print(f"  Train RMSE: {train_rmse:.3f}, Test RMSE: {test_rmse:.3f}")
    print(f"  Training time: {training_time:.1f}s")

# Find best model
best_model_name = max(results.keys(), key=lambda x: results[x]['test_r2'])
best_model = results[best_model_name]['model']
print(f"\n🏆 Best model: {best_model_name} (R² = {results[best_model_name]['test_r2']:.3f})")


In [None]:
# Model 2: Maintenance Cost Prediction
print("="*60)
print("MODEL 2: MAINTENANCE COST PREDICTION")
print("="*60)

# Prepare cost prediction data
print("Preparing cost prediction data...")

# Merge features with maintenance data
if 'SEGMENT_ID' in maintenance_df.columns and 'DATE' in maintenance_df.columns:
    # Check what columns are available in maintenance data
    print(f"Available maintenance columns: {list(maintenance_df.columns)}")
    
    available_maintenance_cols = ['SEGMENT_ID', 'DATE']
    cost_col = None
    
    # Find cost column (could be 'COST', 'cost', or similar)
    for col in maintenance_df.columns:
        if 'COST' in col.upper():
            cost_col = col
            available_maintenance_cols.append(col)
            break
    
    if 'REPAIR_TYPE' in maintenance_df.columns:
        available_maintenance_cols.append('REPAIR_TYPE')
    
    print(f"Using maintenance columns: {available_maintenance_cols}")
    print(f"Cost column found: {cost_col}")
    
    if cost_col:
        cost_features = features_sample.merge(
            maintenance_df[available_maintenance_cols], 
            left_on=[segment_id_col, date_col], 
            right_on=['SEGMENT_ID', 'DATE'], 
            how='inner'
        )
        print(f"Cost prediction dataset: {len(cost_features):,} records")
        
        # Encode repair type if available
        if 'REPAIR_TYPE' in cost_features.columns:
            le_repair_type = LabelEncoder()
            cost_features['repair_type_encoded'] = le_repair_type.fit_transform(cost_features['REPAIR_TYPE'].fillna('Unknown'))
        else:
            print("⚠️ REPAIR_TYPE not available - using dummy repair type")
            cost_features['repair_type_encoded'] = 0
    else:
        print("⚠️ No cost column found in maintenance data")
        cost_features = None
    
    if cost_features is not None:
        # Select available features for cost prediction
        cost_feature_candidates = [
            condition_score_col, 'traffic_stress', 'heavy_truck_impact', 'total_weather_damage',
            'days_since_last_maintenance', 'maintenance_frequency', 'avg_maintenance_cost',
            'is_highway', 'is_arterial', 'is_local', 'repair_type_encoded'
        ]
        
        cost_available_features = [f for f in cost_feature_candidates if f in cost_features.columns]
        
        X_cost = cost_features[cost_available_features].fillna(0)
        y_cost = cost_features[cost_col]  # Use the detected cost column
        
        print(f"Cost prediction features: {X_cost.shape}")
        print(f"Cost targets: {y_cost.shape}")
        print(f"Features used: {cost_available_features}")
        
        # Split cost data
        X_cost_train, X_cost_test, y_cost_train, y_cost_test = train_test_split(
            X_cost, y_cost, test_size=0.2, random_state=42)
        
        # Train cost prediction models (workshop optimized)
        cost_models = {
            'Random Forest': RandomForestRegressor(n_estimators=50, random_state=42, n_jobs=-1),
            'XGBoost': xgb.XGBRegressor(n_estimators=50, random_state=42),
            'LightGBM': lgb.LGBMRegressor(n_estimators=50, random_state=42, verbose=-1)
        }
        
        cost_results = {}
        for name, model in cost_models.items():
            print(f"\nTraining {name} for cost prediction...")
            start_time = time.time()
            model.fit(X_cost_train, y_cost_train)
            training_time = time.time() - start_time
            
            y_cost_pred = model.predict(X_cost_test)
            r2 = r2_score(y_cost_test, y_cost_pred)
            rmse = np.sqrt(mean_squared_error(y_cost_test, y_cost_pred))
            
            cost_results[name] = {'r2': r2, 'rmse': rmse, 'model': model, 'training_time': training_time}
            print(f"  R²: {r2:.3f}, RMSE: ${rmse:,.0f}")
            print(f"  Training time: {training_time:.1f}s")
        
        # Find best cost model
        best_cost_model_name = max(cost_results.keys(), key=lambda x: cost_results[x]['r2'])
        print(f"\n🏆 Best cost model: {best_cost_model_name} (R² = {cost_results[best_cost_model_name]['r2']:.3f})")
    else:
        print("⚠️ Cannot proceed with cost prediction - no cost data available")
        cost_results = {}
    
else:
    print("⚠️ Maintenance data not available or missing required columns")
    print("Skipping cost prediction model...")
    cost_results = {}


In [None]:
# Model Performance Summary
print("="*60)
print("MODEL PERFORMANCE SUMMARY")
print("="*60)

print("🏆 CONDITION PREDICTION MODELS:")
for name, result in results.items():
    print(f"  {name}:")
    print(f"    Test R²: {result['test_r2']:.3f}")
    print(f"    Test RMSE: {result['test_rmse']:.3f}")
    print(f"    Training time: {result['training_time']:.1f}s")

if 'cost_results' in locals() and cost_results:
    print("\n🏆 COST PREDICTION MODELS:")
    for name, result in cost_results.items():
        print(f"  {name}:")
        print(f"    Test R²: {result['r2']:.3f}")
        print(f"    Test RMSE: ${result['rmse']:,.0f}")
        print(f"    Training time: {result['training_time']:.1f}s")
else:
    print("\n⚠️ Cost prediction models not available")

print(f"\n📊 WORKSHOP OPTIMIZATIONS:")
print(f"  - Used sample data: {len(features_sample):,} records")
print(f"  - Reduced estimators: 50 (vs 100)")
print(f"  - Parallel processing: Enabled")
print(f"  - Total features used: {len(available_features)}")

print(f"\n✅ ML MODELING COMPLETE!")
print(f"Best condition model: {best_model_name}")
if 'cost_results' in locals() and cost_results:
    print(f"Best cost model: {best_cost_model_name}")
