# SmartPave Analytics: Machine Learning Modeling

## Overview
This notebook trains machine learning models to predict pavement degradation, estimate repair costs, and prioritize maintenance activities.

## Objectives
- Train degradation prediction model
- Build cost estimation model
- Develop priority scoring algorithm
- Evaluate model performance
- Create ensemble predictions


In [None]:
# Import libraries for machine learning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, LabelEncoder
import xgboost as xgb
import lightgbm as lgb
import joblib
import warnings
warnings.filterwarnings('ignore')

# Connect to Snowflake
from snowflake.snowpark.context import get_active_session
session = get_active_session()
session.sql("USE DATABASE DOT_workshop_test").collect()
session.sql("USE SCHEMA smartpave_analytics").collect()

print("ML libraries imported successfully!")


In [None]:
# Load processed features from Snowflake
print("Loading processed features for ML modeling...")

# Load the features table created in notebook 2
features_df = session.sql("SELECT * FROM DOT_workshop_test.smartpave_analytics.pavement_features").to_pandas()
print(f"Loaded {len(features_df):,} feature records")

# Load additional data for cost prediction
maintenance_df = session.sql("SELECT * FROM DOT_workshop_test.smartpave_analytics.maintenance_records").to_pandas()
maintenance_df['date'] = pd.to_datetime(maintenance_df['date'])

print(f"Loaded {len(maintenance_df):,} maintenance records")

# Display feature information
print(f"\nFeature columns available: {len(features_df.columns)}")
print(f"Features: {list(features_df.columns)}")
print(f"\nSample of features data:")
print(features_df.head())


In [None]:
# Model 1: Pavement Condition Prediction
print("="*60)
print("MODEL 1: PAVEMENT CONDITION PREDICTION")
print("="*60)

# Prepare features for condition prediction
feature_columns = ['days_since_last_repair', 'season', 'month', 'year', 'condition_trend',
                  'traffic_stress', 'traffic_volume', 'weather_damage', 'precipitation_30d_avg',
                  'freeze_thaw_30d_sum', 'total_maintenance_cost', 'repair_count', 
                  'avg_repair_cost', 'avg_effectiveness', 'days_since_last_maintenance',
                  'lanes', 'segment_length_miles', 'distance_from_center']

# Handle categorical variables
le_road_type = LabelEncoder()
le_region = LabelEncoder()
features_df['road_type_encoded'] = le_road_type.fit_transform(features_df['road_type'])
features_df['region_encoded'] = le_region.fit_transform(features_df['region'])

# Add encoded categorical features
feature_columns.extend(['road_type_encoded', 'region_encoded'])

# Prepare data
X = features_df[feature_columns].fillna(0)
y = features_df['condition_score']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Split data (time-aware split)
features_df_sorted = features_df.sort_values('date')
split_date = features_df_sorted['date'].quantile(0.8)
train_mask = features_df_sorted['date'] < split_date

X_train = X[train_mask]
X_test = X[~train_mask]
y_train = y[train_mask]
y_test = y[~train_mask]

print(f"Training set: {len(X_train):,} samples")
print(f"Test set: {len(X_test):,} samples")

# Train multiple models
models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Metrics
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    
    results[name] = {
        'train_r2': train_r2,
        'test_r2': test_r2,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'model': model
    }
    
    print(f"  Train R²: {train_r2:.3f}, Test R²: {test_r2:.3f}")
    print(f"  Train RMSE: {train_rmse:.3f}, Test RMSE: {test_rmse:.3f}")

# Find best model
best_model_name = max(results.keys(), key=lambda x: results[x]['test_r2'])
best_model = results[best_model_name]['model']
print(f"\n🏆 Best model: {best_model_name} (R² = {results[best_model_name]['test_r2']:.3f})")


In [None]:
# Model 2: Maintenance Cost Prediction
print("="*60)
print("MODEL 2: MAINTENANCE COST PREDICTION")
print("="*60)

# Prepare cost prediction data
cost_features = features_df.merge(maintenance_df[['segment_id', 'date', 'cost', 'repair_type']], 
                                 on=['segment_id', 'date'], how='inner')

# Encode repair type
le_repair_type = LabelEncoder()
cost_features['repair_type_encoded'] = le_repair_type.fit_transform(cost_features['repair_type'])

# Features for cost prediction
cost_feature_columns = ['condition_score', 'days_since_last_repair', 'traffic_stress', 
                       'weather_damage', 'lanes', 'segment_length_miles', 'repair_type_encoded']

X_cost = cost_features[cost_feature_columns].fillna(0)
y_cost = cost_features['cost']

print(f"Cost prediction features: {X_cost.shape}")
print(f"Cost targets: {y_cost.shape}")

# Split cost data
X_cost_train, X_cost_test, y_cost_train, y_cost_test = train_test_split(
    X_cost, y_cost, test_size=0.2, random_state=42)

# Train cost prediction models
cost_models = {
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'XGBoost': xgb.XGBRegressor(n_estimators=100, random_state=42),
    'LightGBM': lgb.LGBMRegressor(n_estimators=100, random_state=42, verbose=-1)
}

cost_results = {}
for name, model in cost_models.items():
    print(f"\nTraining {name} for cost prediction...")
    model.fit(X_cost_train, y_cost_train)
    
    y_cost_pred = model.predict(X_cost_test)
    r2 = r2_score(y_cost_test, y_cost_pred)
    rmse = np.sqrt(mean_squared_error(y_cost_test, y_cost_pred))
    
    cost_results[name] = {'r2': r2, 'rmse': rmse, 'model': model}
    print(f"  R²: {r2:.3f}, RMSE: ${rmse:,.0f}")

# Find best cost model
best_cost_model_name = max(cost_results.keys(), key=lambda x: cost_results[x]['r2'])
print(f"\n🏆 Best cost model: {best_cost_model_name} (R² = {cost_results[best_cost_model_name]['r2']:.3f})")
