# SmartPave Analytics: Data Exploration

## Overview
This notebook explores the pavement condition and maintenance data for 16,000 miles of roadway infrastructure.

## Objectives
- Load and examine the dataset structure
- Understand data quality and completeness
- Create initial visualizations
- Identify patterns and trends
- Prepare for feature engineering


## Setup and Imports


In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")


## Data Loading


In [None]:
# Set database context and load datasets from Snowflake tables
print("Loading datasets from Snowflake...")

# Connect to Snowflake and set context
from snowflake.snowpark.context import get_active_session
session = get_active_session()
session.sql("USE DATABASE DOT_workshop_test").collect()
session.sql("USE SCHEMA smartpave_analytics").collect()

# Check if tables exist first
print("Checking if tables exist...")
tables_check = session.sql("""
    SELECT table_name 
    FROM information_schema.tables 
    WHERE table_schema = 'SMARTPAVE_ANALYTICS'
""").to_pandas()
print(f"Available tables: {list(tables_check['TABLE_NAME'])}")

# Check table schemas to see what columns actually exist
print("\nChecking table schemas...")
for table in ['road_network', 'pavement_condition', 'maintenance_records', 'traffic_data']:
    try:
        schema_info = session.sql(f"""
            SELECT column_name, data_type 
            FROM information_schema.columns 
            WHERE table_schema = 'SMARTPAVE_ANALYTICS' 
            AND table_name = '{table.upper()}'
            ORDER BY ordinal_position
        """).to_pandas()
        print(f"\n{table} table schema:")
        print(schema_info)
    except Exception as e:
        print(f"Could not get schema for {table}: {e}")

# Road network data
print("\n" + "="*50)
print("Loading road network data...")
try:
    roads_df = session.sql("SELECT * FROM DOT_workshop_test.smartpave_analytics.road_network").to_pandas()
    print(f"✅ Road network: {len(roads_df):,} segments")
    print(f"   Columns: {list(roads_df.columns)}")
    if len(roads_df) > 0:
        print(f"   Sample data:\n{roads_df.head()}")
    else:
        print("   ⚠️ No data in road_network table")
except Exception as e:
    print(f"❌ Error loading road_network: {e}")
    roads_df = pd.DataFrame()

# Pavement condition data
print("\n" + "="*50)
print("Loading pavement condition data...")
try:
    condition_df = session.sql("SELECT * FROM DOT_workshop_test.smartpave_analytics.pavement_condition").to_pandas()
    print(f"✅ Pavement condition: {len(condition_df):,} records")
    print(f"   Columns: {list(condition_df.columns)}")
    if len(condition_df) > 0:
        print(f"   Sample data:\n{condition_df.head()}")
        # Check if date column exists and convert if it does (handle case sensitivity)
        date_col = None
        for col in condition_df.columns:
            if col.upper() == 'DATE':
                date_col = col
                break
        
        if date_col:
            condition_df['date'] = pd.to_datetime(condition_df[date_col])
            print(f"   ✅ Date column '{date_col}' converted to datetime")
        else:
            print("   ⚠️ No 'DATE' column found")
            # Look for any date-like columns
            date_like_cols = [col for col in condition_df.columns if any(word in col.lower() for word in ['date', 'time', 'year', 'month', 'day'])]
            print(f"   Date-like columns found: {date_like_cols}")
    else:
        print("   ⚠️ No data in pavement_condition table")
except Exception as e:
    print(f"❌ Error loading pavement_condition: {e}")
    condition_df = pd.DataFrame()

# Maintenance records
print("\n" + "="*50)
print("Loading maintenance records...")
try:
    maintenance_df = session.sql("SELECT * FROM DOT_workshop_test.smartpave_analytics.maintenance_records").to_pandas()
    print(f"✅ Maintenance records: {len(maintenance_df):,} records")
    print(f"   Columns: {list(maintenance_df.columns)}")
    if len(maintenance_df) > 0:
        print(f"   Sample data:\n{maintenance_df.head()}")
        # Check if date column exists and convert if it does (handle case sensitivity)
        date_col = None
        for col in maintenance_df.columns:
            if col.upper() == 'DATE':
                date_col = col
                break
        
        if date_col:
            maintenance_df['date'] = pd.to_datetime(maintenance_df[date_col])
            print(f"   ✅ Date column '{date_col}' converted to datetime")
        else:
            print("   ⚠️ No 'DATE' column found")
            # Look for any date-like columns
            date_like_cols = [col for col in maintenance_df.columns if any(word in col.lower() for word in ['date', 'time', 'year', 'month', 'day'])]
            print(f"   Date-like columns found: {date_like_cols}")
    else:
        print("   ⚠️ No data in maintenance_records table")
except Exception as e:
    print(f"❌ Error loading maintenance_records: {e}")
    maintenance_df = pd.DataFrame()

# Traffic data
print("\n" + "="*50)
print("Loading traffic data...")
try:
    traffic_df = session.sql("SELECT * FROM DOT_workshop_test.smartpave_analytics.traffic_data").to_pandas()
    print(f"✅ Traffic data: {len(traffic_df):,} records")
    print(f"   Columns: {list(traffic_df.columns)}")
    if len(traffic_df) > 0:
        print(f"   Sample data:\n{traffic_df.head()}")
    else:
        print("   ⚠️ No data in traffic_data table")
except Exception as e:
    print(f"❌ Error loading traffic_data: {e}")
    traffic_df = pd.DataFrame()

print("\n" + "="*60)
print("DATA LOADING SUMMARY")
print("="*60)
print(f"Road network: {len(roads_df):,} records")
print(f"Pavement condition: {len(condition_df):,} records") 
print(f"Maintenance records: {len(maintenance_df):,} records")
print(f"Traffic data: {len(traffic_df):,} records")

# Check for date columns specifically
print("\n" + "="*60)
print("DATE COLUMN ANALYSIS")
print("="*60)
for name, df in [("Road Network", roads_df), ("Pavement Condition", condition_df), 
                ("Maintenance Records", maintenance_df), ("Traffic Data", traffic_df)]:
    if len(df) > 0:
        date_cols = [col for col in df.columns if any(word in col.lower() for word in ['date', 'time', 'year', 'month', 'day'])]
        print(f"{name}: {date_cols if date_cols else 'No date columns found'}")

if len(condition_df) == 0:
    print("\n🚨 CRITICAL: No pavement condition data loaded!")
    print("   This will prevent ML modeling from working.")
    print("   Please check if data was loaded into the tables.")
elif not any(col.upper() == 'DATE' for col in condition_df.columns):
    print("\n🚨 CRITICAL: No date column in pavement condition data!")
    print("   This will severely impact ML modeling capabilities.")
    print("   Time-based features are essential for pavement analysis.")


In [None]:
# Dataset Overview and Basic Statistics
print("="*60)
print("DATASET OVERVIEW")
print("="*60)

# Basic statistics
print(f"📊 Dataset Summary:")
print(f"   • Road segments: {len(roads_df):,}")
print(f"   • Pavement records: {len(condition_df):,}")
print(f"   • Maintenance records: {len(maintenance_df):,}")
print(f"   • Traffic records: {len(traffic_df):,}")

# Date ranges (only if date columns exist)
print(f"\n📅 Date Ranges:")
if 'date' in condition_df.columns:
    print(f"   • Pavement data: {condition_df['date'].min().strftime('%Y-%m-%d')} to {condition_df['date'].max().strftime('%Y-%m-%d')}")
else:
    print("   • Pavement data: No date column available")

if 'date' in maintenance_df.columns:
    print(f"   • Maintenance data: {maintenance_df['date'].min().strftime('%Y-%m-%d')} to {maintenance_df['date'].max().strftime('%Y-%m-%d')}")
else:
    print("   • Maintenance data: No date column available")

# Road types distribution
print(f"\n🛣️ Road Types:")
road_type_col = None
for col in roads_df.columns:
    if col.upper() == 'ROAD_TYPE':
        road_type_col = col
        break

if road_type_col:
    road_type_counts = roads_df[road_type_col].value_counts()
    for road_type, count in road_type_counts.items():
        print(f"   • {road_type}: {count:,} segments ({count/len(roads_df)*100:.1f}%)")
else:
    print("   • No road_type column found")

# Condition score statistics
print(f"\n📈 Pavement Condition Statistics:")
condition_score_col = None
for col in condition_df.columns:
    if col.upper() == 'CONDITION_SCORE':
        condition_score_col = col
        break

if condition_score_col:
    print(f"   • Average condition score: {condition_df[condition_score_col].mean():.1f}")
    print(f"   • Median condition score: {condition_df[condition_score_col].median():.1f}")
    print(f"   • Best condition: {condition_df[condition_score_col].max():.1f}")
    print(f"   • Worst condition: {condition_df[condition_score_col].min():.1f}")
else:
    print("   • No condition_score column found")

# Maintenance cost statistics
print(f"\n💰 Maintenance Cost Statistics:")
cost_col = None
for col in maintenance_df.columns:
    if col.upper() == 'COST':
        cost_col = col
        break

if cost_col:
    print(f"   • Total maintenance cost: ${maintenance_df[cost_col].sum():,.0f}")
    print(f"   • Average repair cost: ${maintenance_df[cost_col].mean():,.0f}")
    print(f"   • Most expensive repair: ${maintenance_df[cost_col].max():,.0f}")
    print(f"   • Total repairs: {len(maintenance_df):,}")
else:
    print("   • No cost column found")


In [None]:
# Data Quality Analysis
print("="*60)
print("DATA QUALITY ANALYSIS")
print("="*60)

# Check for missing values
print("🔍 Missing Values Analysis:")
datasets = {
    'Road Network': roads_df,
    'Pavement Condition': condition_df,
    'Maintenance Records': maintenance_df,
    'Traffic Data': traffic_df
}

for name, df in datasets.items():
    missing = df.isnull().sum()
    total = len(df)
    print(f"\n{name}:")
    if missing.sum() == 0:
        print("   ✅ No missing values")
    else:
        for col, missing_count in missing[missing > 0].items():
            pct = (missing_count / total) * 100
            print(f"   ⚠️ {col}: {missing_count:,} missing ({pct:.1f}%)")

# Check for duplicates
print(f"\n🔍 Duplicate Records Analysis:")
for name, df in datasets.items():
    duplicates = df.duplicated().sum()
    print(f"   • {name}: {duplicates:,} duplicate records")

# Check data types
print(f"\n🔍 Data Types Analysis:")
for name, df in datasets.items():
    print(f"\n{name}:")
    for col, dtype in df.dtypes.items():
        print(f"   • {col}: {dtype}")

# Check for outliers in condition scores (if column exists)
print(f"\n🔍 Outlier Analysis (Condition Scores):")
condition_score_col = None
for col in condition_df.columns:
    if col.upper() == 'CONDITION_SCORE':
        condition_score_col = col
        break

if condition_score_col:
    Q1 = condition_df[condition_score_col].quantile(0.25)
    Q3 = condition_df[condition_score_col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = condition_df[(condition_df[condition_score_col] < lower_bound) | 
                           (condition_df[condition_score_col] > upper_bound)]
    print(f"   • Outliers detected: {len(outliers):,} ({len(outliers)/len(condition_df)*100:.1f}%)")
    print(f"   • Normal range: {lower_bound:.1f} to {upper_bound:.1f}")
else:
    print("   • No condition_score column found - cannot perform outlier analysis")


In [None]:
# Visualization 1: Condition Score Distribution
plt.figure(figsize=(15, 5))

# Get column names (handle case sensitivity)
condition_score_col = None
segment_id_col = None
road_type_col = None
date_col = None

for col in condition_df.columns:
    if col.upper() == 'CONDITION_SCORE':
        condition_score_col = col
    elif col.upper() == 'SEGMENT_ID':
        segment_id_col = col

for col in roads_df.columns:
    if col.upper() == 'ROAD_TYPE':
        road_type_col = col
    elif col.upper() == 'SEGMENT_ID':
        segment_id_col = col

for col in condition_df.columns:
    if col.upper() == 'DATE':
        date_col = col

# Histogram of condition scores
plt.subplot(1, 3, 1)
if condition_score_col:
    plt.hist(condition_df[condition_score_col], bins=50, alpha=0.7, color='skyblue', edgecolor='black')
    plt.title('Distribution of Pavement Condition Scores')
    plt.xlabel('Condition Score')
    plt.ylabel('Frequency')
    plt.grid(True, alpha=0.3)
else:
    plt.text(0.5, 0.5, 'No condition_score column found', ha='center', va='center', transform=plt.gca().transAxes)
    plt.title('Condition Score Distribution - Data Not Available')

# Box plot by road type
plt.subplot(1, 3, 2)
if condition_score_col and segment_id_col and road_type_col:
    try:
        condition_with_road_type = condition_df.merge(roads_df[[segment_id_col, road_type_col]], on=segment_id_col)
        print(f"Columns after merge: {list(condition_with_road_type.columns)}")
        
        # Find the actual road type column name after merge
        merged_road_type_col = None
        for col in condition_with_road_type.columns:
            if col.upper() == 'ROAD_TYPE':
                merged_road_type_col = col
                break
        
        if merged_road_type_col:
            road_types = condition_with_road_type[merged_road_type_col].unique()
            box_data = [condition_with_road_type[condition_with_road_type[merged_road_type_col] == rt][condition_score_col].values 
                       for rt in road_types]
            plt.boxplot(box_data, labels=road_types)
            plt.title('Condition Scores by Road Type')
            plt.xlabel('Road Type')
            plt.ylabel('Condition Score')
            plt.xticks(rotation=45)
            plt.grid(True, alpha=0.3)
        else:
            plt.text(0.5, 0.5, 'Road type column not found after merge', ha='center', va='center', transform=plt.gca().transAxes)
            plt.title('Condition by Road Type - Column Not Found')
    except Exception as e:
        plt.text(0.5, 0.5, f'Error in merge: {str(e)[:50]}...', ha='center', va='center', transform=plt.gca().transAxes)
        plt.title('Condition by Road Type - Merge Error')
else:
    plt.text(0.5, 0.5, 'Required columns not found', ha='center', va='center', transform=plt.gca().transAxes)
    plt.title('Condition by Road Type - Data Not Available')

# Time series of average condition
plt.subplot(1, 3, 3)
if date_col and condition_score_col:
    monthly_condition = condition_df.groupby(condition_df[date_col].dt.to_period('M'))[condition_score_col].mean()
    plt.plot(monthly_condition.index.astype(str), monthly_condition.values, marker='o', linewidth=2)
    plt.title('Average Condition Score Over Time')
    plt.xlabel('Month')
    plt.ylabel('Average Condition Score')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
else:
    plt.text(0.5, 0.5, 'Date or condition_score column not found', ha='center', va='center', transform=plt.gca().transAxes)
    plt.title('Condition Over Time - Data Not Available')

plt.tight_layout()
plt.show()

print("📊 Visualization 1 Complete: Condition Score Analysis")


In [None]:
# Visualization 2: Maintenance Cost Analysis
plt.figure(figsize=(15, 5))

# Get column names (handle case sensitivity)
cost_col = None
repair_type_col = None
date_col = None

for col in maintenance_df.columns:
    if col.upper() == 'COST':
        cost_col = col
    elif col.upper() == 'REPAIR_TYPE':
        repair_type_col = col
    elif col.upper() == 'DATE':
        date_col = col

# Maintenance cost distribution
plt.subplot(1, 3, 1)
if cost_col:
    plt.hist(maintenance_df[cost_col], bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
    plt.title('Distribution of Maintenance Costs')
    plt.xlabel('Cost ($)')
    plt.ylabel('Frequency')
    plt.grid(True, alpha=0.3)
else:
    plt.text(0.5, 0.5, 'No cost column found', ha='center', va='center', transform=plt.gca().transAxes)
    plt.title('Cost Distribution - Data Not Available')

# Cost by repair type
plt.subplot(1, 3, 2)
if cost_col and repair_type_col:
    cost_by_type = maintenance_df.groupby(repair_type_col)[cost_col].mean().sort_values(ascending=False)
    cost_by_type.plot(kind='bar', color='lightgreen')
    plt.title('Average Cost by Repair Type')
    plt.xlabel('Repair Type')
    plt.ylabel('Average Cost ($)')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
else:
    plt.text(0.5, 0.5, 'Required columns not found', ha='center', va='center', transform=plt.gca().transAxes)
    plt.title('Cost by Repair Type - Data Not Available')

# Monthly maintenance spending
plt.subplot(1, 3, 3)
if date_col and cost_col:
    monthly_costs = maintenance_df.groupby(maintenance_df[date_col].dt.to_period('M'))[cost_col].sum()
    plt.plot(monthly_costs.index.astype(str), monthly_costs.values, marker='s', linewidth=2, color='orange')
    plt.title('Monthly Maintenance Spending')
    plt.xlabel('Month')
    plt.ylabel('Total Cost ($)')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
else:
    plt.text(0.5, 0.5, 'Date or cost column not found', ha='center', va='center', transform=plt.gca().transAxes)
    plt.title('Monthly Spending - Data Not Available')

plt.tight_layout()
plt.show()

print("📊 Visualization 2 Complete: Maintenance Cost Analysis")


In [None]:
# Visualization 3: Traffic and Condition Correlation
plt.figure(figsize=(15, 5))

# Get additional column names
traffic_volume_col = None
roughness_col = None
cracking_col = None
pothole_col = None

for col in roads_df.columns:
    if col.upper() == 'TRAFFIC_VOLUME':
        traffic_volume_col = col

for col in condition_df.columns:
    if col.upper() == 'ROUGHNESS_INDEX':
        roughness_col = col
    elif col.upper() == 'CRACKING_PERCENT':
        cracking_col = col
    elif col.upper() == 'POTHOLE_COUNT':
        pothole_col = col

# Traffic volume vs condition score
plt.subplot(1, 3, 1)
if condition_score_col and segment_id_col and traffic_volume_col:
    try:
        condition_with_traffic = condition_df.merge(roads_df[[segment_id_col, traffic_volume_col]], on=segment_id_col)
        plt.scatter(condition_with_traffic[traffic_volume_col], condition_with_traffic[condition_score_col], 
                   alpha=0.5, s=10, color='purple')
        plt.title('Traffic Volume vs Condition Score')
        plt.xlabel('Traffic Volume (vehicles/day)')
        plt.ylabel('Condition Score')
        plt.grid(True, alpha=0.3)
    except Exception as e:
        plt.text(0.5, 0.5, f'Error: {str(e)[:30]}...', ha='center', va='center', transform=plt.gca().transAxes)
        plt.title('Traffic vs Condition - Error')
else:
    plt.text(0.5, 0.5, 'Required columns not found', ha='center', va='center', transform=plt.gca().transAxes)
    plt.title('Traffic vs Condition - Data Not Available')

# Correlation heatmap
plt.subplot(1, 3, 2)
if condition_score_col and traffic_volume_col and roughness_col and cracking_col and pothole_col:
    try:
        correlation_cols = [condition_score_col, traffic_volume_col, roughness_col, cracking_col, pothole_col]
        correlation_data = condition_with_traffic[correlation_cols].corr()
        sns.heatmap(correlation_data, annot=True, cmap='coolwarm', center=0, square=True)
        plt.title('Feature Correlation Matrix')
    except Exception as e:
        plt.text(0.5, 0.5, f'Error: {str(e)[:30]}...', ha='center', va='center', transform=plt.gca().transAxes)
        plt.title('Correlation Matrix - Error')
else:
    plt.text(0.5, 0.5, 'Required columns not found', ha='center', va='center', transform=plt.gca().transAxes)
    plt.title('Correlation Matrix - Data Not Available')

# Condition degradation over time by road type
plt.subplot(1, 3, 3)
if date_col and condition_score_col and segment_id_col and road_type_col:
    try:
        condition_with_road_type = condition_df.merge(roads_df[[segment_id_col, road_type_col]], on=segment_id_col)
        merged_road_type_col = None
        for col in condition_with_road_type.columns:
            if col.upper() == 'ROAD_TYPE':
                merged_road_type_col = col
                break
        
        if merged_road_type_col:
            for road_type in condition_with_road_type[merged_road_type_col].unique():
                road_data = condition_with_road_type[condition_with_road_type[merged_road_type_col] == road_type]
                monthly_condition = road_data.groupby(road_data[date_col].dt.to_period('M'))[condition_score_col].mean()
                plt.plot(monthly_condition.index.astype(str), monthly_condition.values, 
                         marker='o', label=road_type, linewidth=2)
            plt.title('Condition Trends by Road Type')
            plt.xlabel('Month')
            plt.ylabel('Average Condition Score')
            plt.legend()
            plt.xticks(rotation=45)
            plt.grid(True, alpha=0.3)
        else:
            plt.text(0.5, 0.5, 'Road type column not found', ha='center', va='center', transform=plt.gca().transAxes)
            plt.title('Condition Trends - Column Not Found')
    except Exception as e:
        plt.text(0.5, 0.5, f'Error: {str(e)[:30]}...', ha='center', va='center', transform=plt.gca().transAxes)
        plt.title('Condition Trends - Error')
else:
    plt.text(0.5, 0.5, 'Required columns not found', ha='center', va='center', transform=plt.gca().transAxes)
    plt.title('Condition Trends - Data Not Available')

plt.tight_layout()
plt.show()

print("📊 Visualization 3 Complete: Traffic and Condition Analysis")


In [None]:
# Summary and Key Insights
print("="*60)
print("KEY INSIGHTS FROM DATA EXPLORATION")
print("="*60)

# Get column names for calculations
segment_length_col = None
cost_col = None
condition_score_col = None
segment_id_col = None
road_type_col = None
repair_type_col = None

for col in roads_df.columns:
    if col.upper() == 'SEGMENT_LENGTH_MILES':
        segment_length_col = col
    elif col.upper() == 'ROAD_TYPE':
        road_type_col = col

for col in maintenance_df.columns:
    if col.upper() == 'COST':
        cost_col = col
    elif col.upper() == 'REPAIR_TYPE':
        repair_type_col = col

for col in condition_df.columns:
    if col.upper() == 'CONDITION_SCORE':
        condition_score_col = col
    elif col.upper() == 'SEGMENT_ID':
        segment_id_col = col

# Calculate key metrics
total_road_miles = roads_df[segment_length_col].sum() if segment_length_col else 0
total_maintenance_cost = maintenance_df[cost_col].sum() if cost_col else 0
avg_condition = condition_df[condition_score_col].mean() if condition_score_col else 0
worst_condition = condition_df[condition_score_col].min() if condition_score_col else 0
best_condition = condition_df[condition_score_col].max() if condition_score_col else 0

print(f"📊 Infrastructure Overview:")
print(f"   • Total road network: {total_road_miles:,.0f} miles")
print(f"   • Total maintenance investment: ${total_maintenance_cost:,.0f}")
print(f"   • Average condition score: {avg_condition:.1f}/100")
print(f"   • Condition range: {worst_condition:.1f} to {best_condition:.1f}")

# Identify problem areas
if condition_score_col:
    poor_condition = condition_df[condition_df[condition_score_col] < 30]
    print(f"\n⚠️ Problem Areas Identified:")
    print(f"   • Segments in poor condition (<30): {len(poor_condition):,}")
    print(f"   • Percentage needing attention: {len(poor_condition)/len(condition_df)*100:.1f}%")
    
    # Try to get high-traffic poor condition if we can merge the data
    try:
        if segment_id_col and 'TRAFFIC_VOLUME' in [col.upper() for col in roads_df.columns]:
            traffic_volume_col = None
            for col in roads_df.columns:
                if col.upper() == 'TRAFFIC_VOLUME':
                    traffic_volume_col = col
                    break
            
            if traffic_volume_col:
                condition_with_traffic = condition_df.merge(roads_df[[segment_id_col, traffic_volume_col]], on=segment_id_col)
                high_traffic_poor_condition = condition_with_traffic[
                    (condition_with_traffic[condition_score_col] < 30) & 
                    (condition_with_traffic[traffic_volume_col] > 25000)
                ]
                print(f"   • High-traffic poor condition: {len(high_traffic_poor_condition):,}")
    except Exception as e:
        print(f"   • High-traffic analysis: Could not complete ({str(e)[:50]}...)")
else:
    print(f"\n⚠️ Problem Areas: Cannot analyze - condition_score column not found")

# Cost analysis
if cost_col and segment_length_col and total_road_miles > 0:
    cost_per_mile = total_maintenance_cost / total_road_miles
    print(f"\n💰 Cost Analysis:")
    print(f"   • Average cost per mile: ${cost_per_mile:,.0f}")
    
    if repair_type_col:
        most_expensive = maintenance_df.groupby(repair_type_col)[cost_col].mean().idxmax()
        print(f"   • Most expensive repair type: {most_expensive}")
    
    print(f"   • Average repair cost: ${maintenance_df[cost_col].mean():,.0f}")
else:
    print(f"\n💰 Cost Analysis: Cannot complete - required columns not found")

# Recommendations
print(f"\n🎯 Initial Recommendations:")
if condition_score_col and segment_id_col:
    poor_condition_count = len(condition_df[condition_df[condition_score_col] < 30])
    unique_segments = condition_df[condition_df[condition_score_col] < 30][segment_id_col].nunique()
    print(f"   • Focus on {poor_condition_count:,} segments in poor condition")
    print(f"   • Prioritize {unique_segments:,} unique segments needing repair")
    print(f"   • Consider preventive maintenance for segments scoring 30-50")
else:
    print(f"   • Cannot provide specific recommendations - data columns not found")

if road_type_col:
    most_common_road_type = roads_df[road_type_col].value_counts().idxmax()
    print(f"   • Monitor {most_common_road_type} roads (most common type)")

print(f"\n✅ Data exploration complete! Ready for feature engineering.")


In [None]:
# Additional Analysis (if needed)
print("="*60)
print("ADDITIONAL ANALYSIS")
print("="*60)

# Check if we have all the data we need for comprehensive analysis
print("Data availability check:")
print(f"✅ Road network: {len(roads_df):,} segments")
print(f"✅ Pavement condition: {len(condition_df):,} records")
print(f"✅ Maintenance records: {len(maintenance_df):,} records")
print(f"✅ Traffic data: {len(traffic_df):,} records")

# Check column availability
print(f"\nColumn availability:")
print(f"Road network columns: {list(roads_df.columns)}")
print(f"Pavement condition columns: {list(condition_df.columns)}")
print(f"Maintenance records columns: {list(maintenance_df.columns)}")
print(f"Traffic data columns: {list(traffic_df.columns)}")

print(f"\n🎉 Data exploration notebook completed successfully!")
print(f"Ready to proceed to feature engineering and ML modeling.")
