# METR-LA Traffic Dataset Analysis

**Task 6.2: Data Analysis Notebook**  
Traffic Prediction System - Exploratory Data Analysis

This notebook provides comprehensive analysis of the METR-LA traffic dataset including:
- Dataset overview and structure
- Sensor location analysis (207 sensors)
- Traffic pattern analysis
- Missing data patterns
- Data quality assessment
- Statistical insights and visualizations

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("✅ Libraries imported successfully")
print(f"📅 Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 1. Dataset Loading and Overview

Load the METR-LA dataset and examine its structure.

In [None]:
# Define data paths
data_dir = Path("../data/raw")

# Load sensor metadata
metadata_file = data_dir / "metr_la_sensor_metadata.csv"
sensors_df = pd.read_csv(metadata_file)

# Load traffic data (use sample for this analysis)
sample_file = data_dir / "metr_la_sample_data.csv"
if sample_file.exists():
    traffic_df = pd.read_csv(sample_file)
    print("📊 Using sample dataset for analysis")
else:
    # Try to load full dataset if available
    full_file = data_dir / "metr_la_traffic_data.csv"
    if full_file.exists():
        print("📊 Loading full dataset (this may take a moment...)")
        traffic_df = pd.read_csv(full_file)
    else:
        print("❌ No traffic data found. Please run the dataset generation script first.")
        raise FileNotFoundError("Traffic dataset not found")

print(f"\n📈 Dataset Overview:")
print(f"   Sensors: {len(sensors_df):,}")
print(f"   Traffic Records: {len(traffic_df):,}")
print(f"   Memory Usage: {traffic_df.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")

In [None]:
# Examine data structure
print("📋 Sensor Metadata Structure:")
print(sensors_df.info())
print("\n📊 First 5 sensors:")
print(sensors_df.head())

print("\n" + "="*60)
print("📋 Traffic Data Structure:")
print(traffic_df.info())
print("\n📊 First 5 traffic records:")
print(traffic_df.head())

## 2. Sensor Location Analysis

Analyze the 207 sensor locations across Los Angeles.

In [None]:
# Sensor distribution by road type
road_type_counts = sensors_df['road_type'].value_counts()
print("🛣️ Sensor Distribution by Road Type:")
for road_type, count in road_type_counts.items():
    percentage = (count / len(sensors_df)) * 100
    print(f"   {road_type.title()}: {count} sensors ({percentage:.1f}%)")

# Create visualization
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Pie chart of road types
ax1.pie(road_type_counts.values, labels=road_type_counts.index, autopct='%1.1f%%', startangle=90)
ax1.set_title('Sensor Distribution by Road Type')

# Lane distribution
lane_dist = sensors_df.groupby('road_type')['lanes'].mean()
ax2.bar(lane_dist.index, lane_dist.values, color=['skyblue', 'lightcoral', 'lightgreen'])
ax2.set_title('Average Lanes by Road Type')
ax2.set_ylabel('Average Number of Lanes')
ax2.set_xlabel('Road Type')

plt.tight_layout()
plt.show()

In [None]:
# Geographic distribution analysis
print("🗺️ Geographic Distribution:")
print(f"   Latitude range: {sensors_df['latitude'].min():.4f} to {sensors_df['latitude'].max():.4f}")
print(f"   Longitude range: {sensors_df['longitude'].min():.4f} to {sensors_df['longitude'].max():.4f}")
print(f"   Geographic span: ~{((sensors_df['latitude'].max() - sensors_df['latitude'].min()) * 69):.1f} miles N-S")
print(f"                   ~{((sensors_df['longitude'].max() - sensors_df['longitude'].min()) * 54.6):.1f} miles E-W")

# Plot sensor locations
plt.figure(figsize=(15, 10))

# Create scatter plot colored by road type
colors = {'highway': 'red', 'arterial': 'blue', 'local': 'green'}
for road_type in sensors_df['road_type'].unique():
    subset = sensors_df[sensors_df['road_type'] == road_type]
    plt.scatter(subset['longitude'], subset['latitude'], 
               c=colors[road_type], label=road_type.title(), 
               alpha=0.7, s=60)

plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('METR-LA Traffic Sensor Locations in Los Angeles')
plt.legend()
plt.grid(True, alpha=0.3)
plt.axis('equal')
plt.show()

print(f"\n📍 Sensor density: {len(sensors_df) / ((sensors_df['latitude'].max() - sensors_df['latitude'].min()) * (sensors_df['longitude'].max() - sensors_df['longitude'].min())):.1f} sensors per square degree")

## 3. Traffic Pattern Analysis

Analyze traffic speed and volume patterns across different road types and time periods.

In [None]:
# Convert timestamp to datetime
traffic_df['timestamp'] = pd.to_datetime(traffic_df['timestamp'])
traffic_df['hour'] = traffic_df['timestamp'].dt.hour
traffic_df['day_of_week'] = traffic_df['timestamp'].dt.day_name()
traffic_df['weekday'] = traffic_df['timestamp'].dt.weekday < 5

# Overall statistics
print("🚗 Traffic Statistics Overview:")
print(f"   Date range: {traffic_df['timestamp'].min()} to {traffic_df['timestamp'].max()}")
print(f"   Unique sensors in data: {traffic_df['sensor_id'].nunique()}")
print(f"   Total measurements: {len(traffic_df):,}")

print("\n🏎️ Speed Statistics:")
speed_stats = traffic_df['speed_mph'].describe()
for stat, value in speed_stats.items():
    print(f"   {stat.title()}: {value:.2f} mph")

print("\n🚛 Volume Statistics:")
volume_stats = traffic_df['volume_vehicles_per_hour'].describe()
for stat, value in volume_stats.items():
    print(f"   {stat.title()}: {value:.0f} vehicles/hour")

In [None]:
# Hourly traffic patterns
hourly_stats = traffic_df.groupby('hour').agg({
    'speed_mph': ['mean', 'std'],
    'volume_vehicles_per_hour': ['mean', 'std']
}).round(2)

# Flatten column names
hourly_stats.columns = ['_'.join(col) for col in hourly_stats.columns]

# Plot hourly patterns
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 12))

# Speed patterns
ax1.plot(hourly_stats.index, hourly_stats['speed_mph_mean'], 'b-', linewidth=2, label='Average Speed')
ax1.fill_between(hourly_stats.index, 
                 hourly_stats['speed_mph_mean'] - hourly_stats['speed_mph_std'],
                 hourly_stats['speed_mph_mean'] + hourly_stats['speed_mph_std'],
                 alpha=0.3, label='±1 Std Dev')
ax1.set_title('Average Traffic Speed by Hour of Day')
ax1.set_xlabel('Hour of Day')
ax1.set_ylabel('Speed (mph)')
ax1.legend()
ax1.grid(True, alpha=0.3)
ax1.set_xticks(range(0, 24, 2))

# Volume patterns
ax2.plot(hourly_stats.index, hourly_stats['volume_vehicles_per_hour_mean'], 'r-', linewidth=2, label='Average Volume')
ax2.fill_between(hourly_stats.index,
                 hourly_stats['volume_vehicles_per_hour_mean'] - hourly_stats['volume_vehicles_per_hour_std'],
                 hourly_stats['volume_vehicles_per_hour_mean'] + hourly_stats['volume_vehicles_per_hour_std'],
                 alpha=0.3, label='±1 Std Dev')
ax2.set_title('Average Traffic Volume by Hour of Day')
ax2.set_xlabel('Hour of Day')
ax2.set_ylabel('Volume (vehicles/hour)')
ax2.legend()
ax2.grid(True, alpha=0.3)
ax2.set_xticks(range(0, 24, 2))

plt.tight_layout()
plt.show()

# Identify rush hours
min_speed_hours = hourly_stats['speed_mph_mean'].nsmallest(3)
print("⏰ Rush Hour Analysis (lowest average speeds):")
for hour, speed in min_speed_hours.items():
    print(f"   {hour}:00 - {speed:.1f} mph")

In [None]:
# Traffic patterns by road type
road_type_stats = traffic_df.groupby('road_type').agg({
    'speed_mph': ['mean', 'std', 'min', 'max'],
    'volume_vehicles_per_hour': ['mean', 'std', 'min', 'max']
}).round(2)

print("🛣️ Traffic Patterns by Road Type:")
print(road_type_stats)

# Visualize road type comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Speed comparison
speed_data = [traffic_df[traffic_df['road_type'] == rt]['speed_mph'].dropna() for rt in ['highway', 'arterial', 'local']]
ax1.boxplot(speed_data, labels=['Highway', 'Arterial', 'Local'])
ax1.set_title('Speed Distribution by Road Type')
ax1.set_ylabel('Speed (mph)')
ax1.grid(True, alpha=0.3)

# Volume comparison
volume_data = [traffic_df[traffic_df['road_type'] == rt]['volume_vehicles_per_hour'].dropna() for rt in ['highway', 'arterial', 'local']]
ax2.boxplot(volume_data, labels=['Highway', 'Arterial', 'Local'])
ax2.set_title('Volume Distribution by Road Type')
ax2.set_ylabel('Volume (vehicles/hour)')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Missing Data Analysis

Analyze missing data patterns to understand sensor reliability and data quality.

In [None]:
# Overall missing data statistics
total_records = len(traffic_df)
missing_speed = traffic_df['speed_mph'].isna().sum()
missing_volume = traffic_df['volume_vehicles_per_hour'].isna().sum()

print("❓ Missing Data Overview:")
print(f"   Total records: {total_records:,}")
print(f"   Missing speed readings: {missing_speed:,} ({missing_speed/total_records*100:.2f}%)")
print(f"   Missing volume readings: {missing_volume:,} ({missing_volume/total_records*100:.2f}%)")
print(f"   Complete records: {total_records - missing_speed:,} ({(total_records - missing_speed)/total_records*100:.2f}%)")

# Missing data by sensor
sensor_missing = traffic_df.groupby('sensor_id').agg({
    'speed_mph': lambda x: x.isna().sum(),
    'timestamp': 'count'
}).rename(columns={'speed_mph': 'missing_count', 'timestamp': 'total_count'})

sensor_missing['missing_rate'] = (sensor_missing['missing_count'] / sensor_missing['total_count'] * 100).round(2)
sensor_missing = sensor_missing.sort_values('missing_rate', ascending=False)

print(f"\n📊 Sensor Reliability (Top 10 sensors with most missing data):")
print(sensor_missing.head(10))

print(f"\n🏆 Best performing sensors (lowest missing data rates):")
print(sensor_missing.tail(5))

In [None]:
# Visualize missing data patterns
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(15, 10))

# Missing data rate distribution
ax1.hist(sensor_missing['missing_rate'], bins=20, edgecolor='black', alpha=0.7)
ax1.set_title('Distribution of Missing Data Rates Across Sensors')
ax1.set_xlabel('Missing Data Rate (%)')
ax1.set_ylabel('Number of Sensors')
ax1.grid(True, alpha=0.3)

# Missing data by hour of day
hourly_missing = traffic_df.groupby('hour')['speed_mph'].apply(lambda x: x.isna().sum())
total_by_hour = traffic_df.groupby('hour').size()
hourly_missing_rate = (hourly_missing / total_by_hour * 100).round(2)

ax2.bar(hourly_missing_rate.index, hourly_missing_rate.values, color='coral', alpha=0.7)
ax2.set_title('Missing Data Rate by Hour of Day')
ax2.set_xlabel('Hour of Day')
ax2.set_ylabel('Missing Data Rate (%)')
ax2.grid(True, alpha=0.3)
ax2.set_xticks(range(0, 24, 2))

plt.tight_layout()
plt.show()

print(f"\n⏰ Peak missing data hours:")
peak_missing_hours = hourly_missing_rate.nlargest(3)
for hour, rate in peak_missing_hours.items():
    print(f"   {hour}:00 - {rate:.2f}% missing")

## 5. Data Quality Assessment

Assess overall data quality and identify potential issues.

In [None]:
# Data quality checks
print("🔍 Data Quality Assessment:")

# 1. Speed value ranges
valid_speeds = traffic_df['speed_mph'].dropna()
invalid_speeds = valid_speeds[(valid_speeds < 0) | (valid_speeds > 100)]
print(f"\n🏎️ Speed Data Quality:")
print(f"   Valid speed readings: {len(valid_speeds):,}")
print(f"   Invalid speeds (<0 or >100 mph): {len(invalid_speeds)} ({len(invalid_speeds)/len(valid_speeds)*100:.3f}%)")
print(f"   Speed range: {valid_speeds.min():.1f} to {valid_speeds.max():.1f} mph")

# 2. Volume value ranges
valid_volumes = traffic_df['volume_vehicles_per_hour'].dropna()
invalid_volumes = valid_volumes[valid_volumes < 0]
print(f"\n🚛 Volume Data Quality:")
print(f"   Valid volume readings: {len(valid_volumes):,}")
print(f"   Invalid volumes (<0): {len(invalid_volumes)} ({len(invalid_volumes)/len(valid_volumes)*100:.3f}%)")
print(f"   Volume range: {valid_volumes.min():.0f} to {valid_volumes.max():.0f} vehicles/hour")

# 3. Timestamp continuity
print(f"\n📅 Temporal Data Quality:")
time_diffs = traffic_df.groupby('sensor_id')['timestamp'].apply(lambda x: x.sort_values().diff().dt.total_seconds().median())
expected_interval = 300  # 5 minutes in seconds
print(f"   Expected measurement interval: {expected_interval} seconds (5 minutes)")
print(f"   Median actual interval: {time_diffs.median():.0f} seconds")
print(f"   Sensors with correct timing: {(time_diffs == expected_interval).sum()}/{len(time_diffs)}")

# 4. Duplicate records
duplicates = traffic_df.duplicated(['timestamp', 'sensor_id']).sum()
print(f"\n🔄 Data Consistency:")
print(f"   Duplicate records: {duplicates} ({duplicates/len(traffic_df)*100:.3f}%)")

In [None]:
# Correlation analysis
print("📊 Variable Correlation Analysis:")

# Calculate correlations
numeric_cols = ['speed_mph', 'volume_vehicles_per_hour', 'hour', 'latitude', 'longitude']
correlation_matrix = traffic_df[numeric_cols].corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.3f', cbar_kws={'label': 'Correlation Coefficient'})
plt.title('Correlation Matrix of Traffic Variables')
plt.tight_layout()
plt.show()

# Key correlations
speed_volume_corr = traffic_df['speed_mph'].corr(traffic_df['volume_vehicles_per_hour'])
print(f"\n🔗 Key Relationships:")
print(f"   Speed vs Volume correlation: {speed_volume_corr:.3f}")
if speed_volume_corr < -0.3:
    print("   ➡ Strong negative correlation: Higher volume leads to lower speeds")
elif speed_volume_corr < -0.1:
    print("   ➡ Moderate negative correlation: Some relationship between volume and speed")
else:
    print("   ➡ Weak correlation: Volume and speed relationship is complex")

## 6. Summary and Key Insights

Summarize the key findings from the METR-LA dataset analysis.

In [None]:
# Generate comprehensive analysis summary
analysis_summary = {
    "analysis_metadata": {
        "analysis_date": datetime.now().isoformat(),
        "dataset_type": "METR-LA Traffic Dataset",
        "analyst": "Traffic Prediction System",
        "analysis_version": "1.0"
    },
    "dataset_overview": {
        "total_sensors": len(sensors_df),
        "sensors_in_traffic_data": traffic_df['sensor_id'].nunique(),
        "total_traffic_records": len(traffic_df),
        "date_range": {
            "start": traffic_df['timestamp'].min().isoformat(),
            "end": traffic_df['timestamp'].max().isoformat(),
            "duration_days": (traffic_df['timestamp'].max() - traffic_df['timestamp'].min()).days
        },
        "measurement_frequency": "5 minutes"
    },
    "sensor_distribution": {
        "by_road_type": road_type_counts.to_dict(),
        "geographic_coverage": {
            "latitude_range": [float(sensors_df['latitude'].min()), float(sensors_df['latitude'].max())],
            "longitude_range": [float(sensors_df['longitude'].min()), float(sensors_df['longitude'].max())],
            "coverage_area_sq_miles": round(((sensors_df['latitude'].max() - sensors_df['latitude'].min()) * 69) * 
                                           ((sensors_df['longitude'].max() - sensors_df['longitude'].min()) * 54.6), 1)
        }
    },
    "traffic_patterns": {
        "speed_statistics": {
            "overall_mean_mph": round(traffic_df['speed_mph'].mean(), 2),
            "overall_std_mph": round(traffic_df['speed_mph'].std(), 2),
            "min_mph": round(traffic_df['speed_mph'].min(), 2),
            "max_mph": round(traffic_df['speed_mph'].max(), 2)
        },
        "volume_statistics": {
            "overall_mean_vph": round(traffic_df['volume_vehicles_per_hour'].mean(), 2),
            "overall_std_vph": round(traffic_df['volume_vehicles_per_hour'].std(), 2),
            "min_vph": round(traffic_df['volume_vehicles_per_hour'].min(), 2),
            "max_vph": round(traffic_df['volume_vehicles_per_hour'].max(), 2)
        },
        "rush_hour_analysis": {
            "lowest_speed_hours": [int(hour) for hour in min_speed_hours.index[:3]],
            "peak_missing_data_hours": [int(hour) for hour in peak_missing_hours.index[:3]]
        },
        "road_type_performance": road_type_stats.to_dict()
    },
    "data_quality": {
        "missing_data": {
            "overall_missing_rate_percent": round(missing_speed / total_records * 100, 2),
            "sensors_with_high_missing_rate": int((sensor_missing['missing_rate'] > 10).sum()),
            "sensors_with_low_missing_rate": int((sensor_missing['missing_rate'] < 5).sum()),
            "median_sensor_missing_rate": round(sensor_missing['missing_rate'].median(), 2)
        },
        "data_validity": {
            "invalid_speed_readings": len(invalid_speeds),
            "invalid_volume_readings": len(invalid_volumes),
            "duplicate_records": duplicates
        },
        "temporal_consistency": {
            "median_measurement_interval_seconds": float(time_diffs.median()),
            "sensors_with_correct_timing": int((time_diffs == expected_interval).sum())
        }
    },
    "key_insights": [
        f"Dataset covers {len(sensors_df)} sensors across Los Angeles with {traffic_df['sensor_id'].nunique()} sensors having traffic data",
        f"Missing data rate is {missing_speed / total_records * 100:.1f}%, indicating {'good' if missing_speed / total_records < 0.1 else 'moderate'} data quality",
        f"Clear rush hour patterns observed with lowest speeds at {min_speed_hours.index[0]}:00 ({min_speed_hours.iloc[0]:.1f} mph average)",
        f"Highway sensors show highest speeds (avg {road_type_stats.loc['highway', ('speed_mph', 'mean')]:.1f} mph) vs local roads ({road_type_stats.loc['local', ('speed_mph', 'mean')]:.1f} mph)",
        f"Speed-volume correlation of {speed_volume_corr:.3f} indicates {'strong' if abs(speed_volume_corr) > 0.5 else 'moderate' if abs(speed_volume_corr) > 0.3 else 'weak'} relationship"
    ],
    "recommendations": [
        "Implement forward-fill imputation for missing speed/volume data",
        "Focus preprocessing on rush hour periods (7-9 AM, 5-7 PM) for traffic prediction models",
        "Consider road type as a key feature for traffic prediction algorithms",
        "Monitor sensor reliability and flag sensors with >10% missing data for maintenance",
        "Use 5-minute aggregation windows to match native measurement frequency"
    ]
}

# Save analysis summary
summary_file = Path("../data/processed/metr_la_analysis_summary.json")
summary_file.parent.mkdir(parents=True, exist_ok=True)

with open(summary_file, 'w') as f:
    json.dump(analysis_summary, f, indent=2)

print("✅ METR-LA Dataset Analysis Complete!")
print(f"📋 Analysis summary saved to: {summary_file}")
print("\n🔍 Key Insights:")
for i, insight in enumerate(analysis_summary['key_insights'], 1):
    print(f"   {i}. {insight}")

print("\n💡 Recommendations for Data Preprocessing:")
for i, rec in enumerate(analysis_summary['recommendations'], 1):
    print(f"   {i}. {rec}")

## Next Steps

Based on this analysis, the next steps in the data pipeline are:

1. **Data Preprocessing (Task 6.3)**: Clean and preprocess the dataset based on identified patterns
2. **HDFS Upload (Task 6.4)**: Upload the cleaned dataset to HDFS for big data processing
3. **Feature Engineering**: Create additional features based on rush hour patterns and road types
4. **Model Training**: Use the insights to inform traffic prediction model development

The analysis shows that the METR-LA dataset has good coverage across Los Angeles with reasonable data quality, making it suitable for traffic prediction model training.