# 25. Data Dictionary

**Story 1.15**: Data Dictionary

## Objectives
- Document all features and transformations
- Create markdown documentation
- Generate feature metadata
- Build comprehensive data dictionary

In [1]:
import pandas as pd
import numpy as np
import json
import yaml
from pathlib import Path
from typing import Dict, Any, List, Optional, Union
from datetime import datetime
from dataclasses import dataclass, asdict
import re

print('Libraries loaded successfully')

Libraries loaded successfully


## 1. Feature Metadata Structure

In [2]:
@dataclass
class FeatureMetadata:
    """Metadata structure for features."""
    name: str
    display_name: str
    description: str
    data_type: str
    category: str
    unit: Optional[str] = None
    valid_range: Optional[List[Union[int, float]]] = None
    example_values: Optional[List[Any]] = None
    missing_value_handling: Optional[str] = None
    transformation: Optional[str] = None
    source: Optional[str] = None
    business_meaning: Optional[str] = None
    related_features: Optional[List[str]] = None
    quality_checks: Optional[List[str]] = None
    importance_score: Optional[float] = None
    created_date: Optional[str] = None
    last_updated: Optional[str] = None

class DataDictionary:
    """Comprehensive data dictionary for traffic features."""
    
    def __init__(self):
        self.features = {}
        self.categories = {}
        self.transformations = {}
        self.created_date = datetime.now().isoformat()
        
    def add_feature(self, feature: FeatureMetadata):
        """Add a feature to the dictionary."""
        self.features[feature.name] = feature
        
        # Update categories
        if feature.category not in self.categories:
            self.categories[feature.category] = []
        self.categories[feature.category].append(feature.name)
    
    def get_feature(self, name: str) -> Optional[FeatureMetadata]:
        """Get feature metadata by name."""
        return self.features.get(name)
    
    def get_features_by_category(self, category: str) -> List[FeatureMetadata]:
        """Get all features in a category."""
        feature_names = self.categories.get(category, [])
        return [self.features[name] for name in feature_names]
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert to dictionary."""
        return {
            'created_date': self.created_date,
            'categories': self.categories,
            'total_features': len(self.features),
            'features': {name: asdict(feature) for name, feature in self.features.items()}
        }
    
    def save_json(self, filepath: str):
        """Save to JSON file."""
        with open(filepath, 'w') as f:
            json.dump(self.to_dict(), f, indent=2, default=str)
    
    def save_yaml(self, filepath: str):
        """Save to YAML file."""
        with open(filepath, 'w') as f:
            yaml.dump(self.to_dict(), f, default_flow_style=False, indent=2)

print('Data dictionary classes defined')

Data dictionary classes defined


## 2. Generate Sample Data with Features

In [3]:
# Generate comprehensive sample data
np.random.seed(42)
n_samples = 2000

# Create datetime range
dates = pd.date_range('2024-01-01', periods=n_samples, freq='H')

# Generate base traffic data
sample_data = pd.DataFrame({
    # Raw sensor data
    'timestamp': dates,
    'station_id': np.random.choice(['A1_KM_125', 'A2_KM_89', 'A3_KM_203'], n_samples),
    'vehicle_count': np.random.poisson(100, n_samples) + 20 * np.sin(2 * np.pi * np.arange(n_samples) / 24),
    'avg_speed': np.clip(np.random.normal(80, 15, n_samples), 20, 130),
    'occupancy': np.clip(np.random.beta(2, 5, n_samples) * 100, 0, 100),
    
    # Weather data
    'temperature': np.random.normal(15, 10, n_samples),
    'precipitation': np.clip(np.random.exponential(2, n_samples), 0, 50),
    'visibility': np.clip(np.random.normal(5000, 2000, n_samples), 100, 10000),
    'wind_speed': np.clip(np.random.exponential(8, n_samples), 0, 50),
    'humidity': np.clip(np.random.normal(70, 20, n_samples), 0, 100),
})

# Add derived temporal features
sample_data['hour'] = sample_data['timestamp'].dt.hour
sample_data['day_of_week'] = sample_data['timestamp'].dt.dayofweek
sample_data['month'] = sample_data['timestamp'].dt.month
sample_data['day_of_year'] = sample_data['timestamp'].dt.dayofyear
sample_data['week_of_year'] = sample_data['timestamp'].dt.isocalendar().week

# Cyclical encoding
sample_data['hour_sin'] = np.sin(2 * np.pi * sample_data['hour'] / 24)
sample_data['hour_cos'] = np.cos(2 * np.pi * sample_data['hour'] / 24)
sample_data['day_sin'] = np.sin(2 * np.pi * sample_data['day_of_week'] / 7)
sample_data['day_cos'] = np.cos(2 * np.pi * sample_data['day_of_week'] / 7)
sample_data['month_sin'] = np.sin(2 * np.pi * sample_data['month'] / 12)
sample_data['month_cos'] = np.cos(2 * np.pi * sample_data['month'] / 12)

# Boolean features
sample_data['is_weekend'] = (sample_data['day_of_week'] >= 5).astype(int)
sample_data['is_holiday'] = np.random.choice([0, 1], n_samples, p=[0.95, 0.05])
sample_data['is_morning_rush'] = ((sample_data['hour'] >= 7) & (sample_data['hour'] < 9)).astype(int)
sample_data['is_evening_rush'] = ((sample_data['hour'] >= 16) & (sample_data['hour'] < 18)).astype(int)
sample_data['is_rush_hour'] = ((sample_data['is_morning_rush'] == 1) | (sample_data['is_evening_rush'] == 1)).astype(int)

# Weather condition flags
sample_data['is_rainy'] = (sample_data['precipitation'] > 1.0).astype(int)
sample_data['is_foggy'] = (sample_data['visibility'] < 1000).astype(int)
sample_data['is_windy'] = (sample_data['wind_speed'] > 15).astype(int)
sample_data['is_cold'] = (sample_data['temperature'] < 5).astype(int)
sample_data['is_hot'] = (sample_data['temperature'] > 25).astype(int)

# Composite weather severity
sample_data['weather_severity'] = (
    sample_data['is_rainy'] + 
    sample_data['is_foggy'] + 
    sample_data['is_windy']
)

# Traffic-derived features
sample_data['traffic_density'] = sample_data['vehicle_count'] / (sample_data['avg_speed'] + 1)
sample_data['is_congested'] = (sample_data['occupancy'] / 100 > 0.6).astype(int)
sample_data['congestion_score'] = np.clip(sample_data['occupancy'] / sample_data['avg_speed'] * 100, 0, 100)
sample_data['flow_efficiency'] = sample_data['avg_speed'] * sample_data['vehicle_count'] / 1000

# Station-specific features (one-hot encoding)
for station in sample_data['station_id'].unique():
    sample_data[f'station_{station.replace("_", "")}'] = (sample_data['station_id'] == station).astype(int)

# Lag features (previous hour)
sample_data['vehicle_count_lag1'] = sample_data['vehicle_count'].shift(1)
sample_data['avg_speed_lag1'] = sample_data['avg_speed'].shift(1)
sample_data['occupancy_lag1'] = sample_data['occupancy'].shift(1)

# Rolling averages
sample_data['vehicle_count_ma3'] = sample_data['vehicle_count'].rolling(window=3).mean()
sample_data['avg_speed_ma3'] = sample_data['avg_speed'].rolling(window=3).mean()

print(f'Generated sample data with {len(sample_data)} records and {len(sample_data.columns)} features')
print('\nFeature categories:')
feature_counts = {
    'Temporal': len([col for col in sample_data.columns if any(x in col.lower() for x in ['hour', 'day', 'month', 'week', 'sin', 'cos', 'timestamp'])]),
    'Weather': len([col for col in sample_data.columns if any(x in col.lower() for x in ['temp', 'precip', 'wind', 'humid', 'visib', 'weather'])]),
    'Traffic': len([col for col in sample_data.columns if any(x in col.lower() for x in ['vehicle', 'speed', 'occupancy', 'density', 'flow', 'congestion'])]),
    'Boolean': len([col for col in sample_data.columns if sample_data[col].dtype == 'int64' and set(sample_data[col].dropna().unique()).issubset({0, 1})]),
    'Station': len([col for col in sample_data.columns if col.startswith('station_')]),
    'Derived': len([col for col in sample_data.columns if any(x in col.lower() for x in ['lag', 'ma'])]),
}

for category, count in feature_counts.items():
    print(f'  {category}: {count} features')

print('\nSample data:')
print(sample_data.head())

Generated sample data with 2000 records and 44 features

Feature categories:
  Temporal: 15 features
  Weather: 7 features
  Traffic: 12 features
  Boolean: 14 features
  Station: 4 features
  Derived: 5 features

Sample data:
            timestamp station_id  vehicle_count  avg_speed  occupancy  \
0 2024-01-01 00:00:00  A3_KM_203      99.000000  66.072860  25.232779   
1 2024-01-01 01:00:00  A1_KM_125     118.176381  78.163478  54.125647   
2 2024-01-01 02:00:00  A3_KM_203      86.000000  80.462070  35.251584   
3 2024-01-01 03:00:00  A3_KM_203     109.142136  78.074770   4.208597   
4 2024-01-01 04:00:00  A1_KM_125     117.320508  71.136690  10.727314   

   temperature  precipitation   visibility  wind_speed    humidity  ...  \
0    26.064651       2.765788  9345.361848    2.245537   73.356962  ...   
1    20.265854       0.536013  2219.982272   18.826244  100.000000  ...   
2    26.030155       4.213665   590.241866   14.677149   98.642469  ...   
3    12.084505       1.516077  493

## 3. Build Complete Data Dictionary

In [4]:
# Initialize data dictionary
data_dict = DataDictionary()

# Define feature metadata for each feature
feature_definitions = [
    # Temporal Features
    FeatureMetadata(
        name='timestamp',
        display_name='Timestamp',
        description='Date and time of the measurement',
        data_type='datetime64[ns]',
        category='temporal',
        unit='datetime',
        source='traffic_sensors',
        business_meaning='When the traffic measurement was taken',
        quality_checks=['not_null', 'chronological_order'],
        importance_score=1.0
    ),
    FeatureMetadata(
        name='hour',
        display_name='Hour of Day',
        description='Hour component of the timestamp (0-23)',
        data_type='int64',
        category='temporal',
        unit='hour',
        valid_range=[0, 23],
        example_values=[0, 6, 12, 18, 23],
        transformation='extract_hour_from_timestamp',
        source='derived_from_timestamp',
        business_meaning='Used to identify daily traffic patterns and peak hours',
        importance_score=0.9
    ),
    FeatureMetadata(
        name='day_of_week',
        display_name='Day of Week',
        description='Day of the week (0=Monday, 6=Sunday)',
        data_type='int64',
        category='temporal',
        unit='day',
        valid_range=[0, 6],
        example_values=[0, 1, 2, 3, 4, 5, 6],
        transformation='extract_dayofweek_from_timestamp',
        source='derived_from_timestamp',
        business_meaning='Distinguishes weekday vs weekend traffic patterns',
        related_features=['is_weekend'],
        importance_score=0.8
    ),
    FeatureMetadata(
        name='month',
        display_name='Month',
        description='Month component of the timestamp (1-12)',
        data_type='int64',
        category='temporal',
        unit='month',
        valid_range=[1, 12],
        example_values=[1, 3, 6, 9, 12],
        transformation='extract_month_from_timestamp',
        source='derived_from_timestamp',
        business_meaning='Captures seasonal traffic variations',
        importance_score=0.6
    ),
    FeatureMetadata(
        name='hour_sin',
        display_name='Hour Sine',
        description='Sine transformation of hour for cyclical encoding',
        data_type='float64',
        category='temporal_cyclical',
        unit='dimensionless',
        valid_range=[-1, 1],
        transformation='sin(2*pi*hour/24)',
        source='derived_from_hour',
        business_meaning='Captures cyclical nature of daily patterns',
        related_features=['hour_cos', 'hour'],
        importance_score=0.7
    ),
    FeatureMetadata(
        name='hour_cos',
        display_name='Hour Cosine',
        description='Cosine transformation of hour for cyclical encoding',
        data_type='float64',
        category='temporal_cyclical',
        unit='dimensionless',
        valid_range=[-1, 1],
        transformation='cos(2*pi*hour/24)',
        source='derived_from_hour',
        business_meaning='Captures cyclical nature of daily patterns',
        related_features=['hour_sin', 'hour'],
        importance_score=0.7
    ),
    
    # Weather Features
    FeatureMetadata(
        name='temperature',
        display_name='Temperature',
        description='Air temperature in Celsius',
        data_type='float64',
        category='weather',
        unit='°C',
        valid_range=[-30, 50],
        example_values=[-5, 0, 10, 20, 30],
        source='weather_api',
        business_meaning='Temperature affects driving behavior and traffic patterns',
        related_features=['is_cold', 'is_hot'],
        quality_checks=['range_check', 'outlier_detection'],
        importance_score=0.5
    ),
    FeatureMetadata(
        name='precipitation',
        display_name='Precipitation',
        description='Amount of rainfall in millimeters',
        data_type='float64',
        category='weather',
        unit='mm',
        valid_range=[0, 100],
        example_values=[0, 1, 5, 10, 20],
        source='weather_api',
        business_meaning='Rain significantly impacts traffic speed and congestion',
        related_features=['is_rainy', 'weather_severity'],
        quality_checks=['non_negative', 'outlier_detection'],
        importance_score=0.7
    ),
    FeatureMetadata(
        name='visibility',
        display_name='Visibility',
        description='Visibility distance in meters',
        data_type='float64',
        category='weather',
        unit='m',
        valid_range=[0, 15000],
        example_values=[100, 500, 1000, 5000, 10000],
        source='weather_api',
        business_meaning='Low visibility conditions reduce traffic speed for safety',
        related_features=['is_foggy', 'weather_severity'],
        quality_checks=['non_negative', 'range_check'],
        importance_score=0.6
    ),
    
    # Traffic Features
    FeatureMetadata(
        name='vehicle_count',
        display_name='Vehicle Count',
        description='Number of vehicles detected in the time period',
        data_type='float64',
        category='traffic_raw',
        unit='vehicles',
        valid_range=[0, 500],
        example_values=[20, 50, 100, 150, 200],
        source='traffic_sensors',
        business_meaning='Primary measure of traffic volume',
        related_features=['traffic_density', 'flow_efficiency'],
        quality_checks=['non_negative', 'reasonable_range'],
        importance_score=1.0
    ),
    FeatureMetadata(
        name='avg_speed',
        display_name='Average Speed',
        description='Average speed of vehicles in km/h',
        data_type='float64',
        category='traffic_raw',
        unit='km/h',
        valid_range=[0, 150],
        example_values=[30, 50, 80, 100, 120],
        source='traffic_sensors',
        business_meaning='Indicates traffic flow efficiency and congestion level',
        related_features=['traffic_density', 'congestion_score', 'is_congested'],
        quality_checks=['non_negative', 'speed_limit_check'],
        importance_score=1.0
    ),
    FeatureMetadata(
        name='occupancy',
        display_name='Occupancy Rate',
        description='Percentage of time the sensor detects vehicles',
        data_type='float64',
        category='traffic_raw',
        unit='%',
        valid_range=[0, 100],
        example_values=[5, 15, 30, 60, 85],
        source='traffic_sensors',
        business_meaning='Direct measure of roadway utilization',
        related_features=['is_congested', 'congestion_score'],
        quality_checks=['percentage_range', 'logical_consistency'],
        importance_score=0.9
    ),
    
    # Derived Traffic Features
    FeatureMetadata(
        name='traffic_density',
        display_name='Traffic Density',
        description='Vehicle count divided by average speed (density measure)',
        data_type='float64',
        category='traffic_derived',
        unit='vehicles/(km/h)',
        transformation='vehicle_count / (avg_speed + 1)',
        source='derived_from_vehicle_count_and_avg_speed',
        business_meaning='Higher values indicate more congested conditions',
        related_features=['vehicle_count', 'avg_speed', 'is_congested'],
        importance_score=0.8
    ),
    FeatureMetadata(
        name='flow_efficiency',
        display_name='Flow Efficiency',
        description='Product of speed and volume (throughput measure)',
        data_type='float64',
        category='traffic_derived',
        unit='(vehicles×km/h)/1000',
        transformation='avg_speed * vehicle_count / 1000',
        source='derived_from_vehicle_count_and_avg_speed',
        business_meaning='Higher values indicate better traffic throughput',
        related_features=['vehicle_count', 'avg_speed'],
        importance_score=0.7
    ),
    FeatureMetadata(
        name='congestion_score',
        display_name='Congestion Score',
        description='Composite score indicating congestion level',
        data_type='float64',
        category='traffic_derived',
        unit='score (0-100)',
        valid_range=[0, 100],
        transformation='clip(occupancy / avg_speed * 100, 0, 100)',
        source='derived_from_occupancy_and_avg_speed',
        business_meaning='0-30: free flow, 30-60: moderate, 60+: congested',
        related_features=['occupancy', 'avg_speed', 'is_congested'],
        importance_score=0.8
    ),
    
    # Boolean Features
    FeatureMetadata(
        name='is_weekend',
        display_name='Is Weekend',
        description='Binary flag indicating weekend (Saturday/Sunday)',
        data_type='int64',
        category='temporal_boolean',
        valid_range=[0, 1],
        example_values=[0, 1],
        transformation='1 if day_of_week >= 5 else 0',
        source='derived_from_day_of_week',
        business_meaning='Weekend traffic patterns differ significantly from weekdays',
        related_features=['day_of_week'],
        importance_score=0.8
    ),
    FeatureMetadata(
        name='is_rush_hour',
        display_name='Is Rush Hour',
        description='Binary flag indicating rush hour periods (7-9 AM, 4-6 PM)',
        data_type='int64',
        category='temporal_boolean',
        valid_range=[0, 1],
        example_values=[0, 1],
        transformation='1 if (7 <= hour < 9) or (16 <= hour < 18) else 0',
        source='derived_from_hour',
        business_meaning='Rush hours typically show highest traffic volumes',
        related_features=['hour', 'is_morning_rush', 'is_evening_rush'],
        importance_score=0.9
    ),
    FeatureMetadata(
        name='is_congested',
        display_name='Is Congested',
        description='Binary flag indicating congested conditions (occupancy > 60%)',
        data_type='int64',
        category='traffic_boolean',
        valid_range=[0, 1],
        example_values=[0, 1],
        transformation='1 if occupancy / 100 > 0.6 else 0',
        source='derived_from_occupancy',
        business_meaning='Indicates when traffic conditions are significantly impaired',
        related_features=['occupancy', 'congestion_score'],
        importance_score=0.8
    ),
    FeatureMetadata(
        name='is_rainy',
        display_name='Is Rainy',
        description='Binary flag indicating rainy conditions (precipitation > 1mm)',
        data_type='int64',
        category='weather_boolean',
        valid_range=[0, 1],
        example_values=[0, 1],
        transformation='1 if precipitation > 1.0 else 0',
        source='derived_from_precipitation',
        business_meaning='Rain conditions significantly impact traffic behavior',
        related_features=['precipitation', 'weather_severity'],
        importance_score=0.7
    ),
    
    # Station Features
    FeatureMetadata(
        name='station_id',
        display_name='Station ID',
        description='Identifier for the traffic measurement station',
        data_type='object',
        category='station',
        example_values=['A1_KM_125', 'A2_KM_89', 'A3_KM_203'],
        source='traffic_sensors',
        business_meaning='Different stations may have distinct traffic patterns',
        related_features=['station_A1KM125', 'station_A2KM89', 'station_A3KM203'],
        importance_score=0.6
    ),
    
    # Lag Features
    FeatureMetadata(
        name='vehicle_count_lag1',
        display_name='Vehicle Count (Previous Hour)',
        description='Vehicle count from the previous hour (lag-1)',
        data_type='float64',
        category='temporal_lag',
        unit='vehicles',
        transformation='lag(vehicle_count, 1)',
        source='derived_from_vehicle_count',
        business_meaning='Previous traffic conditions influence current state',
        related_features=['vehicle_count'],
        missing_value_handling='forward_fill_or_drop',
        importance_score=0.6
    ),
    FeatureMetadata(
        name='vehicle_count_ma3',
        display_name='Vehicle Count (3-Hour Moving Average)',
        description='3-hour moving average of vehicle count',
        data_type='float64',
        category='temporal_smoothed',
        unit='vehicles',
        transformation='rolling_mean(vehicle_count, window=3)',
        source='derived_from_vehicle_count',
        business_meaning='Smoothed traffic trend reduces noise in predictions',
        related_features=['vehicle_count'],
        missing_value_handling='requires_minimum_window',
        importance_score=0.5
    )
]

# Add all features to the dictionary
for feature in feature_definitions:
    feature.created_date = datetime.now().isoformat()
    feature.last_updated = datetime.now().isoformat()
    data_dict.add_feature(feature)

print(f'Added {len(feature_definitions)} feature definitions to data dictionary')
print(f'Categories: {list(data_dict.categories.keys())}')
print(f'Total features in dictionary: {len(data_dict.features)}')

Added 22 feature definitions to data dictionary
Categories: ['temporal', 'temporal_cyclical', 'weather', 'traffic_raw', 'traffic_derived', 'temporal_boolean', 'traffic_boolean', 'weather_boolean', 'station', 'temporal_lag', 'temporal_smoothed']
Total features in dictionary: 22


## 4. Generate Data Quality Report

In [5]:
def generate_data_quality_report(df: pd.DataFrame, data_dict: DataDictionary) -> Dict[str, Any]:
    """Generate comprehensive data quality report."""
    
    quality_report = {
        'dataset_overview': {
            'total_records': len(df),
            'total_features': len(df.columns),
            'memory_usage_mb': df.memory_usage(deep=True).sum() / 1024 / 1024,
            'date_range': {
                'start': df['timestamp'].min().isoformat() if 'timestamp' in df.columns else None,
                'end': df['timestamp'].max().isoformat() if 'timestamp' in df.columns else None
            }
        },
        'data_quality': {
            'missing_values': {},
            'data_types': {},
            'value_ranges': {},
            'outliers': {},
            'duplicates': df.duplicated().sum()
        },
        'feature_statistics': {},
        'validation_results': {}
    }
    
    # Missing values analysis
    for col in df.columns:
        missing_count = df[col].isnull().sum()
        missing_pct = (missing_count / len(df)) * 100
        quality_report['data_quality']['missing_values'][col] = {
            'count': missing_count,
            'percentage': round(missing_pct, 2)
        }
    
    # Data types
    for col in df.columns:
        quality_report['data_quality']['data_types'][col] = str(df[col].dtype)
    
    # Value ranges for numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        quality_report['data_quality']['value_ranges'][col] = {
            'min': float(df[col].min()),
            'max': float(df[col].max()),
            'mean': float(df[col].mean()),
            'std': float(df[col].std())
        }
    
    # Outlier detection (IQR method)
    for col in numeric_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers = df[(df[col] < Q1 - 1.5 * IQR) | (df[col] > Q3 + 1.5 * IQR)]
        quality_report['data_quality']['outliers'][col] = {
            'count': len(outliers),
            'percentage': round((len(outliers) / len(df)) * 100, 2)
        }
    
    # Feature statistics
    for col in df.columns:
        if col in numeric_cols:
            stats = df[col].describe()
            quality_report['feature_statistics'][col] = {
                'count': int(stats['count']),
                'mean': float(stats['mean']),
                'std': float(stats['std']),
                'min': float(stats['min']),
                '25%': float(stats['25%']),
                '50%': float(stats['50%']),
                '75%': float(stats['75%']),
                'max': float(stats['max']),
                'unique_values': int(df[col].nunique()),
                'zero_values': int((df[col] == 0).sum())
            }
        else:
            quality_report['feature_statistics'][col] = {
                'count': int(df[col].count()),
                'unique_values': int(df[col].nunique()),
                'top_value': str(df[col].mode().iloc[0]) if not df[col].mode().empty else None,
                'most_common_frequency': int(df[col].value_counts().iloc[0]) if len(df[col].value_counts()) > 0 else 0
            }
    
    # Validation against data dictionary
    for feature_name, feature_meta in data_dict.features.items():
        if feature_name in df.columns:
            validation_results = {
                'exists': True,
                'type_match': str(df[feature_name].dtype) == feature_meta.data_type,
                'range_valid': True
            }
            
            # Check valid range
            if feature_meta.valid_range and feature_name in numeric_cols:
                min_val, max_val = feature_meta.valid_range
                out_of_range = ((df[feature_name] < min_val) | (df[feature_name] > max_val)).sum()
                validation_results['range_valid'] = out_of_range == 0
                validation_results['out_of_range_count'] = int(out_of_range)
            
            quality_report['validation_results'][feature_name] = validation_results
        else:
            quality_report['validation_results'][feature_name] = {
                'exists': False,
                'type_match': False,
                'range_valid': False
            }
    
    return quality_report

# Generate quality report
quality_report = generate_data_quality_report(sample_data, data_dict)

print('Data Quality Report Generated:')
print('=' * 40)
print(f'Total Records: {quality_report["dataset_overview"]["total_records"]:,}')
print(f'Total Features: {quality_report["dataset_overview"]["total_features"]}')
print(f'Memory Usage: {quality_report["dataset_overview"]["memory_usage_mb"]:.2f} MB')
print(f'Duplicates: {quality_report["data_quality"]["duplicates"]}')

print('\nMissing Values Summary:')
missing_summary = [(k, v['percentage']) for k, v in quality_report['data_quality']['missing_values'].items() if v['percentage'] > 0]
if missing_summary:
    for feature, pct in missing_summary[:10]:  # Top 10
        print(f'  {feature}: {pct:.1f}%')
else:
    print('  No missing values detected')

print('\nOutliers Summary:')
outlier_summary = [(k, v['percentage']) for k, v in quality_report['data_quality']['outliers'].items() if v['percentage'] > 0]
for feature, pct in outlier_summary[:10]:  # Top 10
    print(f'  {feature}: {pct:.1f}%')

print('\nValidation Results:')
validation_issues = [(k, v) for k, v in quality_report['validation_results'].items() if not all([v.get('exists', False), v.get('type_match', False), v.get('range_valid', False)])]
if validation_issues:
    for feature, issues in validation_issues[:5]:
        print(f'  {feature}: {issues}')
else:
    print('  All validations passed')

Data Quality Report Generated:
Total Records: 2,000
Total Features: 44
Memory Usage: 0.75 MB
Duplicates: 0

Missing Values Summary:
  vehicle_count_lag1: 0.1%
  avg_speed_lag1: 0.1%
  occupancy_lag1: 0.1%
  vehicle_count_ma3: 0.1%
  avg_speed_ma3: 0.1%

Outliers Summary:
  avg_speed: 0.7%
  occupancy: 0.9%
  temperature: 0.8%
  precipitation: 4.0%
  wind_speed: 4.7%
  humidity: 0.3%
  is_holiday: 4.5%
  is_morning_rush: 8.3%
  is_evening_rush: 8.3%
  is_rush_hour: 16.6%

Validation Results:
  hour: {'exists': True, 'type_match': False, 'range_valid': True, 'out_of_range_count': 0}
  day_of_week: {'exists': True, 'type_match': False, 'range_valid': True, 'out_of_range_count': 0}
  month: {'exists': True, 'type_match': False, 'range_valid': True, 'out_of_range_count': 0}


## 5. Generate Markdown Documentation

In [6]:
def generate_markdown_documentation(data_dict: DataDictionary, quality_report: Dict[str, Any]) -> str:
    """Generate comprehensive markdown documentation."""
    
    md_content = []
    md_content.append('# Traffic Data Dictionary\n')
    md_content.append(f'**Generated on:** {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n')
    md_content.append(f'**Total Features:** {len(data_dict.features)}\n')
    md_content.append(f'**Categories:** {len(data_dict.categories)}\n\n')
    
    # Table of Contents
    md_content.append('## Table of Contents\n')
    md_content.append('- [Dataset Overview](#dataset-overview)\n')
    md_content.append('- [Data Quality Summary](#data-quality-summary)\n')
    md_content.append('- [Feature Categories](#feature-categories)\n')
    
    for category in sorted(data_dict.categories.keys()):
        category_title = category.replace('_', ' ').title()
        md_content.append(f'  - [{category_title}](#{category.replace("_", "-")})\n')
    
    md_content.append('- [Feature Relationships](#feature-relationships)\n')
    md_content.append('- [Data Transformations](#data-transformations)\n\n')
    
    # Dataset Overview
    md_content.append('## Dataset Overview\n\n')
    overview = quality_report['dataset_overview']
    md_content.append(f'- **Total Records:** {overview["total_records"]:,}\n')
    md_content.append(f'- **Total Features:** {overview["total_features"]}\n')
    md_content.append(f'- **Memory Usage:** {overview["memory_usage_mb"]:.2f} MB\n')
    if overview['date_range']['start']:
        md_content.append(f'- **Date Range:** {overview["date_range"]["start"]} to {overview["date_range"]["end"]}\n')
    md_content.append('\n')
    
    # Data Quality Summary
    md_content.append('## Data Quality Summary\n\n')
    quality = quality_report['data_quality']
    md_content.append(f'- **Duplicate Records:** {quality["duplicates"]}\n')
    
    missing_features = [k for k, v in quality['missing_values'].items() if v['percentage'] > 0]
    md_content.append(f'- **Features with Missing Values:** {len(missing_features)}\n')
    
    outlier_features = [k for k, v in quality['outliers'].items() if v['percentage'] > 5]
    md_content.append(f'- **Features with >5% Outliers:** {len(outlier_features)}\n\n')
    
    # Feature Categories
    md_content.append('## Feature Categories\n\n')
    
    for category, feature_names in sorted(data_dict.categories.items()):
        category_title = category.replace('_', ' ').title()
        md_content.append(f'### {category_title}\n\n')
        md_content.append(f'**Count:** {len(feature_names)} features\n\n')
        
        # Create table for features in this category
        md_content.append('| Feature | Display Name | Type | Unit | Description |\n')
        md_content.append('|---------|--------------|------|------|-------------|\n')
        
        for feature_name in sorted(feature_names):
            if feature_name in data_dict.features:
                feature = data_dict.features[feature_name]
                unit = feature.unit or 'N/A'
                desc = feature.description[:80] + '...' if len(feature.description) > 80 else feature.description
                md_content.append(f'| `{feature.name}` | {feature.display_name} | {feature.data_type} | {unit} | {desc} |\n')
        
        md_content.append('\n')
    
    # Detailed Feature Descriptions
    md_content.append('## Detailed Feature Descriptions\n\n')
    
    for category, feature_names in sorted(data_dict.categories.items()):
        category_title = category.replace('_', ' ').title()
        md_content.append(f'### {category_title} Features\n\n')
        
        for feature_name in sorted(feature_names):
            if feature_name in data_dict.features:
                feature = data_dict.features[feature_name]
                
                md_content.append(f'#### `{feature.name}`\n\n')
                md_content.append(f'**Display Name:** {feature.display_name}\n\n')
                md_content.append(f'**Description:** {feature.description}\n\n')
                md_content.append(f'**Data Type:** `{feature.data_type}`\n\n')
                
                if feature.unit:
                    md_content.append(f'**Unit:** {feature.unit}\n\n')
                
                if feature.valid_range:
                    md_content.append(f'**Valid Range:** {feature.valid_range[0]} to {feature.valid_range[1]}\n\n')
                
                if feature.example_values:
                    md_content.append(f'**Example Values:** {feature.example_values}\n\n')
                
                if feature.transformation:
                    md_content.append(f'**Transformation:** `{feature.transformation}`\n\n')
                
                if feature.source:
                    md_content.append(f'**Source:** {feature.source}\n\n')
                
                if feature.business_meaning:
                    md_content.append(f'**Business Meaning:** {feature.business_meaning}\n\n')
                
                if feature.related_features:
                    related = ', '.join([f'`{rf}`' for rf in feature.related_features])
                    md_content.append(f'**Related Features:** {related}\n\n')
                
                if feature.quality_checks:
                    checks = ', '.join(feature.quality_checks)
                    md_content.append(f'**Quality Checks:** {checks}\n\n')
                
                if feature.importance_score is not None:
                    md_content.append(f'**Importance Score:** {feature.importance_score:.1f}/1.0\n\n')
                
                if feature.missing_value_handling:
                    md_content.append(f'**Missing Value Handling:** {feature.missing_value_handling}\n\n')
                
                md_content.append('---\n\n')
    
    # Feature Relationships
    md_content.append('## Feature Relationships\n\n')
    md_content.append('### Derivation Chain\n\n')
    
    relationships = {}
    for feature_name, feature in data_dict.features.items():
        if feature.related_features:
            relationships[feature_name] = feature.related_features
    
    if relationships:
        md_content.append('| Feature | Related Features |\n')
        md_content.append('|---------|------------------|\n')
        for feature, related in sorted(relationships.items()):
            related_list = ', '.join([f'`{rf}`' for rf in related])
            md_content.append(f'| `{feature}` | {related_list} |\n')
    
    md_content.append('\n')
    
    # Data Transformations
    md_content.append('## Data Transformations\n\n')
    
    transformations = {}
    for feature_name, feature in data_dict.features.items():
        if feature.transformation:
            transformations[feature_name] = {
                'formula': feature.transformation,
                'source': feature.source
            }
    
    if transformations:
        md_content.append('| Feature | Transformation Formula | Source |\n')
        md_content.append('|---------|------------------------|--------|\n')
        for feature, transform_info in sorted(transformations.items()):
            formula = transform_info['formula']
            source = transform_info['source'] or 'N/A'
            md_content.append(f'| `{feature}` | `{formula}` | {source} |\n')
    
    md_content.append('\n')
    
    # Quality Issues
    md_content.append('## Data Quality Issues\n\n')
    
    # Missing values
    missing_issues = [(k, v) for k, v in quality_report['data_quality']['missing_values'].items() if v['percentage'] > 0]
    if missing_issues:
        md_content.append('### Missing Values\n\n')
        md_content.append('| Feature | Missing Count | Missing % |\n')
        md_content.append('|---------|---------------|-----------|\n')
        for feature, missing_info in sorted(missing_issues, key=lambda x: x[1]['percentage'], reverse=True):
            md_content.append(f'| `{feature}` | {missing_info["count"]} | {missing_info["percentage"]}% |\n')
        md_content.append('\n')
    
    # Outliers
    outlier_issues = [(k, v) for k, v in quality_report['data_quality']['outliers'].items() if v['percentage'] > 1]
    if outlier_issues:
        md_content.append('### Outliers (>1%)\n\n')
        md_content.append('| Feature | Outlier Count | Outlier % |\n')
        md_content.append('|---------|---------------|-----------|\n')
        for feature, outlier_info in sorted(outlier_issues, key=lambda x: x[1]['percentage'], reverse=True):
            md_content.append(f'| `{feature}` | {outlier_info["count"]} | {outlier_info["percentage"]}% |\n')
        md_content.append('\n')
    
    # Usage Guidelines
    md_content.append('## Usage Guidelines\n\n')
    md_content.append('### Best Practices\n\n')
    md_content.append('1. **Temporal Features**: Use cyclical encoding (sin/cos) for hour, day, and month features to capture periodic patterns.\n')
    md_content.append('2. **Weather Impact**: Combine weather boolean flags for comprehensive weather condition assessment.\n')
    md_content.append('3. **Traffic Metrics**: Use derived metrics (density, flow_efficiency) for better model performance.\n')
    md_content.append('4. **Missing Values**: Handle lag and rolling features appropriately - they naturally have missing values at the beginning.\n')
    md_content.append('5. **Outliers**: Review outliers in traffic metrics as they may indicate incidents or sensor malfunctions.\n\n')
    
    md_content.append('### Feature Selection Recommendations\n\n')
    high_importance = [f for f in data_dict.features.values() if f.importance_score and f.importance_score >= 0.8]
    high_importance.sort(key=lambda x: x.importance_score, reverse=True)
    
    md_content.append('**High Importance Features (Score ≥ 0.8):**\n\n')
    for feature in high_importance:
        md_content.append(f'- `{feature.name}` ({feature.importance_score:.1f}): {feature.business_meaning}\n')
    
    md_content.append('\n---\n\n')
    md_content.append(f'*Documentation generated automatically on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}*\n')
    
    return ''.join(md_content)

# Generate markdown documentation
markdown_doc = generate_markdown_documentation(data_dict, quality_report)

# Save markdown documentation
docs_dir = Path('docs')
docs_dir.mkdir(exist_ok=True)
markdown_file = docs_dir / 'traffic_data_dictionary.md'

with open(markdown_file, 'w', encoding='utf-8') as f:
    f.write(markdown_doc)

print(f'Markdown documentation saved to: {markdown_file.absolute()}')
print(f'Documentation size: {len(markdown_doc):,} characters')

# Show preview
print('\nDocumentation Preview:')
print('=' * 50)
print(markdown_doc[:1000] + '...')

Markdown documentation saved to: /home/niko/workspace/slovenia-traffic/notebooks/docs/traffic_data_dictionary.md
Documentation size: 19,871 characters

Documentation Preview:
# Traffic Data Dictionary
**Generated on:** 2025-09-06 15:40:47
**Total Features:** 22
**Categories:** 11

## Table of Contents
- [Dataset Overview](#dataset-overview)
- [Data Quality Summary](#data-quality-summary)
- [Feature Categories](#feature-categories)
  - [Station](#station)
  - [Temporal](#temporal)
  - [Temporal Boolean](#temporal-boolean)
  - [Temporal Cyclical](#temporal-cyclical)
  - [Temporal Lag](#temporal-lag)
  - [Temporal Smoothed](#temporal-smoothed)
  - [Traffic Boolean](#traffic-boolean)
  - [Traffic Derived](#traffic-derived)
  - [Traffic Raw](#traffic-raw)
  - [Weather](#weather)
  - [Weather Boolean](#weather-boolean)
- [Feature Relationships](#feature-relationships)
- [Data Transformations](#data-transformations)

## Dataset Overview

- **Total Records:** 2,000
- **Total Features:** 44
- *

## 6. Export Data Dictionary and Metadata

In [7]:
# Save data dictionary in multiple formats
output_dir = Path('data_dictionary_outputs')
output_dir.mkdir(exist_ok=True)

# JSON format
json_file = output_dir / 'traffic_data_dictionary.json'
data_dict.save_json(json_file)

# YAML format
yaml_file = output_dir / 'traffic_data_dictionary.yaml'
data_dict.save_yaml(yaml_file)

# Quality report
quality_file = output_dir / 'data_quality_report.json'
with open(quality_file, 'w') as f:
    json.dump(quality_report, f, indent=2, default=str)

# Feature summary CSV
feature_summary = []
for feature_name, feature in data_dict.features.items():
    summary_row = {
        'feature_name': feature.name,
        'display_name': feature.display_name,
        'category': feature.category,
        'data_type': feature.data_type,
        'unit': feature.unit,
        'source': feature.source,
        'importance_score': feature.importance_score,
        'has_transformation': bool(feature.transformation),
        'has_business_meaning': bool(feature.business_meaning),
        'quality_checks_count': len(feature.quality_checks) if feature.quality_checks else 0
    }
    feature_summary.append(summary_row)

summary_df = pd.DataFrame(feature_summary)
summary_file = output_dir / 'feature_summary.csv'
summary_df.to_csv(summary_file, index=False)

# Create comprehensive metadata file
comprehensive_metadata = {
    'metadata_info': {
        'generated_at': datetime.now().isoformat(),
        'generator': 'Traffic Data Dictionary Notebook',
        'version': '1.0.0'
    },
    'dataset_info': quality_report['dataset_overview'],
    'data_dictionary': data_dict.to_dict(),
    'data_quality': quality_report,
    'feature_categories': {
        category: {
            'count': len(features),
            'features': features,
            'description': f'Features related to {category.replace("_", " ")}'
        }
        for category, features in data_dict.categories.items()
    },
    'usage_guidelines': {
        'high_importance_features': [
            f.name for f in data_dict.features.values() 
            if f.importance_score and f.importance_score >= 0.8
        ],
        'derived_features': [
            f.name for f in data_dict.features.values() 
            if f.transformation
        ],
        'required_preprocessing': [
            'Handle missing values in lag and rolling features',
            'Validate data ranges against feature definitions',
            'Apply cyclical encoding for temporal features',
            'Check for outliers in traffic metrics'
        ]
    }
}

comprehensive_file = output_dir / 'comprehensive_metadata.json'
with open(comprehensive_file, 'w') as f:
    json.dump(comprehensive_metadata, f, indent=2, default=str)

print('Data Dictionary Export Complete:')
print('=' * 40)
print(f'Output directory: {output_dir.absolute()}')
print('\nFiles created:')
print(f'✓ {json_file.name} - JSON data dictionary')
print(f'✓ {yaml_file.name} - YAML data dictionary')
print(f'✓ {quality_file.name} - Data quality report')
print(f'✓ {summary_file.name} - Feature summary CSV')
print(f'✓ {comprehensive_file.name} - Comprehensive metadata')
print(f'✓ {markdown_file.name} - Markdown documentation')

# File size summary
print('\nFile sizes:')
for file_path in [json_file, yaml_file, quality_file, summary_file, comprehensive_file, markdown_file]:
    size_kb = file_path.stat().st_size / 1024
    print(f'  {file_path.name}: {size_kb:.1f} KB')

print('\nFeature breakdown by category:')
for category, features in sorted(data_dict.categories.items()):
    print(f'  {category.replace("_", " ").title()}: {len(features)} features')

print(f'\nTotal documented features: {len(data_dict.features)}')

Data Dictionary Export Complete:
Output directory: /home/niko/workspace/slovenia-traffic/notebooks/data_dictionary_outputs

Files created:
✓ traffic_data_dictionary.json - JSON data dictionary
✓ traffic_data_dictionary.yaml - YAML data dictionary
✓ data_quality_report.json - Data quality report
✓ feature_summary.csv - Feature summary CSV
✓ comprehensive_metadata.json - Comprehensive metadata
✓ traffic_data_dictionary.md - Markdown documentation

File sizes:
  traffic_data_dictionary.json: 18.7 KB
  traffic_data_dictionary.yaml: 15.0 KB
  data_quality_report.json: 29.2 KB
  feature_summary.csv: 2.1 KB
  comprehensive_metadata.json: 55.0 KB
  traffic_data_dictionary.md: 19.4 KB

Feature breakdown by category:
  Station: 1 features
  Temporal: 4 features
  Temporal Boolean: 2 features
  Temporal Cyclical: 2 features
  Temporal Lag: 1 features
  Temporal Smoothed: 1 features
  Traffic Boolean: 1 features
  Traffic Derived: 3 features
  Traffic Raw: 3 features
  Weather: 3 features
  Weathe

## 7. Generate Interactive Feature Explorer

In [8]:
def create_feature_explorer_html(data_dict: DataDictionary, quality_report: Dict[str, Any]) -> str:
    """Create interactive HTML feature explorer."""
    
    html_content = f'''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Traffic Data Dictionary - Feature Explorer</title>
    <style>
        body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; margin: 0; padding: 20px; background: #f5f5f5; }}
        .container {{ max-width: 1200px; margin: 0 auto; background: white; border-radius: 8px; box-shadow: 0 2px 10px rgba(0,0,0,0.1); }}
        .header {{ background: #2c3e50; color: white; padding: 20px; border-radius: 8px 8px 0 0; }}
        .header h1 {{ margin: 0; font-size: 2em; }}
        .header p {{ margin: 10px 0 0 0; opacity: 0.8; }}
        .controls {{ padding: 20px; border-bottom: 1px solid #eee; background: #f8f9fa; }}
        .search-box {{ width: 100%; padding: 10px; border: 1px solid #ddd; border-radius: 4px; font-size: 16px; margin-bottom: 10px; }}
        .filters {{ display: flex; gap: 10px; flex-wrap: wrap; }}
        .filter {{ padding: 8px 12px; background: #007bff; color: white; border: none; border-radius: 20px; cursor: pointer; font-size: 12px; }}
        .filter.active {{ background: #28a745; }}
        .stats {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px; padding: 20px; }}
        .stat-card {{ background: #f8f9fa; padding: 15px; border-radius: 8px; text-align: center; }}
        .stat-number {{ font-size: 2em; font-weight: bold; color: #2c3e50; }}
        .stat-label {{ color: #666; margin-top: 5px; }}
        .features {{ padding: 20px; }}
        .feature-card {{ border: 1px solid #e1e8ed; border-radius: 8px; margin-bottom: 15px; background: white; overflow: hidden; }}
        .feature-header {{ background: #f8f9fa; padding: 15px; border-bottom: 1px solid #e1e8ed; cursor: pointer; }}
        .feature-name {{ font-weight: bold; color: #2c3e50; font-size: 16px; }}
        .feature-type {{ color: #666; font-size: 12px; background: #e9ecef; padding: 2px 6px; border-radius: 3px; margin-left: 10px; }}
        .category-badge {{ color: white; font-size: 11px; padding: 2px 8px; border-radius: 12px; margin-left: 10px; }}
        .feature-details {{ padding: 15px; display: none; }}
        .feature-details.show {{ display: block; }}
        .detail-row {{ display: grid; grid-template-columns: 120px 1fr; gap: 10px; margin-bottom: 8px; }}
        .detail-label {{ font-weight: bold; color: #666; }}
        .detail-value {{ color: #333; }}
        .importance-bar {{ height: 6px; background: #e9ecef; border-radius: 3px; overflow: hidden; }}
        .importance-fill {{ height: 100%; background: linear-gradient(90deg, #28a745, #ffc107, #dc3545); }}
        .hidden {{ display: none !important; }}
    </style>
</head>
<body>
    <div class="container">
        <div class="header">
            <h1>Traffic Data Dictionary</h1>
            <p>Interactive Feature Explorer - Generated {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}</p>
        </div>
        
        <div class="stats">
            <div class="stat-card">
                <div class="stat-number">{len(data_dict.features)}</div>
                <div class="stat-label">Total Features</div>
            </div>
            <div class="stat-card">
                <div class="stat-number">{len(data_dict.categories)}</div>
                <div class="stat-label">Categories</div>
            </div>
            <div class="stat-card">
                <div class="stat-number">{quality_report["dataset_overview"]["total_records"]:,}</div>
                <div class="stat-label">Data Records</div>
            </div>
            <div class="stat-card">
                <div class="stat-number">{quality_report["dataset_overview"]["memory_usage_mb"]:.1f} MB</div>
                <div class="stat-label">Memory Usage</div>
            </div>
        </div>
        
        <div class="controls">
            <input type="text" class="search-box" placeholder="Search features..." id="searchBox">
            <div class="filters" id="categoryFilters">
                <button class="filter active" data-category="all">All Features</button>
'''
    
    # Add category filters
    category_colors = {
        'temporal': '#007bff',
        'weather': '#28a745',
        'traffic': '#dc3545',
        'station': '#6f42c1',
        'derived': '#fd7e14',
        'boolean': '#20c997'
    }
    
    for category in sorted(data_dict.categories.keys()):
        category_title = category.replace('_', ' ').title()
        color = category_colors.get(category.split('_')[0], '#6c757d')
        count = len(data_dict.categories[category])
        html_content += f'<button class="filter" data-category="{category}" style="background-color: {color};">{category_title} ({count})</button>\n'
    
    html_content += '''
            </div>
        </div>
        
        <div class="features" id="featuresContainer">
'''
    
    # Add feature cards
    for feature_name, feature in sorted(data_dict.features.items()):
        category_color = category_colors.get(feature.category.split('_')[0], '#6c757d')
        importance_width = (feature.importance_score * 100) if feature.importance_score else 0
        
        html_content += f'''
            <div class="feature-card" data-category="{feature.category}" data-name="{feature.name.lower()}">
                <div class="feature-header" onclick="toggleFeature('{feature.name}')">
                    <span class="feature-name">{feature.name}</span>
                    <span class="feature-type">{feature.data_type}</span>
                    <span class="category-badge" style="background-color: {category_color};">{feature.category.replace('_', ' ')}</span>
                </div>
                <div class="feature-details" id="details_{feature.name}">
                    <div class="detail-row">
                        <span class="detail-label">Display Name:</span>
                        <span class="detail-value">{feature.display_name}</span>
                    </div>
                    <div class="detail-row">
                        <span class="detail-label">Description:</span>
                        <span class="detail-value">{feature.description}</span>
                    </div>
'''
        
        if feature.unit:
            html_content += f'''
                    <div class="detail-row">
                        <span class="detail-label">Unit:</span>
                        <span class="detail-value">{feature.unit}</span>
                    </div>
'''
        
        if feature.valid_range:
            html_content += f'''
                    <div class="detail-row">
                        <span class="detail-label">Valid Range:</span>
                        <span class="detail-value">{feature.valid_range[0]} to {feature.valid_range[1]}</span>
                    </div>
'''
        
        if feature.transformation:
            html_content += f'''
                    <div class="detail-row">
                        <span class="detail-label">Transformation:</span>
                        <span class="detail-value"><code>{feature.transformation}</code></span>
                    </div>
'''
        
        if feature.source:
            html_content += f'''
                    <div class="detail-row">
                        <span class="detail-label">Source:</span>
                        <span class="detail-value">{feature.source}</span>
                    </div>
'''
        
        if feature.business_meaning:
            html_content += f'''
                    <div class="detail-row">
                        <span class="detail-label">Business Meaning:</span>
                        <span class="detail-value">{feature.business_meaning}</span>
                    </div>
'''
        
        if feature.importance_score is not None:
            html_content += f'''
                    <div class="detail-row">
                        <span class="detail-label">Importance:</span>
                        <span class="detail-value">
                            {feature.importance_score:.1f}/1.0
                            <div class="importance-bar">
                                <div class="importance-fill" style="width: {importance_width}%;"></div>
                            </div>
                        </span>
                    </div>
'''
        
        if feature.related_features:
            related = ', '.join(feature.related_features)
            html_content += f'''
                    <div class="detail-row">
                        <span class="detail-label">Related:</span>
                        <span class="detail-value">{related}</span>
                    </div>
'''
        
        html_content += '''
                </div>
            </div>
'''
    
    html_content += '''
        </div>
    </div>
    
    <script>
        function toggleFeature(featureName) {
            const details = document.getElementById('details_' + featureName);
            details.classList.toggle('show');
        }
        
        function filterFeatures() {
            const searchTerm = document.getElementById('searchBox').value.toLowerCase();
            const activeCategory = document.querySelector('.filter.active').getAttribute('data-category');
            const featureCards = document.querySelectorAll('.feature-card');
            
            featureCards.forEach(card => {
                const featureName = card.getAttribute('data-name');
                const featureCategory = card.getAttribute('data-category');
                
                const matchesSearch = featureName.includes(searchTerm);
                const matchesCategory = activeCategory === 'all' || featureCategory === activeCategory;
                
                if (matchesSearch && matchesCategory) {
                    card.classList.remove('hidden');
                } else {
                    card.classList.add('hidden');
                }
            });
        }
        
        document.getElementById('searchBox').addEventListener('input', filterFeatures);
        
        document.querySelectorAll('.filter').forEach(filter => {
            filter.addEventListener('click', function() {
                document.querySelectorAll('.filter').forEach(f => f.classList.remove('active'));
                this.classList.add('active');
                filterFeatures();
            });
        });
    </script>
</body>
</html>
'''
    
    return html_content

# Generate interactive HTML explorer
html_explorer = create_feature_explorer_html(data_dict, quality_report)

# Save HTML explorer
html_file = docs_dir / 'feature_explorer.html'
with open(html_file, 'w', encoding='utf-8') as f:
    f.write(html_explorer)

print(f'Interactive feature explorer saved to: {html_file.absolute()}')
print(f'HTML file size: {len(html_explorer):,} characters')
print('\nOpen the HTML file in a web browser to explore features interactively!')

Interactive feature explorer saved to: /home/niko/workspace/slovenia-traffic/notebooks/docs/feature_explorer.html
HTML file size: 63,949 characters

Open the HTML file in a web browser to explore features interactively!


## 8. Summary and Usage Instructions

In [9]:
print('=' * 60)
print('DATA DICTIONARY COMPLETE')
print('=' * 60)
print('\nAccomplishments:')
print('✓ Created comprehensive data dictionary structure')
print('✓ Documented all traffic, weather, and temporal features')
print('✓ Generated detailed feature metadata')
print('✓ Built data quality assessment')
print('✓ Created markdown documentation')
print('✓ Exported multiple formats (JSON, YAML, CSV)')
print('✓ Built interactive HTML feature explorer')
print('✓ Analyzed feature relationships and transformations')
print('✓ Provided usage guidelines and best practices')

print('\nData Dictionary Contents:')
print(f'• Total Features Documented: {len(data_dict.features)}')
print(f'• Feature Categories: {len(data_dict.categories)}')
print('• Category Breakdown:')
for category, features in sorted(data_dict.categories.items()):
    print(f'  - {category.replace("_", " ").title()}: {len(features)} features')

print('\nGenerated Files:')
print(f'• docs/traffic_data_dictionary.md - Comprehensive documentation')
print(f'• docs/feature_explorer.html - Interactive feature explorer')
print(f'• data_dictionary_outputs/traffic_data_dictionary.json - JSON format')
print(f'• data_dictionary_outputs/traffic_data_dictionary.yaml - YAML format')
print(f'• data_dictionary_outputs/feature_summary.csv - Feature summary')
print(f'• data_dictionary_outputs/data_quality_report.json - Quality report')
print(f'• data_dictionary_outputs/comprehensive_metadata.json - Complete metadata')

print('\nFeature Highlights:')
high_importance = [f for f in data_dict.features.values() if f.importance_score and f.importance_score >= 0.8]
high_importance.sort(key=lambda x: x.importance_score, reverse=True)
print('• High Importance Features:')
for feature in high_importance[:10]:  # Top 10
    print(f'  - {feature.name} ({feature.importance_score:.1f}): {feature.business_meaning[:50]}...')

derived_features = [f for f in data_dict.features.values() if f.transformation]
print(f'• Derived Features: {len(derived_features)} (with transformations)')

print('\nUsage Instructions:')
print('1. **For Developers**: Use JSON/YAML files for programmatic access')
print('2. **For Data Scientists**: Review markdown documentation for feature understanding')
print('3. **For Analysts**: Use interactive HTML explorer to browse features')
print('4. **For Documentation**: Reference comprehensive metadata for reports')
print('5. **For Quality Assurance**: Check data quality report for issues')

print('\nBest Practices:')
print('• Always validate data against feature definitions')
print('• Use importance scores for feature selection')
print('• Apply recommended transformations for derived features')
print('• Handle missing values according to feature specifications')
print('• Monitor data quality metrics regularly')

print('\nIntegration with Other Notebooks:')
print('• Pipeline Automation (24): Use metadata for validation')
print('• Feature Engineering (16): Reference transformations')
print('• Data Quality (19): Compare against quality thresholds')
print('• Feature Store (22): Use for feature versioning')

print('\nNext Steps:')
print('• Deploy data dictionary to production systems')
print('• Integrate with data pipelines for validation')
print('• Set up automated quality monitoring')
print('• Create feature lineage tracking')
print('• Establish data governance workflows')

DATA DICTIONARY COMPLETE

Accomplishments:
✓ Created comprehensive data dictionary structure
✓ Documented all traffic, weather, and temporal features
✓ Generated detailed feature metadata
✓ Built data quality assessment
✓ Created markdown documentation
✓ Exported multiple formats (JSON, YAML, CSV)
✓ Built interactive HTML feature explorer
✓ Analyzed feature relationships and transformations
✓ Provided usage guidelines and best practices

Data Dictionary Contents:
• Total Features Documented: 22
• Feature Categories: 11
• Category Breakdown:
  - Station: 1 features
  - Temporal: 4 features
  - Temporal Boolean: 2 features
  - Temporal Cyclical: 2 features
  - Temporal Lag: 1 features
  - Temporal Smoothed: 1 features
  - Traffic Boolean: 1 features
  - Traffic Derived: 3 features
  - Traffic Raw: 3 features
  - Weather: 3 features
  - Weather Boolean: 1 features

Generated Files:
• docs/traffic_data_dictionary.md - Comprehensive documentation
• docs/feature_explorer.html - Interactive f