In [None]:
# @title 1. Load Fixed Train/Test Data and Group by Traffic

# CELL 1 [TAG: parameters]
# ---------------------------------------------------------
# Default parameters (Airflow will OVERWRITE these)
# ---------------------------------------------------------

# Input: Fixed CSV files from MinIO
# Expected columns: speed_kmh, acceleration_ms2 (or acc_forward), segment_id (optional)
INPUT_TRAIN_CSV = "s3://models-quality-eval/data/train.csv"
INPUT_TEST_CSV = "s3://models-quality-eval/data/test.csv"

# Output: Grouped segments (same format as before)
RUN_TIMESTAMP = "2025-01-01_00-00-00"  # Injected by Airflow
OUTPUT_TRAIN_DATA = "s3://models-quality-eval/2025-01-01_00-00-00/train/grouped_segments.pkl"
OUTPUT_TEST_DATA = "s3://models-quality-eval/2025-01-01_00-00-00/test/grouped_segments.pkl"

# Traffic classification parameters
SPEED_THRESHOLD = 25.0     # km/h (below = heavy traffic, above = light traffic)
MIN_DURATION_TRAIN = 5     # seconds (minimum segment length for training data)
MIN_DURATION_TEST = 0      # seconds (minimum segment length for test data - accept all)

# MinIO Credentials
MINIO_ENDPOINT = "http://localhost:9000"
MINIO_ACCESS_KEY = "admin"
MINIO_SECRET_KEY = "password123"


In [None]:
# CELL 2: Imports
import pandas as pd
import numpy as np
import pickle
import s3fs


In [None]:
# CELL 3: MinIO Configuration
fs = s3fs.S3FileSystem(
    key=MINIO_ACCESS_KEY,
    secret=MINIO_SECRET_KEY,
    client_kwargs={'endpoint_url': MINIO_ENDPOINT}
)

print(f"✅ Connected to MinIO at {MINIO_ENDPOINT}")


In [None]:
# CELL 4: Helper Function - Group by Traffic
def group_by_traffic(csv_path, speed_threshold, min_duration):
    """
    Load CSV and group segments by traffic condition.
    
    Args:
        csv_path: S3 path to CSV file
        speed_threshold: Speed threshold (km/h) for heavy vs light traffic
        min_duration: Minimum segment length in seconds
    
    Returns:
        List of [heavy_traffic_segments, light_traffic_segments]
        Each segment is a numpy array of shape (n, 2) with columns [speed_kmh, accel_ms2]
    """
    print(f"\nLoading: {csv_path}")
    
    # Load CSV from MinIO
    with fs.open(csv_path, 'rb') as f:
        df = pd.read_csv(f)
    
    print(f"  Total rows: {len(df)}")
    print(f"  Columns: {list(df.columns)}")
    
    # Validate required columns (flexible acceleration column naming)
    if 'speed_kmh' not in df.columns:
        raise ValueError("Missing required column: speed_kmh")
    
    # Handle both acceleration column naming conventions
    if 'acceleration_ms2' in df.columns:
        accel_col = 'acceleration_ms2'
        print(f"  Using acceleration column: acceleration_ms2")
    elif 'acc_forward' in df.columns:
        accel_col = 'acc_forward'
        print(f"  Using acceleration column: acc_forward")
    else:
        raise ValueError("Missing acceleration column: need either 'acceleration_ms2' or 'acc_forward'")
    
    # Initialize grouping
    grouped_segments = [[], []]  # Index 0: Heavy Traffic, Index 1: Light Traffic
    
    # If there's a segment_id column, group by it; otherwise treat as one segment
    if 'segment_id' in df.columns:
        # Group by segment_id and identify contiguous blocks
        df = df.sort_values('segment_id')
        df['block_id'] = (df['segment_id'] != df['segment_id'].shift()).cumsum()
        segments_iter = df.groupby('block_id')
    else:
        # Treat entire dataset as segments based on continuity
        # Create blocks based on time gaps or just split into chunks
        print("  No segment_id found - treating as continuous data")
        # Simple approach: create one large segment
        segments_iter = [(0, df)]
    
    segment_count = 0
    heavy_count = 0
    light_count = 0
    
    for _, segment_df in segments_iter:
        # Filter by minimum duration
        if len(segment_df) < min_duration:
            continue
        
        # Extract speed and acceleration
        speed_kmh = segment_df['speed_kmh'].values
        accel_ms2 = segment_df[accel_col].values
        
        # Create segment array [speed, accel]
        segment_array = np.column_stack([speed_kmh, accel_ms2])
        
        # Classify by average speed
        avg_speed = np.mean(speed_kmh)
        
        if avg_speed < speed_threshold:
            # Heavy Traffic (low speed)
            grouped_segments[0].append(segment_array)
            heavy_count += 1
        else:
            # Light Traffic (high speed)
            grouped_segments[1].append(segment_array)
            light_count += 1
        
        segment_count += 1
    
    print(f"  ✅ Processed {segment_count} segments:")
    print(f"     - Heavy Traffic: {heavy_count} segments")
    print(f"     - Light Traffic: {light_count} segments")
    
    return grouped_segments


In [None]:
# CELL 5: Load and Group Train Data
print("="*60)
print("PROCESSING TRAIN DATA")
print("="*60)

train_grouped = group_by_traffic(
    csv_path=INPUT_TRAIN_CSV,
    speed_threshold=SPEED_THRESHOLD,
    min_duration=MIN_DURATION_TRAIN
)

# Save train data
with fs.open(OUTPUT_TRAIN_DATA, 'wb') as f:
    pickle.dump(train_grouped, f)

print(f"\n✅ Train data saved to: {OUTPUT_TRAIN_DATA}")


In [None]:
# CELL 6: Load and Group Test Data
print("\n" + "="*60)
print("PROCESSING TEST DATA")
print("="*60)

test_grouped = group_by_traffic(
    csv_path=INPUT_TEST_CSV,
    speed_threshold=SPEED_THRESHOLD,
    min_duration=MIN_DURATION_TEST
)

# Save test data
with fs.open(OUTPUT_TEST_DATA, 'wb') as f:
    pickle.dump(test_grouped, f)

print(f"\n✅ Test data saved to: {OUTPUT_TEST_DATA}")


In [None]:
# CELL 7: Summary
print("\n" + "="*60)
print("SUMMARY")
print("="*60)
print(f"Run Timestamp: {RUN_TIMESTAMP}")
print(f"Speed Threshold: {SPEED_THRESHOLD} km/h")
print(f"Min Duration (Train): {MIN_DURATION_TRAIN} seconds")
print(f"Min Duration (Test): {MIN_DURATION_TEST} seconds (accepts all segments)")
print(f"\nTrain Set:")
print(f"  - Heavy Traffic: {len(train_grouped[0])} segments")
print(f"  - Light Traffic: {len(train_grouped[1])} segments")
print(f"  - Total: {len(train_grouped[0]) + len(train_grouped[1])} segments")
print(f"\nTest Set:")
print(f"  - Heavy Traffic: {len(test_grouped[0])} segments")
print(f"  - Light Traffic: {len(test_grouped[1])} segments")
print(f"  - Total: {len(test_grouped[0]) + len(test_grouped[1])} segments")
print(f"\n✅ Pipeline ready for training!")
