In [None]:
# @title 1. Group Segments with Train/Test Split (80:20)

# CELL 1 [TAG: parameters]

# Default parameters (Airflow will inject these)

INPUT_FOLDER = "s3://processed-data/"       # Output from Step 0

RUN_TIMESTAMP = "2025-01-01_00-00-00"  # Injected by Airflow
OUTPUT_TRAIN_DATA = "s3://models-quality-eval/2025-01-01_00-00-00/train/grouped_segments.pkl"
OUTPUT_TEST_DATA = "s3://models-quality-eval/2025-01-01_00-00-00/test/grouped_segments.pkl"

SPEED_THRESHOLD = 25.0  # km/h
MIN_DURATION = 15       # seconds
TRAIN_RATIO = 0.8       # 80% train, 20% test
RANDOM_SEED = 42        # For reproducibility

MINIO_ENDPOINT = "http://localhost:9000" 
MINIO_ACCESS_KEY = "admin"
MINIO_SECRET_KEY = "password123"

In [None]:
# CELL 2: Imports

import os

import pandas as pd

import numpy as np

import pickle

import s3fs

from sklearn.model_selection import train_test_split

In [None]:
# CELL 3: MinIO Configuration

# Define connection settings for S3-compatible storage

MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT", "http://minio:9000")

MINIO_KEY = os.environ.get("MINIO_ACCESS_KEY", "admin")

MINIO_SECRET = os.environ.get("MINIO_SECRET_KEY", "password123")

# Initialize S3 Filesystem

fs = s3fs.S3FileSystem(
    key=MINIO_ACCESS_KEY,      # Uses the variable injected by Papermill
    secret=MINIO_SECRET_KEY,   # Uses the variable injected by Papermill
    client_kwargs={'endpoint_url': MINIO_ENDPOINT}
)

# Pandas storage options
storage_options = {
    "key": MINIO_ACCESS_KEY,
    "secret": MINIO_SECRET_KEY,
    "client_kwargs": {"endpoint_url": MINIO_ENDPOINT}
}

In [None]:
# CELL 4: Logic

def preprocess_and_group(folder):

    grouped_segments = [[], []] # Index 0: Heavy, Index 1: Light

    # 1. List all processed 1Hz files in the S3 bucket

    # fs.glob returns paths like 'bucket/file.csv', we need to ensure 's3://' prefix if needed

    # or just pass the path directly to fs.open

    try:

        file_paths = fs.glob(f"{folder.replace('s3://', '')}*.csv")

    except Exception as e:

        print(f"Error listing files in {folder}: {e}")

        return grouped_segments

    print(f"Found {len(file_paths)} processed trips.")

    for file_path in file_paths:

        print(f"Processing {file_path}...")

        

        # Read directly from S3

        with fs.open(file_path, 'rb') as f:

            df = pd.read_csv(f)

        

        # Ensure sorted by time

        df = df.sort_values('seconds_elapsed')

        # Identify contiguous blocks of the same segment_id

        # (Step 0 already ensured segment_id is populated via Google Roads API)

        df['block_id'] = (df['segment_id'] != df['segment_id'].shift()).cumsum()

        for _, block in df.groupby('block_id'):

            # Filter out short noise segments

            if len(block) < MIN_DURATION: 

                continue

            # Extract Physics

            speed_kmh = block['speed_kmh'].values

            

            # CRITICAL CHANGE: Use the Sensor-Fused Acceleration from Step 0

            # Instead of recalculating it via derivative (np.diff), we use the 

            # high-precision 'acc_forward' calculated from the IMU.

            if 'acc_forward' in block.columns:

                accel_ms2 = block['acc_forward'].values

            else:

                # Fallback if column missing (shouldn't happen with Step 0)

                accel_ms2 = np.zeros(len(speed_kmh))

                accel_ms2[:-1] = np.diff(speed_kmh) / 3.6

            

            # Stack into matrix: [Speed, Acceleration]

            segment_data = np.column_stack((speed_kmh, accel_ms2))

            # Classify based on Average Speed

            if np.mean(speed_kmh) <= SPEED_THRESHOLD:

                grouped_segments[0].append(segment_data) # Heavy

            else:

                grouped_segments[1].append(segment_data) # Light

    return grouped_segments

In [None]:
# CELL 5: Execution & Train/Test Split

print("=== Preprocessing and Grouping ===")
data = preprocess_and_group(INPUT_FOLDER)

print(f"Result: {len(data[0])} Heavy segments, {len(data[1])} Light segments.")

# Set random seed for reproducibility
np.random.seed(RANDOM_SEED)

# Split each group into train/test
train_data = [[], []]
test_data = [[], []]

for group_idx in range(2):
    group_name = "Heavy" if group_idx == 0 else "Light"
    segments = data[group_idx]
    
    if len(segments) == 0:
        print(f"Warning: No segments in {group_name} traffic group.")
        continue
    
    # Calculate split index
    n_total = len(segments)
    n_train = int(n_total * TRAIN_RATIO)
    
    # Shuffle indices
    indices = np.random.permutation(n_total)
    train_indices = indices[:n_train]
    test_indices = indices[n_train:]
    
    # Split segments
    train_data[group_idx] = [segments[i] for i in train_indices]
    test_data[group_idx] = [segments[i] for i in test_indices]
    
    print(f"{group_name} Traffic: {len(train_data[group_idx])} train, {len(test_data[group_idx])} test segments")

print(f"\n=== Split Summary ===")
print(f"Train Ratio: {TRAIN_RATIO * 100}%")
print(f"Random Seed: {RANDOM_SEED}")
print(f"Run Timestamp: {RUN_TIMESTAMP}")

In [None]:
# CELL 6: Save Train and Test Data

# Save Training Data
print(f"\nSaving training data to {OUTPUT_TRAIN_DATA}...")
with fs.open(OUTPUT_TRAIN_DATA, 'wb') as f:
    pickle.dump(train_data, f)

# Save Test Data
print(f"Saving test data to {OUTPUT_TEST_DATA}...")
with fs.open(OUTPUT_TEST_DATA, 'wb') as f:
    pickle.dump(test_data, f)

print("âœ… Done. Train and test datasets saved successfully.")