# ParkSense - Production Training Pipeline ðŸ”¥ðŸ”¥ðŸ”¥ðŸ”¥

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
import os

## 1. Data Loading
Loads historical sensor data exported from Supabase.

In [10]:
def load_data(file_path='data/supabase_snapshots.csv'):
    """
    Loads raw snapshot data and converts timestamps.
    """
    print("Loading snapshot data...")
    df = pd.read_csv(file_path)
    df['status_timestamp'] = pd.to_datetime(df['status_timestamp'])
    return df

## 2. Neighborhood Grouping
Groups bays into blocks of 20 to reduce noise from individual sensors.

In [11]:
def neighborhood_grouping(df):
    """
    Groups individual bays into 'neighborhoods' of 20 units.
    """
    print("Grouping bays into neighborhoods...")
    df['group_id'] = (df['kerbsideid'] // 20) * 20
    df['is_occupied'] = df['status'].apply(lambda x: 1 if x == 'Present' else 0)
    return df

## 3. Time-Series Resampling
Converts raw events into standardized 15-minute intervals.

In [12]:
def timeseries_resampling(df):
    """
    Resamples data into 15-minute heartbeats calculating occupancy ratios.
    """
    print("Preprocessing time-series into 15-min intervals...")
    group_ts = df.groupby(['group_id', pd.Grouper(key='status_timestamp', freq='15min')])['is_occupied'].mean().reset_index()
    group_ts.columns = ['group_id', 'timestamp', 'occupancy_ratio']
    group_ts = group_ts.sort_values(['group_id', 'timestamp'])
    return group_ts

## 4. Feature Engineering
Creates the inputs (X) and target (y) for the model, including time features and lags.

In [13]:
def feature_engineering(group_ts):
    """
    Engineering features (lags and time-based) and defines the target variable.
    """
    print("Engineering features (lags and time-based)...")
    group_ts = group_ts.copy()
    group_ts['hour'] = group_ts['timestamp'].dt.hour
    group_ts['day_of_week'] = group_ts['timestamp'].dt.dayofweek
    
    # Lag Features: Looking at what happened 15m and 30m ago
    group_ts['lag_15m'] = group_ts.groupby('group_id')['occupancy_ratio'].shift(1)
    group_ts['lag_30m'] = group_ts.groupby('group_id')['occupancy_ratio'].shift(2)
    
    # Target Variable: 15 minutes into the future
    group_ts['target_15m'] = group_ts.groupby('group_id')['occupancy_ratio'].shift(-1)
    
    model_data = group_ts.dropna()
    
    features = ['group_id', 'occupancy_ratio', 'hour', 'day_of_week', 'lag_15m', 'lag_30m']
    X = model_data[features]
    y = model_data['target_15m']
    
    return X, y, features

## 5. Model Training
Trains the XGBoost Regressor on the processed dataset.

In [14]:
def train_model(X, y):
    """
    Trains the XGBoost model with production-ready hyperparameters.
    """
    print(f"Training XGBoost model on {len(X)} samples...")
    model = XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8
    )
    model.fit(X, y)
    return model

## 6. Model Export
Saves the model and feature metadata for use by the FastAPI backend.

In [15]:
def export_model(model, features, output_dir='models'):
    """
    Exports the trained model and feature list for the backend.
    """
    os.makedirs(output_dir, exist_ok=True)
    model_path = os.path.join(output_dir, 'parking_model_15m.ubj')
    features_path = os.path.join(output_dir, 'features.txt')
    
    model.save_model(model_path)
    with open(features_path, 'w') as f:
        f.write(",".join(features))
        
    print(f"Success! Model saved to {model_path}")
    print(f"Features expected by BE: {features}")

## Final Execution Function
This orchestrates the entire pipeline by calling the functions defined above.

In [16]:
def train_production_model():
    """
    Complete end-to-end pipeline: load -> process -> engineer -> train -> export.
    """
    # 1. Load
    df = load_data()
    
    # 2. Process
    df = neighborhood_grouping(df)
    group_ts = timeseries_resampling(df)
    
    # 3. Engineer
    X, y, features = feature_engineering(group_ts)
    
    # 4. Train
    model = train_model(X, y)
    
    # 5. Export
    export_model(model, features)

# Run the full pipeline
if __name__ == "__main__":
    train_production_model()

Loading snapshot data...
Grouping bays into neighborhoods...
Preprocessing time-series into 15-min intervals...
Engineering features (lags and time-based)...
Training XGBoost model on 287065 samples...
Success! Model saved to models\parking_model_15m.ubj
Features expected by BE: ['group_id', 'occupancy_ratio', 'hour', 'day_of_week', 'lag_15m', 'lag_30m']
