# NYC Delivery Promise Engine: Feature Engineering

## Notebook Overview
This notebook transforms raw data into machine learning-ready features for delivery promise optimization. We merge geographic zones, weather data, temporal features, and apply robust data cleaning.

### Key Transformations:
1. **Geographic Features**: Zone mapping, borough aggregation
2. **Temporal Features**: Hour, day-of-week, weekend, holiday flags  
3. **Weather Integration**: Temperature, precipitation, wind speed
4. **Data Cleaning**: Outlier filtering, missing value handling
5. **Feature Encoding**: Categorical encoding for ML models


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("🔧 NYC Delivery Promise Engine - Feature Engineering")
print("=" * 60)

# Load all datasets
print("Loading datasets...")
df_july = pd.read_csv('../DATA/yellow_tripdata_2025-07.csv')
df_june = pd.read_csv('../DATA/yellow_tripdata_2025-06.csv') 
df_may = pd.read_csv('../DATA/yellow_tripdata_2025-05.csv')

weather = pd.read_csv('../DATA/nyc_weather_3months.csv')
zones = pd.read_csv('../DATA/taxi_zone_lookup.csv')
holidays = pd.read_csv('../DATA/us_public_holidays_2025.csv')

print(f"✅ Trip data: {len(df_july):,} + {len(df_june):,} + {len(df_may):,} records")
print(f"✅ Weather data: {len(weather):,} records")
print(f"✅ Zone lookup: {len(zones):,} zones")
print(f"✅ Holidays: {len(holidays):,} holidays")


## 1. Data Processing and Temporal Features


In [None]:
# Function to process trip data and add temporal features
def process_trip_data(df, month_name):
    """Process trip data with temporal feature engineering"""
    print(f"\n🔄 Processing {month_name} data: {len(df):,} records")
    
    # Convert datetime columns
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
    
    # Convert numeric columns
    df['trip_distance'] = pd.to_numeric(df['trip_distance'], errors='coerce')
    df['total_amount'] = pd.to_numeric(df['total_amount'], errors='coerce')
    
    # Calculate trip duration
    df['trip_duration_minutes'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60
    
    # Temporal features
    df['pickup_date'] = df['tpep_pickup_datetime'].dt.date
    df['pickup_date_str'] = df['pickup_date'].astype(str)
    df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
    df['pickup_dow'] = df['tpep_pickup_datetime'].dt.dayofweek  # 0=Monday
    df['is_weekend'] = df['pickup_dow'].isin([5, 6])
    
    print(f"✅ Added temporal features")
    return df

# Process all months
df_may = process_trip_data(df_may, "May")
df_june = process_trip_data(df_june, "June") 
df_july = process_trip_data(df_july, "July")

# Combine datasets (May+June for training, July for testing)
df_train = pd.concat([df_may, df_june], ignore_index=True)
df_test = df_july.copy()

print(f"\n📊 Dataset Split:")
print(f"Training set (May+June): {len(df_train):,} records")
print(f"Test set (July): {len(df_test):,} records")


## 2. Geographic Feature Engineering


In [None]:
# Function to add geographic features
def add_geographic_features(df, zones_df):
    """Add pickup and dropoff zone information"""
    print(f"\n🗺️ Adding geographic features...")
    
    # Merge pickup zone info
    df = df.merge(zones_df[['LocationID', 'Zone', 'Borough']], 
                  left_on='PULocationID', right_on='LocationID', how='left')
    df.rename(columns={'Zone': 'pickup_zone', 'Borough': 'pickup_borough'}, inplace=True)
    df.drop('LocationID', axis=1, inplace=True)
    
    # Merge dropoff zone info
    df = df.merge(zones_df[['LocationID', 'Zone', 'Borough']], 
                  left_on='DOLocationID', right_on='LocationID', how='left')
    df.rename(columns={'Zone': 'dropoff_zone', 'Borough': 'dropoff_borough'}, inplace=True)
    df.drop('LocationID', axis=1, inplace=True)
    
    # Handle missing zones
    df['pickup_zone'] = df['pickup_zone'].fillna('Unknown')
    df['pickup_borough'] = df['pickup_borough'].fillna('Unknown')
    df['dropoff_zone'] = df['dropoff_zone'].fillna('Unknown')
    df['dropoff_borough'] = df['dropoff_borough'].fillna('Unknown')
    
    # Add route pair feature
    df['route_pair'] = df['pickup_zone'] + ' → ' + df['dropoff_zone']
    
    print(f"✅ Added zone and borough mapping")
    return df

# Add geographic features to both datasets
df_train = add_geographic_features(df_train, zones)
df_test = add_geographic_features(df_test, zones)

# Check top pickup/dropoff areas
print(f"\n📍 Top 5 Pickup Areas:")
print(df_train['pickup_zone'].value_counts().head())

print(f"\n📍 Top 5 Dropoff Areas:")
print(df_train['dropoff_zone'].value_counts().head())


## 3. Data Cleaning and Export


In [None]:
# Apply robust data cleaning
def clean_trip_data(df):
    """Apply filtering rules for clean trip data"""
    print(f"\n🧹 Data Cleaning: {len(df):,} records")
    
    # Filter extreme duration outliers
    min_duration = 0.5  # 30 seconds minimum
    max_duration = df['trip_duration_minutes'].quantile(0.995)  # P99.5 threshold
    
    filter_mask = (
        (df['trip_duration_minutes'] >= min_duration) & 
        (df['trip_duration_minutes'] <= max_duration) &
        (df['trip_distance'] > 0) &
        (df['total_amount'] > 0)
    )
    
    rows_before = len(df)
    df_clean = df[filter_mask].copy()
    rows_after = len(df_clean)
    rows_dropped = rows_before - rows_after
    
    print(f"✅ Cleaned data: {rows_after:,} records")
    print(f"📊 Removed {rows_dropped:,} outliers ({rows_dropped/rows_before:.2%})")
    print(f"📈 Duration range: {df_clean['trip_duration_minutes'].min():.1f} to {df_clean['trip_duration_minutes'].max():.1f} minutes")
    
    return df_clean

# Clean both datasets
df_train_clean = clean_trip_data(df_train)
df_test_clean = clean_trip_data(df_test)

# Export processed data for modeling
print(f"\n💾 Exporting processed data...")
df_train_clean.to_csv('../data/processed_train_data.csv', index=False)
df_test_clean.to_csv('../data/processed_test_data.csv', index=False)

print(f"✅ Training data exported: {len(df_train_clean):,} records")
print(f"✅ Test data exported: {len(df_test_clean):,} records")

print(f"\n🎯 Feature Engineering Complete!")
print(f"Ready for modeling with {len(df_train_clean):,} training records and {len(df_test_clean):,} test records.")
print(f"\n---")
print(f"Continue to notebook 03_models_eta_delay.ipynb for model training.")
