In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
from sklearn.model_selection import KFold

In [10]:
# Load your dataset
df = pd.read_csv("../Data/cleaned/final_cleaned_jan_feb_2023_taxi_data.csv")
print(f"Dataset loaded successfully! Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

#make function to encode binary, one-hot, and ordinal

def base_encode(df):
    df_encoded = df.copy()

    # 1. Binary Encoding
    df_encoded['is_weekend'] = df_encoded['is_weekend'].astype(int)

    # 2. One-Hot Encoding
    onehot_cols = ['pickup_borough', 'dropoff_borough', 'pickup_service_zone', 'dropoff_service_zone']
    df_encoded = pd.get_dummies(df_encoded, columns=onehot_cols, drop_first=True)

    # 3. Ordinal Encoding
    time_order = { 
        'Early Morning': 0,
        'Morning Rush': 1,
        'Midday': 2,
        'Evening Rush': 3,
        'Night': 4
    }
    df_encoded['time_of_day_encoded'] = df_encoded['time_of_day'].map(time_order)
    df_encoded.drop('time_of_day', axis=1, inplace=True)

    return df_encoded

# Use KFold for safe target encoding

def target_encode_zones_cv(df,target_column,zone_columns, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    df_encoded = df.copy()
    
    for col in zone_columns:
        new_col = f"{col}_target_encoded"
        df_encoded[new_col] = np.nan

        for train_idx, val_idx in kf.split(df):
            train, val = df.iloc[train_idx], df.iloc[val_idx]
            means = train.groupby(col)[target_column].mean()
            df_encoded.loc[val_idx, new_col] = val[col].map(means)

    # drop the original high-cardinality columns to avoid confusion when modeling
    df_encoded.drop(columns=zone_columns, inplace=True)
    return df_encoded

# Run the full encoding pipeline

# Base encodings
df_encoded = base_encode(df)

# Target encoding for pickup/dropoff zones
high_cardinality_cols = ['pickup_zone', 'dropoff_zone']
df_encoded = target_encode_zones_cv(df_encoded, target_column='fare_per_minute', zone_columns=high_cardinality_cols)

print(f"\nEncoding complete!")
print(f"Encoded dataset shape: {df_encoded.shape}")

# Show new encoded columns
new_cols = [col for col in df_encoded.columns if col not in df.columns]
print(f"\nNewly created columns: {new_cols}")

#save the encoded daatset
output_path = "../Data/cleaned/encoded_taxi_data.csv"
df_encoded.to_csv(output_path, index=False)
print(f"\nEncoded dataset saved to: {output_path}")

Dataset loaded successfully! Shape: (5646828, 19)
Columns: ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance', 'fare_amount', 'trip_duration_min', 'pickup_date', 'pickup_hour', 'pickup_day_of_week', 'pickup_borough', 'pickup_zone', 'pickup_service_zone', 'dropoff_borough', 'dropoff_zone', 'dropoff_service_zone', 'fare_per_minute', 'trip_speed', 'trip_speed_mph', 'time_of_day', 'is_weekend']

Encoding complete!
Encoded dataset shape: (5646828, 33)

Newly created columns: ['pickup_borough_Brooklyn', 'pickup_borough_EWR', 'pickup_borough_Manhattan', 'pickup_borough_Queens', 'pickup_borough_Staten Island', 'pickup_borough_Unknown', 'dropoff_borough_Brooklyn', 'dropoff_borough_EWR', 'dropoff_borough_Manhattan', 'dropoff_borough_Queens', 'dropoff_borough_Staten Island', 'dropoff_borough_Unknown', 'pickup_service_zone_Boro Zone', 'pickup_service_zone_EWR', 'pickup_service_zone_Yellow Zone', 'dropoff_service_zone_Boro Zone', 'dropoff_service_zone_EWR', 'dropoff_service_zone_Yell