# NYC Taxi Trip Data Analysis - Preprocessing

This notebook covers the data loading, cleaning, feature engineering, and feature selection steps for the NYC Taxi Trip Data Analysis project.

## 1. Setup and Imports

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Add the project root directory to the Python path
sys.path.append('..')

# Import project modules
from src.config import RESULTS_DIR, MODELS_DIR, DATA_DIR
from src.data.loader import load_taxi_data, load_parquet
from src.data.cleaner import (
    clean_yellow_taxi_data, clean_green_taxi_data, 
    clean_fhv_data, clean_fhvhv_data
)
from src.data.feature_engineering import engineer_features
from src.data.feature_selection import (
    select_features_mutual_info, select_features_lasso,
    compare_feature_selection_methods, get_common_features
)
from src.utils.helpers import print_dataframe_info, sample_dataframe

# Set up plotting
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

# Create results directory if it doesn't exist
os.makedirs(RESULTS_DIR, exist_ok=True)

## 2. Data Loading

Let's load the yellow taxi data for January and February 2025.

In [None]:
# Load yellow taxi data
taxi_type = 'yellow'
months = ['2025-01', '2025-02']

print(f"Loading {taxi_type} taxi data for months: {months}")
df = load_taxi_data(taxi_type, months)

# Display basic information about the data
print_dataframe_info(df, f"Raw {taxi_type} taxi data")

### 2.1 Sample the Data

For faster processing, let's sample the data.

In [None]:
# Sample the data for faster processing
sample_size = 100000
if len(df) > sample_size:
    print(f"Sampling {sample_size} rows from {len(df)} total rows")
    df_sampled = sample_dataframe(df, n=sample_size)
else:
    df_sampled = df

print(f"Sampled data shape: {df_sampled.shape}")

## 3. Exploratory Data Analysis

Let's explore the data to understand its structure and identify potential issues.

In [None]:
# Display summary statistics
df_sampled.describe()

In [None]:
# Check for missing values
missing = df_sampled.isnull().sum()
missing_percent = (missing / len(df_sampled)) * 100
missing_info = pd.DataFrame({
    'Missing Values': missing,
    'Percentage': missing_percent
})
missing_info[missing_info['Missing Values'] > 0]

In [None]:
# Visualize the distribution of trip distance
plt.figure(figsize=(12, 8))
sns.histplot(df_sampled['trip_distance'], bins=50, kde=True)
plt.title('Distribution of Trip Distance')
plt.xlabel('Trip Distance (miles)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Visualize the distribution of fare amount
plt.figure(figsize=(12, 8))
sns.histplot(df_sampled['fare_amount'], bins=50, kde=True)
plt.title('Distribution of Fare Amount')
plt.xlabel('Fare Amount ($)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Check for outliers in trip distance
print(f"Trip distance statistics:")
print(f"Min: {df_sampled['trip_distance'].min()}")
print(f"Max: {df_sampled['trip_distance'].max()}")
print(f"Mean: {df_sampled['trip_distance'].mean():.2f}")
print(f"Median: {df_sampled['trip_distance'].median():.2f}")
print(f"95th percentile: {df_sampled['trip_distance'].quantile(0.95):.2f}")
print(f"99th percentile: {df_sampled['trip_distance'].quantile(0.99):.2f}")

In [None]:
# Check for outliers in fare amount
print(f"Fare amount statistics:")
print(f"Min: {df_sampled['fare_amount'].min()}")
print(f"Max: {df_sampled['fare_amount'].max()}")
print(f"Mean: {df_sampled['fare_amount'].mean():.2f}")
print(f"Median: {df_sampled['fare_amount'].median():.2f}")
print(f"95th percentile: {df_sampled['fare_amount'].quantile(0.95):.2f}")
print(f"99th percentile: {df_sampled['fare_amount'].quantile(0.99):.2f}")

In [None]:
# Calculate trip duration
df_sampled['trip_duration'] = (df_sampled['tpep_dropoff_datetime'] - df_sampled['tpep_pickup_datetime']).dt.total_seconds() / 60

# Visualize the distribution of trip duration
plt.figure(figsize=(12, 8))
sns.histplot(df_sampled['trip_duration'], bins=50, kde=True)
plt.title('Distribution of Trip Duration')
plt.xlabel('Trip Duration (minutes)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Check for outliers in trip duration
print(f"Trip duration statistics:")
print(f"Min: {df_sampled['trip_duration'].min():.2f}")
print(f"Max: {df_sampled['trip_duration'].max():.2f}")
print(f"Mean: {df_sampled['trip_duration'].mean():.2f}")
print(f"Median: {df_sampled['trip_duration'].median():.2f}")
print(f"95th percentile: {df_sampled['trip_duration'].quantile(0.95):.2f}")
print(f"99th percentile: {df_sampled['trip_duration'].quantile(0.99):.2f}")

In [None]:
# Visualize the relationship between trip distance and duration
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df_sampled, x='trip_distance', y='trip_duration', alpha=0.5)
plt.title('Trip Distance vs. Duration')
plt.xlabel('Trip Distance (miles)')
plt.ylabel('Trip Duration (minutes)')
plt.grid(True)
plt.show()

In [None]:
# Visualize the relationship between trip distance and fare amount
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df_sampled, x='trip_distance', y='fare_amount', alpha=0.5)
plt.title('Trip Distance vs. Fare Amount')
plt.xlabel('Trip Distance (miles)')
plt.ylabel('Fare Amount ($)')
plt.grid(True)
plt.show()

## 4. Data Cleaning

Based on the EDA, let's clean the data to remove outliers and handle missing values.

In [None]:
# Clean the data using the predefined cleaning function
df_clean = clean_yellow_taxi_data(df_sampled)

# Display information about the cleaned data
print_dataframe_info(df_clean, f"Cleaned {taxi_type} taxi data")

In [None]:
# Compare the original and cleaned data sizes
print(f"Original data shape: {df_sampled.shape}")
print(f"Cleaned data shape: {df_clean.shape}")
print(f"Removed {len(df_sampled) - len(df_clean)} rows ({(len(df_sampled) - len(df_clean)) / len(df_sampled) * 100:.2f}%)")

In [None]:
# Visualize the distribution of trip duration after cleaning
plt.figure(figsize=(12, 8))
sns.histplot(df_clean['trip_duration'], bins=50, kde=True)
plt.title('Distribution of Trip Duration (After Cleaning)')
plt.xlabel('Trip Duration (minutes)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Visualize the relationship between trip distance and duration after cleaning
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df_clean, x='trip_distance', y='trip_duration', alpha=0.5)
plt.title('Trip Distance vs. Duration (After Cleaning)')
plt.xlabel('Trip Distance (miles)')
plt.ylabel('Trip Duration (minutes)')
plt.grid(True)
plt.show()

## 5. Feature Engineering

Let's create new features to improve model performance.

In [None]:
# Engineer features using the predefined function
df_features = engineer_features(df_clean, taxi_type)

# Display information about the feature-engineered data
print_dataframe_info(df_features, f"Feature-engineered {taxi_type} taxi data")

In [None]:
# List the new features created
new_features = [col for col in df_features.columns if col not in df_clean.columns]
print(f"New features created ({len(new_features)}):\n{new_features}")

In [None]:
# Visualize the distribution of speed
plt.figure(figsize=(12, 8))
sns.histplot(df_features['speed_mph'], bins=50, kde=True)
plt.title('Distribution of Speed')
plt.xlabel('Speed (mph)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Visualize trip duration by hour of day
plt.figure(figsize=(12, 8))
sns.boxplot(data=df_features, x='tpep_pickup_datetime_hour', y='trip_duration')
plt.title('Trip Duration by Hour of Day')
plt.xlabel('Hour of Day')
plt.ylabel('Trip Duration (minutes)')
plt.grid(True)
plt.show()

In [None]:
# Visualize trip duration by day of week
plt.figure(figsize=(12, 8))
sns.boxplot(data=df_features, x='tpep_pickup_datetime_dayofweek', y='trip_duration')
plt.title('Trip Duration by Day of Week')
plt.xlabel('Day of Week (0 = Monday, 6 = Sunday)')
plt.ylabel('Trip Duration (minutes)')
plt.grid(True)
plt.show()

In [None]:
# Visualize trip duration by period of day
plt.figure(figsize=(12, 8))
sns.boxplot(data=df_features, x='tpep_pickup_datetime_period', y='trip_duration')
plt.title('Trip Duration by Period of Day')
plt.xlabel('Period of Day')
plt.ylabel('Trip Duration (minutes)')
plt.grid(True)
plt.show()

In [None]:
# Visualize trip duration by distance category
plt.figure(figsize=(12, 8))
sns.boxplot(data=df_features, x='distance_category', y='trip_duration')
plt.title('Trip Duration by Distance Category')
plt.xlabel('Distance Category')
plt.ylabel('Trip Duration (minutes)')
plt.grid(True)
plt.show()

## 6. Feature Selection

Let's select the most important features for modeling.

In [None]:
# Prepare data for feature selection
target_col = 'trip_duration'
y = df_features[target_col]

# Identify categorical and numerical columns
cat_cols = df_features.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
# Also include columns that are likely categorical but stored as integers
for col in df_features.select_dtypes(include=['int64', 'int32']).columns:
    if col.endswith(('ID', '_id', 'code', 'type')) or df_features[col].nunique() < 20:
        cat_cols.append(col)
# Identify numerical columns (excluding the target and categorical columns)
num_cols = df_features.select_dtypes(include=['number']).columns.tolist()
num_cols = [col for col in num_cols if col != target_col and col not in cat_cols]

# Create feature matrix
X = df_features[cat_cols + num_cols]

print(f"Feature matrix shape: {X.shape}")
print(f"Categorical columns ({len(cat_cols)}): {cat_cols}")
print(f"Numerical columns ({len(num_cols)}): {num_cols}")

In [None]:
# Compare different feature selection methods
max_features = 20
feature_selection_results = compare_feature_selection_methods(X, y, k=max_features)

# Display the features selected by each method
for method, features in feature_selection_results.items():
    print(f"{method} ({len(features)} features): {features}")

In [None]:
# Get common features selected by at least 2 methods
selected_features = get_common_features(feature_selection_results, min_methods=2)
print(f"Common features selected by at least 2 methods ({len(selected_features)}):\n{selected_features}")

In [None]:
# If we have too many features, use mutual information to select top max_features
if len(selected_features) > max_features:
    print(f"Selected {len(selected_features)} features, reducing to {max_features} using mutual information")
    _, selected_features = select_features_mutual_info(X, y, k=max_features)

# If we have too few features, use mutual information to select features
if len(selected_features) < 5:
    print(f"Only {len(selected_features)} features selected, using mutual information to select {max_features}")
    _, selected_features = select_features_mutual_info(X, y, k=max_features)

print(f"Final selected features ({len(selected_features)}):\n{selected_features}")

In [None]:
# Create a correlation matrix for the selected features
selected_features_with_target = selected_features + [target_col]
corr_matrix = df_features[selected_features_with_target].corr()

# Visualize the correlation matrix
plt.figure(figsize=(14, 12))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corr_matrix, mask=mask, cmap=cmap, vmax=1, vmin=-1, center=0,
            annot=True, fmt='.2f', square=True, linewidths=.5)
plt.title('Correlation Matrix of Selected Features', fontsize=16)
plt.tight_layout()
plt.show()

## 7. Save Processed Data

Let's save the processed data for use in the modeling notebook.

In [None]:
# Create a directory for processed data
processed_dir = os.path.join(RESULTS_DIR, 'processed_data')
os.makedirs(processed_dir, exist_ok=True)

# Save the full feature-engineered data
df_features.to_csv(os.path.join(processed_dir, f"{taxi_type}_features.csv"), index=False)
print(f"Saved feature-engineered data to {os.path.join(processed_dir, f'{taxi_type}_features.csv')}")

# Save the selected features
with open(os.path.join(processed_dir, f"{taxi_type}_selected_features.txt"), 'w') as f:
    f.write('\n'.join(selected_features))
print(f"Saved selected features to {os.path.join(processed_dir, f'{taxi_type}_selected_features.txt')}")

## 8. Summary

In this notebook, we have:

1. Loaded the NYC yellow taxi data for January and February 2025
2. Performed exploratory data analysis to understand the data structure and identify issues
3. Cleaned the data by removing outliers and handling missing values
4. Engineered new features to improve model performance
5. Selected the most important features for modeling
6. Saved the processed data for use in the modeling notebook

The next step is to train and evaluate different regression models to predict trip duration.