# NYC Taxi Tip Prediction
### Deep Learning Regression on 5.6M+ NYC Yellow Taxi Trip Records

This notebook builds a neural network to predict taxi tip amounts using trip metadata from the [NYC TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page) (Aug-Sep 2023).

**Pipeline**: Data Loading → Feature Engineering → Outlier Removal → Encoding & Scaling → Neural Network Training → Evaluation & Visualization

<a href="https://colab.research.google.com/github/ninadpshah/New-York-TLC-Tip-Prediction/blob/main/TLC_Trip_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Environment Setup

In [None]:
# Mount Google Drive (Colab only) to access the raw trip data files
from google.colab import drive
drive.mount('/content/drive')

# Core libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow as tf
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU available: {bool(tf.config.list_physical_devices('GPU'))}")

## 2. Data Loading

Load NYC TLC Yellow Taxi trip records for **August and September 2023** (~6M+ trips). Source: [NYC TLC Trip Record Data](https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page).

In [None]:
# Load August and September 2023 yellow taxi trip data
DATA_DIR = '/content/drive/MyDrive/TLC_Yellow/'
date_cols = ['tpep_pickup_datetime', 'tpep_dropoff_datetime']

august_data = pd.read_csv(
    f'{DATA_DIR}yellow_tripdata_2023-08.csv',
    parse_dates=date_cols, low_memory=False
)
september_data = pd.read_csv(
    f'{DATA_DIR}yellow_tripdata_2023-09.csv',
    parse_dates=date_cols, low_memory=False
)

combined_data = pd.concat([august_data, september_data], ignore_index=True)
print(f"Combined dataset: {len(combined_data):,} trips")
print(f"Columns: {list(combined_data.columns)}")

## 3. Data Preprocessing

Feature engineering, outlier removal, encoding, and train/test split.

In [ ]:
# --- Feature Selection ---
selected_features = ['trip_distance', 'RatecodeID', 'fare_amount',
                     'payment_type', 'Airport_fee', 'tip_amount']
taxi_filtered = combined_data[selected_features].copy()

# --- Temporal Feature Engineering ---
# Extract time-based features to capture commuter patterns and time-of-day effects
for prefix, col in [('pickup', 'tpep_pickup_datetime'),
                     ('dropoff', 'tpep_dropoff_datetime')]:
    dt = combined_data[col]
    taxi_filtered[f'{prefix}_weekday'] = dt.dt.weekday
    taxi_filtered[f'{prefix}_hour'] = dt.dt.hour
    taxi_filtered[f'{prefix}_minute'] = dt.dt.minute
    taxi_filtered[f'{prefix}_week_hour'] = (
        taxi_filtered[f'{prefix}_weekday'] * 24 + taxi_filtered[f'{prefix}_hour']
    )

print(f"Features after engineering: {list(taxi_filtered.columns)}")
print(f"Shape: {taxi_filtered.shape}")

# --- Outlier Removal (Z-score) ---
def remove_outliers_zscore(df, columns, threshold=3):
    """Remove rows where any specified column has |z-score| >= threshold."""
    z_scores = np.abs(stats.zscore(df[columns]))
    return df[(z_scores < threshold).all(axis=1)]

columns_to_check = ['trip_distance', 'fare_amount']
taxi_filtered_no_outliers = remove_outliers_zscore(taxi_filtered, columns_to_check)
print(f"After outlier removal: {len(taxi_filtered_no_outliers):,} records "
      f"(removed {len(taxi_filtered) - len(taxi_filtered_no_outliers):,})")

# --- Categorical Encoding ---
label_encoders = {}
categorical_columns = ['RatecodeID', 'payment_type', 'Airport_fee']
for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    taxi_filtered_no_outliers[col] = label_encoders[col].fit_transform(
        taxi_filtered_no_outliers[col]
    )

# --- Train/Test Split & Scaling ---
X = taxi_filtered_no_outliers.drop('tip_amount', axis=1)
y = taxi_filtered_no_outliers['tip_amount']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nTrain set: {X_train_scaled.shape[0]:,} samples")
print(f"Test set:  {X_test_scaled.shape[0]:,} samples")
print(f"Features:  {X_train_scaled.shape[1]}")

## 4. Exploratory Data Analysis

Inspect the cleaned dataset and visualize distributions, correlations, and feature relationships.

In [None]:
# Dataset summary after preprocessing
print(f"Dataset shape: {taxi_filtered_no_outliers.shape}")
print(f"Missing values: {taxi_filtered_no_outliers.isnull().sum().sum()}")
print(f"\n{'='*60}")
print("Feature Statistics:")
print(f"{'='*60}")
taxi_filtered_no_outliers.describe().round(2)

In [None]:
# Target variable distribution
plt.figure(figsize=(10, 5))
sns.histplot(taxi_filtered_no_outliers['tip_amount'], bins=50, kde=True, color='steelblue')
plt.xlabel('Tip Amount ($)')
plt.ylabel('Frequency')
plt.title('Distribution of Tip Amount (5.6M+ trips)')
plt.axvline(taxi_filtered_no_outliers['tip_amount'].mean(), color='red',
            linestyle='--', label=f"Mean: ${taxi_filtered_no_outliers['tip_amount'].mean():.2f}")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Feature correlation heatmap
plt.figure(figsize=(12, 9))
corr = taxi_filtered_no_outliers.select_dtypes(include=[np.number]).corr()
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Feature Correlation Heatmap')
plt.tight_layout()
plt.show()

In [None]:
# Pairwise feature relationships (sampled for performance)
sns.pairplot(
    taxi_filtered_no_outliers.sample(5000, random_state=42),
    vars=['trip_distance', 'fare_amount', 'pickup_weekday', 'pickup_hour', 'tip_amount'],
    diag_kind='kde',
    plot_kws={'alpha': 0.4, 's': 10}
)
plt.suptitle('Pairplot of Selected Features', y=1.02)
plt.show()

## 5. Model Training

Build a feed-forward neural network with dropout regularization for the regression task:
- **Architecture**: Dense(128, ReLU) → Dropout(0.5) → Dense(64, ReLU) → Dense(1)
- **Optimizer**: Adam (lr=0.001)
- **Loss**: Mean Squared Error
- **Training**: 50 epochs, batch size 8,196, 20% validation split, GPU-accelerated

In [None]:
# Build the model
model = Sequential([
    Dense(128, input_dim=X_train.shape[1], activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(1)  # Linear output for regression
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='mean_squared_error'
)

model.summary()

# Train with GPU acceleration
device = '/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'
print(f"\nTraining on: {device}")

with tf.device(device):
    history = model.fit(
        X_train_scaled, y_train,
        epochs=50,
        batch_size=8196,
        validation_split=0.2,
        verbose=1
    )

# Evaluate on held-out test set
test_loss = model.evaluate(X_test_scaled, y_test)
predictions = model.predict(X_test_scaled)

# Compute additional metrics
test_rmse = np.sqrt(test_loss)
test_r2 = r2_score(y_test, predictions)
test_mae = np.mean(np.abs(y_test.values - predictions.flatten()))

print(f"\n{'='*40}")
print(f"Test MSE:  {test_loss:.4f}")
print(f"Test RMSE: ${test_rmse:.2f}")
print(f"Test MAE:  ${test_mae:.2f}")
print(f"Test R²:   {test_r2:.4f}")
print(f"{'='*40}")

## 6. Evaluation & Visualization

Visualize training convergence, prediction distributions, and compare true vs. predicted tip amounts across key features.

In [None]:
# --- Training Convergence ---
fig, ax = plt.subplots(figsize=(10, 5))
ax.plot(history.history['loss'], label='Training Loss', linewidth=2)
ax.plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
ax.set_xlabel('Epoch')
ax.set_ylabel('MSE Loss')
ax.set_title('Training & Validation Loss over Epochs')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

# --- Prediction Density (KDE) ---
predictions_df = pd.DataFrame({
    'True Values': y_test.values,
    'Predictions': predictions.flatten()
})
sample = predictions_df.sample(9999, random_state=42)

fig, axs = plt.subplots(1, 2, figsize=(15, 5))
for ax in axs:
    ax.set(xlim=[-3, 20])
sns.kdeplot(data=sample, ax=axs[0], bw_adjust=3)
axs[0].set_title('Prediction Density (KDE)')
axs[0].set_xlabel('Tip Amount ($)')
sns.kdeplot(data=sample, ax=axs[1], bw_adjust=3, cumulative=True)
axs[1].set_title('Cumulative Prediction Density')
axs[1].set_xlabel('Tip Amount ($)')
fig.tight_layout()
plt.show()

# --- True vs Predicted: Correlation Heatmap ---
selected_features_without_tip = [f for f in selected_features if f != 'tip_amount']
comparison_df = pd.DataFrame({
    'True Tip Amount': y_test.values,
    'Predicted Tip Amount': predictions.flatten()
})
comparison_df[selected_features_without_tip] = X_test.reset_index(drop=True)[selected_features_without_tip]

plt.figure(figsize=(10, 8))
sns.heatmap(comparison_df.corr(), annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation: Features vs True & Predicted Tip Amounts')
plt.tight_layout()
plt.show()

# --- True vs Predicted: Scatter Plots ---
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
for ax, feature in zip(axes, ['trip_distance', 'fare_amount']):
    ax.scatter(X_test[feature], y_test, alpha=0.15, s=3, label='True', color='steelblue')
    ax.scatter(X_test[feature], predictions.flatten(), alpha=0.15, s=3, label='Predicted', color='coral')
    ax.set_xlabel(feature.replace('_', ' ').title())
    ax.set_ylabel('Tip Amount ($)')
    ax.set_title(f'{feature.replace("_", " ").title()} vs Tip Amount')
    ax.legend()
fig.tight_layout()
plt.show()