In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tkinter import Tk
from tkinter.filedialog import askopenfilename
import numpy as np
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from datetime import datetime

In [2]:
# Load your dataset
# 1: verify your working directory
print("Current working directory:", os.getcwd())

# Let user select their local file
#Tk().withdraw()  # Hide the root window
file_path = askopenfilename(title="Select your local finall cleaned taxi data CSV")

# Check and load
if not file_path or not os.path.exists(file_path):
    raise FileNotFoundError("File not found or not selected.")
else:
    df = pd.read_csv(file_path)
    print("Loaded file:", file_path)
    print("Initial shape:", df.shape)
print(f"Dataset loaded successfully! Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

#make function to encode binary, one-hot, and ordinal

def base_encode(df):
    df_encoded = df.copy()

    # 1. Binary Encoding
    df_encoded['is_weekend'] = df_encoded['is_weekend'].astype(int)

    # 2. One-Hot Encoding
    onehot_cols = ['pickup_borough', 'dropoff_borough', 'pickup_service_zone', 'dropoff_service_zone']
    df_encoded = pd.get_dummies(df_encoded, columns=onehot_cols, drop_first=True)

    # 3. Ordinal Encoding
    time_order = { 
        'Early Morning': 0,
        'Morning Rush': 1,
        'Midday': 2,
        'Evening Rush': 3,
        'Night': 4
    }
    df_encoded['time_of_day_encoded'] = df_encoded['time_of_day'].map(time_order)
    df_encoded.drop('time_of_day', axis=1, inplace=True)

    return df_encoded

# Use KFold for safe target encoding

def target_encode_zones_cv(df,target_column,zone_columns, n_splits=5):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    df_encoded = df.copy()
    
    for col in zone_columns:
        new_col = f"{col}_target_encoded"
        df_encoded[new_col] = np.nan

        for train_idx, val_idx in kf.split(df):
            train, val = df.iloc[train_idx], df.iloc[val_idx]
            means = train.groupby(col)[target_column].mean()
            df_encoded.loc[val_idx, new_col] = val[col].map(means)

    # drop the original high-cardinality columns to avoid confusion when modeling
    df_encoded.drop(columns=zone_columns, inplace=True)
    return df_encoded

# Run the full encoding pipeline

# Base encodings
df_encoded = base_encode(df)

# Target encoding for pickup/dropoff zones
high_cardinality_cols = ['pickup_zone', 'dropoff_zone']
df_encoded = target_encode_zones_cv(df_encoded, target_column='fare_per_minute', zone_columns=high_cardinality_cols)

print(f"\nEncoding complete!")
print(f"Encoded dataset shape: {df_encoded.shape}")

# Show new encoded columns
new_cols = [col for col in df_encoded.columns if col not in df.columns]
print(f"\nNewly created columns: {new_cols}")

#save the encoded daatset
output_path = "../Data/cleaned/encoded_taxi_data.csv"
df_encoded.to_csv(output_path, index=False)
print(f"\nEncoded dataset saved to: {output_path}")

Current working directory: c:\diksha\Summer Sem\DataAnalysis\Notebooks
Loaded file: C:/diksha/Summer Sem/DataAnalysis/Data/cleaned/final_cleaned_jan_feb_2023_taxi_data.csv
Initial shape: (5646828, 19)
Dataset loaded successfully! Shape: (5646828, 19)
Columns: ['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'trip_distance', 'fare_amount', 'trip_duration_min', 'pickup_date', 'pickup_hour', 'pickup_day_of_week', 'pickup_borough', 'pickup_zone', 'pickup_service_zone', 'dropoff_borough', 'dropoff_zone', 'dropoff_service_zone', 'fare_per_minute', 'trip_speed', 'trip_speed_mph', 'time_of_day', 'is_weekend']

Encoding complete!
Encoded dataset shape: (5646828, 33)

Newly created columns: ['pickup_borough_Brooklyn', 'pickup_borough_EWR', 'pickup_borough_Manhattan', 'pickup_borough_Queens', 'pickup_borough_Staten Island', 'pickup_borough_Unknown', 'dropoff_borough_Brooklyn', 'dropoff_borough_EWR', 'dropoff_borough_Manhattan', 'dropoff_borough_Queens', 'dropoff_borough_Staten Island', 'dropoff_

## Inital Lightweight Model (baseline)

I will start with a lightweight model to start the training testing using the target as fare_per_minute starting with lightGBM and linear regression and evalute with MAE and R sqaured. Come back to this when you can changed the target feature to Elli's effective_earnings_per_min (currently dont know how she added this to make it as target)

In [None]:
# 1: verify your working directory 
print("Current working directory:", os.getcwd())

# Let user select their local file
#Tk().withdraw()  # Hide the root window
file_path = askopenfilename(title="Select your local encoded taxi data CSV")

# Check and load
if not file_path or not os.path.exists(file_path):
    raise FileNotFoundError("File not found or not selected.")
else:
    df = pd.read_csv(file_path)
    print("Loaded file:", file_path)
    print("Initial shape:", df.shape)

#due to LightGBM needs all features to be numeric, we will ensure all categorical features are encoded properly
# Solution 1: Convert datetime columns to numeric features
import pandas as pd
import numpy as np
from datetime import datetime

# Assuming your dataframe is named 'df'
# Convert datetime columns to datetime type first
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
df['pickup_date'] = pd.to_datetime(df['pickup_date'])

# Option 1: Extract numeric features from datetime columns
df['pickup_hour'] = df['tpep_pickup_datetime'].dt.hour
df['pickup_day_of_week'] = df['tpep_pickup_datetime'].dt.dayofweek
df['pickup_month'] = df['tpep_pickup_datetime'].dt.month
df['pickup_year'] = df['tpep_pickup_datetime'].dt.year

df['dropoff_hour'] = df['tpep_dropoff_datetime'].dt.hour
df['dropoff_day_of_week'] = df['tpep_dropoff_datetime'].dt.dayofweek

# Calculate trip duration in minutes
df['trip_duration_minutes'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

# Drop the original datetime columns
df = df.drop(['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'pickup_date'], axis=1)

# Check data types to ensure all are numeric
print("Data types after conversion:")
print(df.dtypes)

# Check for any remaining object columns
object_columns = df.select_dtypes(include=['object']).columns
if len(object_columns) > 0:
    print(f"\nRemaining object columns: {list(object_columns)}")
    # Handle remaining object columns
    for col in object_columns:
        if df[col].dtype == 'object':
            # Try to convert to numeric, or encode categorically
            try:
                df[col] = pd.to_numeric(df[col], errors='coerce')
            except:
                # If conversion fails, use label encoding
                from sklearn.preprocessing import LabelEncoder
                le = LabelEncoder()
                df[col] = le.fit_transform(df[col].astype(str))

# Verify all columns are now numeric
print("\nFinal data types:")
print(df.dtypes)

# Assuming 'target_column' is your target variable name
X = df.drop('fare_per_minute', axis=1)  
y = df['fare_per_minute']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create LightGBM datasets
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Train the model
params = {
    'objective': 'regression',
    'metric': 'mae',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, test_data],
    num_boost_round=100,
    callbacks=[
        early_stopping(stopping_rounds=10),
        log_evaluation(period=10)
    ]
)

# Evaluate the model
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\n MAE: {mae:.3f}")
print(f" R² Score: {r2:.3f}")

lgb.plot_importance(model, max_num_features=15, importance_type='gain')


Current working directory: c:\diksha\Summer Sem\DataAnalysis\Notebooks
Loaded file: C:/diksha/Summer Sem/DataAnalysis/Data/cleaned/encoded_taxi_data.csv
Initial shape: (5646828, 33)
Data types after conversion:
trip_distance                       float64
fare_amount                         float64
trip_duration_min                   float64
pickup_hour                           int32
pickup_day_of_week                    int32
fare_per_minute                     float64
trip_speed                          float64
trip_speed_mph                      float64
is_weekend                            int64
pickup_borough_Brooklyn                bool
pickup_borough_EWR                     bool
pickup_borough_Manhattan               bool
pickup_borough_Queens                  bool
pickup_borough_Staten Island           bool
pickup_borough_Unknown                 bool
dropoff_borough_Brooklyn               bool
dropoff_borough_EWR                    bool
dropoff_borough_Manhattan              bo

Discussion: MAE (Mean Absolute Error) = 0.033
This means that the models prediciton are, on avg off only about 0.033 units from the actual values (if prediciting taxi fares in $ then it is off about 3.3 cents on avg if we are looking at trip duration during hours it is off about 2 mins on avg)

R sqaured = 0.979
This model explains about 97.9 of the variance in our target variables and only about 2.1% reamins unexplained

These metrics show that the inital model is performing well! With high accuracy, low error rate

Notes: 
reults being this good might be due to overfitting and that the model memorized the training data rather than learning the patterns

could also mean data leakage, will need to check if any features inadvertently contain ingormation about the target that wouldnt be avilable for real-world predictions


next steps: Validate the results

check fare distributions, what is the typical range for fare_per_minute 2) cross validate across different time periods 3) business logic do the predicitions align with known taxi pricing rules? 


## Linear Regression (baseline model)

Next I will use a model that I am used to working with and comapre the results with the LightGBM model

In [16]:
# Split features and target
X = df.drop('fare_per_minute', axis=1)  
y = df['fare_per_minute']  

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Drop NaNs in X_train and sync y_train
X_train_lr = X_train.dropna()
y_train_lr = y_train.loc[X_train_lr.index]

# Drop NaNs in X_test and sync y_test
X_test_lr = X_test.dropna()
y_test_lr = y_test.loc[X_test_lr.index]

# Train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train_lr, y_train_lr)

# Make predictions
y_pred_lr = lr_model.predict(X_test_lr)

# Evaluate performance
mae_lr = mean_absolute_error(y_test_lr, y_pred_lr)
r2_lr = r2_score(y_test_lr, y_pred_lr)

print("Linear Regression Results:")
print(f"   MAE: {mae_lr:.3f}")
print(f"   R²:  {r2_lr:.3f}")

Linear Regression Results:
   MAE: 0.102
   R²:  0.777


Discussion for Linear Regression: 

MAE = 0.102 the predicitions are off by 10.2 cents per min on avg so for a ten minute ride that is about $1.02 error in total fare

R Sqaured = 0.777 means this model explains about 77.7% of the variance in fare per minute and that 22.3% of the variation remains unexplained

## Comparision of LightGBM and Linear Regression

From basic observation it is shown that LightGBM significantly outperforms Linear Regression due to: 

1) Non-linear realtionships: taxi fares likely have complex, non-linear patterns that LGBM captures better such as peak hpurs vs off-peak pricing, distance-based rate changes, location-specific surcharges

2) Feature Interactions: LGBM automatically finds interactions between features like time of day and location combinations

3) Flexiability: LGBM can model complex decision boundaries that linear regression cannot

Conclusion:  LightGBM model is substantially better for fare prediction, with 3x better accuracy. The Linear Regression still performs reasonably (R² = 0.777 is decent), but LightGBM's ability to capture complex patterns in taxi pricing makes a better choice.
Both models suggest that the feature engineering was effective, but LightGBM better exploits those features!
