In [1]:
from collections import Counter
from typing import List, Dict
import os
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from matplotlib.patheffects import withStroke
import pandas as pd
import os 
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

In [2]:
base_path = '../dataset_nyc_taxi_samples/parquet/'

In [105]:
def get_csv_dict(names: List[str], csv_root: str) -> Dict[str, pd.DataFrame]:
    csv_dict = {}

    for name in names:
        csv_dict[name] = {}
        csv_dict[name]["train"] = pd.read_csv(os.path.join(csv_root, f"{name}_train_1M.csv"))

        csv_dict[name]["valid"] = pd.read_csv(os.path.join(csv_root, f"{name}_valid_500k.csv"))
    
    return csv_dict

In [123]:
names = ["fhv", "green", "yellow"]

csv_dict = get_csv_dict(names, "../dataset_nyc_taxi_samples/csv")

  csv_dict[name]["train"] = pd.read_csv(os.path.join(csv_root, f"{name}_train_1M.csv"))
  csv_dict[name]["valid"] = pd.read_csv(os.path.join(csv_root, f"{name}_valid_500k.csv"))


In [124]:
def update_columns(df: pd.DataFrame, taxi_type: str) -> pd.DataFrame:

    if taxi_type == "green":
        df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
        df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])

        df['store_and_fwd_flag'] = df['store_and_fwd_flag'] == 'Y'
        df['store_and_fwd_flag'] = df['store_and_fwd_flag'].astype(int)

        df['trip_time'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
        df['trip_time'] = df['trip_time'].values.astype(float)//10**9

        df['month'] = df['lpep_pickup_datetime'].dt.month
        df['weekday'] = df['lpep_pickup_datetime'].dt.weekday
        df['hour'] = df['lpep_pickup_datetime'].dt.hour

        df = df.drop(columns=['lpep_pickup_datetime', 'lpep_dropoff_datetime', 'ehail_fee', 'total_amount'])


    if taxi_type == "yellow":
        df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
        df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])

        df['store_and_fwd_flag'] = df['store_and_fwd_flag'] == 'Y'
        df['store_and_fwd_flag'] = df['store_and_fwd_flag'].astype(int)

        df['trip_time'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
        df['trip_time'] = df['trip_time'].values.astype(float)//10**9

        df['month'] = df['tpep_pickup_datetime'].dt.month
        df['weekday'] = df['tpep_pickup_datetime'].dt.weekday
        df['hour'] = df['tpep_pickup_datetime'].dt.hour

        df = df.drop(columns=['tpep_pickup_datetime', 'tpep_dropoff_datetime', 'airport_fee', 'total_amount'])

    if taxi_type == "fhv":
        df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
        df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])
        df['request_datetime'] = pd.to_datetime(df['request_datetime'])

        for flag in ['shared_request_flag', 'shared_match_flag', 'access_a_ride_flag', 'wav_request_flag', 'wav_match_flag']:
            df[flag] = df[flag] == 'Y'
            df[flag] = df[flag].astype(int)

        df['trip_time'] = df['dropoff_datetime'] - df['pickup_datetime']
        df['trip_time'] = df['trip_time'].values.astype(float)//10**9

        df['wait_time'] = df['pickup_datetime'] - df['request_datetime']
        df['wait_time'] = df['wait_time'].values.astype(float)//10**9

        df['month'] = df['pickup_datetime'].dt.month
        df['weekday'] = df['pickup_datetime'].dt.weekday
        df['hour'] = df['pickup_datetime'].dt.hour

        df = df.drop(columns=['hvfhs_license_num', 'dispatching_base_num', 'originating_base_num', 'airport_fee',
                                        'request_datetime', 'on_scene_datetime', 'pickup_datetime', 'dropoff_datetime',])
    
    df = df.dropna()

    return df

In [126]:
shapefile = gpd.read_file("../taxi_zones")
location_to_borough_mapping = dict(zip(shapefile['LocationID'], shapefile['borough']))
borough_to_borough_idx_mappng = {'Manhattan': 0, 'Queens': 1, 'Brooklyn': 2, 'Bronx': 3, 'EWR': 4, 'Staten Island': 5}

location_to_borough_idx_mapping = dict(zip(shapefile['LocationID'], shapefile['borough'].map(borough_to_borough_idx_mappng)))
shapefile['borough_id'] = shapefile['borough'].map(borough_to_borough_idx_mappng)


In [127]:
for name in names:
    # change id, map borough to id
    csv_dict[name]["train"]["PUBoroughID"] = (csv_dict[name]["train"]["PULocationID"]-1).map(location_to_borough_idx_mapping)
    csv_dict[name]["train"]["DOBoroughID"] = (csv_dict[name]["train"]["DOLocationID"]-1).map(location_to_borough_idx_mapping)
    csv_dict[name]["valid"]["PUBoroughID"] = (csv_dict[name]["valid"]["PULocationID"]-1).map(location_to_borough_idx_mapping)
    csv_dict[name]["valid"]["DOBoroughID"] = (csv_dict[name]["valid"]["DOLocationID"]-1).map(location_to_borough_idx_mapping)

    # drop nans on PUBoroughID, DOBoroughID
    len_train_before = len(csv_dict[name]["train"])
    len_valid_before = len(csv_dict[name]["valid"])

    csv_dict[name]["train"].dropna(subset=["PUBoroughID", "DOBoroughID"], inplace=True)
    csv_dict[name]["valid"].dropna(subset=["PUBoroughID", "DOBoroughID"], inplace=True)

    csv_dict[name]["train"] = update_columns(csv_dict[name]["train"], name)
    csv_dict[name]["valid"] = update_columns(csv_dict[name]["valid"], name)

    len_train_after = len(csv_dict[name]["train"])
    len_valid_after = len(csv_dict[name]["valid"])

    print(f"{name} - train reduced: {(len_train_before-len_train_after)/len_train_before}%, num samples: {len_train_before-len_train_after}")
    print(f"{name} - valid reduced: {(len_valid_before-len_valid_after)/len_valid_before}%, num samples: {len_valid_before-len_valid_after}")



fhv - train reduced: 0.04675354286691461%, num samples: 46752
fhv - valid reduced: 0.046842594722158334%, num samples: 23422
green - train reduced: 0.009437283118493555%, num samples: 9437
green - valid reduced: 0.00951161953521859%, num samples: 4756
yellow - train reduced: 0.005849181324621063%, num samples: 5849
yellow - valid reduced: 0.0057577927194620996%, num samples: 2879


## Train regression model

In [128]:
csv_dict[name]["train"].columns

Index(['VendorID', 'passenger_count', 'trip_distance', 'RatecodeID',
       'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'payment_type',
       'fare_amount', 'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'congestion_surcharge', 'retail_price',
       'temperature_2m (°C)', 'relative_humidity_2m (%)', 'dew_point_2m (°C)',
       'apparent_temperature (°C)', 'precipitation (mm)', 'rain (mm)',
       'snowfall (cm)', 'snow_depth (m)', 'surface_pressure (hPa)',
       'cloud_cover (%)', 'wind_speed_10m (km/h)', 'is_day ()', 'PUBoroughID',
       'DOBoroughID', 'trip_time', 'month', 'weekday', 'hour'],
      dtype='object')

In [129]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

for name in names:
    scaler = MinMaxScaler()
    tip_label = [label for label in csv_dict[name]["train"].columns if "tip" in label][0]

    y_train = csv_dict[name]["train"].copy()[tip_label].to_numpy()
    y_val = csv_dict[name]["valid"].copy()[tip_label].to_numpy()

    x_train = csv_dict[name]["train"].copy().drop(columns=[tip_label]).to_numpy()
    x_val = csv_dict[name]["valid"].copy().drop(columns=[tip_label]).to_numpy()

    x_train = scaler.fit_transform(x_train)
    x_val = scaler.transform(x_val)


    model = LinearRegression()

    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)
    mse = mean_squared_error(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)

    print("====================")
    print(name)
    print("Mean Squared Error:", mse)
    print("Mean Absolute Error:", mae)
    print("Coefficients:", model.coef_)
    print("Intercept:", model.intercept_)
    print("====================")


fhv
Mean Squared Error: 4.634459390855498
Mean Absolute Error: 1.2076065665555729
Coefficients: [ 1.14763642e-02  3.04155523e-03 -7.50486512e+00 -1.08085120e+01
  1.61764647e+01  2.54430433e+00  1.45238411e+01  7.06464344e+00
  6.25838932e-01  1.10321964e+01  1.29037965e-01  1.84089379e-01
 -7.71150991e-02  8.17171175e-02 -7.36178179e-02  1.33669326e-01
 -8.62463779e-01  1.14822706e-02 -5.05785535e-01  1.33511294e+00
 -5.54391149e+01  5.54966662e+01  7.21090827e+00 -1.42565339e-01
 -3.13153731e-02  3.22218704e-02  1.04549289e-01  1.19423358e-01
 -1.07065984e-01 -1.14407463e-01 -3.27681050e-01  8.09519432e-02
 -7.37092096e-02  7.49998501e-02]
Intercept: -0.417857295368977
green
Mean Squared Error: 6.683244293988039
Mean Absolute Error: 1.3983482675412022
Coefficients: [ 5.78810666e-01  1.20580004e-01  1.16809719e+01  1.06099068e-01
 -9.70010736e-02  6.29694650e-01 -2.68496017e-01  3.81232313e+01
  4.07895032e+00 -6.23860520e+00  1.70673167e+00  2.53097891e+00
  1.34374734e-11 -1.1391137

In [50]:
tip_label

'tip_amount'

In [61]:
csv_dict[name]["valid"][tip_label]

0         18.55
1          2.96
2          1.66
3          2.76
4          1.76
          ...  
500013     4.45
500014     3.28
500015     3.35
500016     6.52
500017     4.54
Name: tip_amount, Length: 497139, dtype: float64

In [69]:
x_train.shape

(994120, 34)

In [68]:
scaler.fit_transform(x_train, y_train).shape

(994120, 34)

In [None]:
train_df = train_df.dropna()

In [None]:
import missingno as msno

msno.bar(train_df)