In [33]:
import os
from tqdm import tqdm

import numpy as np
import pandas as pd
from mpl_toolkits.basemap import Basemap

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Loading data

In [2]:
TRAIN_PATH = "data/processed_data/train_df.csv"
TEST_PATH = "data/processed_data/test_df.csv"

In [5]:
def load_nyc_taxi_fare(path, col_types, chunksize=None, datetime_format="%Y-%m-%d %H:%M:%S UTC"):
    chunk_iter = pd.read_csv(path, usecols=col_types.keys(), dtype=col_types, chunksize=chunksize)
    
    if chunksize is None:
        chunk_iter["pickup_datetime"] = pd.to_datetime(chunk_iter["pickup_datetime"], 
                                                       utc=True, format=datetime_format)
        return chunk_iter
    
    df_list = []
    # use tqdm to monitor progress
    # It would take extremely long time if format were not used.
    for df_chunk in tqdm(chunk_iter):
        df_chunk["pickup_datetime"] = pd.to_datetime(df_chunk["pickup_datetime"], 
                                                     utc=True, format=datetime_format)
        df_list.append(df_chunk)
    return pd.concat(df_list)

In [16]:
train_types = {"fare_amount": "float32",
              "pickup_datetime": "str", 
              "pickup_longitude": "float32",
              "pickup_latitude": "float32",
              "dropoff_longitude": "float32",
              "dropoff_latitude": "float32",
              "passenger_count": "uint8"}
X_train = load_nyc_taxi_fare(TRAIN_PATH, train_types, 
                              chunksize=5_000_000,
                              datetime_format="%Y-%m-%d %H:%M:%S+00:00")
X_train.shape

11it [04:55, 26.21s/it]


(54254481, 7)

In [18]:
test_types = train_types.copy()
test_types.pop("fare_amount")
test_types["key"] = "str"
print("test_types:", test_types)

X_test = load_nyc_taxi_fare(TEST_PATH, test_types,
                           datetime_format="%Y-%m-%d %H:%M:%S+00:00")
X_test.shape

test_types: {'pickup_datetime': 'str', 'pickup_longitude': 'float32', 'pickup_latitude': 'float32', 'dropoff_longitude': 'float32', 'dropoff_latitude': 'float32', 'passenger_count': 'uint8', 'key': 'str'}


(9914, 7)

In [10]:
y_train = X_train["fare_amount"]
X_train = X_train.drop(["fare_amount"], axis=1)

In [19]:
y_train.head()

0     4.5
1    16.9
2     5.7
3     7.7
4     5.3
Name: fare_amount, dtype: float32

In [14]:
x_train.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21+00:00,-73.844315,40.721317,-73.841614,40.712276,1
1,2010-01-05 16:52:16+00:00,-74.016045,40.711304,-73.979271,40.782005,1
2,2011-08-18 00:35:00+00:00,-73.982735,40.761269,-73.991241,40.750561,2
3,2012-04-21 04:30:42+00:00,-73.987129,40.733143,-73.99157,40.758091,1
4,2010-03-09 07:51:00+00:00,-73.968094,40.768009,-73.956657,40.783764,1


In [21]:
X_test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24+00:00,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.74614,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751637,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1


In [22]:
test_key = X_test["key"]
X_test = X_test.drop(["key"], axis=1)

In [23]:
test_key.head()

0    2015-01-27 13:08:24.0000002
1    2015-01-27 13:08:24.0000003
2    2011-10-08 11:53:44.0000002
3    2012-12-01 21:12:12.0000002
4    2012-12-01 21:12:12.0000003
Name: key, dtype: object

In [24]:
X_test.head()

Unnamed: 0,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24+00:00,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24+00:00,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44+00:00,-73.982521,40.751259,-73.979652,40.74614,1
3,2012-12-01 21:12:12+00:00,-73.981163,40.767807,-73.990448,40.751637,1
4,2012-12-01 21:12:12+00:00,-73.966049,40.789776,-73.988564,40.744427,1


# Feature extraction

We collect all the functions which generated new columns in `data_cleaning_and_EDA.ipynb` and put them into one function which works for both train and test sets.

In [25]:
# return distance in kilometer
def distance(lon1, lat1, lon2, lat2):
    if isinstance(lon1, pd.Series):
        lon1 = lon1.values
    
    if isinstance(lat1, pd.Series):
        lat1 = lat1.values
        
    if isinstance(lon2, pd.Series):
        lon2 = lon2.values
    
    if isinstance(lat2, pd.Series):
        lat2 = lat2.values
        
    # use more precise floating numbers
    if isinstance(lon1, np.ndarray):
        lon1 = np.asarray(lon1, dtype=np.float64)
    
    if isinstance(lat1, np.ndarray):
        lat1 = np.asarray(lat1, dtype=np.float64)
        
    if isinstance(lon2, np.ndarray):
        lon2 = np.asarray(lon2, dtype=np.float64)
    
    if isinstance(lat2, np.ndarray):
        lat2 = np.asarray(lat2, dtype=np.float64)
        
    lon1_rad = np.radians(lon1)
    lat1_rad = np.radians(lat1)
    lon2_rad = np.radians(lon2)
    lat2_rad = np.radians(lat2)
    
    a = 0.5 - 0.5*np.cos(lat2_rad - lat1_rad) + np.cos(lat1_rad)*np.cos(lat2_rad)*(1 - np.cos(lon2_rad - lon1_rad))*0.5
    return 12742 * np.arcsin(np.sqrt(a))

def add_distance_col(df):
    df["distance"] = distance(df["pickup_longitude"], df["pickup_latitude"],
                              df["dropoff_longitude"], df["dropoff_latitude"])
    df["distance"] = df["distance"].astype(np.float32)
    return df

In [26]:
# 40.639722, -73.778889
JFK_LON = -73.778889
JFK_LAT = 40.639722

# 40.6925, -74.168611
EWR_LON = -74.168611
EWR_LAT = 40.6925

# 40.77725, -73.872611
LGA_LON = -73.872611
LGA_LAT = 40.77725

def is_to_airport(df, airport_lon, airport_lat, thres=3):
    dist = distance(airport_lon, airport_lat, df["dropoff_longitude"], df["dropoff_latitude"])
    return dist < thres

def is_from_airport(df, airport_lon, airport_lat, thres=3.):
    dist = distance(airport_lon, airport_lat, df["pickup_longitude"], df["pickup_latitude"])
    return dist < thres

def mark_airport_trip(df, 
                      jfk_lon=JFK_LON, jfk_lat=JFK_LAT, 
                      ewr_lon=EWR_LON, ewr_lat=EWR_LAT,
                      lga_lon=LGA_LON, lga_lat=LGA_LAT):
    
    df["from_to_airport"] = "No"
    
    from_to_jfk = is_to_airport(df, jfk_lon, jfk_lat) | is_from_airport(df, jfk_lon, jfk_lat)
    df.loc[from_to_jfk, "from_to_airport"] = "JFK"
    
    from_to_ewr = is_to_airport(df, ewr_lon, ewr_lat) | is_from_airport(df, ewr_lon, ewr_lat)
    df.loc[from_to_ewr, "from_to_airport"] = "EWR"
    
    from_to_lga = is_to_airport(df, lga_lon, lga_lat) | is_from_airport(df, lga_lon, lga_lat)
    df.loc[from_to_lga, "from_to_airport"] = "LGA"
    
    return df

In [27]:
# New York City Coordinates from google
NYC_LON = -74.006
NYC_LAT = 40.7128

def add_pickup_to_center_dist_col(df, nyc_lon=NYC_LON, nyc_lat=NYC_LAT):
    df["pickup_to_center_dist"] = distance(nyc_lon, nyc_lat, 
                                           df["pickup_longitude"], df["pickup_latitude"])
    df["pickup_to_center_dist"] = df["pickup_to_center_dist"].astype(np.float32)
    return df


def add_dropoff_to_center_dist_col(df, nyc_lon=NYC_LON, nyc_lat=NYC_LAT):
    df["dropoff_to_center_dist"] = distance(nyc_lon, nyc_lat, 
                                            df["dropoff_longitude"], df["dropoff_latitude"])
    df["dropoff_to_center_dist"] = df["dropoff_to_center_dist"].astype(np.float32)
    return df

In [28]:
def direction(lons_1, lats_1, lons_2, lats_2):
    bm = Basemap()
    
    x1, y1 = bm(lons_1, lats_1)
    x2, y2 = bm(lons_2, lats_2)
    
    dx = x2 - x1
    dy = y2 - y1
    
    hypotenuse = np.sqrt(dx*dx + dy*dy)
    
    pos_dx = dx >= 0
    neg_dx = dx < 0
    
    pos_dy = dy >= 0
    neg_dy = dy < 0
    
    neg_dx_and_pos_dy = neg_dx & pos_dy
    neg_dx_and_neg_dy = neg_dx & neg_dy
    
    direc = np.zeros(len(dx))
    
    direc[pos_dx] = np.arcsin(dy[pos_dx] / hypotenuse[pos_dx])
    
    direc[neg_dx_and_pos_dy] = np.pi - np.arcsin(dy[neg_dx_and_pos_dy] / hypotenuse[neg_dx_and_pos_dy])
    
    direc[neg_dx_and_neg_dy] = -np.pi - np.arcsin(dy[neg_dx_and_neg_dy] / hypotenuse[neg_dx_and_neg_dy])
    
    direc = 180 / np.pi * direc
    return direc

def add_direction_col(df):
    df["direction"] = direction(df["pickup_longitude"], df["pickup_latitude"],
                                df["dropoff_longitude"], df["dropoff_latitude"])
    df["direction"] = df["direction"].astype(np.float32)
    
    return df

def impute_nan_direction(df):
    df.loc[df["direction"].isna(), "direction"] = df["direction"].median()
    return df

In [29]:
def ad_time_cols(df):
    df["year"] = df["pickup_datetime"].apply(lambda t: t.year).astype(np.int32)
    df["month"] = df["pickup_datetime"].apply(lambda t: t.month).astype(np.uint8)
    df["weekday"] = df["pickup_datetime"].apply(lambda t: t.weekday).astype(np.uint8)
    df["hour"] = df["pickup_datetime"].apply(lambda t: t.hour).astype(np.uint8)
    return df

Put all functions above into one function.

In [39]:
COLS_TO_REMOVE = ["pickup_datetime", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude"]

def extract_new_features(df, cols_to_remove=COLS_TO_REMOVE):
    df = add_distance_col(df)
    df = mark_airport_trip(df)
    df = add_pickup_to_center_dist_col(df)
    df = add_dropoff_to_center_dist_col(df)
    df = add_direction_col(df)
    df = impute_nan_direction(df)
    df = ad_time_cols(df)
    
    df = df.drop(cols_to_remove, axis=1)
    return df

In [40]:
X_train = extract_new_features(X_train)
X_test = extract_new_features(X_test)



In [42]:
X_train.head()

Unnamed: 0,fare_amount,passenger_count,distance,from_to_airport,pickup_to_center_dist,dropoff_to_center_dist,direction,year,month,weekday,hour
0,4.5,1,1.030742,No,13.659573,13.855385,-73.367325,2009,6,0,17
1,16.9,1,8.450001,No,0.862809,8.01794,62.519924,2010,1,1,16
2,5.7,2,1.389632,No,5.734865,4.379088,-128.465118,2011,8,3,0
3,7.7,1,2.799211,No,2.765087,5.180817,100.091927,2012,4,5,4
4,5.3,1,1.999081,No,6.919971,8.918654,54.0238,2010,3,1,7


In [44]:
X_test.head()

Unnamed: 0,passenger_count,distance,from_to_airport,pickup_to_center_dist,dropoff_to_center_dist,direction,year,month,weekday,hour
0,1,2.32326,No,6.304552,4.024398,-112.102699,2015,1,1,13
1,1,2.425353,No,1.771281,2.996195,121.246758,2015,1,1,13
2,1,0.618412,No,4.711865,4.321139,-60.735573,2011,10,5,11
3,1,1.960778,No,6.46453,4.512865,-119.864113,2012,12,5,21
4,1,5.38728,No,9.197128,3.811321,-116.402885,2012,12,5,21


One-hot encoding.

In [58]:
CAT_COLS = ("from_to_airport", "month", "weekday", "hour")
type_dict = {col: "category" for col in CAT_COLS}
X_tmp = X_test.astype(type_dict)

In [62]:
X_tmp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9914 entries, 0 to 9913
Data columns (total 10 columns):
passenger_count           9914 non-null uint8
distance                  9914 non-null float32
from_to_airport           9914 non-null category
pickup_to_center_dist     9914 non-null float32
dropoff_to_center_dist    9914 non-null float32
direction                 9914 non-null float32
year                      9914 non-null category
month                     9914 non-null category
weekday                   9914 non-null category
hour                      9914 non-null category
dtypes: category(5), float32(4), uint8(1)
memory usage: 215.2 KB


In [63]:
CAT_COLS = ("from_to_airport", "year", "month", "weekday", "hour")

def onehot_encode(X_train, X_test, cat_cols=CAT_COLS):
    type_dict = {col: "category" for col in cat_cols}
    
    X_train_ohe = pd.get_dummies(X_train.astype(type_dict))
    X_test_ohe = pd.get_dummies(X_test.astype(type_dict))

    X_train_ohe, X_test_ohe = X_train_ohe.align(X_test_ohe, join='inner', axis=1)
    return X_train_ohe, X_test_ohe

In [67]:
X_train, X_test = onehot_encode(X_train, X_test)

In [69]:
X_train.shape

(54254481, 59)

In [53]:
pd.get_dummies(X_train, drop_first=True).head()

Unnamed: 0,fare_amount,passenger_count,distance,pickup_to_center_dist,dropoff_to_center_dist,direction,year,month,weekday,hour,from_to_airport_JFK,from_to_airport_LGA,from_to_airport_No
0,4.5,1,1.030742,13.659573,13.855385,-73.367325,2009,6,0,17,0,0,1
1,16.9,1,8.450001,0.862809,8.01794,62.519924,2010,1,1,16,0,0,1
2,5.7,2,1.389632,5.734865,4.379088,-128.465118,2011,8,3,0,0,0,1
3,7.7,1,2.799211,2.765087,5.180817,100.091927,2012,4,5,4,0,0,1
4,5.3,1,1.999081,6.919971,8.918654,54.0238,2010,3,1,7,0,0,1
