# New York Taxirides to Events

## Imports

In [1]:
import concurrent.futures
import itertools
import os
import os.path as op
import shutil
import time
import uuid
from collections import Counter
from datetime import datetime

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyproj
import pytz
import requests
import tqdm

In [2]:
pd.set_option("max_columns", 100)
pd.set_option("max_rows", 1000)

In [3]:
%matplotlib inline

## Parameters

In [4]:
NOTEBOOK_NAME = "new_york_taxirides_to_events"
try:
    os.mkdir(NOTEBOOK_NAME)
except OSError:
    pass

In [5]:
URLS = [
    "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-12.csv",
    "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-01.csv",
]

In [6]:
try:
    %load_ext autoreload
    %autoreload 2
except Exception:
    print("No autoreload")

## Create events dataset

### Load New York taxirides data

In [7]:
def download_file(url, outfile):
    with requests.get(url, stream=True) as r:
        with open(outfile, 'wb') as f:
            shutil.copyfileobj(r.raw, f)
    return outfile

In [8]:
dfs = []
for url in URLS:
    output_file = os.path.join(NOTEBOOK_NAME, url.split("/")[-1])
    try:
        df = pd.read_csv(output_file)
    except IOError:
        download_file(URL, output_file)
        df = pd.read_csv(output_file)
    dfs.append(df)
        
new_york_taxirides = pd.concat(dfs, ignore_index=True)

In [9]:
new_york_taxirides.head(2)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,pickup_longitude,pickup_latitude,RatecodeID,store_and_fwd_flag,dropoff_longitude,dropoff_latitude,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount
0,2,2015-12-01 00:00:00,2015-12-01 00:05:16,5,0.96,-73.979942,40.765381,1,N,-73.966309,40.763088,1,5.5,0.5,0.5,1.0,0.0,0.3,7.8
1,2,2015-12-01 00:00:00,2015-12-01 00:00:00,2,2.69,-73.972336,40.762379,1,N,-73.993629,40.745998,1,21.5,0.0,0.5,3.34,0.0,0.3,25.64


### Validate data

In [10]:
assert len(new_york_taxirides) > (0.7 * len(new_york_taxirides[["pickup_longitude", "pickup_latitude"]].drop_duplicates()))

In [11]:
assert len(new_york_taxirides) > (0.7 * len(new_york_taxirides[["dropoff_longitude", "dropoff_latitude"]].drop_duplicates()))

### Create `events_df`

In [12]:
num_events = len(new_york_taxirides)

events = {
    "event_type": np.array(["start"] * num_events + ["stop"] * num_events),
    # TODO(ostrokach): this should be a pyarrow timestamp type.
    "timestamp": np.r_[
        new_york_taxirides["tpep_pickup_datetime"],
        new_york_taxirides["tpep_dropoff_datetime"],
    ],
    "longitude": np.r_[
        new_york_taxirides["pickup_longitude"], new_york_taxirides["dropoff_longitude"]
    ],
    "latitude": np.r_[
        new_york_taxirides["pickup_latitude"], new_york_taxirides["dropoff_latitude"]
    ],
    "trip_miles": np.r_[
        np.array([np.nan] * num_events), new_york_taxirides["trip_distance"]
    ],
    "trip_total": np.r_[
        np.array([np.nan] * num_events), new_york_taxirides["total_amount"]
    ],
}

In [13]:
events_df = pd.DataFrame(events)

### Add timestamp

In [14]:
def timestamp_to_seconds(timestamp_str):
    from datetime import datetime
    import pytz

    dt = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S")
    dt = dt.replace(tzinfo=pytz.UTC)  # .astimezone(pytz.timezone('America/Chicago'))
    unix_dt = datetime.utcfromtimestamp(0).replace(tzinfo=pytz.UTC)
    dt_delta = (dt - unix_dt).total_seconds()
    return dt_delta

timestamp_to_seconds("2016-01-01 00:00:00")

1451606400.0

In [15]:
with concurrent.futures.ProcessPoolExecutor() as p:
    events_df["timestamp_seconds"] = list(
        tqdm.tqdm_notebook(
            p.map(
                timestamp_to_seconds,
                (ts for ts in events_df["timestamp"].values),
                chunksize=1000,
            ),
            total=len(events_df),
        )
    )

HBox(children=(IntProgress(value=0, max=44734862), HTML(value='')))




### Add UTML coords

In [16]:
def geographic_to_utm(longitude, latitude, _cache={}):
    if (longitude, latitude) in _cache:
        return _cache[(longitude, latitude)]

    from pyproj import Proj, transform

    x, y = transform(
        Proj(init="epsg:4326"), Proj(init="epsg:3857"), longitude, latitude
    )

    _cache[(longitude, latitude)] = (x, y)
    return x, y


geographic_to_utm(-87.632746, 41.880994)

(-9755232.661536282, 5143169.558268635)

In [17]:
r_major = 6378137.000

events_df["utm_x"] = (r_major * 2 * np.pi / 360) * events_df["longitude"]
scale = events_df["utm_x"] / events_df["longitude"]
events_df["utm_y"] = (
    180.0 / np.pi * np.log(np.tan((np.pi / 4.0) + events_df["latitude"] * (np.pi / 180.0 / 2.0))) * scale
)
events_df["utm_y"] = events_df["utm_y"].fillna(0)

In [18]:
import unittest

for row in itertools.islice(events_df.itertuples(), 100):
    utm_x, utm_y = geographic_to_utm(row.longitude, row.latitude)
    np.testing.assert_almost_equal(utm_x, row.utm_x)
    np.testing.assert_almost_equal(utm_y, row.utm_y)

### Sort table by timestamp

In [19]:
events_df = events_df.sort_values("timestamp", ascending=True)

## Export to parquet

In [20]:
events_df.head()

Unnamed: 0,event_type,timestamp,longitude,latitude,trip_miles,trip_total,timestamp_seconds,utm_x,utm_y
28336433,stop,1900-01-01 00:00:00,0.0,0.0,0.0,0.0,-2208989000.0,0.0,0.0
0,start,2015-12-01 00:00:00,-73.979942,40.765381,,,1448928000.0,-8235410.0,4977797.0
22367433,stop,2015-12-01 00:00:00,-73.974548,40.791641,2.62,21.36,1448928000.0,-8234809.0,4981657.0
22367432,stop,2015-12-01 00:00:00,-73.993629,40.745998,2.69,25.64,1448928000.0,-8236933.0,4974948.0
1,start,2015-12-01 00:00:00,-73.972336,40.762379,,,1448928000.0,-8234563.0,4977356.0


In [21]:
events_df_filtered = events_df[
    (events_df["timestamp"] != "1900-01-01 00:00:00") &
    (events_df["longitude"] != 0) &
    (events_df["latitude"] != 0)
]

In [22]:
events_df_filtered.head()

Unnamed: 0,event_type,timestamp,longitude,latitude,trip_miles,trip_total,timestamp_seconds,utm_x,utm_y
0,start,2015-12-01 00:00:00,-73.979942,40.765381,,,1448928000.0,-8235410.0,4977797.0
22367433,stop,2015-12-01 00:00:00,-73.974548,40.791641,2.62,21.36,1448928000.0,-8234809.0,4981657.0
22367432,stop,2015-12-01 00:00:00,-73.993629,40.745998,2.69,25.64,1448928000.0,-8236933.0,4974948.0
1,start,2015-12-01 00:00:00,-73.972336,40.762379,,,1448928000.0,-8234563.0,4977356.0
2,start,2015-12-01 00:00:00,-73.968849,40.76453,,,1448928000.0,-8234175.0,4977672.0


In [23]:
output_file = op.join(NOTEBOOK_NAME, "new-york-taxi-events.parquet")
print(output_file)

new_york_taxirides_to_events/new-york-taxi-events.parquet


In [24]:
table = pa.Table.from_pandas(events_df_filtered, preserve_index=False)
pq.write_table(table, output_file, row_group_size=20000, flavor="spark", version="2.0")

In [25]:
pt = pq.ParquetFile(output_file)
pt.num_row_groups

2204

In [26]:
pt.read_row_group(0).to_pandas(integer_object_nulls=True).head()

Unnamed: 0,event_type,timestamp,longitude,latitude,trip_miles,trip_total,timestamp_seconds,utm_x,utm_y
0,start,2015-12-01 00:00:00,-73.979942,40.765381,,,1448928000.0,-8235410.0,4977797.0
1,stop,2015-12-01 00:00:00,-73.974548,40.791641,2.62,21.36,1448928000.0,-8234809.0,4981657.0
2,stop,2015-12-01 00:00:00,-73.993629,40.745998,2.69,25.64,1448928000.0,-8236933.0,4974948.0
3,start,2015-12-01 00:00:00,-73.972336,40.762379,,,1448928000.0,-8234563.0,4977356.0
4,start,2015-12-01 00:00:00,-73.968849,40.76453,,,1448928000.0,-8234175.0,4977672.0
