# New York Taxirides to Events

Parse a dataset of New York Taxirides into start and stop events to be used for streaming Beam demos.

## Imports

In [None]:
import concurrent.futures
import itertools
import os
import os.path as op
import shutil
import time
import uuid
from collections import Counter
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyproj
import pytz
import requests
import tqdm

In [None]:
pd.set_option("max_columns", 100)
pd.set_option("max_rows", 1000)

In [None]:
%matplotlib inline

## Parameters

In [None]:
NOTEBOOK_NAME = "new_york_taxirides_to_events"
try:
    os.mkdir(NOTEBOOK_NAME)
except OSError:
    pass

In [None]:
URLS = [
    "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-12.csv",
    "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2016-01.csv",
]

In [None]:
try:
    %load_ext autoreload
    %autoreload 2
except Exception:
    print("No autoreload")

## Create events dataset

### Load New York taxirides data

In [None]:
def download_file(url, outfile):
    with requests.get(url, stream=True) as r:
        with open(outfile, 'wb') as f:
            shutil.copyfileobj(r.raw, f)
    return outfile

In [None]:
dfs = []
for url in URLS:
    output_file = os.path.join(NOTEBOOK_NAME, url.split("/")[-1])
    try:
        df = pd.read_csv(output_file)
    except IOError:
        download_file(URL, output_file)
        df = pd.read_csv(output_file)
    dfs.append(df)
        
new_york_taxirides = pd.concat(dfs, ignore_index=True)

In [None]:
new_york_taxirides.head(2)

### Validate data

In [None]:
assert len(new_york_taxirides) > (0.7 * len(new_york_taxirides[["pickup_longitude", "pickup_latitude"]].drop_duplicates()))

In [None]:
assert len(new_york_taxirides) > (0.7 * len(new_york_taxirides[["dropoff_longitude", "dropoff_latitude"]].drop_duplicates()))

### Create `events_df`

In [None]:
num_events = len(new_york_taxirides)

events = {
    "event_type": np.array(["start"] * num_events + ["stop"] * num_events),
    # TODO(ostrokach): this should be a pyarrow timestamp type.
    "timestamp": np.r_[
        new_york_taxirides["tpep_pickup_datetime"],
        new_york_taxirides["tpep_dropoff_datetime"],
    ],
    "longitude": np.r_[
        new_york_taxirides["pickup_longitude"], new_york_taxirides["dropoff_longitude"]
    ],
    "latitude": np.r_[
        new_york_taxirides["pickup_latitude"], new_york_taxirides["dropoff_latitude"]
    ],
    "trip_miles": np.r_[
        np.array([np.nan] * num_events), new_york_taxirides["trip_distance"]
    ],
    "trip_total": np.r_[
        np.array([np.nan] * num_events), new_york_taxirides["total_amount"]
    ],
}

In [None]:
events_df = pd.DataFrame(events)

### Add timestamp

In [None]:
def timestamp_to_milliseconds(timestamp_str):
    from datetime import datetime
    import pytz

    dt = datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S")
    dt = dt.replace(tzinfo=pytz.UTC)  # .astimezone(pytz.timezone('America/Chicago'))
    unix_dt = datetime.utcfromtimestamp(0).replace(tzinfo=pytz.UTC)
    dt_delta = int((dt - unix_dt).total_seconds() * 1000)
    return dt_delta

timestamp_to_milliseconds("2016-01-01 00:00:00")

In [None]:
with concurrent.futures.ProcessPoolExecutor() as p:
    events_df["timestamp_milliseconds"] = list(
        tqdm.tqdm_notebook(
            p.map(
                timestamp_to_milliseconds,
                (ts for ts in events_df["timestamp"].values),
                chunksize=1000,
            ),
            total=len(events_df),
        )
    )

### Add Mercator coordinates

In [None]:
def geographic_to_utm(longitude, latitude, _cache={}):
    if (longitude, latitude) in _cache:
        return _cache[(longitude, latitude)]

    from pyproj import Proj, transform

    x, y = transform(
        Proj(init="epsg:4326"), Proj(init="epsg:3857"), longitude, latitude
    )

    _cache[(longitude, latitude)] = (x, y)
    return x, y


geographic_to_utm(-87.632746, 41.880994)

In [None]:
r_major = 6378137.000

events_df["utm_x"] = (r_major * 2 * np.pi / 360) * events_df["longitude"]
scale = events_df["utm_x"] / events_df["longitude"]
events_df["utm_y"] = (
    180.0 / np.pi * np.log(np.tan((np.pi / 4.0) + events_df["latitude"] * (np.pi / 180.0 / 2.0))) * scale
)
events_df["utm_y"] = events_df["utm_y"].fillna(0)

In [None]:
import unittest

for row in itertools.islice(events_df.itertuples(), 100):
    utm_x, utm_y = geographic_to_utm(row.longitude, row.latitude)
    np.testing.assert_almost_equal(utm_x, row.utm_x)
    np.testing.assert_almost_equal(utm_y, row.utm_y)

### Sort table by timestamp

In [None]:
events_df = events_df.sort_values("timestamp", ascending=True)

### Filter to New York

In [None]:
MERCATOR_X_RANGE = (-8240000, -8220000)

fg, ax = plt.subplots()
_ = ax.hist(
    np.clip(events_df["utm_x"], *MERCATOR_X_RANGE), bins=100, range=MERCATOR_X_RANGE
)

In [None]:
MERCATOR_Y_RANGE = (4950000, 5000000)

fg, ax = plt.subplots()
_ = ax.hist(
    np.clip(events_df["utm_y"], *MERCATOR_Y_RANGE), bins=100, range=MERCATOR_Y_RANGE
)

In [None]:
# events_df = events_df[
#     (MERCATOR_X_RANGE[0] <= events_df["utm_x"])
#     & (MERCATOR_X_RANGE[1] > events_df["utm_x"])
#     & (MERCATOR_Y_RANGE[0] <= events_df["utm_y"])
#     & (MERCATOR_Y_RANGE[1] > events_df["utm_y"])
# ]

## Export to parquet

In [None]:
events_df.head()

In [None]:
events_df_filtered = events_df[
    (events_df["timestamp"] != "1900-01-01 00:00:00") &
    (events_df["longitude"] != 0) &
    (events_df["latitude"] != 0)
]

In [None]:
events_df_filtered.head()

In [None]:
output_file = op.join(NOTEBOOK_NAME, "new-york-taxi-events.parquet")
print(output_file)

In [None]:
table = pa.Table.from_pandas(events_df_filtered, preserve_index=False)
pq.write_table(table, output_file, row_group_size=20000, flavor="spark", version="2.0")

In [None]:
pt = pq.ParquetFile(output_file)
pt.num_row_groups

In [None]:
pt.read_row_group(0).to_pandas(integer_object_nulls=True).head()