# StreamBasedCache Demo - New York Taxi Rides

**Limitations**

- Caches are not removed on system exit.
  - Adding a `__del__` method would conflict with over-write semantics.

## Install dependencies

In [None]:
try:
    import google.colab
    GOOGLE_COLAB = True
except ImportError:
    GOOGLE_COLAB = False

## Imports

In [None]:
from __future__ import division, print_function

import atexit
import itertools
import json
import os
import time
import uuid
from datetime import datetime

import apache_beam as beam
import dateutil
import pytz
from apache_beam import transforms
from apache_beam.io.gcp.pubsub import PubsubMessage
from apache_beam.options.pipeline_options import (GoogleCloudOptions,
                                                  PipelineOptions)
from apache_beam.runners.interactive.caching import streambasedcache
from apache_beam.transforms import combiners, window
from google.api_core import exceptions as gexc
from google.cloud import pubsub_v1

## Parameters

In [None]:
NOTEBOOK_NAME = "streambasedcache-new_york_taxirides"

try:
    os.makedirs(NOTEBOOK_NAME)
except OSError:
    pass

In [None]:
# { display-mode: "form" }
project_id = "strokach-playground"  #@param {type:"string"}
remote_temp_location = "gs://strokach/dataflow_temp"  #@param {type:"string"}

In [None]:
options = PipelineOptions(
    project=project_id,
    temp_location=remote_temp_location,
    streaming=True,
    runner="DirectRunner",
#     runner="DataflowRunner",
    sdk_location=os.path.expanduser(
        "~/workspace/beam/sdks/python/dist/apache-beam-2.15.0.dev0.tar.gz"
    ),
    setup_file="../setup.py"
)
options.display_data()

In [None]:
if GOOGLE_COLAB:
    from google.colab import auth
    auth.authenticate_user()

In [None]:
try:
    %load_ext autoreload
    %autoreload 2
except Exception:
    print("No autoreload")

## Function definitions

In [None]:
def create_cache(location, cache_class, *args, **kwargs):
    for _ in range(3):
        full_location = "{}-{}".format(location, uuid.uuid4().hex[:8])
        try:
            return cache_class(full_location, *args, **kwargs)
        except IOError as e:
            pass
    raise e

### Create subscription

In [None]:
sub_client = pubsub_v1.SubscriberClient()

In [None]:
project_id

In [None]:
taxirides_topic_path = "projects/pubsub-public-data/topics/taxirides-realtime"
subscription_path = "projects/{}/subscriptions/{}".format(project_id, NOTEBOOK_NAME)


def create_subscription(subscription_path, taxirides_topic_path):
    sub_client.create_subscription(subscription_path, taxirides_topic_path)


def delete_subscription(subscription_path):
    sub_client.delete_subscription(subscription_path)


try:
    create_subscription(subscription_path, taxirides_topic_path)
    atexit.register(delete_subscription, subscription_path)
except gexc.AlreadyExists:
    delete_subscription(subscription_path)
    create_subscription(subscription_path, taxirides_topic_path)
    atexit.register(delete_subscription, subscription_path)

### Read data from subscription

In [None]:
class LoadPubSubMessage(beam.DoFn):
    def process(self, message):
        from datetime import datetime
        import json

        import arrow
        import pytz

        from apache_beam.transforms import window
        from apache_beam.utils import timestamp

        timestamp_str = message.attributes["ts"]
        dt = arrow.get(timestamp_str).datetime
        dt_delta = dt.astimezone(pytz.UTC) - datetime.utcfromtimestamp(0).replace(
            tzinfo=pytz.UTC
        )
        timestampt = timestamp.Timestamp(seconds=dt_delta.total_seconds())

        element = json.loads(message.data.decode())
        element["timestamp_milliseconds"] = int(timestampt.micros / 1000)
        yield window.TimestampedValue(element, timestampt)


message = PubsubMessage(
    data=json.dumps({"x": "hello"}).encode("utf-8"),
    attributes={"ts": "2019-06-27T20:36:35.4972-04:00"},
)
assert next(LoadPubSubMessage().process(message)).value == {
    u"x": u"hello",
    "timestamp_milliseconds": 1561682195497,
}
assert next(LoadPubSubMessage().process(message)).timestamp.micros == 1561682195497200
next(LoadPubSubMessage().process(message)).value

In [None]:
class AddMercatorCoords(beam.DoFn):
    
    def process(self, element):
        import numpy as np

        r_major = 6378137.000

        element["utm_x"] = (r_major * 2 * np.pi / 360) * element["longitude"]
        try:
            scale = element["utm_x"] / element["longitude"]
        except ZeroDivisionError:
            scale = 0
        element["utm_y"] = (
            180.0 / np.pi * np.log(np.tan((np.pi / 4.0) + element["latitude"] * (np.pi / 180.0 / 2.0))) * scale
        )
        yield element
#         events_df["utm_y"] = events_df["utm_y"].fillna(0)
        
        
next(AddMercatorCoords().process({"longitude": 0, "latitude": 0}))

In [None]:
class FilterByRegion(beam.DoFn):
    def __init__(self, utm_x_range, utm_y_range):
        self.utm_x_range = utm_x_range
        self.utm_y_range = utm_y_range

    def process(self, element):
        if (self.utm_x_range[0] <= element["utm_x"] < self.utm_x_range[1]) and (
            self.utm_y_range[0] <= element["utm_y"] < self.utm_y_range[1]
        ):
            yield element

In [None]:
class FilterRideStatus(beam.DoFn):
    def __init__(self, ride_status):
        self._ride_status = ride_status
        super(FilterRideStatus, self).__init__()

    def process(self, element):
        if element["ride_status"] in self._ride_status:
            yield element

In [None]:
class AddWindowRange(beam.DoFn):
    def process(self, element, window=beam.DoFn.WindowParam):
        import pytz

        ts_format = "%Y-%m-%dT%H:%M:%S.%f-04:00"
        element["window_start_est"] = (
            window.start.to_utc_datetime()
            .replace(tzinfo=pytz.UTC)
            .astimezone(pytz.timezone("US/Eastern"))
            .strftime(ts_format)
        )
        element["window_end_est"] = (
            window.end.to_utc_datetime()
            .replace(tzinfo=pytz.UTC)
            .astimezone(pytz.timezone("US/Eastern"))
            .strftime(ts_format)
        )
        element["window_start_milliseconds"] = int(window.start.micros / 1000)
        element["window_end_milliseconds"] = int(window.end.micros / 1000)
        yield element

> ***Issues***
> 
> - If any errors occur, we more often than not have to restart the kernel.
> - Python Beam SDK does not support publishing messages with a specific timestamp. We always have to infer the timestamp from one of the attributes.

In [None]:
MERCATOR_X_RANGE = (-8240000, -8220000)
MERCATOR_Y_RANGE = (4950000, 5000000)

In [None]:
try:
    pr.cancel()
except NameError:
    pass

temp = streambasedcache.PubSubBasedCache(
    "projects/{}/topics/{}-temp".format(project_id, NOTEBOOK_NAME),
    mode="overwrite",
    with_attributes=["timestamp_milliseconds"],
    timestamp_attribute="timestamp_milliseconds",
)

options.view_as(GoogleCloudOptions).job_name = "test6"
p = beam.Pipeline(options=options)

out = (
    p
    | "Read"
    >> beam.io.ReadFromPubSub(subscription=subscription_path, with_attributes=True)
    | "Decode message" >> beam.ParDo(LoadPubSubMessage())
    | "Add Mercator coords" >> beam.ParDo(AddMercatorCoords())
    | "Filter to New York"
    >> beam.ParDo(FilterByRegion(MERCATOR_X_RANGE, MERCATOR_Y_RANGE))
    | "Window" >> beam.WindowInto(window.FixedWindows(1))
    | "Add window info" >> beam.ParDo(AddWindowRange())
    | "Combine" >> beam.CombineGlobally(combiners.ToListCombineFn()).without_defaults()
    | "Reduce"
    >> beam.Map(
        lambda e: {
            "timestamp_milliseconds": e[0]["window_end_milliseconds"],
            "number_of_events": len(e),
        }
    )
    | "Write" >> temp.writer()
)

pr = p.run()

In [None]:
data_source  = temp.read(from_start=False, timeout=10, return_timestamp=True)
out = list(itertools.islice(data_source, 100))
out

In [None]:
out

In [None]:
# try:
#     print(pr.cancel())
# except NameError:
#     pass

In [None]:
random.randint(0, 10)

In [None]:
import threading
output_notebook()

fg = figure(plot_width=400, plot_height=400)

source = ColumnDataSource(
    data=dict(x=[1, 2, 3, 4, 5], y=[random.randint(0, 10) for _ in range(5)])
)

# add a circle renderer with a size, color, and alpha
fg.circle("x", "y", size=20, color="navy", alpha=0.5, source=source)

# show the results
handle = show(fg, notebook_handle=True)

def update_plot():
    while True:
        time.sleep(1)
        source.data = {"x": [1, 2, 3, 4, 5], "y": [random.randint(0, 10) for _ in range(5)]}
        #     print(source.data)
        push_notebook(handle=handle)
        
        
t2 = threading.Thread(target=update_plot)
t2.start()

In [None]:
t1.join()

In [None]:
from bokeh.io import output_notebook, push_notebook, show
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure

output_notebook()

fg = figure(plot_width=400, plot_height=400)

source = ColumnDataSource(data=dict(x=[1, 2, 3, 4, 5], y=[6, 7, 2, 4, 5]))

# add a circle renderer with a size, color, and alpha
fg.circle("x", "y", size=20, color="navy", alpha=0.5, source=source)

# show the results
handle = show(fg, notebook_handle=True)

while True:
    time.sleep(1)
    source.data = {
        "x": source.data["x"] + [x + 1 for x in source.data["x"]],
        "y": [y + 1 for y in source.data["y"]],
    }
    #     print(source.data)
    push_notebook(handle=handle)

In [None]:
output_notebook()

fg, source = create_map()

# Updates
handle = show(fg, notebook_handle=True)

days_of_week = [
    "Monday",
    "Tuesday",
    "Wednesday",
    "Thursday",
    "Friday",
    "Saturday",
    "Sunday",
]

data_source = temp.read(timeout=10, return_timestamp=True)
time.sleep(5)

for timestamp, message in data_source:
    data = message.data.data
    dt = datetime.utcfromtimestamp(timestamp).replace(
        tzinfo=pytz.UTC
    )  # .astimezone(pytz.timezone('US/Eastern'))
    dt_str = days_of_week[dt.weekday()] + " " + dt.strftime("%b %d %Y %I:%M:%S %f")
    fg.title.text = dt_str
    fg.title.align = "center"
    source.data = {
        "x": [d["utm_x"] for d in data],
        "y": [d["utm_y"] for d in data],
        "ride_status": [d["ride_status"] for d in data],
    }
#     print(source.data)
    push_notebook(handle=handle)
    time.sleep(0.1)

In [None]:
import bokeh
from bokeh.core.properties import value
from bokeh.io import output_notebook, push_notebook, show
from bokeh.layouts import row
from bokeh.models import ColumnDataSource, Label, LabelSet, Legend, LegendItem, Range1d
from bokeh.models.annotations import Title
from bokeh.plotting import figure, show
from bokeh.tile_providers import Vendors, get_provider
from bokeh.transform import factor_cmap

if GOOGLE_COLAB:
    print("Interactive plot does not work on colab yet!")


def create_map():
    # Colormap
    # cmap = bokeh.palettes.d3["Category10"][4]
    cmap = bokeh.palettes.d3["Category20b"][20]
    colors = [cmap[1], cmap[9], cmap[-2]]

    # Source of data
    source = ColumnDataSource(data=dict(x=[], y=[], ride_status=[]))

    # Background map
    fg = figure(
        x_range=MERCATOR_X_RANGE,
        y_range=MERCATOR_Y_RANGE,
        x_axis_type="mercator",
        y_axis_type="mercator",
        title_location="above",
        plot_height=600,
    )
    fg.add_tile(get_provider(Vendors.CARTODBPOSITRON))

    # Scatterplot
    fg.circle(
        x="x",
        y="y",
        source=source,
        size=2,
        color=factor_cmap("ride_status", colors, ["pickup", "enroute", "dropoff"]),
        fill_alpha=0.8,
        #     legend=value("start", "stop"),
        #     legend=[value(x) for x in ["start", "stop"]],
    )

    # Legend
    pickup = fg.circle(x=[], y=[], color=colors[0])
    enroute = fg.circle(x=[], y=[], color=colors[1])
    dropoff = fg.circle(x=[], y=[], color=colors[2])
    legend = Legend(
        items=[("pickup", [pickup]), ("enroute", [enroute]), ("dropoff", [dropoff])]
    )
    fg.add_layout(legend)
    return fg, source

In [None]:
try:
    data_source  = temp.read(timeout=10, return_timestamp=True)
#     time.sleep(3)
    out = list(itertools.islice(data_source, 100))
finally:
    pr.cancel()
out

In [None]:
out = (
    p
    | "Read"
    >> beam.io.ReadFromPubSub(subscription=subscription_path, with_attributes=True)
    | "Decode message" >> beam.ParDo(LoadPubSubMessage())
    | "Window" >> beam.WindowInto(window.FixedWindows(1))
    | "Add window info" >> beam.ParDo(AddWindowRange())
    | "Combine" >> beam.CombineGlobally(combiners.ToListCombineFn()).without_defaults()
    | "Reduce"
    >> beam.Map(
        lambda e: {
            "timestamp_milliseconds": e[0]["window_end_milliseconds"],
            "number_of_events": len(e),
        }
    )
    | "Write" >> temp.writer()
)

In [None]:
    #     | "Filter to pickups and dropoff"
    #     >> beam.ParDo(FilterRideStatus(["pickup", "dropoff"]))
    | "Pair with end of window" >> beam.Map(lambda e: (e["window_end_milliseconds"], e))
    | "Group by end of window" >> beam.GroupByKey()
    | "Reduce"
    >> beam.Map(
        lambda e: {"timestamp_milliseconds": e[0], "number_of_events": len(e[1])}
    )


In [None]:
# [Timestamp(1562888333.275000),
#  Timestamp(1562888333.279000),
#  Timestamp(1562888333.286000),
#  Timestamp(1562888333.288000),
#  Timestamp(1562888333.294000)]

In [None]:
# try:
#     pr.cancel()
# except NameError:
#     pass

### Interactive dashboard

In [None]:
import bokeh
from bokeh.core.properties import value
from bokeh.io import output_notebook, push_notebook, show
from bokeh.layouts import row
from bokeh.models import ColumnDataSource, Label, LabelSet, Legend, LegendItem, Range1d
from bokeh.models.annotations import Title
from bokeh.plotting import figure, show
from bokeh.tile_providers import Vendors, get_provider
from bokeh.transform import factor_cmap

if GOOGLE_COLAB:
    print("Interactive plot does not work on colab yet!")


def create_map():
    # Colormap
    # cmap = bokeh.palettes.d3["Category10"][4]
    cmap = bokeh.palettes.d3["Category20b"][20]
    colors = [cmap[1], cmap[9], cmap[-2]]

    # Source of data
    source = ColumnDataSource(data=dict(x=[], y=[], ride_status=[]))

    # Background map
    fg = figure(
        x_range=MERCATOR_X_RANGE,
        y_range=MERCATOR_Y_RANGE,
        x_axis_type="mercator",
        y_axis_type="mercator",
        title_location="above",
        plot_height=600,
    )
    fg.add_tile(get_provider(Vendors.CARTODBPOSITRON))

    # Scatterplot
    fg.circle(
        x="x",
        y="y",
        source=source,
        size=2,
        color=factor_cmap("ride_status", colors, ["pickup", "enroute", "dropoff"]),
        fill_alpha=0.8,
        #     legend=value("start", "stop"),
        #     legend=[value(x) for x in ["start", "stop"]],
    )

    # Legend
    pickup = fg.circle(x=[], y=[], color=colors[0])
    enroute = fg.circle(x=[], y=[], color=colors[1])
    dropoff = fg.circle(x=[], y=[], color=colors[2])
    legend = Legend(
        items=[("pickup", [pickup]), ("enroute", [enroute]), ("dropoff", [dropoff])]
    )
    fg.add_layout(legend)
    return fg, source

In [None]:
output_notebook()

fg, source = create_map()

# Updates
handle = show(fg, notebook_handle=True)

days_of_week = [
    "Monday",
    "Tuesday",
    "Wednesday",
    "Thursday",
    "Friday",
    "Saturday",
    "Sunday",
]

data_source = temp.read(timeout=10, return_timestamp=True)
time.sleep(5)

for timestamp, message in data_source:
    data = message.data.data
    dt = datetime.utcfromtimestamp(timestamp).replace(
        tzinfo=pytz.UTC
    )  # .astimezone(pytz.timezone('US/Eastern'))
    dt_str = days_of_week[dt.weekday()] + " " + dt.strftime("%b %d %Y %I:%M:%S %f")
    fg.title.text = dt_str
    fg.title.align = "center"
    source.data = {
        "x": [d["utm_x"] for d in data],
        "y": [d["utm_y"] for d in data],
        "ride_status": [d["ride_status"] for d in data],
    }
#     print(source.data)
    push_notebook(handle=handle)
    time.sleep(0.1)