# Install packages

In [1]:
# !pip install apache-beam[gcp] google-cloud-pubsub bokeh

In [2]:
# !pip install -e "git+https://github.com/ostrokach/beam.git@develop#egg=apache-beam&subdirectory=sdks/python"

# Imports

Packages available by default:

In [3]:
import atexit
import os
import os.path as op
import string
import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
from absl import flags
from google.cloud import bigquery, pubsub_v1
# from google.colab import auth

Packages that have to be installed:

In [4]:
import apache_beam as beam
import avro.schema
from apache_beam.runners.interactive import interactive_runner
from apache_beam.runners.direct import direct_runner

In [5]:
beam.__version__

2.14.0.dev

# Parameters

In [6]:
project_id = 'strokach-playground'

os.environ["GOOGLE_CLOUD_PROJECT"] = project_id
!gcloud config set project {project_id}

Updated property [core/project].


In [7]:
# auth.authenticate_user()
# print('Authenticated')

In [8]:
NOTEBOOK_NAME = "bokeh_examples"
NOTEBOOK_PATH = op.realpath(NOTEBOOK_NAME)

NOTEBOOK_PATH

/home/strokach/workspace/beam-notebooks/bokeh_examples

# Pipelines

## taxirides-realtime

In [9]:
underlying_runner = beam.runners.dataflow.DataflowRunner()

pipeline_options = beam.pipeline.PipelineOptions(
    project=project_id,
    temp_location="gs://strokach/temp",
    job_name="taxirides-realtime",
    streaming=True,
    # $ python setup.py sdist
    sdk_location=op.expanduser("~/workspace/beam/sdks/python/dist/apache-beam-2.14.0.dev0.tar.gz"),
)

# runner = interactive_runner.InteractiveRunner(underlying_runner=underlying_runner, render_option='graph')
runner = underlying_runner

In [10]:
taxirides_wf = (
    beam.Pipeline(runner=runner, options=pipeline_options)
    | "Read" >> beam.io.ReadFromPubSub(topic="projects/pubsub-public-data/topics/taxirides-realtime")
    | "Write" >> beam.io.WriteToPubSub(topic="projects/strokach-playground/topics/taxirides-realtime")
)

In [11]:
# taxirides_r = taxirides_wf.pipeline.run()
# atexit.register(taxirides_r.cancel)

# Dashboards

## Imports

In [12]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
output_notebook()

## Basic test of interactivity

In [13]:
p1 = figure(plot_width=250, plot_height=250)
r1 = p1.circle([1,2,3], [4,5,6], size=20)

p2 = figure(plot_width=250, plot_height=250)
r2 = p2.circle([1,2,3], [4,5,6], size=20)

t = show(row(p1, p2), notebook_handle=True)

In [14]:
r1.glyph.fill_color = "white"
push_notebook(handle=t)

## Map taxi trips

In [15]:
def geographic_to_utm(element):
    from pyproj import Proj, transform

    element["x"], element["y"] = transform(
        Proj(init='epsg:4326'),
        Proj(init='epsg:3857'),
        element["longitude"],
        element["latitude"],
    )

    return element

In [16]:
def xxx(element):
    from pyproj import Proj, transform

    x, y = transform(
        Proj(init='epsg:4326'),
        Proj(init='epsg:3857'),
        element["longitude"],
        element["latitude"],
    )

    return x, y

In [17]:
x_min, y_min = xxx({"longitude": -74.747, "latitude": 40.699})
x_max, y_max = xxx({"longitude": -73.969, "latitude": 40.720})

In [18]:
from bokeh.plotting import figure, show
from bokeh.tile_providers import get_provider, Vendors

# range bounds supplied in web mercator coordinates
p = figure(x_range=(x_min, x_max), y_range=(y_min, y_max),
           x_axis_type="mercator", y_axis_type="mercator")
p.add_tile(get_provider(Vendors.CARTODBPOSITRON))

source = ColumnDataSource(data=dict(x=[], y=[]))

p.circle(x="x", y="y", size=15, fill_color="blue", fill_alpha=0.8, source=source)

t = show(p, notebook_handle=True)

In [19]:
# source.stream({"x": [0,1,2], "y": [1,2,3]})
# push_notebook(handle=t)

### Explore

In [20]:
def tee_to_output(element):
    output.append(element)
    return element

In [21]:
def load_json(element):
    ju = json.loads(element)
    js = {}
    for k, v in ju.items():
        if isinstance(k, unicode):
            k = str(k)
        if isinstance(v, unicode):
            v = str(v)
        js[k] = v
    return js

In [22]:
output_folder = op.join(NOTEBOOK_PATH, "pipeline-output") 

try:
    os.makedirs(output_folder)
except OSError:
    pass

In [23]:
# ts = output[0]["timestamp"]
# ts

In [24]:
# date_time_obj = datetime.strptime(ts, '%Y-%m-%dT%H:%M:%S.%f-04:00')

In [25]:
class AverageFn(beam.CombineFn):

    def create_accumulator(self):
        return (0.0, 0.0, 0)

    def add_input(self, sum_count, input):
        from datetime import datetime

        (passenger_count_sum, timepoint_sum, count) = sum_count

        try:
            timestamp = datetime.strptime(input["timestamp"], '%Y-%m-%dT%H:%M:%S.%f-04:00')
        except ValueError:
            timestamp = datetime.strptime(input["timestamp"], '%Y-%m-%dT%H:%M:%S-04:00')

        passenger_count_sum += input["passenger_count"]
        timepoint_sum += float(timestamp.strftime("%s"))
        count += 1
        return passenger_count_sum, timepoint_sum, count
        
    def merge_accumulators(self, accumulators):
        passenger_count_sums, timepoint_sums, counts = zip(*accumulators)
        return sum(passenger_count_sums), sum(timepoint_sums), sum(counts)

    def extract_output(self, sum_count):
        (passenger_count_sum, timepoint_sum, count) = sum_count
        passenger_count_avg = passenger_count_sum / count if count else float("NaN")
        timepoint_avg = timepoint_sum / count if count else float("NaN")
        return passenger_count_avg, timepoint_avg

In [26]:
def filter_ny(element):
    if ((-74.747 < element["longitude"] < -73.969) and
        (40.699 < element["latitude"] < 40.720)):
        return [element]
    else:
        return []

In [27]:
def update_plot(values):
    x_lst = []
    y_lst = []
    count = 0
    for element in values:
        try:
            x_lst.append(element["x"])
            y_lst.append(element["y"])
            count += 1
            if count > 10:
                break
        except (KeyError, TypeError):
            output.append(element)
    source.stream({"x": x_lst, "y": y_lst})
    push_notebook(handle=t)

In [28]:
def increment_counter(element):
    global counter
    counter += 1
    return element

In [29]:
output = []
counter = 0

data = (
    beam.Pipeline(runner="direct", options=beam.pipeline.PipelineOptions(streaming=True))
    | "Read" >> beam.io.ReadFromPubSub(
        topic="projects/strokach-playground/topics/taxirides-realtime",
#         subscription="projects/strokach-playground/subscriptions/beam_1558483403_228577dd",
        timestamp_attribute="ts"
    )
    | "Load JSON" >> beam.Map(load_json)
    | "Window Into" >> beam.WindowInto(beam.window.FixedWindows(10))
    | "Filter non-NY" >> beam.FlatMap(filter_ny)
    | "Add UTM coords" >> beam.Map(geographic_to_utm)
#     | "Gather Statistics" >> beam.CombineGlobally(AverageFn()).without_defaults()
#     | "TeeToOutput" >> beam.Map(tee_to_output)
    | "Update plot" >> beam.CombineGlobally(update_plot).without_defaults()
    | "Update counter" >> beam.Map(increment_counter)
)

In [30]:
result = data.pipeline.run()
# atexit.register(result.cancel)

In [31]:
# result.cancel()

In [32]:
result.state

RUNNING

In [33]:
# result.cancel()

```python
Unhandled exception in thread started by 
Traceback (most recent call last):
  File "/home/strokach/miniconda3/envs/beam-dev/lib/python2.7/threading.py", line 774, in __bootstrap
  File "/home/strokach/miniconda3/envs/beam-dev/lib/python2.7/threading.py", line 814, in __bootstrap_inner
  File "/home/strokach/miniconda3/envs/beam-dev/lib/python2.7/site-packages/ipykernel/iostream.py", line 400, in write
  File "/home/strokach/miniconda3/envs/beam-dev/lib/python2.7/site-packages/ipykernel/iostream.py", line 203, in schedule
  File "/home/strokach/miniconda3/envs/beam-dev/lib/python2.7/site-packages/ipykernel/iostream.py", line 101, in _event_pipe
  File "/home/strokach/miniconda3/envs/beam-dev/lib/python2.7/site-packages/zmq/sugar/context.py", line 146, in socket
  File "/home/strokach/miniconda3/envs/beam-dev/lib/python2.7/site-packages/zmq/sugar/socket.py", line 59, in __init__
  File "zmq/backend/cython/socket.pyx", line 328, in zmq.backend.cython.socket.Socket.__init__
ZMQError: Too many open files
Unhandled exception in thread started by 
Traceback (most recent call last):
  File "/home/strokach/miniconda3/envs/beam-dev/lib/python2.7/threading.py", line 774, in __bootstrap
  File "/home/strokach/miniconda3/envs/beam-dev/lib/python2.7/threading.py", line 814, in __bootstrap_inner
  File "/home/strokach/miniconda3/envs/beam-dev/lib/python2.7/site-packages/ipykernel/iostream.py", line 400, in write
  File "/home/strokach/miniconda3/envs/beam-dev/lib/python2.7/site-packages/ipykernel/iostream.py", line 203, in schedule
  File "/home/strokach/miniconda3/envs/beam-dev/lib/python2.7/site-packages/ipykernel/iostream.py", line 101, in _event_pipe
  File "/home/strokach/miniconda3/envs/beam-dev/lib/python2.7/site-packages/zmq/sugar/context.py", line 146, in socket
  File "/home/strokach/miniconda3/envs/beam-dev/lib/python2.7/site-packages/zmq/sugar/socket.py", line 59, in __init__
  File "zmq/backend/cython/socket.pyx", line 328, in zmq.backend.cython.socket.Socket.__init__
ZMQError: Too many open files
```

In [34]:
output[0]['y']

IndexError: list index out of range

ERROR:root:Exception at bundle <apache_beam.runners.direct.bundle_factory._Bundle object at 0x7f4ff649de50>, due to an exception.
 Traceback (most recent call last):
  File "/home/strokach/workspace/beam/sdks/python/apache_beam/runners/direct/executor.py", line 343, in call
    finish_state)
  File "/home/strokach/workspace/beam/sdks/python/apache_beam/runners/direct/executor.py", line 380, in attempt_call
    evaluator.process_element(value)
  File "/home/strokach/workspace/beam/sdks/python/apache_beam/runners/direct/transform_evaluator.py", line 633, in process_element
    self.runner.process(element)
  File "/home/strokach/workspace/beam/sdks/python/apache_beam/runners/common.py", line 765, in process
    self._reraise_augmented(exn)
  File "/home/strokach/workspace/beam/sdks/python/apache_beam/runners/common.py", line 832, in _reraise_augmented
    raise_with_traceback(new_exn)
  File "/home/strokach/workspace/beam/sdks/python/apache_beam/runners/common.py", line 763, in process
  

In [None]:
counter

In [None]:
output

In [None]:
from bokeh.plotting import figure, show
from bokeh.tile_providers import get_provider, Vendors

# range bounds supplied in web mercator coordinates
p = figure(x_range=(-1000000, 6000000), y_range=(-1000000, 7000000),
           x_axis_type="mercator", y_axis_type="mercator")
p.add_tile(get_provider(Vendors.CARTODBPOSITRON))

source = ColumnDataSource(
    data=dict(lat=[ 30.29,  30.20,  30.29],
              lon=[-97.70, -97.74, -97.78])
)

p.circle(x="lon", y="lat", size=15, fill_color="blue", fill_alpha=0.8, source=source)

show(p)

In [None]:
counter