# Install packages

In [1]:
# !pip install apache-beam[gcp] google-cloud-pubsub bokeh

In [2]:
# !pip install -e "git+https://github.com/ostrokach/beam.git@develop#egg=apache-beam&subdirectory=sdks/python"

# Imports

Packages available by default:

In [None]:
import atexit
import os
import os.path as op
import string
import json

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
from absl import flags
from google.cloud import bigquery, pubsub_v1
# from google.colab import auth

Packages that have to be installed:

In [2]:
import apache_beam as beam
import avro.schema
from apache_beam.runners.interactive import interactive_runner
from apache_beam.runners.direct import direct_runner

In [3]:
beam.__version__

2.14.0.dev

# Parameters

In [4]:
project_id = 'strokach-playground'

os.environ["GOOGLE_CLOUD_PROJECT"] = project_id
!gcloud config set project {project_id}

Updated property [core/project].


In [5]:
# auth.authenticate_user()
# print('Authenticated')

In [6]:
NOTEBOOK_NAME = "bokeh_examples"
NOTEBOOK_PATH = op.realpath(NOTEBOOK_NAME)

NOTEBOOK_PATH

/usr/local/google/home/strokach/workspace/notebooks/bokeh_examples

# Pipelines

## taxirides-realtime

In [7]:
underlying_runner = beam.runners.dataflow.DataflowRunner()

pipeline_options = beam.pipeline.PipelineOptions(
    project=project_id,
    temp_location="gs://strokach/temp",
    job_name="debug1",
    streaming=True,
    # $ python setup.py sdist
    sdk_location="/usr/local/google/home/strokach/workspace/beam/sdks/python/dist/apache-beam-2.14.0.dev0.tar.gz",
)

# runner = interactive_runner.InteractiveRunner(underlying_runner=underlying_runner, render_option='graph')
runner = underlying_runner

In [8]:
taxirides_wf = (
    beam.Pipeline(runner=runner, options=pipeline_options)
    | "Read" >> beam.io.ReadFromPubSub(topic="projects/pubsub-public-data/topics/taxirides-realtime")
    | "Write" >> beam.io.WriteToPubSub(topic="projects/strokach-playground/topics/taxirides-realtime")
)

In [9]:
taxirides_r = taxirides_wf.pipeline.run()



# Dashboards

In [24]:
from bokeh.io import push_notebook, show, output_notebook
from bokeh.layouts import row
from bokeh.plotting import figure
output_notebook()

In [9]:
p1 = figure(plot_width=250, plot_height=250)
r1 = p1.circle([1,2,3], [4,5,6], size=20)

p2 = figure(plot_width=250, plot_height=250)
r2 = p2.circle([1,2,3], [4,5,6], size=20)

t = show(row(p1, p2), notebook_handle=True)

In [10]:
t

In [11]:
r1.glyph.fill_color = "white"
push_notebook(handle=t)

In [44]:
source = ColumnDataSource(data=dict(foo=[], bar=[]))

In [45]:
# has new, identical-length updates for all columns in source
new_data = {
    'foo' : [10, 20],
    'bar' : [100, 200],
}

source.stream(new_data)

In [47]:
# has new, identical-length updates for all columns in source
new_data = {
    'foo' : [15],
    'bar' : [150],
}

source.stream(new_data)
push_notebook(handle=t)

In [127]:
source = ColumnDataSource(data=dict(passenger_count=[], timepoint=[]))

p = figure()
p.line(x='timepoint', y='passenger_count', source=source)
t = show(p, notebook_handle=True)

### Explore

In [78]:
def tee_to_output(element):
    output.append(element)
    return element

In [14]:
def load_json(element):
    ju = json.loads(element)
    js = {}
    for k, v in ju.items():
        if isinstance(k, unicode):
            k = str(k)
        if isinstance(v, unicode):
            v = str(v)
        js[k] = v
    return js

In [15]:
output_folder = op.join(NOTEBOOK_PATH, "pipeline-output") 

try:
    os.makedirs(output_folder)
except OSError:
    pass

In [16]:
output = []


data = (
    beam.Pipeline(runner="direct", options=beam.pipeline.PipelineOptions(streaming=True))
    | "Read" >> beam.io.ReadFromPubSub(
        topic="projects/strokach-playground/topics/taxirides-realtime",
        timestamp_attribute="ts"
    )
    | "Load JSON" >> beam.Map(load_json)
    | "TeeToOutput" >> beam.Map(tee_to_output)
    | "Write" >> beam.io.WriteToText(output_folder)
)

In [17]:
result = data.pipeline.run()

In [18]:
result.state

RUNNING

In [75]:
result.cancel()

In [61]:
len(output)

31479

In [71]:
output[0]

{latitude: 40.701100000000004,
 longitude: -73.9873,
 meter_increment: 0.03368421,
 meter_reading: 10.374737,
 passenger_count: 2,
 point_idx: 308,
 ride_id: ee70d68a-c86b-45cf-9a0a-ba8df7a85c06,
 ride_status: enroute,
 timestamp: 2019-05-21T16:27:28.0828-04:00}

In [51]:
from datetime import datetime

In [55]:
ts = output[0]["timestamp"]
ts

2019-05-21T16:41:00.95953-04:00

In [67]:
date_time_obj = datetime.strptime(ts, '%Y-%m-%dT%H:%M:%S.%f-04:00')


In [111]:
class AverageFn(beam.CombineFn):

    def create_accumulator(self):
        return (0.0, 0.0, 0)

    def add_input(self, sum_count, input):
        (passenger_count_sum, timepoint_sum, count) = sum_count

        try:
            timestamp = datetime.strptime(input["timestamp"], '%Y-%m-%d %H:%M:%S.%f')
        except ValueError:
            timestamp = datetime.strptime(input["timestamp"], '%Y-%m-%d %H:%M:%S')

        passenger_count_sum += input["passenger_count"]
        timepoint_sum += float(timestamp.strftime("%s"))
        count += 1
        return passenger_count_sum, timepoint_sum, count
        
    def merge_accumulators(self, accumulators):
        passenger_count_sums, timepoint_sums, counts = zip(*accumulators)
        return sum(passenger_count_sums), sum(timepoint_sums), sum(counts)

    def extract_output(self, sum_count):
        (passenger_count_sum, timepoint_sum, count) = sum_count
        passenger_count_avg = passenger_count_sum / count if count else float("NaN")
        timepoint_avg = timepoint_sum / count if count else float("NaN")
        return passenger_count_avg, timepoint_avg

In [136]:
def update_plot(element):
    new_data = {
        'passenger_count' : [element[0]],
        'timepoint' : [element[1]],
    }
    source.stream(new_data)
    push_notebook(handle=t)

In [137]:
output = []


data = (
    beam.Pipeline(runner="direct", options=beam.pipeline.PipelineOptions(streaming=True))
    | "Read" >> beam.io.ReadFromPubSub(
        topic="projects/strokach-playground/topics/taxirides-realtime",
        timestamp_attribute="ts"
    )
    | "Load JSON" >> beam.Map(load_json)
    | "Window Into" >> beam.WindowInto(beam.window.FixedWindows(10))
    | "Gather Statistics" >> beam.CombineGlobally(AverageFn()).without_defaults()
    | "TeeToOutput" >> beam.Map(tee_to_output)
    | "Update plot" >> beam.Map(update_plot)
)

In [138]:
data_r = data.pipeline.run()

In [139]:
data_r.state

RUNNING

In [135]:
data_r.cancel()

In [142]:
len(output)

2

ERROR:root:Exception at bundle <apache_beam.runners.direct.bundle_factory._Bundle object at 0x7fc3b994af50>, due to an exception.
 Traceback (most recent call last):
  File "/usr/local/google/home/strokach/workspace/beam/sdks/python/apache_beam/runners/direct/executor.py", line 343, in call
    finish_state)
  File "/usr/local/google/home/strokach/workspace/beam/sdks/python/apache_beam/runners/direct/executor.py", line 380, in attempt_call
    evaluator.process_element(value)
  File "/usr/local/google/home/strokach/workspace/beam/sdks/python/apache_beam/runners/direct/transform_evaluator.py", line 633, in process_element
    self.runner.process(element)
  File "/usr/local/google/home/strokach/workspace/beam/sdks/python/apache_beam/runners/common.py", line 765, in process
    self._reraise_augmented(exn)
  File "/usr/local/google/home/strokach/workspace/beam/sdks/python/apache_beam/runners/common.py", line 832, in _reraise_augmented
    raise_with_traceback(new_exn)
  File "/usr/local/g

ERROR:root:Giving up after 4 attempts.
IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [124]:
output

[(1.2, 1558458525.8),
 (1.7, 1558458526.3),
 (1.3, 1558458525.5),
 (1.1, 1558458526.7),
 (1.3, 1558458525.9),
 (2.2, 1558458526.4),
 (1.5, 1558458526.4),
 (1.9, 1558458525.8),
 (1.4, 1558458524.1),
 (1.3, 1558458526.0),
 (1.9, 1558458526.5),
 (1.0, 1558458526.5),
 (1.6, 1558458526.6),
 (1.8, 1558458526.2),
 (1.4, 1558458526.6),
 (1.8, 1558458525.9),
 (2.3, 1558458526.6),
 (2.8, 1558458526.7),
 (1.4, 1558458526.5),
 (1.6, 1558458526.3),
 (1.5, 1558458526.5),
 (2.1, 1558458526.8),
 (2.4, 1558458526.2),
 (2.0, 1558458526.9),
 (1.4, 1558458525.6),
 (1.6, 1558458526.8),
 (2.6, 1558458526.5)]