## Install dependencies (Colab only)

In [None]:
try:
    import google.colab
    GOOGLE_COLAB = True
except ImportError:
    GOOGLE_COLAB = False

In [None]:
if GOOGLE_COLAB:
    !sudo apt-get -yqq install libsnappy-dev

In [None]:
if GOOGLE_COLAB:
    !pip install -q \
        python-snappy bokeh \
        "git+https://github.com/ostrokach/beam.git@e2aa065f2717cfbf0490514cf164b69c0beb0fab#egg=apache_beam[gcp]&subdirectory=sdks/python"

In [None]:
if GOOGLE_COLAB:
    from google.colab import auth
    auth.authenticate_user()

In [None]:
# @title Google Cloud Project Info { display-mode: "form" }
import os
if GOOGLE_COLAB or "PUBSUB_EMULATOR_HOST" not in os.environ:
    os.environ["BEAM_PROJECT_ID"] = "strokach-playground"  # @param {type:"string"}
    os.environ["BEAM_TEMP_LOCATION"] = "gs://strokach/dataflow_temp"  # @param {type:"string"}

## Imports

In [None]:
import atexit
import contextlib
import gc
import itertools
import json
import logging
import math
import os
import sys
import tempfile
import threading
import time
import uuid
from contextlib import ExitStack
from datetime import datetime

import apache_beam as beam
import bokeh
import pytz
from apache_beam.io.gcp.pubsub import PubsubMessage
from apache_beam.options.pipeline_options import GoogleCloudOptions, PipelineOptions
from apache_beam.runners.direct.direct_runner import DirectRunner
from apache_beam.runners.interactive.cache_manager import CacheManager
from apache_beam.runners.interactive.caching import pubsub_utils, streambasedcache
from apache_beam.runners.interactive.datasets import streaming_dataset
from apache_beam.runners.interactive.display import data_server
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
from bokeh import plotting
from bokeh.io import output_notebook, push_notebook, show
from bokeh.models import DatetimeTickFormatter
from google.cloud import pubsub

In [None]:
output_notebook()

## Boilerplate

In [None]:
sys.argv = sys.argv[:1]
logging.getLogger("werkzeug").setLevel(logging.WARNING)

## Parameters

### Configurable

In [None]:
NOTEBOOK_NAME = "streambasedcache"

In [None]:
project_id = os.getenv("BEAM_PROJECT_ID", "test-project")
try:
    temp_location = os.environ["BEAM_TEMP_LOCATION"]
except KeyError:
    _tempporary_directory = tempfile.TemporaryDirectory()
    temp_location = _tempporary_directory.name

### Derived

In [None]:
runner = DirectRunner()

In [None]:
options = PipelineOptions(
    project=project_id, temp_location=temp_location, streaming=True,
)
options.display_data()

In [None]:
cache_manager = CacheManager(options)

In [None]:
try:
    %load_ext autoreload
    %autoreload 2
except Exception:
    print("No autoreload")

## Function definitions

### Plotting

In [None]:
def gen_plot(source, **circle_kwargs):
    plot = bokeh.plotting.figure(
        plot_height=300, plot_width=800, background_fill_color="lightgrey", title="", y_range=(-1.1, 1.1)
    )
    plot.circle("x", "y", source=source, **circle_kwargs)

    plot.x_range.follow = "end"
    plot.x_range.follow_interval = 100000

    plot.xaxis.major_label_orientation = math.pi / 4
    plot.xaxis.formatter = DatetimeTickFormatter(
        milliseconds=["%H:%M:%S"], seconds=["%H:%M:%S"], minsec=["%H:%M:%S"], minutes=["%H:%M:%S"]
    )
    return plot

### Pipeline-specific

In [None]:
def decode_pubsub_message(message):
    data = json.loads(message.data.decode("utf-8"))
    return data

In [None]:
def milliseconds_to_iso(milliseconds, timezone=None):
    import pytz

    tzinfo = pytz.timezone(timezone) if timezone is not None else pytz.UTC
    dt = (
        datetime.utcfromtimestamp(milliseconds / 1000)
        .replace(tzinfo=pytz.UTC)
        .astimezone(tzinfo)
    )
    return dt.isoformat()


milliseconds_to_iso(12)

In [None]:
def custom_sin(x, period_degrees=360):
    import math
    return (math.sin(x / period_degrees * 2 * math.pi))

In [None]:
def custom_cos(x, period_degrees=360):
    import math
    return (math.cos(x / period_degrees * 2 * math.pi))

## Run pipeline

### Start publisher

In [None]:
pub_client = pubsub.PublisherClient()

In [None]:
timestamps_topic = pubsub_utils.TemporaryPubsubTopic(project=project_id)

timestamp_publisher = streaming_dataset.StreamingDataset(
    "timestamp", pub_client, timestamps_topic.name, time_between_events=0.5
)

In [None]:
timestamp_publisher.start()

### Read from topic

In [None]:
input_subscription = pubsub_utils.TemporaryPubsubSubscription(project_id, timestamps_topic.name)

In [None]:
input_cache = cache_manager.create_cache_from_defaults("input")

In [None]:
p = beam.Pipeline(options=options)
_ = (
    p
    | "Read"
    >> beam.io.ReadFromPubSub(subscription=input_subscription.name, with_attributes=True, timestamp_attribute="ts")
    | "Decode" >> beam.Map(lambda message: json.loads(message.data.decode("utf-8")))
    | "Add timestamp"
    >> beam.Map(lambda e: e.update({"ts_iso": milliseconds_to_iso(e["ts"], timezone="US/Pacific")}) or e)
    | "Write" >> input_cache.writer()
)
pr = p.run()

In [None]:
for element in itertools.islice(input_cache.read(seek_to_start=False, timeout=5), 10):
    print(element.value)

### Plot a sine wave 

In [None]:
sine_cache = cache_manager.create_cache_from_defaults("sine")

In [None]:
# sin_pr.cancel()

In [None]:
sin_pcoll = (
    beam.Pipeline(options=options)
    | "Read" >> input_cache.reader(seek_to_start=False)
    | "Add coords" >> beam.Map(lambda e: e.update({"x": e["ts"], "y": custom_sin((e["ts"]), (100000 / 2))}) or e)
    | "Write" >> sine_cache.writer()
)

sin_pr = sin_pcoll.pipeline.run()

In [None]:
# Show top elements
for element in itertools.islice(sine_cache.read(seek_to_start=False, timeout=5), 5):
    print(element.value)

In [None]:
sin_sp = data_server.StreamingPlot(
    sine_cache,
    lambda e: {"x": [e.value["x"]], "y": [e.value["y"]]},
    gen_plot,
    rollover=100,
    delay=2,
    timeout=100,
    seek_to_start=False,
)

In [None]:
sin_sp.start()

In [None]:
# sin_sp.stop()

### Plot a cosine wave

In [None]:
cosine_cache = cache_manager.create_cache_from_defaults("cosine")

In [None]:
# cos_pr.cancel()

In [None]:
cos_pcoll = (
    beam.Pipeline(options=options)
    | "Read" >> input_cache.reader(seek_to_start=False)
    | "Add coords" >> beam.Map(lambda e: e.update({"x": e["ts"], "y": custom_cos((e["ts"]), (100000 / 2))}) or e)
    | "Write" >> cosine_cache.writer()
)

cos_pr = cos_pcoll.pipeline.run()

In [None]:
cos_sp = data_server.StreamingPlot(
    cosine_cache,
    lambda e: {"x": [e.value["x"]], "y": [e.value["y"] * 0.5]},
    lambda *sources: gen_plot(sources[0], line_color="red", fill_color="red"),
    rollover=100,
    delay=2,
    timeout=100,
    seek_to_start=False,
#     source_type="ajax",
)

In [None]:
cos_sp.start()

In [None]:
# cos_sp.stop()