## Install dependencies

In [None]:
try:
    import google.colab
    GOOGLE_COLAB = True
except ImportError:
    GOOGLE_COLAB = False

In [None]:
if GOOGLE_COLAB:
    !sudo apt-get -yqq install libsnappy-dev

In [None]:
if GOOGLE_COLAB:
    !pip install -q python-snappy Faker pyproj
    !pip install -q -U bokeh

In [None]:
if GOOGLE_COLAB:
    !pip install "git+https://github.com/ostrokach/beam.git@feature/streambasedcache#egg=apache_beam[gcp]&subdirectory=sdks/python"

## Imports

In [None]:
from __future__ import division, print_function

import atexit
import contextlib
import gc
import itertools
import json
import logging
import math
import threading
import time
import uuid
from datetime import datetime

import apache_beam as beam
import bokeh
import pytz
from apache_beam.io.gcp.pubsub import PubsubMessage
from apache_beam.options.pipeline_options import (GoogleCloudOptions,
                                                  PipelineOptions)
from apache_beam.runners.interactive.caching import streambasedcache
from apache_beam.runners.interactive.display import data_server
from bokeh import plotting
from google.cloud import pubsub_v1

try:
    from contextlib import ExitStack
except ImportError:
    from contextlib2 import ExitStack

## Parameters

### Configurable

In [None]:
NOTEBOOK_NAME = "streambasedcache"

In [None]:
LOCAL = True

In [None]:
if GOOGLE_COLAB:
    from google.colab import auth
    auth.authenticate_user()

In [None]:
#@title Google Cloud Project Info { display-mode: "form" }
project_id = "strokach-playground" #@param {type:"string"}
gcs_temp_location = "gs://strokach/dataflow_temp" #@param {type:"string"}

### Derived

In [None]:
options = PipelineOptions(
    project=project_id, temp_location=gcs_temp_location, streaming=True,
)
options.display_data()

In [None]:
if LOCAL:
    HOST_IP = "localhost"
else:
    HOST_IP = subprocess.check_output(["hostname", "-I"], universal_newlines=True).strip().split()[0]

In [None]:
try:
    %load_ext autoreload
    %autoreload 2
except Exception:
    print("No autoreload")

In [None]:
logging.getLogger("werkzeug").setLevel(logging.WARNING)

## Function definitions

### General

In [None]:
def current_time_milliseconds():
    unix_time = (datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds()
    # ReadFromPubSub expects timestamps to be in milliseconds
    unix_time_milliseconds = int(unix_time * 1000)
    return unix_time_milliseconds


current_time_milliseconds()

In [None]:
def create_cache(location, cache_class, *args, **kwargs):
    for _ in range(3):
        full_location = "{}-{}".format(location, uuid.uuid4().hex[:8])
        try:
            return cache_class(full_location, *args, **kwargs)
        except IOError as e:
            pass
    raise e

### Plotting

## Run pipeline

### Start publisher

In [None]:
def close_all_contexts():
    for obj in gc.get_objects():
        if isinstance(obj, ExitStack):
            print(obj)
            try:
                obj.__exit__(None, None, None)
            except Exception as e:
                print(e)
                
close_all_contexts()
atexit.register(close_all_contexts)

In [None]:
class EventPublisher(threading.Thread):
    def __init__(self, topic_path, time_between_events):
        """
        
        Args:
            time_between_events (float): Seconds
        """
        super(EventPublisher, self).__init__()
        self.topic_path = topic_path
        self.time_between_events = time_between_events
        self._stop_event = threading.Event()

    def run(self):
        pub_client = pubsub_v1.PublisherClient()
        while not self.stopped():
            timestamp = current_time_milliseconds()
            element = {"timestamp": timestamp}
            future = pub_client.publish(
                self.topic_path,
                json.dumps(element).encode("utf-8"),
                timestamp=str(timestamp),
            )
            time.sleep(self.time_between_events)

    def stop(self):
        self._stop_event.set()

    def stopped(self):
        return self._stop_event.is_set()

    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

In [None]:
@contextlib.contextmanager
def create_pubsub_topic(project_id, prefix):
    topic_path = "projects/{}/topics/{}-{}".format(project_id, prefix, uuid.uuid4().hex)
    pub_client = pubsub_v1.PublisherClient()
    pub_client.create_topic(topic_path)
    try:
        yield topic_path
    finally:
        pub_client.delete_topic(topic_path)

In [None]:
@contextlib.contextmanager
def create_pubsub_subscription(topic_path, suffix=""):
    subscription_path = topic_path.replace("/topics/", "/subscriptions/")
    if suffix:
        subscription_path += "-{}".format(suffix)
    sub_client = pubsub_v1.SubscriberClient()
    sub_client.create_subscription(subscription_path, topic_path)
    try:
        yield subscription_path
    finally:
        sub_client.delete_subscription(subscription_path)

In [None]:
@contextlib.contextmanager
def run_pipeline(pipeline):
    result = pipeline.run()
    try:
        yield result
    finally:
        result.cancel()

### Pipeline-specific

In [None]:
def decode_pubsub_message(message):
    data = json.loads(message.data.decode("utf-8"))
    return data

In [None]:
def milliseconds_to_iso(milliseconds, timezone=None):
    import pytz

    tzinfo = pytz.timezone(timezone) if timezone is not None else pytz.UTC
    dt = (
        datetime.utcfromtimestamp(milliseconds / 1000)
        .replace(tzinfo=pytz.UTC)
        .astimezone(tzinfo)
    )
    return dt.isoformat()


milliseconds_to_iso(12)

In [None]:
def custom_sin(x, period_degrees=360):
    import math
    return (math.sin(x / period_degrees * 2 * math.pi))

In [None]:
def custom_cos(x, period_degrees=360):
    import math
    return (math.cos(x / period_degrees * 2 * math.pi))

In [None]:
def timestamp_element(element):
    from apache_beam.transforms import window
    from apache_beam.utils import timestamp

    return window.TimestampedValue(
        element, timestamp.Timestamp(micros=element["timestamp"] * 1000)
    )

### Start publisher

In [None]:
try:
    publisher_stack.__exit__(None, None, None)
except NameError:
    pass

publisher_stack = ExitStack()

input_topic = publisher_stack.enter_context(create_pubsub_topic(project_id, "event-stream"))
publisher = publisher_stack.enter_context(EventPublisher(input_topic, time_between_events=0.9))

### Read from topic

In [None]:
try:
    pipeline_stack.__exit__(None, None, None)
except NameError:
    pass

pipeline_stack = ExitStack()
input_subscription = pipeline_stack.enter_context(
    create_pubsub_subscription(input_topic, uuid.uuid4().hex[:8])
)

input_cache = create_cache(
    "projects/{}/topics/{}-temp".format(project_id, NOTEBOOK_NAME),
    streambasedcache.PubSubBasedCache,
    #     with_attributes=["timestamp"],
    #     timestamp_attribute="timestamp",
)

p = beam.Pipeline(options=options)

# fmt: off
out_pcoll = (
    p
    | "Read" >> beam.io.ReadFromPubSub(subscription=input_subscription, with_attributes=["timestamp"], timestamp_attribute="timestamp")
    | "Decode" >> beam.Map(lambda message: json.loads(message.data.decode("utf-8")))
    | "Add ISO 8601 timestamp" >> beam.Map(lambda e: e.update({"timestamp_iso": milliseconds_to_iso(e["timestamp"], timezone="US/Pacific")}) or e)
    | "Write" >> input_cache.writer()
)
# fmt: on

pr = pipeline_stack.enter_context(run_pipeline(p))

In [None]:
for element in itertools.islice(input_cache.read(from_start=False, timeout=5), 10):
    print(element)

### Make a sine wave 

In [None]:
current_time = current_time_milliseconds()
print(current_time)

In [None]:
try:
    sine_pipeline_stack.__exit__(None, None, None)
except NameError:
    pass

sine_pipeline_stack = ExitStack()

sine_cache = create_cache(
    "projects/{}/topics/{}-temp".format(project_id, NOTEBOOK_NAME),
    streambasedcache.PubSubBasedCache,
    with_attributes=["timestamp"],
    timestamp_attribute="timestamp",
)

p = beam.Pipeline(options=options)

# Create a reference, so that we don't garbage collect ahead of time
input_ = input_cache

# fmt: off
_ = (
    p
    | "Read" >> input_.reader(from_start=False)
    | "Add coords" >> beam.Map(lambda e: e.update({
        "x": e["timestamp"],
        "y": custom_sin((e["timestamp"] - current_time), (100000 / 2))}) or e)
    | "Write" >> sine_cache.writer()
)
# fmt: on

pr = sine_pipeline_stack.enter_context(run_pipeline(p))

In [None]:
for element in itertools.islice(sine_cache.read(from_start=False, timeout=5), 10):
    print(element)

In [None]:
def parse_cache_data(messages):
    for timestamp, message in messages:
        data = message.data
        yield (data["x"], data["y"])

In [None]:
try:
    sine_plot_stack.__exit__(None, None, None)
except NameError:
    pass

sine_plot_stack = ExitStack()

data_queue = sine_plot_stack.enter_context(sine_cache._read_to_queue(from_start=False))

app = data_server.create_data_publisher_app(data_queue, processors=[parse_cache_data], timeout=5)
data_endpoint = sine_plot_stack.enter_context(
    data_server.ServerThread(
        app, host=("localhost" if LOCAL else "0.0.0.0"), port=0, threaded=False
    )
)

In [None]:
def generate_plot(data_url):
    from bokeh.models import DatetimeTickFormatter

    adapter = bokeh.models.CustomJS(
        code="""
        const result = {x: [], y: []};
        const pts = cb_data.response;
        for (i=0; i<pts.length; i++) {
            result.x.push(pts[i][0])
            result.y.push(pts[i][1])
        }
        return result;
    """
    )

    source = bokeh.models.AjaxDataSource(
        data_url=data_url, polling_interval=200, adapter=adapter, mode="append"
    )

    p = plotting.figure(
        plot_height=300,
        plot_width=800,
        background_fill_color="lightgrey",
        title="",
        y_range=(-1, 1),
    )
    p.circle("x", "y", source=source)

    p.x_range.follow = "end"
    p.x_range.follow_interval = 100000

    p.xaxis.major_label_orientation = math.pi / 4
    p.xaxis.formatter = DatetimeTickFormatter(
        milliseconds=["%H:%M:%S"],
        seconds=["%H:%M:%S"],
        minsec=["%H:%M:%S"],
        minutes=["%H:%M:%S"],
    )

    return p

In [None]:
bokeh.io.reset_output()
bokeh.io.output_file("sines.html")
# bokeh.io.output_notebook(hide_banner=True)

data_url = "http://{}:{}/data".format(HOST_IP, data_endpoint.server.port)
plot = generate_plot(data_url)
bokeh.io.save(plot)

In [None]:
%%html
<div style="text-align:left;padding:1em 0;"> <h4><a style="text-decoration:none;" href="https://www.zeitverschiebung.net/en/city/5391959"><span style="color:gray;">Current local time in</span><br />San Francisco, United States</a></h4> <iframe src="https://www.zeitverschiebung.net/clock-widget-iframe-v2?language=en&size=small&timezone=America%2FLos_Angeles" width="400" height="90" frameborder="0" seamless></iframe> </div>

<div style="text-align:left;padding:1em 0;"> 
<iframe src="sines.html" width="100%" height="350" frameborder="0" seamless></iframe>
</div>