## Imports

In [None]:
from __future__ import print_function

import contextlib
import gc
import json
import os
import shutil
import tempfile
import time
import uuid
from datetime import datetime

import apache_beam as beam
import pytz
from apache_beam.io.gcp.pubsub import PubsubMessage
from apache_beam.pipeline import PipelineOptions
from apache_beam.runners.interactive.caching import streambasedcache
from apache_beam.transforms import combiners, window
from apache_beam.utils.timestamp import Timestamp

try:
    from contextlib import ExitStack
except ImportError:
    from contextlib2 import ExitStack

In [None]:
try:
    %load_ext autoreload
    %autoreload 2
except Exception:
    print("No autoreload")

## Parameters

In [None]:
#@title Google Cloud Project Info { display-mode: "form" }
project_id = "strokach-playground" #@param {type:"string"}
gcs_temp_location = "gs://strokach/dataflow_temp" #@param {type:"string"}

In [None]:
options = PipelineOptions(
    temp_location=gcs_temp_location, streaming=True, project=project_id,
#     runner="DirectRunner",
    runner="DataflowRunner",
    sdk_location=os.path.expanduser(
        "~/workspace/beam/sdks/python/dist/apache-beam-2.15.0.dev0.tar.gz"
    ),
    setup_file="../setup.py",
    job_name="test11"
)
options.display_data()

## Functions

In [None]:
def create_cache(location, cache_class, *args, **kwargs):
    for _ in range(3):
        full_location = "{}-{}".format(location, uuid.uuid4().hex)
        try:
            return cache_class(full_location, *args, **kwargs)
        except IOError as e:
            pass
    raise e

In [None]:
class AddTimestampDoFn(beam.DoFn):
    def process(self, element):
        import apache_beam as beam
        from apache_beam.utils.timestamp import Timestamp

        yield beam.window.TimestampedValue(element, Timestamp(seconds=int(element)))

In [None]:
class FormatDoFn(beam.DoFn):
    def process(self, element, window=beam.DoFn.WindowParam):
        import pytz

        yield {
            "events": element,
            "window_start_milliseconds": int(window.start.micros / 1000),
            "window_end_milliseconds": int(window.end.micros / 1000)
        }

In [None]:
@contextlib.contextmanager
def run_pipeline(p):
    pr = p.run()
    try:
        yield pr
    finally:
        pr.cancel()

### Absurd time / no groupby

In [None]:
input = create_cache(
    "projects/{}/topics/input".format(project_id),
    streambasedcache.PubSubBasedCache,
    #     with_attributes=["timestamp_milliseconds"],
    #     timestamp_attribute="timestamp_milliseconds",
)

output = create_cache(
    "projects/{}/topics/output".format(project_id),
    streambasedcache.PubSubBasedCache,
    #     with_attributes=["timestamp_milliseconds"],
    #     timestamp_attribute="timestamp_milliseconds",
)


start_time = 0
data = [(start_time + i) for i in range(10)]

input.write(data)

p = beam.Pipeline(options=options)

pcoll = (
    p
    # | beam.Create()
    | input.reader()
    | beam.Map(lambda e: print(e) or e)
    | beam.ParDo(AddTimestampDoFn())
    | beam.WindowInto(window.FixedWindows(1))
    | beam.ParDo(FormatDoFn())
    # | beam.io.WriteToText(os.path.join("/tmp", "pipeline-gc-test2"))
    | output.writer()
)

gc.collect()

with run_pipeline(p):
    while True:
        for element in output.read(timeout=5):
            print(element)
        time.sleep(1)

### Absurd time / groupby

In [None]:
print(datetime.now().strftime("%s"))
print((datetime.utcnow() - datetime.utcfromtimestamp(0)).total_seconds())

In [None]:
class AddTimestampDoFn(beam.DoFn):
    def process(self, element):
        from apache_beam.transforms.window import TimestampedValue
        from apache_beam.utils.timestamp import Timestamp

        yield TimestampedValue(element, Timestamp(seconds=int(element["index"])))

In [None]:
input = create_cache(
    "projects/{}/topics/input".format(project_id),
    streambasedcache.PubSubBasedCache,
    with_attributes=True,
    timestamp_attribute="timestamp",
)
print(input.location)

output = create_cache(
    "projects/{}/topics/output".format(project_id),
    streambasedcache.PubSubBasedCache,
    #     with_attributes=["timestamp_milliseconds"],
    #     timestamp_attribute="timestamp_milliseconds",
)
print(output.location)

start_time = 0
data = [
    PubsubMessage(
        json.dumps({"index": (start_time + i)}).encode("utf-8"),
        {
            "timestamp": (
                datetime.utcfromtimestamp((start_time + i))
                .replace(tzinfo=pytz.UTC)
                .isoformat()
            )
        },
    )
    for i in range(10)
]

input.write(data)

p = beam.Pipeline(options=options)

pcoll = (
    p
    # | beam.Create()
    | input.reader()
    | beam.Map(lambda e: print(e.data) or e.data)
    | beam.Map(lambda e: json.loads(e.decode("utf-8")))
    | beam.ParDo(AddTimestampDoFn())
    | beam.WindowInto(beam.window.FixedWindows(1))
    | beam.CombineGlobally(beam.combiners.ToListCombineFn()).without_defaults()
    | beam.ParDo(FormatDoFn())
    # | beam.io.WriteToText(os.path.join("/tmp", "pipeline-gc-test2"))
    | output.writer()
)

with run_pipeline(p):
    while True:
        for element in output.read(timeout=5):
            print(element)
        time.sleep(1)

In [None]:
class Foo:
    
    def __init__(self, name):
        self.name = name
        self.removed = False

    def __del__(self):
        print("Removing Foo with name '{}.'".format(self.name))
        self.removed = True

foo = Foo(uuid.uuid4().hex)
print("Foo name: '{}'.".format(foo.name))

with beam.Pipeline(options=options) as p:
    p.parent = foo
    create_pt = beam.Create(list(range(10)))
    create_pt.parent = foo
    pcoll = (
        p
        | create_pt
        | beam.Map(lambda e: print(e) or e)
        | beam.io.WriteToText(os.path.join("/tmp", "pipeline-gc-test2"))
    )
    pcoll.parent = foo

del p
del pcoll
# del create_pt

gc.collect()
del foo

In [None]:
print(create_pt.parent.name)
print(create_pt.parent.removed)

In [None]:
class Foo:
    def __init__(self, name):
        self.name = name
        self.removed = False
        self.data = {}

    def __del__(self):
        print("Removing Foo with name '{}.'".format(self.name))
        self.removed = True

    def writer(self):
        return Bar(self.data)


class Bar:
    def __init__(self, data):
        self.data = data


foo = Foo(uuid.uuid4().hex)
print("Foo name: '{}'.".format(foo.name))

with beam.Pipeline(options=options) as p:
    p.parent = foo
    create_pt = beam.Create(list(range(10)))
    create_pt.parent = foo.writer()
    pcoll = (
        p
        | create_pt
        | beam.Map(lambda e: print(e) or e)
        | beam.io.WriteToText(os.path.join("/tmp", "pipeline-gc-test2"))
    )
    pcoll.parent = foo

del p
del pcoll
# del create_pt

gc.collect()
del foo

In [None]:
time.sleep(5)