# Overview



# Imports

In [None]:
import functools
import itertools
import multiprocessing
import os.path as op
import queue

import apache_beam as beam
import matplotlib.pyplot as plt
import pandas as pd
from apache_beam.runners.direct.direct_runner import BundleBasedDirectRunner
from apache_beam.runners.interactive.cache_manager import (
    FileBasedCacheManager, ReadCache, WriteCache)
from apache_beam.runners.interactive.interactive_runner import \
    InteractiveRunner
from apache_beam.runners.portability.fn_api_runner import FnApiRunner

try:
    from pathlib import Path
except ImportError:
    from pathlib2 import Path

In [None]:
%matplotlib inline

# Parameters

In [None]:
PROJECT_ID = "strokach-playground"

In [None]:
NOTEBOOK_NAME = "interactive-beam-demo"
NOTEBOOK_PATH = Path(NOTEBOOK_NAME).resolve()
NOTEBOOK_PATH.mkdir(exist_ok=True)

NOTEBOOK_NAME

In [None]:
class InteractiveSession(object):

  def __init__(self):
    pass

  def _repr_html_(self):
    return (
        '<p style="padding-left: 1%; padding-right: 1%">'
        '<a href="'
        'https://pantheon.corp.google.com/dataflow/jobsDetail/'
        'locations/us-central1/'
        'jobs/2019-05-30_14_39_33-6947614184630253675'
        '">Dashboard</a>'
        '</p>'
    )


In [None]:
InteractiveSession()

In [None]:
class Foo:
    
    def __enter__(self):
        self._in_context = True
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        self._in_context = False
        print("Exiting...!")

In [None]:
foo = Foo()
foo.__enter__()

raise Exception

In [None]:
foo._in_context

In [None]:
b"  ¡ ¢ £ ¤ ¥ ¦ § ¨ © ª « ¬ ­ ® ¯ ° ± ² ³ ´ µ ¶ · ¸ ¹ º » ¼ ".replace(b" ", b"")

In [None]:
import numpy as np

In [None]:
from parameterized import parameterized_class

parameterized_class?

In [None]:
from contextlib import closing

In [None]:
with closing(np.random.randn(5,5)) as a:
    print(a)

In [None]:
runner = beam.runners.dataflow.DataflowRunner(
    # interactive=True,
    # timeout="30min"
)

In [None]:
options = beam.pipeline.PipelineOptions(
    project=PROJECT_ID,
    job_name=NOTEBOOK_NAME,
    temp_location="gs://strokach/dataflow_temp",
    staging_location="gs://strokach/dataflow_staging",
    sdk_location=op.expanduser(
        "~/workspace/beam/sdks/python/dist/apache-beam-2.14.0.dev0.tar.gz"
    ),
#     timeout=600,
)
options.display_data()

In [None]:
cache_manager = FileBasedCacheManager(
    cache_dir="gs://strokach/tmp"
)
# atexit.register(cache_manager.cleanup) <- 

# Setup

In [None]:
runner.start_session(options)

# Workflow

### Running a pipeline

In [None]:
with beam.Pipeline() as p:  # display=...
    _ = (
        #
        p
        | "Read" >> beam.io.ReadFromText("gs://strokach/inputs/winterstale")
        | "Write" >> WriteCache(cache_manager, "temp")
    )

#### Notes

- `p.run()` should start the pipeline in the background, even in interactive mode. This makes sense for streaming, or when we want to start reading results as they appear.

- There should be an option to display results while the pipeline is running. However, we should display results when we call `result.wait_until_finish()`, not when we call `p.run()`.

- `result.wait_until_finish()` should not wait for the VM to shut down.

- `WriteCache` should take an extra argument `format`, which allows the use to specify the format that will be used for writing the cache file. It should also be possible to pass additional arguments such as `schema=...`, etc.

### Reading from cache

In [None]:
# The argument should be just "temp", and it should not return "version"
data, _ = cache_manager.read("full", "temp")  

In [None]:
df = pd.DataFrame(data[0], columns=["string"])

df["string_clean"] = df["string"].str.strip()
df["string_length"] = df["string_clean"].str.len()

In [None]:
fg, ax = plt.subplots()

_ = ax.hist(df["string_length"])
ax.set_xlabel("String length")
ax.set_ylabel("Number of strings")

#### Notes

- When we read from a `CacheManager`, we should not have to specify "full" or "sample". If the user so desires, they can provide that information in the name that they give to their cache object.

- `CacheManager.read(...)` should not return the cache version by default. It should be assumeed that we are always reading the latest cache object.

- `CacheManager.read(...)` should have a `limit=...` argument.

### Querying cache using Beam SQL, BigQuery, etc.

Until we have a Python frontend for Beam SQL, 

#### Notes

### Processing cached objects using DirectRunner

For simple types of queries that are likely to be limited by I/O, it may be helpful to explore the data using `DirectRunner`. Furthermore, we might want to store the output directly in memory instead of saving it to a file. Both the `BundleBasedDirectRunner` and the `FnApiRunner` run workers inside Python threads, so the results can simply be passed into a `queue.Queue()`. If `DirectRunner` ever switches to using `multiprocessing` ([BEAM-1442](https://issues.apache.org/jira/browse/BEAM-1442)), a more sophisticated way of transfering data between processes may be required. For example, PyTorch creates memory-mapped files, and passes filehandes to those files between processes.

In [None]:
from apache_beam.typehints import typehints


In [None]:
typehints.Union[[str, int, int, str]]

In [None]:
isinstance("A", typehints.TypeConstraint)

In [None]:
q = queue.Queue()

In [None]:
def add_to_queue(element, queue):
    queue.put(element)

In [None]:
cache = Par

## Pipeline

In [None]:
typehints.Union[[int, int, str, None]]

In [None]:
{
    typehints.Union[int, str, str]: 'a'
}[typehints.Union[int, str, int]]

In [None]:
typehints.normalize(b"abc")

In [None]:
from past.builtins import unicode

In [None]:
import pyarrow as pa

In [None]:
pa.array([[1,2,3], [4,5,6], [1.3, 4.5]]).type

In [None]:
x.type

In [None]:
pa.int64()

In [None]:
  _typehint_to_avro_type = {
      typehints.Union[[int]]: "int",
#       typehints.Union[int, None]: ["int", "null"],
#       typehints.Union[long]: "long",
#       typehints.Union[long, None]: ["long", "null"],
#       typehints.Union[float]: "double",
#       typehints.Union[float, None]: ["double", "null"],
#       typehints.Union[str]: "string",
#       typehints.Union[str, None]: ["string", "null"],
#       typehints.Union[unicode]: "string",
#       typehints.Union[unicode, None]: ["string", "null"],
#       typehints.Union[np.ndarray]: "bytes",
#       typehints.Union[np.ndarray, None]: ["bytes", "null"],
#       typehints.Union[array.array]: "bytes",
#       typehints.Union[array.array, None]: ["bytes", "null"],
#   }

In [None]:
import avro.schema
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter


In [None]:
schema_dict = {"namespace": "example.avro",
 "type": "record",
 "name": "User",
 "fields": [
     {"name": "name", "type": "string"},
     {"name": "favorite_number",  "type": ["int", "null"]},
     {"name": "favorite_color", "type": ["string", "null"]}
 ]
}

In [None]:
import json

In [None]:
avro.schema.parse(json.dumps(schema_dict))

In [None]:
from mock import MagicMock

In [None]:
writer = MagicMock()

In [None]:
writer(a=1, b=2)

In [None]:
list(writer.call_args)

In [None]:
cache.writer._sink.write_record.side_effect = Exception

In [None]:
cache.writer._sink.write_record(123)

In [None]:
m.reader.call_count

In [None]:
range(0)

In [None]:
from apache_beam.typehints import trivial_inference

In [None]:
trivial_inference.instance_to_type(

In [None]:
u"±♠Ωℑ".encode('utf-8')

In [None]:
b"±♠Ωℑ"

In [None]:
import numpy as np

In [None]:
import array

In [None]:
array1 = array.array('i', [10,20,30,40,50])
array1

In [None]:
q = queue.Queue()

with beam.Pipeline(runner=BundleBasedDirectRunner()) as p:
    _ = (
        #
        p
        | "Read" >> ReadCache(cache_manager, "temp")
        | "Remove whitespace" >> beam.Map(lambda element: element.strip("\n\t|"))
        | "Remove empty lines" >> beam.FlatMap(lambda element: [element] if element else [])
        | "Write" >> beam.Map(lambda element: add_to_queue(element, queue=q))
    )

In [None]:
p = beam.Pipeline(runner=BundleBasedDirectRunner())

In [None]:
p | "Read" >> ReadCache(cache_manager, "temp")

In [None]:
p | "ass" >> beam.Map(lambda element: element.strip("\n\t|"))

In [None]:
p | "ass" >> beam.Map(lambda element: element.strip("\n\t|"))

In [None]:
p.run()

In [None]:
p.cancel()

In [None]:
data = list(q.queue)

In [None]:
fg, ax = plt.subplots()

_ = ax.hist([len(s) for s in data])
ax.set_title("Number of strings after removing whitespace")
ax.set_xlabel("String length")
ax.set_ylabel("Number of strings")

#### Notes

- It may be useful to have a well-defined class for outputing results to an object in the main process. That way, if the implementation of `DirectRunner` changes, this way of accessing results could remain backwards-compatible.

In [None]:
reader = beam.io.textio.ReadFromText("gs://strokach/inputs/winterstale")
range_tracker = reader._source.get_range_tracker(None, None)
data = list(reader._source.read(range_tracker))

# Cleanup

In [None]:
cache_manager.cleanup()

In [None]:
# runner.__exit__()
# runner.end_session() (?)