## Install dependencies

In [None]:
try:
    import google.colab
    GOOGLE_COLAB = True
except ImportError:
    GOOGLE_COLAB = False

In [None]:
if GOOGLE_COLAB:
    !sudo apt-get -yqq install libsnappy-dev

In [None]:
if GOOGLE_COLAB:
    !pip install -q python-snappy Faker pyproj
    !pip install -q -U bokeh

In [None]:
if GOOGLE_COLAB:
    !pip install "git+https://github.com/ostrokach/beam.git@feature/streambasedcache#egg=apache_beam[gcp]&subdirectory=sdks/python"

## Imports

In [None]:
import copy
import itertools
import logging
import os
import pickle
import tempfile
import uuid

import apache_beam as beam
import numpy as np
import tqdm
from apache_beam.io.filesystems import FileSystems
from apache_beam.options.pipeline_options import (GoogleCloudOptions,
                                                  PipelineOptions)
from apache_beam.runners.direct.direct_runner import BundleBasedDirectRunner
from apache_beam.runners.interactive import caching

In [None]:
# logging.basicConfig(level=logging.INFO)

In [None]:
# logging.info("hello")

In [None]:
try:
    %load_ext autoreload
    %autoreload 2
except ModuleNotFoundError:
    print("Not using autoreload")

## Functions

In [None]:
class MaterializedPCollection(beam.pvalue.PCollection):

    # TODO: Ideally, would want to be consistent with the PCollection API.
    def __init__(self, pipeline, tag=None, element_type=None, windowing=None, cache=None):
        super(MaterializedPCollection, self).__init__(
            pipeline, tag=tag, element_type=element_type, windowing=windowing)
        self._cache = cache

    def read(self, **reader_kwargs):
        return self._cache.read(**reader_kwargs)

    def __del__(self):
        # Clear the underlying cache when there are no more references to this object.
        self._cache.clear()

In [None]:
def persist(pcoll):
    """Materialize PCollection.

    Args:
        pcoll: The PCollection to materialize.

    Returns:
        MaterializedPCollection: An object which can be used to accessthe materialized
            PCollection.
    """
    if isinstance(pcoll, MaterializedPCollection):
        logging.info("The provided PCollection has already been materialized.")
        return pcoll

    temp_location = pcoll.pipeline._options.view_as(GoogleCloudOptions).temp_location
    cache_location = FileSystems.join(temp_location, "cache", uuid.uuid4().hex)
    while FileSystems.match(FileSystems.match([cache_location + "*"], limits=[1])[0].metadata_list):
        cache_location = FileSystems.join(temp_location, "cache", uuid.uuid4().hex)

    cache = caching.TFRecordBasedCache(location=cache_location)

    pcoll_to_cache = (
        pcoll | "Write to cache" >> cache.writer()
    )
    # TODO: Get this working with `test_runner_api=True`
    pcoll_to_cache.pipeline.run(test_runner_api=False).wait_until_finish()

    new_pipeline = (
        beam.Pipeline(
            runner=pcoll_to_cache.pipeline.runner,
            options=pcoll_to_cache.pipeline._options)
        | "Read from cache" >> cache.reader()
    ).pipeline
    materialized_pcoll = MaterializedPCollection(
        pipeline=new_pipeline, tag=pcoll.tag, element_type=pcoll.element_type,
        windowing=pcoll.windowing, cache=cache)
    
    assert pcoll.producer
    materialized_pcoll.producer = pcoll.producer
    
    return materialized_pcoll

## Options

In [None]:
NOTEBOOK_NAME = "session_api_batch"
try:
    os.mkdir(NOTEBOOK_NAME)
except OSError:
    pass

In [None]:
#@title Google Cloud Project Info { display-mode: "form" }
project_id = "strokach-playground" #@param {type:"string"}
gcs_temp_location = "gs://strokach/dataflow_temp" #@param {type:"string"}

In [None]:
options = PipelineOptions(
    temp_location=gcs_temp_location, interactive=True, project=project_id
)
options.display_data()

In [None]:
try:
    %load_ext autoreload
    %autoreload 2
except Exception:
    print("No autoreload")

In [None]:
temp_root = "/tmp/beam-dev"
try:
    os.makedirs(temp_root)
except OSError:
    pass

temp_location = tempfile.mkdtemp(dir=temp_root)

cache_location = FileSystems.join(temp_location, "cache")
try:
    os.makedirs(cache_location)
except OSError:
    pass

temp_root, temp_location, cache_location

In [None]:
runner = BundleBasedDirectRunner()

In [None]:
p = beam.Pipeline(runner=runner, options=options)

## Generate a dataset

Create a dataset of 10,000 fake people profiles.

In [None]:
try:
    with open(os.path.join(NOTEBOOK_NAME, "fake_people.pkl"), "rb") as fin:
        fake_people = pickle.load(fin)
except IOError:
    fake = Faker(seed=42)
    fake_people = [fake.profile(fields=None, sex=None) for _ in tqdm.tqdm_notebook(range(10000))]
    with open(os.path.join(NOTEBOOK_NAME, "fake_people.pkl"), "wb") as fout:
        pickle.dump(fake_people, fout)

In [None]:
fake_people[:1]

## Create a PCollection cache

Convert the generated dataset into a `PCollection` which can be accessed from within a Beam pipeline.

In [None]:
input_cache = caching.SafeTextBasedCache(FileSystems.join(cache_location, "fake_people"), mode="overwrite")
input_cache.write(fake_people)

In [None]:
counts = (
    p
    | input_cache.reader()
    | beam.combiners.Count.Globally()
)

In [None]:
counts

In [None]:
counts = persist(
    p
    | input_cache.reader()
    | beam.combiners.Count.Globally()
)

next(counts.read())

In [None]:
counts = (    p
    | input_cache.reader()
    | beam.combiners.Count.Globally()
)

In [None]:
counts

In [None]:
a, b = persist(a, b)

In [None]:
count1, counts2 = persisit(cou ts )

In [None]:
next(counts.read())

## Basic transformations

### Add age columns

In [None]:
def calculate_age(birthdate):
    import datetime
    td = datetime.date.today() - birthdate
    return int(round(td.days // 365.25))

def add_age_colum(element):
    element["age"] = calculate_age(element["birthdate"])
    return element

In [None]:
people_with_age = persist(
    p
    | input_cache.reader()
#     | beam.combiners.Sample.FixedSizeGlobally(5)
#     | beam.FlatMap(lambda e_lst: [e for e in e_lst])
    | beam.Map(add_age_colum)
)

next(people_with_age.read())

## Chain transformations

Select top 10 most popular occupations in the `> 30` age group.

In [None]:
most_popular_occupations = persist(
    people_with_age
    | beam.Filter(lambda e: e["age"] > 30)
    | beam.Map(lambda e: (e["job"], e))
    | beam.combiners.Count.PerKey()
    | beam.combiners.Top.Of(10, key=lambda x: x[1])
)

In [None]:
list(most_popular_occupations.read())

## Store for future use

In [None]:
next(people_with_age.read())

In [None]:
output_folder = FileSystems.join(temp_location, "json_output")
try:
    os.makedirs(output_folder)
except OSError:
    pass


def remove_nonserializable(element):
    element = element.copy()
    element["birthdate"] = element["birthdate"].isoformat()
    del element["current_location"]
    return element


def to_json_string(element):
    import json

    return json.dumps(element)


output = (
    people_with_age
    | beam.combiners.Sample.FixedSizeGlobally(5)
    | beam.FlatMap(lambda e_lst: [e for e in e_lst])
    | beam.Map(remove_nonserializable)
    | beam.Map(to_json_string)
    | beam.io.WriteToText(
        FileSystems.join(output_folder, "output-data"), file_name_suffix=".json"
    )
)
output.pipeline.run(test_runner_api=False).wait_until_finish()

## Convert to DataFrame