## Summary

## Install dependencies

In [None]:
try:
    import google.colab
    GOOGLE_COLAB = True
except ImportError:
    GOOGLE_COLAB = False

In [None]:
if GOOGLE_COLAB:
    !pip install "git+https://github.com/ostrokach/beam.git@develop#egg=apache_beam[gcp]&subdirectory=sdks/python"

## Imports

In [None]:
import itertools
import logging
import tempfile

import apache_beam as beam
from apache_beam.io.filesystems import FileSystems
from apache_beam.options.pipeline_options import (PipelineOptions,
                                                  StandardOptions)
from apache_beam.runners.interactive.cache_manager import CacheManager
from matplotlib import pyplot as plt

In [None]:
%matplotlib inline

## Parameters

In [None]:
NOTEBOOK_NAME = "demo_pipeline_cloning"

In [None]:
temp_dir = tempfile.mkdtemp(prefix="beam-temp-")

In [None]:
options = PipelineOptions(runner="BundleBasedDirectRunner", temp_location=temp_dir, interactive=True)
print(options.display_data())

In [None]:
cache_manager = CacheManager(options)

## Functions

In [None]:
class MaterializedPCollection(beam.pvalue.PCollection):

    # TODO: Ideally, would want to be consistent with the PCollection API.
    def __init__(self, pipeline, tag=None, element_type=None, windowing=None, cache=None):
        super(MaterializedPCollection, self).__init__(
            pipeline, tag=tag, element_type=element_type, windowing=windowing)
        self._cache = cache

    def read(self, **reader_kwargs):
        return self._cache.read(**reader_kwargs)

    def __del__(self):
        # Clear the underlying cache when there are no more references to this object.
        self._cache.clear()

In [None]:
def persist(pcoll):
    """Materialize PCollection.

    Args:
        pcoll: The PCollection to materialize.

    Returns:
        MaterializedPCollection: An object which can be used to accessthe materialized
            PCollection.
    """
    if isinstance(pcoll, MaterializedPCollection):
        logging.info("The provided PCollection has already been materialized.")
        return pcoll

    cache = cache_manager.create_cache_from_defaults()

    pcoll_to_cache = (
        pcoll | "Write to cache" >> cache.writer()
    )
    # TODO: Get this working with `test_runner_api=True`
    pcoll_to_cache.pipeline.run(test_runner_api=False).wait_until_finish()

    new_pipeline = (
        beam.Pipeline(
            runner=pcoll_to_cache.pipeline.runner,
            options=pcoll_to_cache.pipeline._options)
        | "Read from cache" >> cache.reader()
    ).pipeline
    materialized_pcoll = MaterializedPCollection(
        pipeline=new_pipeline, tag=pcoll.tag, element_type=pcoll.element_type,
        windowing=pcoll.windowing, cache=cache)
    
    assert pcoll.producer
    materialized_pcoll.producer = pcoll.producer
    
    return materialized_pcoll

## Workflow

In [None]:
p = beam.Pipeline(options=options)

### Populate input cache

In [None]:
input_ = p | beam.Create(range(10))

In [None]:
input_ = persist(input_)

In [None]:
for i in itertools.islice(input_.read(), 5):
    print(i)

### Squares

In [None]:
squares = input_ | 'Square' >> beam.Map(lambda x: x*x)

In [None]:
squares = persist(squares)

In [None]:
result = p.run()

In [None]:
for i in itertools.islice(squares.read(), 5):
    print(i)

In [None]:
cubes = persist(cubes)

### Cubes

In [None]:
cubes = input_ | 'Cube' >> beam.Map(lambda x: x**3)

In [None]:
result = p.run()

In [None]:
for i in itertools.islice(cubes.read(), 5):
    print(i)

### Plot

In [None]:
init_list = list(input_.read())
squares_list = sorted(squares.read())
cubes_list = sorted(cubes.read())

plt.scatter(init_list, squares_list, label='squares', color='red')
plt.scatter(init_list, cubes_list, label='cubes', color='blue')
plt.legend(loc='upper left')
plt.show()