<a href="https://colab.research.google.com/github/ostrokach/beam-notebooks/blob/master/notebooks/beam_lazyevaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install dependencies

In [1]:
!pip install "git+https://github.com/ostrokach/beam.git@feature/filebasedcache#egg=apache_beam[gcp]&subdirectory=sdks/python"[gcp]



In [2]:
!sudo apt-get install libsnappy-dev -y -q

Reading package lists...
Building dependency tree...
Reading state information...
libsnappy-dev is already the newest version (1.1.7-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.


In [3]:
!pip install python-snappy



## Imports

In [0]:
import copy
import itertools
import logging
import tempfile
import uuid

import apache_beam as beam
from apache_beam.options.pipeline_options import GoogleCloudOptions, PipelineOptions
from apache_beam.io.filesystems import FileSystems
from apache_beam.runners.direct.direct_runner import BundleBasedDirectRunner
from apache_beam.runners.interactive import caching

In [0]:
logging.basicConfig(level=logging.INFO)

In [6]:
logging.info("hello")

INFO:root:hello


## Functions

In [0]:
class CustomRunner(BundleBasedDirectRunner):

    def patched_apply(self, transform, pvalueish, options):
        # We should probably avoid deepcopy and copy the underlying pipeline explicitly, possibly
        # through serialization -> deserialization.
        pvalueish = copy.deepcopy(pvalueis)
        return super(CustomRunner, self).apply(transform, pvalueish, options)

In [0]:
class MaterializedPCollection(beam.pvalue.PCollection):

    # TODO: Ideally, would want to be consistent with the PCollection API.
    def __init__(self, pipeline, cache, tag=None, element_type=None, windowing=None):
        super(MaterializedPCollection, self).__init__(
            pipeline, tag=tag, element_type=element_type, windowing=windowing)
        self._cache = cache

    def read(self, **reader_kwargs):
        return self._cache.read(**reader_kwargs)

    def __del__(self):
        # Clear the underlying cache when there are no more references to this object.
        self._cache.clear()

In [0]:
def persist(pcoll):
    """Materialize PCollection.

    Args:
        pcoll: The PCollection to materialize.

    Returns:
        MaterializedPCollection: An object which can be used to accessthe materialized
            PCollection.
    """
    if isinstance(pcoll, MaterializedPCollection):
        logging.info("The provided PCollection has already been materialized.")
        return pcoll

    temp_location = pcoll.pipeline._options.view_as(GoogleCloudOptions).temp_location
    cache_location = FileSystems.join(temp_location, "cache", uuid.uuid4().hex)
    while FileSystems.match(FileSystems.match([cache_location + "*"], limits=[1])[0].metadata_list):
        cache_location = FileSystems.join(temp_location, "cache", uuid.uuid4().hex)

    cache = caching.TextBasedCache(location=cache_location)

    pcoll_to_cache = (
        pcoll | "Write to cache" >> cache.writer()
    )
    pcoll_to_cache.pipeline.run().wait_until_finish()

    new_pipeline = (
        beam.Pipeline(
            runner=pcoll_to_cache.pipeline.runner,
            options=pcoll_to_cache.pipeline._options)
        | "Read from cache" >> cache.reader()
    )
    materialized_pcoll = MaterializedPCollection(
        pipeline=new_pipeline, cache=cache, tag=pcoll.tag, element_type=pcoll.element_type,
        windowing=pcoll.windowing)
    
    return materialized_pcoll

## Demo

### Download data

In [0]:
!wget https://storage.googleapis.com/strokach/inputs/winterstale.txt -q -O /content/winterstale.txt

### Setup

In [44]:
temp_location = tempfile.mkdtemp()
temp_location

'/tmp/tmpPdnbVl'

In [45]:
options = PipelineOptions(temp_location=temp_location)
options.display_data()



{'temp_location': '/tmp/tmpPdnbVl'}

In [0]:
runner = CustomRunner()

In [0]:
pipeline = beam.Pipeline(runner=runner, options=options)

### Analyze Shakespeare

In [0]:
raw_text = (
    pipeline | "Read" >> beam.io.ReadFromText("/content/winterstale.txt")
)

In [49]:
raw_text = persist(raw_text)

INFO:root:Running pipeline with DirectRunner.
INFO:root:Starting finalize_write threads with num_shards: 5 (skipped: 0), batches: 5, num_threads: 5
INFO:root:Renamed 5 shards in 0.12 seconds.


In [50]:
list(itertools.islice(raw_text.read(), 10))

[u'\tWill bear up with this exercise, so long',
 u'\tI daily vow to use it. Come and lead me',
 u'\tUnto these sorrows.',
 u'',
 u'\t[Exeunt]',
 u'',
 u'',
 u'',
 u'',
 u"\tTHE WINTER'S TALE"]

In [56]:
raw_text | ("Remove empty lines" >> beam.FlatMap(lambda e: [e.strip()] if e.strip() else []))

TypeError: ignored

In [58]:

("hello" >> beam.FlatMap(lambda e: [e.strip()] if e.strip() else [])).

'hello'

In [54]:
"/".join([1])

TypeError: ignored

In [77]:
cleaned_text = persist(
    
)

NameError: ignored

In [40]:
len("3dd6568b81ee4ac1b8497def164e9f9c")

32

In [44]:
beam.pvalue.PCollection(None)._unique_name()

'PCollection139861118106640'