<a href="https://colab.research.google.com/github/ostrokach/beam-notebooks/blob/master/feature/lazyevaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install dependencies

In [1]:
!pip install "git+https://github.com/ostrokach/beam.git@feature/lazyevaluation#egg=apache_beam[gcp]&subdirectory=sdks/python"[gcp]

[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.[0m


In [None]:
!sudo apt-get install libsnappy-dev -y -q

[sudo] password for strokach: 

In [3]:
!pip install python-snappy



## Imports

In [1]:
import copy
import itertools
import logging
import os
import tempfile
import uuid

import apache_beam as beam
import numpy as np
from apache_beam.io.filesystems import FileSystems
from apache_beam.options.pipeline_options import GoogleCloudOptions, PipelineOptions
from apache_beam.runners.direct.direct_runner import BundleBasedDirectRunner
from apache_beam.runners.interactive import caching

from faker import Faker

In [2]:
logging.basicConfig(level=logging.INFO)

In [3]:
logging.info("hello")

INFO:root:hello


In [4]:
%load_ext autoreload
%autoreload 2

## Functions

In [7]:
class MaterializedPCollection(beam.pvalue.PCollection):

    # TODO: Ideally, would want to be consistent with the PCollection API.
    def __init__(self, pipeline, tag=None, element_type=None, windowing=None, cache=None):
        super(MaterializedPCollection, self).__init__(
            pipeline, tag=tag, element_type=element_type, windowing=windowing)
        self._cache = cache

    def read(self, **reader_kwargs):
        return self._cache.read(**reader_kwargs)

    def __del__(self):
        # Clear the underlying cache when there are no more references to this object.
        self._cache.clear()

In [30]:
def persist(pcoll):
    """Materialize PCollection.

    Args:
        pcoll: The PCollection to materialize.

    Returns:
        MaterializedPCollection: An object which can be used to accessthe materialized
            PCollection.
    """
    if isinstance(pcoll, MaterializedPCollection):
        logging.info("The provided PCollection has already been materialized.")
        return pcoll

    temp_location = pcoll.pipeline._options.view_as(GoogleCloudOptions).temp_location
    cache_location = FileSystems.join(temp_location, "cache", uuid.uuid4().hex)
    while FileSystems.match(FileSystems.match([cache_location + "*"], limits=[1])[0].metadata_list):
        cache_location = FileSystems.join(temp_location, "cache", uuid.uuid4().hex)

    cache = caching.TFRecordBasedCache(location=cache_location)

    pcoll_to_cache = (
        pcoll | "Write to cache" >> cache.writer()
    )
    # TODO: Get this working with `test_runner_api=True`
    pcoll_to_cache.pipeline.run(test_runner_api=False).wait_until_finish()

    new_pipeline = (
        beam.Pipeline(
            runner=pcoll_to_cache.pipeline.runner,
            options=pcoll_to_cache.pipeline._options)
        | "Read from cache" >> cache.reader()
    ).pipeline
    materialized_pcoll = MaterializedPCollection(
        pipeline=new_pipeline, tag=pcoll.tag, element_type=pcoll.element_type,
        windowing=pcoll.windowing, cache=cache)
    
    assert pcoll.producer
    materialized_pcoll.producer = pcoll.producer
    
    return materialized_pcoll

## Options

In [53]:
temp_root = "/tmp/beam-dev"
try:
    os.makedirs(temp_root)
except OSError:
    pass

temp_location = tempfile.mkdtemp(dir=temp_root)

cache_location = FileSystems.join(temp_location, "cache")
try:
    os.makedirs(cache_location)
except OSError:
    pass

temp_root, temp_location, cache_location

('/tmp/beam-dev', '/tmp/beam-dev/tmpFtu4pa', '/tmp/beam-dev/tmpFtu4pa/cache')

In [54]:
options = PipelineOptions(temp_location=temp_location, interactive=True)
options.display_data()



{'interactive': True, 'temp_location': '/tmp/beam-dev/tmpFtu4pa'}

In [55]:
runner = BundleBasedDirectRunner()

In [56]:
p = beam.Pipeline(runner=runner, options=options)

## Generate a dataset

Create a dataset of 10,000 fake people profiles.

In [13]:
# fake = Faker()
# fake_people = [fake.profile(fields=None, sex=None) for _ in range(10000)]

In [14]:
fake_people[:1]

[{'address': u'3914 Amanda Ford Apt. 103\nEast Dustin, MO 63196',
  'birthdate': datetime.date(1926, 9, 26),
  'blood_group': 'O-',
  'company': u'Rhodes, Wood and Bailey',
  'current_location': (Decimal('54.9184985'), Decimal('-117.877115')),
  'job': 'Pension scheme manager',
  'mail': u'browndaniel@gmail.com',
  'name': u'Jennifer Smith',
  'residence': u'4211 Amanda Mountains\nTomport, CO 07160',
  'sex': 'F',
  'ssn': u'272-21-0647',
  'username': u'wfoley',
  'website': [u'http://haynes-robinson.com/', u'http://thomas.com/']}]

## Create a PCollection cache

Convert the generated dataset into a `PCollection` which can be accessed from within a Beam pipeline.

In [57]:
input_cache = caching.SafeTextBasedCache(FileSystems.join(cache_location, "fake_people"), if_exists="overwrite")
input_cache.write(fake_people)

In [58]:
temp = persist(
    p
    | input_cache.reader()
    | beam.combiners.Count.Globally()
)

num_people = next(temp.read())
num_people

INFO:root:Running pipeline with DirectRunner.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.


10000

## Basic transformations

### Add age columns

In [59]:
def calculate_age(birthdate):
    import datetime
    td = datetime.date.today() - birthdate
    return int(round(td.days // 365.25))

def add_age_colum(element):
    element["age"] = calculate_age(element["birthdate"])
    return element

In [60]:
people_with_age = persist(
    p
    | input_cache.reader()
#     | beam.combiners.Sample.FixedSizeGlobally(5)
#     | beam.FlatMap(lambda e_lst: [e for e in e_lst])
    | beam.Map(add_age_colum)
)

next(people_with_age.read())

INFO:root:Running pipeline with DirectRunner.
INFO:root:Starting finalize_write threads with num_shards: 10 (skipped: 0), batches: 10, num_threads: 10
INFO:root:Renamed 10 shards in 0.12 seconds.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.


{'address': u'1920 James Row\nNew Jodi, IL 35246',
 'age': 4,
 'birthdate': datetime.date(2014, 9, 7),
 'blood_group': 'O-',
 'company': u'Perkins Ltd',
 'current_location': (Decimal('-23.7394845'), Decimal('-90.853769')),
 'job': 'Electrical engineer',
 'mail': u'mccormickjessica@gmail.com',
 'name': u'Savannah Lopez',
 'residence': u'27264 Brad Glens\nDavidchester, UT 19497',
 'sex': 'F',
 'ssn': u'853-79-8887',
 'username': u'mmaldonado',
 'website': [u'https://sullivan.com/', u'https://www.yates-rivas.com/']}

## Chain transformations

Select top 10 most popular occupations in the `> 30` age group.

In [34]:
most_popular_occupations = persist(
    people_with_age
    | beam.Filter(lambda e: e["age"] > 30)
    | beam.Map(lambda e: (e["job"], e))
    | beam.combiners.Count.PerKey()
    | beam.combiners.Top.Of(10, key=lambda x: x[1])
)

INFO:root:Running pipeline with DirectRunner.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.


In [35]:
list(most_popular_occupations.read())

[[('Music therapist', 24),
  ('Conference centre manager', 23),
  ('Civil Service fast streamer', 22),
  ('Planning and development surveyor', 22),
  ('Product manager', 21),
  ('Consulting civil engineer', 20),
  ('Print production planner', 20),
  ('Geneticist, molecular', 20),
  ('Sport and exercise psychologist', 19),
  ('Therapist, occupational', 19)]]

## Store for future use

In [44]:
next(people_with_age.read())

{'address': u'19805 John Gateway Suite 890\nSchroederside, AK 54542',
 'age': 115,
 'birthdate': datetime.date(1903, 9, 21),
 'blood_group': 'AB+',
 'company': u'Simmons-Olson',
 'current_location': (Decimal('15.3911005'), Decimal('-104.896245')),
 'job': 'Accountant, chartered management',
 'mail': u'monica22@gmail.com',
 'name': u'Maria Brown',
 'residence': u'04023 Collins Corners Apt. 561\nLake Darlene, UT 50391',
 'sex': 'F',
 'ssn': u'473-05-3972',
 'username': u'allendouglas',
 'website': [u'https://www.jensen.biz/', u'https://www.elliott.com/']}

In [68]:
output_folder = FileSystems.join(temp_location, "json_output")
try:
    os.makedirs(output_folder)
except OSError:
    pass


def remove_nonserializable(element):
    element = element.copy()
    element["birthdate"] = element["birthdate"].isoformat()
    del element["current_location"]
    return element


def to_json_string(element):
    import json

    return json.dumps(element)


output = (
    people_with_age
    | beam.combiners.Sample.FixedSizeGlobally(5)
    | beam.FlatMap(lambda e_lst: [e for e in e_lst])
    | beam.Map(remove_nonserializable)
    | beam.Map(to_json_string)
    | beam.io.WriteToText(
        FileSystems.join(output_folder, "output-data"), file_name_suffix=".json"
    )
)
output.pipeline.run(test_runner_api=False).wait_until_finish()

INFO:root:Running pipeline with DirectRunner.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.


'DONE'

## Convert to DataFrame