## Install dependencies

In [0]:
!pip install "git+https://github.com/ostrokach/beam.git@feature/lazyevaluation#egg=apache_beam[gcp]&subdirectory=sdks/python"

Collecting apache_beam[gcp] from git+https://github.com/ostrokach/beam.git@feature/lazyevaluation#egg=apache_beam[gcp]&subdirectory=sdks/python
  Cloning https://github.com/ostrokach/beam.git (to revision feature/lazyevaluation) to /tmp/pip-install-KPkJ_D/apache-beam
  Running command git clone -q https://github.com/ostrokach/beam.git /tmp/pip-install-KPkJ_D/apache-beam
  Running command git checkout -b feature/lazyevaluation --track origin/feature/lazyevaluation
  Switched to a new branch 'feature/lazyevaluation'
  Branch 'feature/lazyevaluation' set up to track remote branch 'feature/lazyevaluation' from 'origin'.
Collecting fastavro<0.22,>=0.21.4 (from apache_beam[gcp])
[?25l  Downloading https://files.pythonhosted.org/packages/15/e3/5956c75f68906b119191ef30d9acff661b422cf918a29a03ee0c3ba774be/fastavro-0.21.24-cp27-cp27mu-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 5.0MB/s 
Collecting hdfs<3.0.0,>=2.1.0 (from apache_beam[gcp])
[?25l  Downloading 

In [0]:
!sudo apt-get install libsnappy-dev -y -q

Reading package lists...
Building dependency tree...
Reading state information...
libsnappy-dev is already the newest version (1.1.7-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 16 not upgraded.


In [0]:
!pip install python-snappy



In [0]:
!pip install faker

Collecting faker
[?25l  Downloading https://files.pythonhosted.org/packages/52/1a/930431923062857520bae512101a648ef528cd327583fda38d9e76fab5ce/Faker-1.0.7-py2.py3-none-any.whl (874kB)
[K     |████████████████████████████████| 880kB 4.9MB/s 
Installing collected packages: faker
Successfully installed faker-1.0.7


## Imports

In [0]:
import copy
import itertools
import logging
import os
import tempfile
import uuid

import apache_beam as beam
import numpy as np
from apache_beam.io.filesystems import FileSystems
from apache_beam.options.pipeline_options import GoogleCloudOptions, PipelineOptions
from apache_beam.runners.direct.direct_runner import BundleBasedDirectRunner
from apache_beam.runners.interactive import caching

from faker import Faker

In [0]:
logging.basicConfig(level=logging.INFO)

In [0]:
logging.info("hello")

INFO:root:hello


In [0]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Functions

In [0]:
class MaterializedPCollection(beam.pvalue.PCollection):

    # TODO: Ideally, would want to be consistent with the PCollection API.
    def __init__(self, pipeline, tag=None, element_type=None, windowing=None, cache=None):
        super(MaterializedPCollection, self).__init__(
            pipeline, tag=tag, element_type=element_type, windowing=windowing)
        self._cache = cache

    def read(self, **reader_kwargs):
        return self._cache.read(**reader_kwargs)

    def __del__(self):
        # Clear the underlying cache when there are no more references to this object.
        self._cache.clear()

In [0]:
def persist(pcoll):
    """Materialize PCollection.

    Args:
        pcoll: The PCollection to materialize.

    Returns:
        MaterializedPCollection: An object which can be used to accessthe materialized
            PCollection.
    """
    if isinstance(pcoll, MaterializedPCollection):
        logging.info("The provided PCollection has already been materialized.")
        return pcoll

    temp_location = pcoll.pipeline._options.view_as(GoogleCloudOptions).temp_location
    cache_location = FileSystems.join(temp_location, "cache", uuid.uuid4().hex)
    while FileSystems.match(FileSystems.match([cache_location + "*"], limits=[1])[0].metadata_list):
        cache_location = FileSystems.join(temp_location, "cache", uuid.uuid4().hex)

    cache = caching.TFRecordBasedCache(location=cache_location)

    pcoll_to_cache = (
        pcoll | "Write to cache" >> cache.writer()
    )
    # TODO: Get this working with `test_runner_api=True`
    pcoll_to_cache.pipeline.run(test_runner_api=False).wait_until_finish()

    new_pipeline = (
        beam.Pipeline(
            runner=pcoll_to_cache.pipeline.runner,
            options=pcoll_to_cache.pipeline._options)
        | "Read from cache" >> cache.reader()
    ).pipeline
    materialized_pcoll = MaterializedPCollection(
        pipeline=new_pipeline, tag=pcoll.tag, element_type=pcoll.element_type,
        windowing=pcoll.windowing, cache=cache)
    
    assert pcoll.producer
    materialized_pcoll.producer = pcoll.producer
    
    return materialized_pcoll

## Options

In [0]:
temp_root = "/tmp/beam-dev"
try:
    os.makedirs(temp_root)
except OSError:
    pass

temp_location = tempfile.mkdtemp(dir=temp_root)

cache_location = FileSystems.join(temp_location, "cache")
try:
    os.makedirs(cache_location)
except OSError:
    pass

temp_root, temp_location, cache_location

('/tmp/beam-dev', '/tmp/beam-dev/tmpC48SkB', '/tmp/beam-dev/tmpC48SkB/cache')

In [0]:
options.display_data()



{'interactive': True, 'temp_location': '/tmp/beam-dev/tmpC48SkB'}

In [0]:
runner = BundleBasedDirectRunner()

In [0]:
p = beam.Pipeline(runner=runner, options=options)

## Generate a dataset

Create a dataset of 10,000 fake people profiles.

In [0]:
# fake = Faker()
# fake_people = [fake.profile(fields=None, sex=None) for _ in range(10000)]

In [0]:
fake_people[:1]

[{'address': u'58653 Logan Points\nNew Meganshire, NH 49518',
  'birthdate': datetime.date(1995, 12, 16),
  'blood_group': 'O-',
  'company': u'Jones, Graves and Johnson',
  'current_location': (Decimal('-54.6011605'), Decimal('99.342496')),
  'job': 'Clinical psychologist',
  'mail': u'jeffreywilliams@gmail.com',
  'name': u'Mrs. Amanda Sutton MD',
  'residence': u'162 Vega Flat Suite 682\nEast Amyton, PA 61239',
  'sex': 'F',
  'ssn': u'880-56-8343',
  'username': u'nshaffer',
  'website': [u'https://velasquez-burke.org/',
   u'https://jones.com/',
   u'https://www.smith-flores.org/',
   u'https://wright.org/']}]

## Create a PCollection cache

Convert the generated dataset into a `PCollection` which can be accessed from within a Beam pipeline.

In [0]:
input_cache = caching.SafeTextBasedCache(FileSystems.join(cache_location, "fake_people"), if_exists="overwrite")
input_cache.write(fake_people)

In [0]:
temp = persist(
    p
    | input_cache.reader()
    | beam.combiners.Count.Globally()
)

num_people = next(temp.read())
num_people

INFO:root:Running pipeline with DirectRunner.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.


10000

## Basic transformations

### Add age columns

In [0]:
def calculate_age(birthdate):
    import datetime
    td = datetime.date.today() - birthdate
    return int(round(td.days // 365.25))

def add_age_colum(element):
    element["age"] = calculate_age(element["birthdate"])
    return element

In [0]:
people_with_age = persist(
    p
    | input_cache.reader()
#     | beam.combiners.Sample.FixedSizeGlobally(5)
#     | beam.FlatMap(lambda e_lst: [e for e in e_lst])
    | beam.Map(add_age_colum)
)

next(people_with_age.read())

INFO:root:Running pipeline with DirectRunner.
INFO:root:Starting finalize_write threads with num_shards: 10 (skipped: 0), batches: 10, num_threads: 10
INFO:root:Renamed 10 shards in 0.12 seconds.
INFO:root:Starting finalize_write threads with num_shards: 10 (skipped: 0), batches: 10, num_threads: 10
INFO:root:Renamed 10 shards in 0.12 seconds.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.


{'address': u'5101 Scott Vista\nMelissafurt, MD 99782',
 'age': 106,
 'birthdate': datetime.date(1913, 3, 28),
 'blood_group': 'B-',
 'company': u'Peterson-Smith',
 'current_location': (Decimal('-71.3423775'), Decimal('101.865327')),
 'job': 'Engineer, manufacturing systems',
 'mail': u'igomez@hotmail.com',
 'name': u'Eric Atkinson',
 'residence': u'52856 Saunders Forge\nLake Johnport, NJ 80734',
 'sex': 'M',
 'ssn': u'892-64-8917',
 'username': u'williamsonchristine',
 'website': [u'http://www.wilson-rich.org/',
  u'https://leonard-torres.net/',
  u'http://chavez.com/']}

## Chain transformations

Select top 10 most popular occupations in the `> 30` age group.

In [0]:
most_popular_occupations = persist(
    people_with_age
    | beam.Filter(lambda e: e["age"] > 30)
    | beam.Map(lambda e: (e["job"], e))
    | beam.combiners.Count.PerKey()
    | beam.combiners.Top.Of(10, key=lambda x: x[1])
)

INFO:root:Running pipeline with DirectRunner.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.


In [0]:
list(most_popular_occupations.read())

[[('Community education officer', 21),
  ('Theatre manager', 21),
  ('Scientific laboratory technician', 20),
  ('Sport and exercise psychologist', 20),
  ('Recycling officer', 20),
  ('Make', 19),
  ('Speech and language therapist', 19),
  ('Plant breeder/geneticist', 19),
  ('Medical technical officer', 19),
  ('Youth worker', 18)]]

## Store for future use

In [0]:
next(people_with_age.read())

{'address': u'19805 John Gateway Suite 890\nSchroederside, AK 54542',
 'age': 115,
 'birthdate': datetime.date(1903, 9, 21),
 'blood_group': 'AB+',
 'company': u'Simmons-Olson',
 'current_location': (Decimal('15.3911005'), Decimal('-104.896245')),
 'job': 'Accountant, chartered management',
 'mail': u'monica22@gmail.com',
 'name': u'Maria Brown',
 'residence': u'04023 Collins Corners Apt. 561\nLake Darlene, UT 50391',
 'sex': 'F',
 'ssn': u'473-05-3972',
 'username': u'allendouglas',
 'website': [u'https://www.jensen.biz/', u'https://www.elliott.com/']}

In [0]:
output_folder = FileSystems.join(temp_location, "json_output")
try:
    os.makedirs(output_folder)
except OSError:
    pass


def remove_nonserializable(element):
    element = element.copy()
    element["birthdate"] = element["birthdate"].isoformat()
    del element["current_location"]
    return element


def to_json_string(element):
    import json

    return json.dumps(element)


output = (
    people_with_age
    | beam.combiners.Sample.FixedSizeGlobally(5)
    | beam.FlatMap(lambda e_lst: [e for e in e_lst])
    | beam.Map(remove_nonserializable)
    | beam.Map(to_json_string)
    | beam.io.WriteToText(
        FileSystems.join(output_folder, "output-data"), file_name_suffix=".json"
    )
)
output.pipeline.run(test_runner_api=False).wait_until_finish()

INFO:root:Running pipeline with DirectRunner.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.


'DONE'

## Convert to DataFrame