<a href="https://colab.research.google.com/github/ostrokach/beam-notebooks/blob/master/feature/filebasedcache.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install dependencies

In [1]:
!pip install "git+https://github.com/ostrokach/beam.git@feature/filebasedcache#egg=apache_beam[gcp]&subdirectory=sdks/python"[gcp]

[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.[0m


In [None]:
!sudo apt-get install libsnappy-dev -y -q

[sudo] password for strokach: 

In [3]:
!pip install python-snappy



## Imports

In [1]:
import copy
import itertools
import logging
import os
import tempfile
import uuid

import apache_beam as beam
import numpy as np
from apache_beam.io.filesystems import FileSystems
from apache_beam.options.pipeline_options import (GoogleCloudOptions,
                                                  PipelineOptions)
from apache_beam.runners.direct.direct_runner import BundleBasedDirectRunner
from apache_beam.runners.interactive import caching

from faker import Faker

In [2]:
logging.basicConfig(level=logging.INFO)

In [3]:
%load_ext autoreload
%autoreload 2

## Functions

## Options

In [8]:
temp_root = "/tmp/beam-dev"
try:
    os.makedirs(temp_root)
except OSError:
    pass

temp_location = tempfile.mkdtemp(dir=temp_root)

cache_location = FileSystems.join(temp_location, "cache")
try:
    os.makedirs(cache_location)
except OSError:
    pass

temp_root, temp_location, cache_location

('/tmp/beam-dev', '/tmp/beam-dev/tmpoxIcDL', '/tmp/beam-dev/tmpoxIcDL/cache')

In [9]:
options = PipelineOptions(temp_location=temp_location, interactive=False)
options.display_data()



{'interactive': True, 'temp_location': '/tmp/beam-dev/tmpoxIcDL'}

In [10]:
runner = BundleBasedDirectRunner()

## Generate a dataset

Create a dataset of 10,000 fake people profiles.

In [12]:
# fake = Faker()
# fake_people = [fake.profile(fields=None, sex=None) for _ in range(10000)]

In [13]:
fake_people[:1]

[{'address': u'81628 Howard Meadows\nChandlerville, TX 44380',
  'birthdate': datetime.date(1942, 8, 27),
  'blood_group': 'B-',
  'company': u'Mcdonald Group',
  'current_location': (Decimal('-50.510297'), Decimal('120.086476')),
  'job': 'Product designer',
  'mail': u'laura32@hotmail.com',
  'name': u'Alexander Harrison',
  'residence': u'USNS Mills\nFPO AE 72127',
  'sex': 'M',
  'ssn': u'833-23-5384',
  'username': u'prestonjessica',
  'website': [u'https://www.carter.com/']}]

## Create a PCollection cache

Convert the generated dataset into a `PCollection` which can be accessed from within a Beam pipeline.

In [19]:
input_cache = caching.SafeTextBasedCache(FileSystems.join(cache_location, "fake_people"), if_exists="overwrite")
input_cache.write(fake_people)

In [20]:
# Make sure we have the correct number of people
temp = caching.TFRecordBasedCache(
    FileSystems.join(cache_location, "temp"), if_exists="overwrite"
)

with beam.Pipeline(runner=runner, options=options) as p:
    _ = (
        p
        | input_cache.reader()
        | beam.combiners.Count.Globally()
        | temp.writer()
    )

num_people = next(temp.read())
num_people

INFO:root:Running pipeline with DirectRunner.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.


10000

## Basic transformations

### Add age columns

In [21]:
def calculate_age(birthdate):
    import datetime
    td = datetime.date.today() - birthdate
    return int(round(td.days // 365.25))

def add_age_colum(element):
    element["age"] = calculate_age(element["birthdate"])
    return element

In [22]:
people_with_age = caching.TFRecordBasedCache(
    FileSystems.join(cache_location, "temp"), if_exists="overwrite"
)

with beam.Pipeline(runner=runner, options=options) as p:
    _ = (
        p
        | input_cache.reader()
#         | beam.combiners.Sample.FixedSizeGlobally(5)
#         | beam.FlatMap(lambda e_lst: [e for e in e_lst])
        | beam.Map(add_age_colum)
        | people_with_age.writer()
    )

INFO:root:Running pipeline with DirectRunner.
INFO:root:Starting finalize_write threads with num_shards: 10 (skipped: 0), batches: 10, num_threads: 10
INFO:root:Renamed 10 shards in 0.12 seconds.


In [24]:
next(people_with_age.read())

{'address': u'256 Kelly Causeway Suite 231\nNew Howardshire, MI 07575',
 'age': 112,
 'birthdate': datetime.date(1907, 4, 27),
 'blood_group': 'B+',
 'company': u'Lee LLC',
 'current_location': (Decimal('-55.825322'), Decimal('105.693991')),
 'job': 'Research scientist (medical)',
 'mail': u'matthew27@hotmail.com',
 'name': u'Andrea Baker',
 'residence': u'7060 Robert Hollow\nPort Zachary, MI 09904',
 'sex': 'F',
 'ssn': u'755-07-9888',
 'username': u'randalldavis',
 'website': [u'https://mora.net/',
  u'https://rivera.info/',
  u'http://phillips-rodgers.com/']}

## Chain transformations

Select top 10 most popular occupations in the `> 30` age group.

In [25]:
most_popular_occupations = caching.TFRecordBasedCache(
    FileSystems.join(cache_location, "most_popular_occupations"), if_exists="overwrite"
)

with beam.Pipeline(runner=runner, options=options) as p:
    _ = (
        p
        | people_with_age.reader()
        | beam.Filter(lambda e: e["age"] > 30)
        | beam.Map(lambda e: (e["job"], e))
        | beam.combiners.Count.PerKey()
        | beam.combiners.Top.Of(10, key=lambda x: x[1])
        | most_popular_occupations.writer()
    )

INFO:root:Running pipeline with DirectRunner.
INFO:root:Starting finalize_write threads with num_shards: 1 (skipped: 0), batches: 1, num_threads: 1
INFO:root:Renamed 1 shards in 0.12 seconds.


In [26]:
list(most_popular_occupations.read())

[[('Engineer, water', 22),
  ('Radiographer, diagnostic', 22),
  ('Higher education lecturer', 22),
  ('Corporate investment banker', 20),
  ('Licensed conveyancer', 19),
  ('Sound technician, broadcasting/film/video', 19),
  ('Osteopath', 19),
  ('Chief Marketing Officer', 19),
  ('Fitness centre manager', 18),
  ('Air cabin crew', 18)]]

## Store for future use

In [32]:
output_folder = FileSystems.join(temp_location, "json_output")
try:
    os.makedirs(output_folder)
except OSError:
    pass


def remove_nonserializable(element):
    element["birthdate"] = element["birthdate"].isoformat()
    del element["current_location"]
    return element
    
def to_json_string(element):
    import json

    return json.dumps(element)


with beam.Pipeline(runner=runner, options=options) as p:
    _ = (
        p
        | people_with_age.reader()
        | beam.Map(remove_nonserializable)
        | beam.Map(to_json_string)
        | beam.io.WriteToText(
            FileSystems.join(output_folder, "output-data"), file_name_suffix=".json"
        )
    )

INFO:root:Running pipeline with DirectRunner.
INFO:root:Starting finalize_write threads with num_shards: 10 (skipped: 0), batches: 10, num_threads: 10
INFO:root:Renamed 10 shards in 0.13 seconds.


## Convert to DataFrame