# StreamBasedCache

## Install dependencies

In [0]:
try:
    import google.colab
    GOOGLE_COLAB = True
except ImportError:
    GOOGLE_COLAB = False

In [0]:
if GOOGLE_COLAB:
    !sudo apt-get -yqq install libsnappy-dev

In [0]:
if GOOGLE_COLAB:
    !pip install -q python-snappy Faker

In [4]:
if GOOGLE_COLAB:
    !pip install "git+https://github.com/ostrokach/beam.git@feature/streambasedcache#egg=apache_beam[gcp]&subdirectory=sdks/python"



## Imports

In [0]:
import copy
import itertools
import logging
import os
import tempfile
import uuid
import pickle

import apache_beam as beam
import fastavro
import numpy as np
import pandas as pd
import tqdm
from apache_beam.io.filesystems import FileSystems
from apache_beam.options.pipeline_options import (GoogleCloudOptions,
                                                  PipelineOptions)
from apache_beam.runners.direct.direct_runner import BundleBasedDirectRunner
from apache_beam.runners.interactive import caching
from apache_beam.transforms.ptransform import ptransform_fn
from faker import Faker

## Parameters

In [6]:
temp_location = tempfile.mkdtemp(prefix="beam-dev-")

cache_location = tempfile.mkdtemp(dir=temp_location, prefix="cache-")

temp_location, cache_location

('/tmp/beam-dev-TK0fV5', '/tmp/beam-dev-TK0fV5/cache-07v3mr')

In [7]:
options = PipelineOptions(runner="direct", temp_location=temp_location)
options.display_data()



{'runner': 'direct', 'temp_location': '/tmp/beam-dev-TK0fV5'}

In [8]:
raise Exception("Breakpoint")

Exception: ignored

## Load data

### Generate a dataset of fake people profiles

In [0]:
try:
    with open("fake_people.pkl", "rb") as f:
        fake_people = pickle.load(f)
except IOError:
    fake = Faker(42)
    fake_people = [fake.profile(fields=None, sex=None) for _ in tqdm.tqdm_notebook(range(10000))]
    with open("fake_people.pkl", "wb") as f:
        pickle.dump(fake_people,f )

In [0]:
fake_people[0]

## Analyse data

### Create a PCollection cache

Convert the generated dataset into a `PCollection` which can be accessed from within a Beam pipeline.

In [0]:
input_cache = caching.SafeTextBasedCache(FileSystems.join(cache_location, "fake_people"), if_exists="overwrite")
input_cache.write(fake_people)

In [0]:
next(input_cache.read())

### Validate the PCollection cache

In [0]:
# Make sure we have 10_000 fake profiles, as expected
temp = caching.TFRecordBasedCache(
    FileSystems.join(cache_location, "temp"), if_exists="overwrite"
)

with beam.Pipeline(options=options) as p:
    _ = (
        p
        | input_cache.reader()
        | beam.combiners.Count.Globally()
        | temp.writer()
    )

assert next(temp.read()) == 10000

### Select people with duplicate usernames

In [0]:
# Make sure every username is distinct
temp = caching.TFRecordBasedCache(
    FileSystems.join(cache_location, "temp"), if_exists="overwrite"
)

with beam.Pipeline(options=options) as p:
    _ = (
        p
        | input_cache.reader()
        | "Extract username" >> beam.WithKeys(lambda e: e["username"])
        | "Group people by username" >> beam.GroupByKey(lambda e: e)
        | beam.Values()
        | "Select groups with at least two people" >> beam.Filter(lambda vs: len(vs) >= 2)
        | "Ungroup" >> beam.FlatMap(lambda gp: [e for e in gp])
        | temp.writer()
    )

In [0]:
pd.DataFrame(temp.read()).sort_values("username")

### Find most popular occupations

Select top 10 most popular occupations in the `> 30` age group.

In [0]:
def calculate_age(birthdate):
    import datetime
    td = datetime.date.today() - birthdate
    return int(round(td.days // 365.25))

def add_age_colum(element):
    element["age"] = calculate_age(element["birthdate"])
    return element

In [0]:
most_popular_occupations = caching.TFRecordBasedCache(
    FileSystems.join(cache_location, "most_popular_occupations"), if_exists="overwrite"
)

with beam.Pipeline(options=options) as p:
    _ = (
        p
        | input_cache.reader()
        | beam.Map(add_age_colum)
        | beam.Filter(lambda e: e["age"] > 30)
        | beam.Map(lambda e: (e["job"], e))
        | beam.combiners.Count.PerKey()
        | beam.combiners.Top.Of(10, key=lambda x: x[1])
        | most_popular_occupations.writer()
    )

In [0]:
next(most_popular_occupations.read())

## Process and export data

In [0]:
def normalize_data(element):
    return {
        "username": element["username"],
        "first_name": element["name"].split()[0],
        "last_name": " ".join(element["name"].split()[1:]),
        "mail": element["name"],
        "job": element["name"],
        "company": element["name"],
        "age": calculate_age(element["birthdate"]),
        "current_location_lat": float(element["current_location"][0]),
        "current_location_long": float(element["current_location"][1]),
        "website": element["website"] or [],
    }

normalize_data(element)

In [0]:
avro_schema = fastavro.parse_schema({
    "namespace": "example.avro",
    "name": "User",
    "type": "record",
    "fields": [
        {"name": "username", "type": "string"},
        {"name": "first_name", "type": "string"},
        {"name": "last_name", "type": "string"},
        {"name": "mail", "type": "string"},
        {"name": "job", "type": "string"},
        {"name": "company", "type": "string"},
        {"name": "age", "type": "int"},
        {"name": "current_location_lat", "type": "double"},
        {"name": "current_location_long", "type": "double"},
        {"name": "website", "type": {"type": "array", "items": "string"}},
    ],
})
avro_schema

In [0]:
output = caching.AvroBasedCache(
    FileSystems.join(cache_location, "temp"), if_exists="overwrite", schema=avro_schema,
    use_fastavro=True,
)

with beam.Pipeline(options=options) as p:
    _ = (
        p
        | input_cache.reader()
        | beam.Map(normalize_data)
        | output.writer()
    )

In [0]:
next(output.read())

## Write SQL queries against the result

### Google Cloud

## Export data to Google Cloud

In [0]:
#@title Google Cloud Project Info { display-mode: "form" }
project_id = "asdfdsf" #@param {type:"string"}
gcs_temp_location = "" #@param {type:"string"}

In [0]:
from google.colab import auth
auth.authenticate_user()

In [0]:
gcs_output = caching.AvroBasedCache(
    FileSystems.join(gcs_temp_location, "filebasedcache-output"), if_exists="overwrite",
    schema=avro_schema, use_fastavro=True)

gcs_output.write(output.read())

In [0]:
next(gcs_output.read())

### Query data using BigQuery

In [0]:
from google.cloud import bigquery

In [0]:
options = PipelineOptions(runner="direct", temp_location=gc_temp_location, staging_location=gsc_temp_location, project=project_id)
options.display_data()

In [0]:
FileSystems.mkdirs(gsc_temp_location)
FileSystems.mkdirs(FileSystems.join(gsc_temp_location, "filebasedcache-demo"))

In [0]:
gcs_output = caching.AvroBasedCache(
    FileSystems.join(gcs_temp_location), if_exists="overwrite",
    schema=avro_schema, use_fastavro=True)

# gce_output.write(output.read())


with beam.Pipeline(options=options) as p:
    _ = (
        p
        | input_cache.reader()
        | beam.Map(normalize_data)
        | output.writer()
    )

In [0]:
gcs_output._existing_file_paths()

In [0]:
# Configure the external data source and query job
table_id = 'fake_people'
external_config = bigquery.ExternalConfig('AVRO')
external_config.source_uris = gcs_output._existing_file_paths()

# external_config.schema = [
#     bigquery.SchemaField('name', 'STRING'),
#     bigquery.SchemaField('post_abbr', 'STRING')
# ]
# external_config.options.skip_leading_rows = 1  # optionally skip header row

In [0]:
external_config.to_api_repr()

In [0]:
client = bigquery.Client(project=project_id)

In [0]:
# Example query to find states starting with 'W'
sql_query = """
SELECT *
FROM `{table_id}`
WHERE AGE = (
    SELECT MAX(age)
    FROM `{table_id}`
)
""".format(table_id=table_id)

job_config = bigquery.QueryJobConfig()
job_config.table_definitions = {table_id: external_config}

query_job = client.query(sql_query, job_config=job_config)

In [0]:
query_job.to_dataframe()

## Cleanup

In [0]:
shutil.rmtree(temp_location)