# Flink Demo

## Install dependencies

In [None]:
try:
    import google.colab
    GOOGLE_COLAB = True
except ImportError:
    GOOGLE_COLAB = False
    
GOOGLE_COLAB

### Install Java and Flink

In [None]:
if GOOGLE_COLAB:
    !apt-get install openjdk-8-jdk-headless -qq > /dev/null
    !wget -q https://www-eu.apache.org/dist/flink/flink-1.8.0/flink-1.8.0-bin-scala_2.12.tgz -O flink-1.8.0-bin-scala_2.12.tgz
    !tar xf flink-1.8.0-bin-scala_2.12.tgz
    !rm flink-1.8.0-bin-scala_2.12.tgz

### Start a Flink cluster

In [None]:
if GOOGLE_COLAB:
    !./flink-1.8.0/bin/start-cluster.sh

### (Optional) Create a tunnel to the Flink dashboard

In [None]:
if GOOGLE_COLAB:
    !wget -q https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip -O ngrok-stable-linux-amd64.zip
    !unzip -o ngrok-stable-linux-amd64.zip

In [None]:
if GOOGLE_COLAB:
    import json
    import os
    import time
    import subprocess
    import sys
    from urllib import request

    env = os.environ.copy()
    if os.getcwd() not in env["PATH"]:
        env["PATH"] = "{}:{}".format(os.getcwd(), env["PATH"])
    ngrok = subprocess.Popen(["ngrok", "http", "8081"], env=env)

    time.sleep(1)
    link = "http://localhost:4040/api/tunnels"
    ngrok_dashboard = json.loads(request.urlopen(link).read())['tunnels']
    for tunnel in json.loads(request.urlopen(link).read())['tunnels']:
        print(tunnel["name"], tunnel["public_url"])

### Install Apache Beam

In [None]:
if GOOGLE_COLAB:
    !git clone https://github.com/apache/beam.git
    !cd beam && ./gradlew :sdks:python:container:docker > sdks-python-container-docker-build.log

## Imports

In [None]:
import copy
import itertools
import logging
import os
import tempfile
import uuid
import pickle

import apache_beam as beam
import fastavro
import numpy as np
import pandas as pd
import tqdm
from apache_beam.io.filesystems import FileSystems
from apache_beam.options.pipeline_options import (GoogleCloudOptions,
                                                  PipelineOptions)
from apache_beam.runners.direct.direct_runner import BundleBasedDirectRunner
from apache_beam.runners.interactive import caching
from apache_beam.transforms.ptransform import ptransform_fn
from faker import Faker

## Parameters

In [None]:
# temp_location = tempfile.mkdtemp(prefix="beam-dev-")
temp_location = "gs://strokach/dataflow_temp/"

# cache_location = tempfile.mkdtemp(dir=temp_location, prefix="cache-")
cache_location = "gs://strokach/dataflow_cache/"

temp_location, cache_location

In [None]:
# options = PipelineOptions(runner="direct", temp_location=temp_location)
# options = PipelineOptions(runner="portable", temp_location=temp_location)
options = PipelineOptions(runner="PortableRunner", temp_location=temp_location)
options.display_data()

## Load data

### Generate a dataset of fake people profiles

In [None]:
try:
    with open("fake_people.pkl", "rb") as f:
        fake_people = pickle.load(f)
except IOError:
    fake = Faker(seed=42)
    fake_people = [fake.profile(fields=None, sex=None) for _ in range(10000)]
    with open("fake_people.pkl", "wb") as f:
        pickle.dump(fake_people,f )

In [None]:
fake_people[0]

## Analyse data

### Create a PCollection cache

Convert the generated dataset into a `PCollection` which can be accessed from within a Beam pipeline.

In [None]:
input_cache = caching.SafeTextBasedCache(FileSystems.join(cache_location, "fake_people"), mode="overwrite")
input_cache.write(fake_people)

In [None]:
next(input_cache.read())

### Validate the PCollection cache

In [None]:
# Make sure we have 10_000 fake profiles, as expected
temp = caching.TFRecordBasedCache(
    FileSystems.join(cache_location, "temp"), mode="overwrite"
)

with beam.Pipeline(options=options) as p:
    _ = (
        p
        | input_cache.reader()
        | beam.combiners.Count.Globally()
        | temp.writer()
    )

In [None]:
assert next(temp.read()) == 10000