## Start the beam-flink job server

In [4]:
from hops import beam as hops_beam
# Start Beam jobservice
hops_beam.start(taskmanager_heap_size=8192)


Job server log is available at:/srv/hops/hadoop/logs/userlogs/6cNHepU-KQfhT5sP9H_vEYwpMAgs7wZjeSs0hav8V-U/application_1566305877532_0005/container_e05_1566305877532_0005_01_000001/beamjobserver-flink_tutorial-runner-23537.log

Job host:ip-172-31-10-231.eu-north-1.compute.internal
Job port:23537
{'job_port': 23537, 'job_host': 'ip-172-31-10-231.eu-north-1.compute.internal', 'jobserver.pid': 7753, 'artifact_port': 26353, 'expansion_port': 13938}

## TFMA Notebook example

This notebook describes how to export your model for TFMA and demonstrates the analysis tooling it offers.

Note: Please make sure to follow the instructions in [README.md](https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi/README.md) when running this notebook

## Setup

Import necessary packages.

In [5]:
import apache_beam as beam
from hops import beam as hops_beam
import os
from tfx.examples.chicago_taxi import preprocess
import shutil
import tensorflow as tf
import tensorflow_data_validation as tfdv
import tensorflow_model_analysis as tfma
from google.protobuf import text_format 
from tensorflow.python.lib.io import file_io
from tensorflow_transform.beam.tft_beam_io import transform_fn_io
from tensorflow_transform.coders import example_proto_coder
from tensorflow_transform.saved import saved_transform_io
from tensorflow_transform.tf_metadata import dataset_schema
from tensorflow_transform.tf_metadata import schema_utils
from tfx.examples.chicago_taxi.trainer import task
from tfx.examples.chicago_taxi.trainer import taxi
from hops import hdfs as hopsfs
from apache_beam.options.pipeline_options import PipelineOptions, StandardOptions, SetupOptions, HadoopFileSystemOptions, PortableOptions, WorkerOptions, DebugOptions

Download input dataset

In [6]:
import requests, zipfile
from io import BytesIO
r = requests.get("http://snurran.sics.se/hops/beam/chicago_taxi_data.zip", stream=True)
z = zipfile.ZipFile(BytesIO(r.content))
z.extractall()

# Copy data into resources
hopsfs.copy_to_hdfs('data', 'Resources')

Started copying local path data to hdfs path hdfs://172.31.10.231:8020/Projects/flink_tutorial/Resources

Finished copying

Helper functions and some constants for running the notebook locally.

In [7]:
BASE_DIR = hopsfs.project_path(exclude_nn_addr=True)
DATA_DIR = os.path.join(BASE_DIR, 'Resources/data')
OUTPUT_DIR = os.path.join(BASE_DIR, 'Resources/taxi_out')
TMP_DIR = os.path.join(BASE_DIR, 'Resources/taxi_tmp')

# Base dir containing train and eval data
TRAIN_DATA_DIR = os.path.join(DATA_DIR, 'train')
EVAL_DATA_DIR = os.path.join(DATA_DIR, 'eval')

# Base dir where TFT writes training data
TFT_TRAIN_OUTPUT_BASE_DIR = os.path.join(OUTPUT_DIR, 'tft_train')
TFT_TRAIN_FILE_PREFIX = 'train_transformed'

# Base dir where TFT writes eval data
TFT_EVAL_OUTPUT_BASE_DIR = os.path.join(OUTPUT_DIR, 'tft_eval')
TFT_EVAL_FILE_PREFIX = 'eval_transformed'

TF_OUTPUT_BASE_DIR = os.path.join(OUTPUT_DIR, 'tf')

# Base dir where TFMA writes eval data
TFMA_OUTPUT_BASE_DIR = os.path.join(OUTPUT_DIR, 'tfma')

SERVING_MODEL_DIR = 'serving_model_dir'
EVAL_MODEL_DIR = 'eval_model_dir'


def get_tft_train_output_dir(run_id):
    return _get_output_dir(TFT_TRAIN_OUTPUT_BASE_DIR, run_id)


def get_tft_eval_output_dir(run_id):
    return _get_output_dir(TFT_EVAL_OUTPUT_BASE_DIR, run_id)


def get_tf_output_dir(run_id):
    return _get_output_dir(TF_OUTPUT_BASE_DIR, run_id)

def get_tfma_output_dir(run_id):
    return _get_output_dir(TFMA_OUTPUT_BASE_DIR, run_id)

def _get_output_dir(base_dir, run_id):
    return os.path.join(base_dir, 'run_' + str(run_id))

def get_schema_file():
    return os.path.join(OUTPUT_DIR, 'schema.pbtxt')


Clean up output directories.

In [8]:
if hopsfs.exists(TMP_DIR):
    hopsfs.rmr(TMP_DIR)
if hopsfs.exists(OUTPUT_DIR):
    hopsfs.rmr(OUTPUT_DIR)

Configure the beam pipeline

In [9]:
pipeline_args = hops_beam.get_portable_runner_config()
options=PipelineOptions(flags=pipeline_args)

## Compute and visualize descriptive data statistics

In [10]:
# Compute stats over training data.
train_stats = tfdv.generate_statistics_from_csv(data_location=os.path.join(TRAIN_DATA_DIR, 'data.csv'), output_path=os.path.join(TMP_DIR,'train_stats'), pipeline_options=options)