In [1]:
import apache_beam as beam
from apache_beam.runners import DataflowRunner
from apache_beam.runners.interactive.interactive_runner import InteractiveRunner
from apache_beam.runners.interactive import interactive_beam as ib
from apache_beam.options import pipeline_options
from apache_beam.options.pipeline_options import GoogleCloudOptions

import google.auth

In [2]:
# Setting up the Apache Beam pipeline options.
options = pipeline_options.PipelineOptions(flags=[])

# Sets the project to the default project in your current Google Cloud environment.
_, options.view_as(GoogleCloudOptions).project = google.auth.default()

# Sets the Google Cloud Region in which Cloud Dataflow runs.
options.view_as(GoogleCloudOptions).region = 'us-central1'

# Adjust the following to choose a Cloud Storage location.
dataflow_gcs_location = 'gs://ningk-test/dataflow'

# Dataflow Staging Location. This location is used to stage the Dataflow Pipeline and SDK binary.
options.view_as(GoogleCloudOptions).staging_location = '%s/staging' % dataflow_gcs_location

# Dataflow Temp Location. This location is used to store temporary files or intermediate results before finally outputting to the sink.
options.view_as(GoogleCloudOptions).temp_location = '%s/temp' % dataflow_gcs_location

# Only needed if it's a dev version sdk.
options.view_as(pipeline_options.SetupOptions).sdk_location = (
    '/Users/ningk/beam/sdks/python/dist/apache-beam-%s0.tar.gz' % beam.version.__version__)

In [3]:
# Sets the recording duration of cache managers.
ib.options.recording_duration = '15m'

In [4]:
from apache_beam.runners.interactive import interactive_environment as ie
from apache_beam.runners.interactive.cache_manager import FileBasedCacheManager

p = beam.Pipeline(InteractiveRunner(DataflowRunner()), options=options)

ie.current_env().set_cache_manager(FileBasedCacheManager(cache_dir='gs://ningk-test/interactive_cache'), p)

In [5]:
pcoll = p | beam.Create(range(10))

In [6]:
df_pcoll = ib.collect(pcoll, n=10)

1611701504.287724




1611701515.881569
d0ecef52d4-5158551616-5158806480-5158603408
[]


In [7]:
cm = ie.current_env().get_cache_manager(p)

In [9]:
coder = cm.load_pcoder('full', 'd0ecef52d4-140147735962000-140147735826512-140147747245520')

In [12]:
reader, _ = cm.read('full', 'd0ecef52d4-140147735962000-140147735826512-140147747245520', tail=True)

In [13]:
reader

<list_iterator at 0x7f76ab116e10>