# Imports

In [1]:
from __future__ import print_function

import multiprocessing
import os
import os.path as op
import subprocess
import time

import apache_beam as beam

try:
    from pathlib import Path
except ImportError:
    from pathlib2 import Path

In [2]:
print(beam.__version__)

2.14.0.dev


# Functions

In [3]:
def count_open_files():
    """Count the number of files opened by current process."""
    pid = multiprocessing.current_process().pid
    lsof_out = subprocess.check_output(["lsof", "-p", str(pid)])
    num_open_files = len(lsof_out.strip().split("\n")) - 1
    return num_open_files


count_open_files()

205

# Parameters

In [4]:
project_id = 'strokach-playground'

os.environ["GOOGLE_CLOUD_PROJECT"] = project_id
!gcloud config set project {project_id}

Updated property [core/project].


In [5]:
NOTEBOOK_NAME = "directrunner_streaming_tmof"
NOTEBOOK_PATH = Path(NOTEBOOK_NAME).resolve()
NOTEBOOK_PATH.mkdir(exist_ok=True)

NOTEBOOK_PATH

PosixPath('/home/strokach/workspace/beam-notebooks/directrunner_streaming_tmof')

# Pipeline

In [6]:
runner = beam.runners.direct.DirectRunner()
runner

<apache_beam.runners.direct.direct_runner.SwitchingDirectRunner at 0x7f1076d742d0>

In [7]:
pipeline_options = beam.pipeline.PipelineOptions(
    project=project_id,
    job_name="taxirides-realtime",
    streaming=True,
)
pipeline_options.display_data()



{'job_name': 'taxirides-realtime',
 'project': 'strokach-playground',
 'streaming': True}

In [8]:
taxirides_pc = beam.Pipeline(
    runner=runner, options=pipeline_options
) | "Read" >> beam.io.ReadFromPubSub(
    topic="projects/strokach-playground/topics/taxirides-realtime"
)

In [9]:
results = taxirides_pc.pipeline.run()

In [10]:
start_time = time.time()
while True:
    num_open_files = count_open_files()
    print(
        "Time elapsed: {:3.0f}s, State: {:s}; Number of open files: {}".format(
            time.time() - start_time, results.state, num_open_files
        )
    )
    if num_open_files > 1000:
        break
    time.sleep(5)

Time elapsed:   0s, State: RUNNING; Number of open files: 218
Time elapsed:   5s, State: RUNNING; Number of open files: 225
Time elapsed:  10s, State: RUNNING; Number of open files: 241
Time elapsed:  15s, State: RUNNING; Number of open files: 325
Time elapsed:  20s, State: RUNNING; Number of open files: 452
Time elapsed:  25s, State: RUNNING; Number of open files: 589
Time elapsed:  31s, State: RUNNING; Number of open files: 735
Time elapsed:  36s, State: RUNNING; Number of open files: 880
Time elapsed:  41s, State: RUNNING; Number of open files: 1013


NameError: name 'breakd' is not defined

In [1]:
from google.cloud import pubsub

In [2]:
sub_client = pubsub.SubscriberClient()

In [4]:
sub_client.api.transport.channel.

<grpc._channel.Channel at 0x7ff9462fe0d0>