In [1]:
# !pip install 'apache-beam[gcp]'

In [2]:
import apache_beam as beam

In [3]:
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.io.gcp.bigtableio import WriteToBigTable
from google.cloud import pubsub_v1

In [4]:
from apache_beam import DoFn, GroupByKey, io, ParDo, Pipeline, PTransform, WindowInto, WithKeys
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.transforms.window import FixedWindows

In [5]:
from datetime import datetime


In [13]:
class GroupMessagesByFixedWindows(PTransform):
    """A composite transform that groups Pub/Sub messages based on publish time
    and outputs a list of tuples, each containing a message and its publish time.
    """

    def __init__(self, window_size, num_shards=5):
        # Set window size to 60 seconds.
        self.window_size = int(window_size * 60)
        self.num_shards = num_shards

    def expand(self, pcoll):
        return (
            pcoll
            # Bind window info to each element using element timestamp (or publish time).
            | "Window into fixed intervals"
            >> WindowInto(FixedWindows(self.window_size))
            | "Add timestamp to windowed elements" >> ParDo(AddTimestamp())
            # Assign a random key to each windowed element based on the number of shards.
            | "Add key" >> WithKeys(lambda _: random.randint(0, self.num_shards - 1))
            # Group windowed elements by key. All the elements in the same window must fit
            # memory for this. If not, you need to use `beam.util.BatchElements`.
            | "Group by key" >> GroupByKey()
        )

In [14]:
class AddTimestamp(DoFn):
    def process(self, element, publish_time=DoFn.TimestampParam):
        """Processes each windowed element by extracting the message body and its
        publish time into a tuple.
        """
        yield (
            element.decode("utf-8"),
            datetime.utcfromtimestamp(float(publish_time)).strftime(
                "%Y-%m-%d %H:%M:%S.%f"
            ),
        )

In [15]:
def run(input_topic, output_path, window_size=1.0, num_shards=5, pipeline_args=None):
    # Set `save_main_session` to True so DoFns can access globally imported modules.
    pipeline_options = PipelineOptions(
        pipeline_args, streaming=True, save_main_session=True
    )

    with Pipeline(options=pipeline_options) as pipeline:
        (
            pipeline
            # Because `timestamp_attribute` is unspecified in `ReadFromPubSub`, Beam
            # binds the publish time returned by the Pub/Sub server for each message
            # to the element's timestamp parameter, accessible via `DoFn.TimestampParam`.
            # https://beam.apache.org/releases/pydoc/current/apache_beam.io.gcp.pubsub.html#apache_beam.io.gcp.pubsub.ReadFromPubSub
            | "Read from Pub/Sub" >> io.ReadFromPubSub(topic=input_topic)
            | "Window into" >> GroupMessagesByFixedWindows(window_size, num_shards)
            # | "Write to GCS" >> ParDo(WriteToGCS(output_path))
        )


In [None]:
# parser = argparse.ArgumentParser()
# parser.add_argument(
#     "--input_topic",
#     help="The Cloud Pub/Sub topic to read from."
#     '"projects/<PROJECT_ID>/topics/<TOPIC_ID>".',
# )
# parser.add_argument(
#     "--window_size",
#     type=float,
#     default=1.0,
#     help="Output file's window size in minutes.",
# )
# parser.add_argument(
#     "--output_path",
#     help="Path of the output GCS file including the prefix.",
# )
# parser.add_argument(
#     "--num_shards",
#     type=int,
#     default=5,
#     help="Number of shards to use when writing windowed elements to GCS.",
# )
# known_args, pipeline_args = parser.parse_known_args()

In [6]:
class ProcessMessage(beam.DoFn):

    def process(self, message):
        try:
            data = message.get('data')
            yield data 
        except Exception as e:
            raise Exception(e)

# Test

In [7]:
var1 = [1,2,3,4]

In [8]:
var1 | beam.Map(lambda x: 2 * x)





[2, 4, 6, 8]

In [9]:
var1 | beam.Map(lambda x: [x, x%2])




[[1, 1], [2, 0], [3, 1], [4, 0]]

# Pub/Sub

In [33]:
import json
import typing

In [34]:
project_number = 294601891609
project_id = "crypto-busting-374123"
location = "europe-central2"
subscription_id = "bda-coinbase-topic-sub"
topic_id = "bda-coinbase-topic"

bigtable_instance_id = "bda-reddit-bigtable"
bigtable_table_id = "coinbase-db"
timeout = 5.0

In [35]:
input_topic = f"projects/{project_id}/topics/{topic_id}"

In [13]:
# WriteToBigTable(
#             project_id=project_id,
#             instance_id=bigtable_instance_id,
#             table_id=bigtable_table_id)

In [36]:
class MyMessage(typing.NamedTuple):
    # Simple way to propagate all the needed information from the Pub/Sub message.
    data: dict

In [37]:
class ProcessMessage(beam.DoFn):

    def process(self, message):
        """
        Example of the Pub/Sub message
        {
            "file_path": "gs://my-bucket/file_to_process.csv",
            "transformations": {
                "col_1": "to_upper",
                "col_2": "to_lower"
            }
        }
        """
        yield MyMessage(data=message.get('data'))

In [38]:
class ReadFile(beam.DoFn):

    def process(self, element: MyMessage):
        import io as io_file

        from apache_beam import io

        reader = csv.DictReader(io_file.TextIOWrapper(
            io.filesystems.FileSystems.open(MyMessage.data),
            encoding='utf-8'),
            delimiter=';')

        for row in reader:
            # Yields both the row to process and the transformations.
            yield (row)

In [40]:
def run(input_topic, pipeline_args=None):
    # Set `save_main_session` to True so DoFns can access globally imported modules.
    pipeline_options = PipelineOptions(
        pipeline_args, streaming=True, save_main_session=True
    )

    with Pipeline(options=pipeline_options) as pipeline:
        message = (
            pipeline
            # Because `timestamp_attribute` is unspecified in `ReadFromPubSub`, Beam
            # binds the publish time returned by the Pub/Sub server for each message
            # to the element's timestamp parameter, accessible via `DoFn.TimestampParam`.
            # https://beam.apache.org/releases/pydoc/current/apache_beam.io.gcp.pubsub.html#apache_beam.io.gcp.pubsub.ReadFromPubSub
            | "Read from Pub/Sub" >> io.ReadFromPubSub(topic=input_topic)
            | "Parse JSON" >> beam.Map(json.loads)
            | "Process message" >> beam.ParDo(ProcessMessage())
            | "Print Message" >> beam.Map(print)
#             | "Process Message" >> beam.ParDo(ProcessMessage())
#             | "Read file" >> beam.io.ReadAllFromText()   
        )
        
        content = (
            message 
            | beam.ParDo(ReadFile())
            | beam.ParDo(print)
        )

In [None]:
run(input_topic)

KeyboardInterrupt: 