## Summary

Collect one day worth of events from the `projects/pubsub-public-data/topics/taxirides-realtime` topic.

## Imports

In [None]:
import json
import time

import pyarrow as pa
import pyarrow.parquet as pq
import tqdm
from apache_beam.io.gcp.tests import utils as gcp_utils
from google.api_core import exceptions as gexc
from google.cloud import pubsub

## Parameters

In [None]:
NOTEBOOK_NAME = "collect_pubsub_events"

In [None]:
topic_path = "projects/pubsub-public-data/topics/taxirides-realtime"
subscription_path = "projects/strokach-playground/subscriptions/taxirides-realtime-5a2de6b6587b4d788e665c6722e267ff"
snapshot_path = "projects/strokach-playground/snapshots/taxirides-realtime-5a2de6b6587b4d788e665c6722e267ff"

## Functions

In [None]:
def read_from_pubsub(
    sub_client,
    subscription_path,
    with_attributes=False,
    number_of_elements=None,
    timeout=None,
):
    if number_of_elements is None and timeout is None:
        raise ValueError("Either number_of_elements or timeout must be specified.")
    messages = []
    start_time = time.time()

    while (number_of_elements is None or len(messages) < number_of_elements) and (
        timeout is None or (time.time() - start_time) < timeout
    ):
        try:
            response = sub_client.pull(
                subscription_path, max_messages=10000, retry=None, timeout=30
            )
        except (gexc.RetryError, gexc.DeadlineExceeded):
            continue
        ack_ids = [msg.ack_id for msg in response.received_messages]
        sub_client.acknowledge(subscription_path, ack_ids)
        for msg in response.received_messages:
            yield msg

## Workflow

In [None]:
sub_client = pubsub.SubscriberClient()

In [None]:
try:
    sub_client.create_snapshot(snapshot_path, subscription_path)
except gexc.AlreadyExists:
    pass

In [None]:
schema = pa.schema(
    [
        ("publish_time_secs", pa.int64()),
        ("publish_time_nanos", pa.int64()),
        ("message_id", pa.string()),
        ("message_data", pa.binary()),
        ("message_attributes", pa.binary()),
    ]
)

In [None]:
!ls -lSh taxirides.parquet

In [None]:
!cp taxirides.parquet taxirides.parquet.bak

In [None]:
pfile = pq.ParquetFile("taxirides.parquet")
print(pfile.num_row_groups)
df = pfile.read_row_group(0).to_pandas()

In [None]:
df.head()

In [None]:
def create_new_data_chunk():
    data_chunk = {
        
        "publish_time_secs": [],
        "publish_time_nanos": [],
        "message_id": [],
        "message_data": [],
        "message_attributes": [],
    }
    return data_chunk


sub_client.seek(subscription_path, snapshot=snapshot_path)


batch_size = 1_000_000
with pq.ParquetWriter('taxirides.parquet', schema) as writer:
    data_chunk = create_new_data_chunk()
    for i, msg in tqdm.tqdm_notebook(enumerate(read_from_pubsub(sub_client, subscription_path, number_of_elements=10)), total=7_000_000):
        data_chunk["publish_time_secs"].append(msg.message.publish_time.seconds)
        data_chunk["publish_time_nanos"].append(msg.message.publish_time.nanos)
        data_chunk["message_id"].append(msg.message.message_id)
        data_chunk["message_data"].append(msg.message.data)
        data_chunk["message_attributes"].append(json.dumps(dict(msg.message.attributes.items())).encode("utf-8"))
        if len(data_chunk["publish_time_secs"]) >= batch_size:
            writer.write_table(pa.Table.from_pydict(data_chunk, schema=schema))
            data_chunk = create_new_data_chunk()

In [None]:
preserve_index=False

In [None]:
df = table.to_pandas()

In [None]:
len(df)

In [None]:
msg

In [None]:
msg.message.publish_time

In [None]:
publish_time_secs, publish_time_nanos, message_id, message_data, message_attributes

In [None]:
msg.message.publish_time.seconds

In [None]:
msg.message.publish_time.nanos

In [None]:
msg.message.message_id

In [None]:
msg.message.data

In [None]:
msg.message.attributes