In [29]:
from concurrent.futures._base import TimeoutError
from google.pubsub_v1 import PubsubMessage
from google.cloud.pubsublite.cloudpubsub import SubscriberClient
from google.cloud.pubsublite.types import (
    CloudRegion,
    CloudZone,
    FlowControlSettings,
    MessageMetadata,
    SubscriptionPath,
)
import ast

In [59]:
project_number = 1072423212419
cloud_region = "europe-central2"
zone_id = "a"
subscription_id = "spark-coinbase-lite-sub"
timeout = 90
regional = True

In [60]:
if regional:
    location = CloudRegion(cloud_region)
else:
    location = CloudZone(CloudRegion(cloud_region), zone_id)

In [61]:
subscription_path = SubscriptionPath(project_number, location, subscription_id)

In [62]:
per_partition_flow_control_settings = FlowControlSettings(
    # 1,000 outstanding messages. Must be >0.
    messages_outstanding = 1000,
    # 10 MiB. Must be greater than the allowed size of the largest message (1 MiB).
    bytes_outstanding = 10 * 1024 * 1024,
)

In [63]:
messages = []
timestamps = []

In [64]:
def callback(message: PubsubMessage):
    message_data = message.data.decode("utf-8")
    ordering_key = message.ordering_key
    metadata = MessageMetadata.decode(message.message_id)
    print(
        f"Received {message_data} of ordering key {ordering_key} with id {metadata}."
    )
    message.ack()
    messages.append(message_data)
    timestamps.append(ordering_key)

In [65]:
with SubscriberClient() as subscriber_client:

    streaming_pull_future = subscriber_client.subscribe(
        subscription_path,
        callback=callback,
        per_partition_flow_control_settings=per_partition_flow_control_settings,
    )

    print(f"Listening for messages on {str(subscription_path)}...")

    try:
        streaming_pull_future.result(timeout=timeout)
    except TimeoutError or KeyboardInterrupt:
        streaming_pull_future.cancel()
        assert streaming_pull_future.done()

Listening for messages on projects/1072423212419/locations/europe-central2/subscriptions/spark-coinbase-lite-sub...
Received {"product_id":"SOL-USD","price":24.48} of ordering key 2023-01-28 00:02:31.227526 with id MessageMetadata(partition=Partition(value=0), cursor=offset: 3916
).
Received {"product_id":"ETH-USD","price":1598.96} of ordering key 2023-01-28 00:02:31.134715 with id MessageMetadata(partition=Partition(value=0), cursor=offset: 3917
).
Received {"product_id":"DOGE-USD","price":0.0874} of ordering key 2023-01-28 00:02:32.050079 with id MessageMetadata(partition=Partition(value=0), cursor=offset: 3918
).
Received {"product_id":"BTC-USD","price":23090.61} of ordering key 2023-01-28 00:02:32.255333 with id MessageMetadata(partition=Partition(value=0), cursor=offset: 3919
).
Received {"product_id":"ADA-USD","price":0.3944} of ordering key 2023-01-28 00:02:32.937742 with id MessageMetadata(partition=Partition(value=0), cursor=offset: 3920
).
Received {"product_id":"SOL-USD","pr

In [66]:
len(messages)

25

In [67]:
messages[0]

'{"product_id":"ETH-USD","price":1598.96}'

In [68]:
response = [ast.literal_eval(mes) for mes in messages]

In [69]:
response[2]

{'product_id': 'DOGE-USD', 'price': 0.0874}

## BigTable

In [70]:
import datetime

In [71]:
from google.cloud import bigtable
from google.cloud.bigtable import column_family
from google.auth.credentials import Credentials
from google.oauth2 import service_account
from google.cloud.bigtable.row_set import RowSet
import google.cloud.bigtable.row_filters as row_filters

In [72]:
credentials = service_account.Credentials.from_service_account_file("/home/bda_crypto_busters/repos/BigDataAnalytics/2_data_preprocessing/reddit/bigtable_key.json")

In [73]:
client = bigtable.Client(project="crypto-busting-375023", admin=True, credentials=credentials, read_only=False)
instance = client.instance(instance_id="bda-bigtable-instance")
table = instance.table(table_id="coinbase-table")

In [74]:
table.list_column_families()

{'coinbase-cf': <google.cloud.bigtable.column_family.ColumnFamily at 0x7fdf7e17c220>,
 'coinbase': <google.cloud.bigtable.column_family.ColumnFamily at 0x7fdf7e17cbe0>}

In [75]:
cf = table.column_family("coinbase")


In [76]:
def print_row(row):
    print("Reading data for {}:".format(row.row_key.decode("utf-8")))
    for cf, cols in sorted(row.cells.items()):
        print("Column Family {}".format(cf))
        for col, cells in sorted(cols.items()):
            for cell in cells:
                labels = (
                    " [{}]".format(",".join(cell.labels)) if len(cell.labels) else ""
                )
                print(
                    "\t{}: {} @{}{}".format(
                        col.decode("utf-8"),
                        cell.value.decode("utf-8"),
                        cell.timestamp,
                        labels,
                    )
                )
    print("")


def write_simple(table, keys, messages):

    timestamp = datetime.datetime.utcnow()
    column_family_id = "coinbase"

    for k, mes in zip(keys, messages):
        row = table.direct_row(k)
        for col in mes:
            row.set_cell(column_family_id, col, str(mes[col]), timestamp)

        row.commit()
        print("Successfully wrote row {}.".format(k))

    
def read_prefix(table, prefix):
    end_key = prefix[:-1] + chr(ord(prefix[-1]) + 1)

    row_set = RowSet()
    row_set.add_row_range_from_keys(prefix.encode("utf-8"), end_key.encode("utf-8"))

    rows = table.read_rows(row_set=row_set)
    for row in rows:
        print_row(row)

In [77]:
timestamps[0]

'2023-01-28 00:02:31.134715'

In [78]:
write_simple(table, timestamps, response)

Successfully wrote row 2023-01-28 00:02:31.134715.
Successfully wrote row 2023-01-28 00:02:31.227526.
Successfully wrote row 2023-01-28 00:02:32.050079.
Successfully wrote row 2023-01-28 00:02:32.255333.
Successfully wrote row 2023-01-28 00:02:32.937742.
Successfully wrote row 2023-01-28 00:07:32.385047.
Successfully wrote row 2023-01-28 00:07:32.397823.
Successfully wrote row 2023-01-28 00:07:27.498972.
Successfully wrote row 2023-01-28 00:07:28.5509.
Successfully wrote row 2023-01-28 00:12:32.366283.
Successfully wrote row 2023-01-28 00:12:32.59445.
Successfully wrote row 2023-01-28 00:12:32.407586.
Successfully wrote row 2023-01-28 00:07:29.931375.
Successfully wrote row 2023-01-28 00:12:30.989022.
Successfully wrote row 2023-01-28 00:12:32.980291.
Successfully wrote row 2023-01-28 00:17:27.295456.
Successfully wrote row 2023-01-28 00:17:31.715343.
Successfully wrote row 2023-01-28 00:17:32.275102.
Successfully wrote row 2023-01-28 00:17:29.297682.
Successfully wrote row 2023-01-28 

In [79]:
read_prefix(table, "2023-01-28")

Reading data for 2023-01-28 00:02:31.134715:
Column Family coinbase
	price: 1598.96 @2023-01-28 00:26:07.518000+00:00
	product_id: ETH-USD @2023-01-28 00:26:07.518000+00:00

Reading data for 2023-01-28 00:02:31.227526:
Column Family coinbase
	price: 24.48 @2023-01-28 00:26:07.518000+00:00
	product_id: SOL-USD @2023-01-28 00:26:07.518000+00:00

Reading data for 2023-01-28 00:02:32.050079:
Column Family coinbase
	price: 0.0874 @2023-01-28 00:26:07.518000+00:00
	product_id: DOGE-USD @2023-01-28 00:26:07.518000+00:00

Reading data for 2023-01-28 00:02:32.255333:
Column Family coinbase
	price: 23090.61 @2023-01-28 00:26:07.518000+00:00
	product_id: BTC-USD @2023-01-28 00:26:07.518000+00:00

Reading data for 2023-01-28 00:02:32.937742:
Column Family coinbase
	price: 0.3944 @2023-01-28 00:26:07.518000+00:00
	product_id: ADA-USD @2023-01-28 00:26:07.518000+00:00

Reading data for 2023-01-28 00:07:27.498972:
Column Family coinbase
	price: 1598.63 @2023-01-28 00:26:07.518000+00:00
	product_id: E