In [9]:
project_number = 1072423212419
project_id = "crypto-busting-375023"
location = "europe-central2"
subscription_id = "bda-coinbase-topic-sub"
topic_id = "bda-coinbase-topic"
timeout = 5.0

In [10]:
from concurrent.futures import TimeoutError
from google.cloud import pubsub_v1

In [11]:
subscriber = pubsub_v1.SubscriberClient()

In [12]:
subscription_path = subscriber.subscription_path(project_id, subscription_id)

In [13]:
messages = []

In [14]:
def callback(message: pubsub_v1.subscriber.message.Message) -> None:
    print(f"Received {message}.")
    message.ack()
    messages.append(message.data)

In [15]:
streaming_pull_future = subscriber.subscribe(subscription_path, callback=callback)
print(f"Listening for messages on {subscription_path}..\n")

with subscriber:
    try:
        # When `timeout` is not set, result() will block indefinitely,
        # unless an exception is encountered first.
        streaming_pull_future.result(timeout=timeout)
    except TimeoutError:
        streaming_pull_future.cancel()  # Trigger the shutdown.
        streaming_pull_future.result()  # Block until the shutdown is complete.

Listening for messages on projects/crypto-busting-375023/subscriptions/bda-coinbase-topic-sub..



In [8]:
import ast

In [9]:
for mes in messages:
    mes.decode('UTF-8')

In [10]:
response = [ast.literal_eval(mes.decode("UTF-8")) for mes in messages]
response[0]

{'side': '',
 'price': '1549.4',
 'product_id': 'ETH-USD',
 'time': '2023-01-20T12:04:14.537578Z'}

# Pub/Sub Lite

In [11]:
!pip install google-cloud-pubsub==2.10

[0m

In [12]:
from google.cloud.pubsublite.cloudpubsub import PublisherClient
from google.cloud.pubsublite.types import (
    CloudRegion,
    CloudZone,
    MessageMetadata,
    TopicPath,
)

In [13]:
topic_id = "bda-coinbase-topic-lite"
subscription_id = "bda-coinbase-sub-lite"

In [14]:
loc = CloudRegion(location)

In [15]:
loc.region

CloudRegion(name='europe-central2')

In [16]:
topic_path = TopicPath(project_number, loc, topic_id)

In [17]:
with PublisherClient() as publisher_client:
    for msg in messages:
        api_future = publisher_client.publish(topic_path, msg)
        # result() blocks. To resolve API futures asynchronously, use add_done_callback().
        message_id = api_future.result()
        message_metadata = MessageMetadata.decode(message_id)
        print(
            f"Published a message to {topic_path} with partition {message_metadata.partition.value} and offset {message_metadata.cursor.offset}."
        )

Published a message to projects/1072423212419/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 10.
Published a message to projects/1072423212419/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 11.
Published a message to projects/1072423212419/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 12.
Published a message to projects/1072423212419/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 13.
Published a message to projects/1072423212419/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 14.
Published a message to projects/1072423212419/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 15.
Published a message to projects/1072423212419/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 16.
Published a message to projects/1072423212419/locations

# Pub/Sub Lite to SparkStreaming

In [19]:
import os

In [20]:
project_number = 1072423212419
location = "europe-central2"
lite_subscription_id = "bda-coinbase-sub-lite"

In [21]:
credentials_path = "/home/bda_crypto_busters/keys/cloud_compute_key.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path

In [22]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType

In [23]:
spark = SparkSession.builder.appName("Read Pub/Sub Stream").master("yarn").getOrCreate()

In [24]:
# spark.sql('add jar file:////home/bda_reddit/dependancies/spark-streaming-pubsub_2.11.jar')
# spark.sql('add jar file:////home/bda_reddit/dependancies/spark-streaming_2.13-3.3.1.jar')
# spark.sql('add jar file:////home/bda_reddit/repos/BigDataAnalytics/2_data_preprocessing/pubsublite-spark-sql-streaming-0.4.2.jar')
# spark.sql('add jar file:////home/bda_reddit/dependancies/spark-catalyst_2.13-3.3.1.jar')
spark.sql("add jar gs://spark-lib/pubsublite/pubsublite-spark-sql-streaming-LATEST-with-dependencies.jar")

23/01/21 14:58:38 WARN org.apache.spark.SparkContext: The jar gs://spark-lib/pubsublite/pubsublite-spark-sql-streaming-LATEST-with-dependencies.jar has been added already. Overwriting of added jars is not supported in the current version.


DataFrame[result: int]

In [21]:
#spark.sql("add jar file:////home/bda_reddit/dependancies/gax-grpc-1.53.1.jar")

In [25]:
sdf = (
    spark.readStream.format("pubsublite")
    .option(
        "pubsublite.subscription",
        f"projects/{project_number}/locations/{location}/subscriptions/{lite_subscription_id}",
    )
    .load()
)

In [26]:
sdf = sdf.withColumn("data", sdf.data.cast(StringType()))
sdf.printSchema()

root
 |-- subscription: string (nullable = false)
 |-- partition: long (nullable = false)
 |-- offset: long (nullable = false)
 |-- key: binary (nullable = false)
 |-- data: string (nullable = false)
 |-- publish_timestamp: timestamp (nullable = false)
 |-- event_timestamp: timestamp (nullable = true)
 |-- attributes: map (nullable = true)
 |    |-- key: string
 |    |-- value: array (valueContainsNull = true)
 |    |    |-- element: binary (containsNull = true)



In [27]:
query = (
        sdf.writeStream.format("console")
        .outputMode("append")
        .trigger(processingTime="1 second")
        .start()
    )

23/01/21 14:58:50 WARN org.apache.spark.sql.streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-79dd53f0-a78f-495b-a190-f7ab2d1cac5c. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/01/21 14:58:50 WARN org.apache.spark.sql.streaming.StreamingQueryManager: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/01/21 14:58:52 WARN org.apache.spark.scheduler.TaskSetManager: Lost task 0.0 in stage 2.0 (TID 8) (bda-hdfs-w-0.europe-central2-a.c.crypto-busting-375023.internal executor 2): org.apache.spark.util.TaskCompletionListenerException: 

Previous exception in task: Failed to retrieve messages.
	com.google.cloud.pubsublite.spark.PslMicroBatchInputPartitionReader.next(PslMicroBatchInputPartitionReader.java:7