In [1]:
project_number = 294601891609
project_id = "crypto-busting-374123"
location = "europe-central2"
subscription_id = "bda-coinbase-topic-sub"
topic_id = "bda-coinbase-topic"
timeout = 5.0

In [2]:
from concurrent.futures import TimeoutError
from google.cloud import pubsub_v1

In [3]:
subscriber = pubsub_v1.SubscriberClient()

In [4]:
subscription_path = subscriber.subscription_path(project_id, subscription_id)

In [5]:
messages = []

In [6]:
def callback(message: pubsub_v1.subscriber.message.Message) -> None:
    print(f"Received {message}.")
    message.ack()
    messages.append(message.data)

In [7]:
streaming_pull_future = subscriber.subscribe(subscription_path, callback=callback)
print(f"Listening for messages on {subscription_path}..\n")

with subscriber:
    try:
        # When `timeout` is not set, result() will block indefinitely,
        # unless an exception is encountered first.
        streaming_pull_future.result(timeout=timeout)
    except TimeoutError:
        streaming_pull_future.cancel()  # Trigger the shutdown.
        streaming_pull_future.result()  # Block until the shutdown is complete.

Listening for messages on projects/crypto-busting-374123/subscriptions/bda-coinbase-topic-sub..

Received Message {
  data: b'{"side":"","price":"0.0858","product_id":"DOGE-USD...'
  ordering_key: ''
  attributes: {}
}.
Received Message {
  data: b'{"side":"","price":"23.15","product_id":"SOL-USD",...'
  ordering_key: ''
  attributes: {}
}.
Received Message {
  data: b'{"side":"","price":"23.15","product_id":"SOL-USD",...'
  ordering_key: ''
  attributes: {}
}.
Received Message {
  data: b'{"side":"","price":"0.0858","product_id":"DOGE-USD...'
  ordering_key: ''
  attributes: {}
}.
Received Message {
  data: b'{"side":"","price":"23.15","product_id":"SOL-USD",...'
  ordering_key: ''
  attributes: {}
}.
Received Message {
  data: b'{"side":"","price":"0.0858","product_id":"DOGE-USD...'
  ordering_key: ''
  attributes: {}
}.
Received Message {
  data: b'{"side":"","price":"0.3507","product_id":"ADA-USD"...'
  ordering_key: ''
  attributes: {}
}.
Received Message {
  data: b'{"side":"","p

In [10]:
import ast

In [11]:
for mes in messages:
    mes.decode('UTF-8')

In [12]:
response = [ast.literal_eval(mes.decode("UTF-8")) for mes in messages]
response[0]

{'side': '',
 'price': '0.0858',
 'product_id': 'DOGE-USD',
 'time': '2023-01-15T21:33:12.111026Z'}

# Pub/Sub Lite

In [13]:
from google.cloud.pubsublite.cloudpubsub import PublisherClient
from google.cloud.pubsublite.types import (
    CloudRegion,
    CloudZone,
    MessageMetadata,
    TopicPath,
)

In [26]:
topic_id = "bda-coinbase-topic-lite"
subscription_id = "bda-coinbase-sub-lite"

In [27]:
loc = CloudRegion(location)

In [28]:
loc.region

CloudRegion(name='europe-central2')

In [29]:
topic_path = TopicPath(project_number, loc, topic_id)

In [30]:
with PublisherClient() as publisher_client:
    for msg in messages:
        api_future = publisher_client.publish(topic_path, msg)
        # result() blocks. To resolve API futures asynchronously, use add_done_callback().
        message_id = api_future.result()
        message_metadata = MessageMetadata.decode(message_id)
        print(
            f"Published a message to {topic_path} with partition {message_metadata.partition.value} and offset {message_metadata.cursor.offset}."
        )

Published a message to projects/294601891609/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 3188.
Published a message to projects/294601891609/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 3189.
Published a message to projects/294601891609/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 3190.
Published a message to projects/294601891609/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 3191.
Published a message to projects/294601891609/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 3192.
Published a message to projects/294601891609/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 3193.
Published a message to projects/294601891609/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 3194.
Published a message to projects/294601891609/loc

# Pub/Sub Lite to SparkStreaming

In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType

In [20]:
spark = SparkSession.builder.appName("Read Pub/Sub Stream").master("yarn").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/01/15 23:28:03 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
23/01/15 23:28:03 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
23/01/15 23:28:03 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
23/01/15 23:28:03 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [47]:
spark.sql('add jar file:////home/bda_reddit/dependancies/spark-streaming-pubsub_2.11.jar')
spark.sql('add jar file:////home/bda_reddit/dependancies/spark-streaming_2.13-3.3.1.jar')
spark.sql('add jar file:////home/bda_reddit/repos/BigDataAnalytics/2_data_preprocessing/pubsublite-spark-sql-streaming-0.4.2.jar')
spark.sql('add jar file:////home/bda_reddit/dependancies/spark-catalyst_2.13-3.3.1.jar')


23/01/15 23:38:57 WARN org.apache.spark.SparkContext: The jar file:////home/bda_reddit/dependancies/spark-streaming-pubsub_2.11.jar has been added already. Overwriting of added jars is not supported in the current version.
23/01/15 23:38:57 WARN org.apache.spark.SparkContext: The jar file:////home/bda_reddit/dependancies/spark-streaming_2.13-3.3.1.jar has been added already. Overwriting of added jars is not supported in the current version.
23/01/15 23:38:57 WARN org.apache.spark.SparkContext: The jar file:////home/bda_reddit/repos/BigDataAnalytics/2_data_preprocessing/pubsublite-spark-sql-streaming-0.4.2.jar has been added already. Overwriting of added jars is not supported in the current version.


DataFrame[result: int]

In [34]:
spark.sql("add jar file:////home/bda_reddit/dependancies/gax-grpc-1.53.1.jar")

23/01/15 23:33:12 WARN org.apache.spark.SparkContext: The jar file:////home/bda_reddit/dependancies/gax-grpc-1.53.1.jar has been added already. Overwriting of added jars is not supported in the current version.


DataFrame[result: int]

In [48]:
sdf = (
    spark.readStream.format("pubsublite")
    .option(
        "pubsublite.subscription",
        f"projects/{project_number}/locations/{location}/subscriptions/{subscription_id}",
    )
    .load()
)

In [49]:
sdf.writeStream.format("console").start()

23/01/15 23:39:03 WARN org.apache.spark.sql.streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-6b4a85a2-1cc0-4062-b5c7-be0af51a5825. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/01/15 23:39:03 WARN org.apache.spark.sql.streaming.StreamingQueryManager: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x7fe3bee86fd0>

23/01/15 23:39:04 ERROR org.apache.spark.sql.execution.streaming.MicroBatchExecution: Query [id = c149575b-da6e-404d-beb9-d3eaef8d1463, runId = 4b5d0352-f21d-464a-847e-068a1ba6dc92] terminated with error
java.lang.NoClassDefFoundError: com/google/api/gax/rpc/ApiException
	at com.google.cloud.pubsublite.spark.PslTable.newScanBuilder(PslTable.java:33)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$1.$anonfun$applyOrElse$4(MicroBatchExecution.scala:103)
	at scala.collection.mutable.HashMap.getOrElseUpdate(HashMap.scala:86)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$1.applyOrElse(MicroBatchExecution.scala:97)
	at org.apache.spark.sql.execution.streaming.MicroBatchExecution$$anonfun$1.applyOrElse(MicroBatchExecution.scala:82)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDown$1(TreeNode.scala:318)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:74)
	at org.apache.spark.sql.catalys

In [39]:
query = sdf.writeStream.format("console").outputMode("append").start().awaitTermination()

23/01/15 23:34:35 WARN org.apache.spark.sql.streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-6306b912-13ee-4f74-b1b5-d41dd285ccfc. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/01/15 23:34:35 WARN org.apache.spark.sql.streaming.StreamingQueryManager: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/01/15 23:34:36 ERROR org.apache.spark.sql.execution.streaming.MicroBatchExecution: Query [id = 5f2e1f9c-3874-4ebc-947d-0e205b866d47, runId = 32bccd49-734e-4ae2-a647-b4b468d85e24] terminated with error
java.lang.NoClassDefFoundError: com/google/api/gax/rpc/ApiException
	at com.google.cloud.pubsublite.spark.PslTable.newScanBuilder(PslTable.java:33)
	at org.apache.spark.sql.execution.streaming.MicroBatch

StreamingQueryException: com/google/api/gax/rpc/ApiException
=== Streaming Query ===
Identifier: [id = 5f2e1f9c-3874-4ebc-947d-0e205b866d47, runId = 32bccd49-734e-4ae2-a647-b4b468d85e24]
Current Committed Offsets: {}
Current Available Offsets: {}

Current State: INITIALIZING
Thread State: RUNNABLE