In [9]:
project_number = 1072423212419
project_id = "crypto-busting-375023"
location = "europe-central2"
subscription_id = "bda-coinbase-topic-sub"
topic_id = "bda-coinbase-topic"
timeout = 5.0

In [10]:
from concurrent.futures import TimeoutError
from google.cloud import pubsub_v1

In [11]:
subscriber = pubsub_v1.SubscriberClient()

In [12]:
subscription_path = subscriber.subscription_path(project_id, subscription_id)

In [13]:
messages = []

In [14]:
def callback(message: pubsub_v1.subscriber.message.Message) -> None:
    print(f"Received {message}.")
    message.ack()
    messages.append(message.data)

In [None]:
streaming_pull_future = subscriber.subscribe(subscription_path, callback=callback)
print(f"Listening for messages on {subscription_path}..\n")

with subscriber:
    try:
        # When `timeout` is not set, result() will block indefinitely,
        # unless an exception is encountered first.
        streaming_pull_future.result(timeout=timeout)
    except TimeoutError:
        streaming_pull_future.cancel()  # Trigger the shutdown.
        streaming_pull_future.result()  # Block until the shutdown is complete.

Listening for messages on projects/crypto-busting-375023/subscriptions/bda-coinbase-topic-sub..



In [8]:
import ast

In [9]:
for mes in messages:
    mes.decode('UTF-8')

In [10]:
response = [ast.literal_eval(mes.decode("UTF-8")) for mes in messages]
response[0]

{'side': '',
 'price': '1549.4',
 'product_id': 'ETH-USD',
 'time': '2023-01-20T12:04:14.537578Z'}

# Pub/Sub Lite

In [11]:
!pip install google-cloud-pubsub==2.10

[0m

In [12]:
from google.cloud.pubsublite.cloudpubsub import PublisherClient
from google.cloud.pubsublite.types import (
    CloudRegion,
    CloudZone,
    MessageMetadata,
    TopicPath,
)

In [13]:
topic_id = "bda-coinbase-topic-lite"
subscription_id = "bda-coinbase-sub-lite"

In [14]:
loc = CloudRegion(location)

In [15]:
loc.region

CloudRegion(name='europe-central2')

In [16]:
topic_path = TopicPath(project_number, loc, topic_id)

In [17]:
with PublisherClient() as publisher_client:
    for msg in messages:
        api_future = publisher_client.publish(topic_path, msg)
        # result() blocks. To resolve API futures asynchronously, use add_done_callback().
        message_id = api_future.result()
        message_metadata = MessageMetadata.decode(message_id)
        print(
            f"Published a message to {topic_path} with partition {message_metadata.partition.value} and offset {message_metadata.cursor.offset}."
        )

Published a message to projects/1072423212419/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 10.
Published a message to projects/1072423212419/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 11.
Published a message to projects/1072423212419/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 12.
Published a message to projects/1072423212419/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 13.
Published a message to projects/1072423212419/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 14.
Published a message to projects/1072423212419/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 15.
Published a message to projects/1072423212419/locations/europe-central2/topics/bda-coinbase-topic-lite with partition 0 and offset 16.
Published a message to projects/1072423212419/locations

# Pub/Sub Lite to SparkStreaming

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType

In [19]:
spark = SparkSession.builder.appName("Read Pub/Sub Stream").master("yarn").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/01/20 12:13:06 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
23/01/20 12:13:06 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
23/01/20 12:13:06 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
23/01/20 12:13:06 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [20]:
# spark.sql('add jar file:////home/bda_reddit/dependancies/spark-streaming-pubsub_2.11.jar')
# spark.sql('add jar file:////home/bda_reddit/dependancies/spark-streaming_2.13-3.3.1.jar')
# spark.sql('add jar file:////home/bda_reddit/repos/BigDataAnalytics/2_data_preprocessing/pubsublite-spark-sql-streaming-0.4.2.jar')
# spark.sql('add jar file:////home/bda_reddit/dependancies/spark-catalyst_2.13-3.3.1.jar')
spark.sql("add jar gs://spark-lib/pubsublite/pubsublite-spark-sql-streaming-LATEST-with-dependencies.jar")

ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used
23/01/20 12:13:20 WARN org.apache.hadoop.hive.ql.session.SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


DataFrame[result: int]

In [21]:
#spark.sql("add jar file:////home/bda_reddit/dependancies/gax-grpc-1.53.1.jar")

In [22]:
sdf = (
    spark.readStream.format("pubsublite")
    .option(
        "pubsublite.subscription",
        f"projects/{project_number}/locations/{location}/subscriptions/{subscription_id}",
    )
    .load()
)

In [23]:
sdf.writeStream.format("console").start()

23/01/20 12:13:31 WARN org.apache.spark.sql.streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-d51b647d-9358-48e8-ba4b-d6418be07e95. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/01/20 12:13:31 WARN org.apache.spark.sql.streaming.StreamingQueryManager: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x7f1c208a6c10>

In [24]:
query = sdf.writeStream.format("console").outputMode("append").start().awaitTermination()

23/01/20 12:13:35 WARN org.apache.spark.sql.streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-b3a8385d-8212-4e27-b3ea-de012952e870. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/01/20 12:13:35 WARN org.apache.spark.sql.streaming.StreamingQueryManager: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/01/20 12:13:48 WARN org.apache.spark.scheduler.TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0) (bda-hdfs-w-1.europe-central2-a.c.crypto-busting-375023.internal executor 1): org.apache.spark.util.TaskCompletionListenerException: 

Previous exception in task: Failed to retrieve messages.
	com.google.cloud.pubsublite.spark.PslMicroBatchInputPartitionReader.next(PslMicroBatchInputPartitionReader.java:7

StreamingQueryException: Writing job aborted.
=== Streaming Query ===
Identifier: [id = d353d59b-0eea-4674-b890-377699fcbb2b, runId = 067333a9-571f-45a2-be0e-e31980e9c551]
Current Committed Offsets: {}
Current Available Offsets: {com.google.cloud.pubsublite.spark.PslMicroBatchStream@1d4d79c3: {"0":24}}

Current State: ACTIVE
Thread State: RUNNABLE

Logical Plan:
WriteToMicroBatchDataSource ConsoleWriter[numRows=20, truncate=true]
+- StreamingDataSourceV2Relation [subscription#5, partition#6L, offset#7L, key#8, data#9, publish_timestamp#10, event_timestamp#11, attributes#12], com.google.cloud.pubsublite.spark.PslScanBuilder@ed1422, com.google.cloud.pubsublite.spark.PslMicroBatchStream@1d4d79c3


In [26]:
query = (
    sdf.writeStream.format("console")
    .outputMode("append")
    .trigger(processingTime="1 second")
    .start()
)

# Wait 120 seconds (must be >= 60 seconds) to start receiving messages.
# 
query.stop()

23/01/20 12:14:56 WARN org.apache.spark.sql.streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-cd23a0ba-91fa-4141-bd45-9c7aeb52b09b. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/01/20 12:14:56 WARN org.apache.spark.sql.streaming.StreamingQueryManager: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/01/20 12:14:57 ERROR com.google.cloud.pubsublite.repackaged.io.grpc.internal.ManagedChannelOrphanWrapper: *~*~*~ Channel ManagedChannelImpl{logId=529, target=europe-central2-pubsublite.googleapis.com:443} was not shutdown properly!!! ~*~*~*
    Make sure to call shutdown()/shutdownNow() and wait until awaitTermination() returns true.
java.lang.RuntimeException: ManagedChannel allocation site
	at com.g