In [1]:
import pyspark 
from pyspark.sql import SparkSession
import base64

In [2]:
from pyspark.sql.types import StringType, TimestampType

In [3]:
key_file = open("/home/bda_crypto_busters/repos/BigDataAnalytics/2_data_preprocessing/crypto/stream/crypto-busting-375023-6722d6967eca.json", "rb")
key = base64.b64encode(key_file.read())
key = key.decode("utf-8")

In [4]:
spark = SparkSession.builder \
    .master("yarn") \
    .appName("Spark Streaming to Pub/Sub Lite") \
    .config("spark.jars", "/home/bda_crypto_busters/repos/BigDataAnalytics/2_data_preprocessing/crypto/stream/pubsublite-spark-sql-streaming-LATEST-with-dependencies.jar") \
    .config("spark.dynamicAllocation.enabled", "false") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/01/27 23:03:51 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
23/01/27 23:03:51 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
23/01/27 23:03:52 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
23/01/27 23:03:52 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [5]:
project_number = 1072423212419
location = "europe-central2"
subscription_id = "bda-coinbase-sub-lite"
df = (
    spark.readStream.format("pubsublite")
    .option(
        "pubsublite.subscription",
        f"projects/{project_number}/locations/{location}/subscriptions/{subscription_id}",
    )
    .option("gcp.credentials.key", key)
    .load()
)

In [6]:
# w tej kolumnie powinny być dane ale są zakodowane jako json?
df = df.withColumn('data', df.data.cast(StringType())).select('data')

In [9]:
from pyspark.sql.functions import from_json, to_json
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, FloatType

In [10]:
JSONschema = StructType([ 
    StructField("side", StringType(), nullable=True), 
    StructField("price", StringType(), nullable=False), 
    StructField("product_id", StringType(), nullable=False),
    StructField("time", TimestampType(), nullable=False), 
])

In [11]:
sdf = df.withColumn("JSONData", from_json(col("data"), JSONschema)).select("JSONData.*")

In [12]:
sdf = sdf.withColumn("price", sdf.price.cast(FloatType()))

In [13]:
sdf_price = sdf.select("price")

In [14]:
sdf.isStreaming

True

In [15]:
sdf.printSchema()

root
 |-- side: string (nullable = true)
 |-- price: float (nullable = true)
 |-- product_id: string (nullable = true)
 |-- time: timestamp (nullable = true)



In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import array, create_map, col, lit, when, struct
from pyspark.sql.types import BinaryType, StringType
import uuid

In [17]:
message = (
    sdf.withColumn("key", col("time").cast(StringType()).cast(BinaryType()))
    .withColumn("data", to_json(struct("product_id", "price")).cast(BinaryType()))
    .withColumnRenamed("time", "event_timestamp")
    .withColumn(
        "attributes",
        create_map(
            lit("key1"),
            array(when(col("price") > 100, b"huge").otherwise(b"low")),
        ),
    )
    .drop("side", "product_id", "price")
)

In [18]:
message.printSchema()

root
 |-- event_timestamp: timestamp (nullable = true)
 |-- key: binary (nullable = true)
 |-- data: binary (nullable = true)
 |-- attributes: map (nullable = false)
 |    |-- key: string
 |    |-- value: array (valueContainsNull = false)
 |    |    |-- element: binary (containsNull = false)



In [19]:
save_topic_id = "spark-coinbase-lite-topic"

In [20]:
query = (
    message.writeStream.format("pubsublite")
    .option(
        "pubsublite.topic",
        f"projects/{project_number}/locations/{location}/topics/{save_topic_id}",
    )
    .option("gcp.credentials.key", key)
    # Required. Use a unique checkpoint location for each job.
    .option("checkpointLocation", "/tmp/app" + uuid.uuid4().hex)
    .outputMode("append")
    .trigger(processingTime="1 second")
    .start()
)

23/01/27 23:09:39 WARN org.apache.spark.sql.streaming.StreamingQueryManager: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/01/27 23:09:48 WARN org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 1000 milliseconds, but spent 7865 milliseconds
23/01/27 23:12:46 WARN org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 1000 milliseconds, but spent 2030 milliseconds
23/01/27 23:13:46 WARN org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 1000 milliseconds, but spent 1730 milliseconds
23/01/27 23:17:50 WARN org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 1000 milliseconds, but spent 1413 milliseconds
23/01/27 23:18:51 WARN org.apache.spark.sql.execution.st

In [21]:
query.stop()