## Loading Pub/Sub Lite to Spark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, StructType, StructField, IntegerType, LongType, FloatType, DateType
from pyspark.sql.functions import from_json, col
import pyspark.sql.functions as f

import base64

In [2]:
project_number = 1072423212419
location = "europe-central2"
lite_subscription_id = "bda-reddit-sub-lite"

key_file = open("/home/bda_crypto_busters/repos/BigDataAnalytics/2_data_preprocessing/crypto/stream/crypto-busting-375023-6722d6967eca.json", "rb")
key = base64.b64encode(key_file.read())
key = key.decode("utf-8")

In [3]:
spark = (
    SparkSession
    .builder
    .config("spark.jars", "/home/bda_crypto_busters/repos/BigDataAnalytics/2_data_preprocessing/crypto/stream/pubsublite-spark-sql-streaming-LATEST-with-dependencies.jar")
    .config("spark.dynamicAllocation.enabled", "false")
    .appName("Read Pub/Sub Lite Stream")
    .master("yarn")
    .getOrCreate()
)

In [4]:
spark.sql("add jar gs://spark-lib/pubsublite/pubsublite-spark-sql-streaming-LATEST-with-dependencies.jar")

ivysettings.xml file not found in HIVE_HOME or HIVE_CONF_DIR,/etc/hive/conf.dist/ivysettings.xml will be used
23/01/22 19:06:06 WARN org.apache.hadoop.hive.ql.session.SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.


DataFrame[result: int]

In [5]:
df = (
    spark.readStream.format("pubsublite")
    .option(
        "pubsublite.subscription",
        f"projects/{project_number}/locations/{location}/subscriptions/{lite_subscription_id}",
    )
    .option("gcp.credentials.key", key)
    .load()
)

In [6]:
df = df.withColumn("data", df.data.cast(StringType())).select("data")

In [7]:
# query = (
#     df.writeStream.format("console")
#     .outputMode("append")
#     .trigger(processingTime="1 second")
#     .start()
# )

In [8]:
JSONschema = StructType([ 
    StructField("id", StringType(), False),
    StructField("title", StringType(), False),
    StructField("text", StringType(), True),
    StructField("time", FloatType(), False),
    StructField("upvotes", IntegerType(), False),
    StructField("comments", IntegerType(), False),
    StructField("subreddit", StringType(), False),
])

In [9]:
sdf = df.withColumn("JSONData", from_json(col("data"), JSONschema)).select("JSONData.*")

In [10]:
sdf = sdf.withColumn("subreddit", f.lower(col("subreddit")))\
.withColumn("time", sdf.time.cast(IntegerType()))\
.withColumn("time", f.from_unixtime(col("time")))\
.withColumn("year", f.year(col("time")))\
.withColumn("month", f.month(col("time")))\
.withColumn("day", f.dayofmonth(col("time")))\
.withColumn("hour", f.hour(col("time")))

In [11]:
query = (
    sdf.writeStream.format("console")
    .outputMode("append")
    .trigger(processingTime="1 second")
    .start()
)

23/01/22 19:06:19 WARN org.apache.spark.sql.streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-934b2785-8753-4378-a555-7e342cecac7c. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/01/22 19:06:19 WARN org.apache.spark.sql.streaming.StreamingQueryManager: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
[Stage 0:>                                                          (0 + 1) / 1]

## Counting words

In [12]:
sdf_words = sdf.withColumn("word_count", f.size(f.split(f.concat_ws(" ", col("title"), col("text")), "\\s+")))

In [None]:
query = (
    sdf_words.writeStream.format("console")
    .outputMode("append")
    .trigger(processingTime="1 second")
    .start()
)

## Assigning sentiment

In [13]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.functions import vector_to_array

import argparse
from datetime import datetime



In [14]:
model_path = '/user/bda_reddit_pw/models/sentiment_model'
day_shift = 37

In [15]:
loaded_model = PipelineModel.load(model_path)

                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+-------+--------------------+--------------------+-------------------+-------+--------+---------+----+-----+---+----+
|     id|               title|                text|               time|upvotes|comments|subreddit|year|month|day|hour|
+-------+--------------------+--------------------+-------------------+-------+--------+---------+----+-----+---+----+
|10iheu5|Bill Gates WARNS ...|                    |2023-01-22 11:16:16|      1|       0|  bitcoin|2023|    1| 22|  11|
|10ii6lj|              Noice!|                    |2023-01-22 12:03:12|      1|       0|  bitcoin|2023|    1| 22|  12|
|10ijgv0|               DM Me|I looking for peo...|2023-01-22 13:17:52|      1|       0|  bitcoin|2023|    1| 22|  13|
|10ijtny|            good boi|                    |2023-01-22 13:37:04|      1|       0| dogecoin|2023|    1| 22|  13|
|10ihwtj|Calling all crypt...|                    |2023-01-22 11:48:16

23/01/22 19:06:51 WARN org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 1000 milliseconds, but spent 19402 milliseconds
23/01/22 19:06:55 WARN org.apache.hadoop.util.concurrent.ExecutorHelper: Thread (Thread[GetFileInfo #1,5,main]) interrupted: 
java.lang.InterruptedException
	at com.google.common.util.concurrent.AbstractFuture.get(AbstractFuture.java:510)
	at com.google.common.util.concurrent.FluentFuture$TrustedFuture.get(FluentFuture.java:88)
	at org.apache.hadoop.util.concurrent.ExecutorHelper.logThrowableFromAfterExecute(ExecutorHelper.java:48)
	at org.apache.hadoop.util.concurrent.HadoopThreadPoolExecutor.afterExecute(HadoopThreadPoolExecutor.java:90)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1157)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)
23/01/22 19:06:55 WARN org.apache.hadoop.util.concu

In [16]:
prediction_raw = loaded_model.transform(sdf_words)

In [21]:
prediction = prediction_raw\
.withColumnRenamed("subreddit", "cryptocurrency")\
.withColumnRenamed("time", "created_utc")\
.withColumn("created_utc", f.date_format(col("created_utc"), "yyyy-MM-dd HH:00:00"))

columns = [f.col("xs")[1], 'created_utc', 'cryptocurrency', 'word_count', 'upvotes', 'comments']
prediction = (prediction.withColumn("xs", vector_to_array("probability"))).select(columns)
prediction = prediction.withColumnRenamed('xs[1]', 'sentiment')
prediction

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:46023)
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 977, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1115, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [Errno 111] Connection refused


Py4JNetworkError: An error occurred while trying to connect to the Java server (127.0.0.1:46023)

In [20]:
query = (
    prediction.writeStream.format("console")
    .outputMode("append")
    .trigger(processingTime="1 second")
    .start()
)

23/01/22 19:09:32 WARN org.apache.spark.sql.streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-ed9e9618-b201-4bf7-ba68-ee8bfd9b5d12. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/01/22 19:09:32 WARN org.apache.spark.sql.streaming.StreamingQueryManager: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
[Stage 12:>                                                         (0 + 1) / 1]

-------------------------------------------
Batch: 0
-------------------------------------------


                                                                                

+----------+--------------------+-------------------+--------------+----------+-------+--------+
|prediction|         probability|        created_utc|cryptocurrency|word_count|upvotes|comments|
+----------+--------------------+-------------------+--------------+----------+-------+--------+
|       1.0|  0.6202467984518677|2023-01-22 11:00:00|       bitcoin|         7|      1|       0|
|       1.0|  0.6202467984518677|2023-01-22 12:00:00|       bitcoin|         2|      1|       0|
|       1.0|  0.9999936424766556|2023-01-22 13:00:00|       bitcoin|        90|      1|       0|
|       1.0|  0.6202467984518677|2023-01-22 13:00:00|      dogecoin|         3|      1|       0|
|       1.0|  0.6202467984518677|2023-01-22 11:00:00|       bitcoin|        24|      1|       0|
|       1.0|  0.8683240493020469|2023-01-22 13:00:00|       bitcoin|        82|      1|       0|
|       1.0|  0.9999804915117282|2023-01-22 13:00:00|       cardano|       105|      1|       1|
|       1.0|  0.62024679845186

23/01/22 19:09:37 WARN org.apache.spark.sql.execution.streaming.ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 1000 milliseconds, but spent 3577 milliseconds
