In [158]:
import os

from typing import Tuple

os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-10_2.12:3.3.0,org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0 pyspark-shell'

from pymongo import MongoClient
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, element_at, when
from pprint import pprint
from json import loads

In [159]:
TOPIC_NAME = "topic_1"
HOST_IP = "192.168.20.6"

In [160]:
spark = (
    SparkSession.builder
    .master('local[*]')
    .appName('[Demo] Spark Streaming from Kafka into MongoDB')
    .getOrCreate()
)

In [161]:
p1_topic_name = "producer_1"
p1_stream_df = (
    spark.readStream.format('kafka')
    .option('kafka.bootstrap.servers', f'{HOST_IP}:9092')
    .option('subscribe', p1_topic_name)
    .load()
)
p1_stream_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [162]:
p1_output_stream_df = (
    p1_stream_df
    .select(p1_stream_df.columns[:2])   # get column of key (producer_id, date) and value (data)
)
p1_output_stream_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)



In [195]:
from json import JSONDecodeError


class ClimateWriter:

    def __init__(self):
        self.client = None
        self.db = None
        self.col = None
        self.producer = None
        self.date = None
        self.data = None

    # called at the start of processing each partition in each output micro-batch
    def open(self, partition_id, epoch_id):
        print("Opening Mongo Client")
        self.client = MongoClient(
            host=f'{"localhost"}',
            port=27017
        )
        self.db = self.client.fit3182_assignment_db
        self.col = self.db.dates
        return True

    # called once per row of the result dataframe
    def process(self, row):
        print("Processing")
        key = row["key"].decode()
        value = row["value"].decode()
        try:
            key = dict(loads(key.replace("\'", "\"")))      # dict-in-str -> json -> dict
            self.producer = key.get("producer")
            self.date = key.get("date")
        except JSONDecodeError as e:
            print("Process skipped: \n" + str(e) + " in decoding key (Don't worry about it, it works 50% of the time)")
        try:
            value = dict(loads(value.replace("\'", "\"")))  # dict-in-str -> json -> dict
            self.data = value
        except JSONDecodeError as e:
            print("Process skipped: \n" + str(e) + " in decoding key (Don't worry about it, it works 50% of the time)")

        if self.producer and self.date and self.data:       # insert stream data to db
            self.write_to_db()

    # called once all rows have been processed (possibly with error)
    def close(self, err):
        if err:
            print("Error: " + str(err))
        print("Closing Mongo Client")
        self.client.close()

    def write_to_db(self):
        climate_db_obj = {
            "_id": self.date,
            "climate": {    # date-climate is 1-1
                "air_temperature": int(self.data.get("air_temperature_celcius")),
                "relative_humidity": float(self.data.get("relative_humidity")),
                "windspeed_knots": float(self.data.get("windspeed_knots")),
                "max_wind_speed": float(self.data.get("max_wind_speed")),
                "precipitation": {
                        "flag": self.data.get("precipitation")[-1],
                        "value": float(self.data.get("precipitation")[:-1])
                },
                "ghi": int(self.data.get("GHI_w/m2"))
            },
            "hotspots": []  # init empty hotspots
        }



In [196]:
class HotspotWriter:

    def __init__(self):
        self.db = None
        self.mongo_client = None

    # called at the start of processing each partition in each output micro-batch
    def open(self, partition_id, epoch_id):
        self.mongo_client = MongoClient(
            host=f'{"localhost"}',
            port=27017
        )
        self.db = self.mongo_client.fit3182_assignment_db
        return True

    # called once per row of the result dataframe
    def process(self, row):

        self.db[TOPIC_NAME].insert_one(row.asDict())

    # called once all rows have been processed (possibly with error)
    def close(self, err):
        self.mongo_client.close()

In [199]:
climate_writer = (
    p1_output_stream_df
    .writeStream
    .outputMode('append')
    .foreach(ClimateWriter())
)

console_logger = (
    p1_output_stream_df
    .writeStream
    .outputMode('append')
    .format('console')
)

writer = climate_writer

ConnectionRefusedError: [Errno 61] Connection refused

In [210]:
from typing import Tuple

"""
    Convert lat-lng from tuple of floating points to tuple of binary string presentations of the values
    !!!IMPORTANT!!!
    To avoid converting floating points to binary directly, all values are x100 and converted to integer
    THE FINAL VALUE IS 100X THE ORIGINAL VALUE (but it doesnt matter for our use case)
"""
def latlng_to_binstr(lat_lng: Tuple[float, float]) -> Tuple[str, str]:
    # float(25.125), float(10.13) -> int(25125), int(10130) -> '0b10101100`', '0b11011010' -> '10101100', '11011010'
    return bin(int(lat_lng[0]*100))[2:], bin(int(lat_lng[1]*100))[2:]

"""
    Return True if two given latitude-longitude pairs are close together
                                                          within 3 precision
"""
def are_close(lat_lng_1: Tuple[float, float], lat_lng_2: Tuple[float, float]) -> bool:
    pass

"""
    Return True if two given latitude-longitude pairs are the same
                                                          within 5 precision
"""
def are_same(lat_lng_1: Tuple[float, float], lat_lng_2: Tuple[float, float]) -> bool:
    pass

latlng_to_binstr((12.561, 32.12))

('10011101000', '110010001011')

In [198]:
from time import sleep
from pyspark.errors import StreamingQueryException

try:
    query = writer.start()
    query.awaitTermination()
    sleep(10)
except KeyboardInterrupt:
    print('Interrupted by CTRL-C. Stopped query')
except StreamingQueryException as exc:
    print(exc)
finally:
    query.stop()


23/06/04 17:10:21 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /private/var/folders/nk/pq_9ypcs6_x5jdx99mrtszc80000gn/T/temporary-68706a2d-50ff-47ee-9dbf-f4d29d3563b0. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/06/04 17:10:21 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/06/04 17:10:21 WARN AdminClientConfig: The configuration 'key.deserializer' was supplied but isn't a known config.
23/06/04 17:10:21 WARN AdminClientConfig: The configuration 'value.deserializer' was supplied but isn't a known config.
23/06/04 17:10:21 WARN AdminClientConfig: The configuration 'enable.auto.commit' was supplied but isn't a known config.
23/06/04 17:10:21 WARN AdminClientConfig: The configuration '

Interrupted by CTRL-C. Stopped query
