In [6]:
from pymongo import MongoClient
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#, col, split, element_at, when, to_json, transform
from pyspark.sql.types import *
from json import loads

In [2]:
spark = (
    SparkSession.builder
    .master('local[*]')
    .appName('Streaming Application')
    .getOrCreate()
)

In [3]:
df = (
    spark.readStream.format('kafka') # specify source
    .option('kafka.bootstrap.servers', 'localhost:9092')
    .option('subscribe', 'Climate, Aqua, Terra')
    .load() # creates streaming dataframe
)

In [4]:
df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [7]:
schema = (
    StructType()
    .add('latitude', FloatType())
    .add('longitude', FloatType())
    .add('air_temperature_celcius', FloatType())
    .add('relative_humidity', FloatType())
    .add('windspeed_knots', FloatType())
    .add('max_wind_speed', FloatType())
    .add('precipitation', FloatType())
    .add('precipitation_flag', StringType())
    .add('GHI_w/m2', FloatType())
    .add('date', StringType())
    .add('station', StringType())
    .add('confidence', FloatType())
    .add('surface_temperature_celcius', FloatType()))

df2 = df.selectExpr('CAST(value AS STRING)').select(from_json('value', schema).alias('temp')).select('temp.*')

# query = df.writeStream.format('console').option('truncate', 'False').start()

In [8]:
df2.printSchema()

root
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- air_temperature_celcius: float (nullable = true)
 |-- relative_humidity: float (nullable = true)
 |-- windspeed_knots: float (nullable = true)
 |-- max_wind_speed: float (nullable = true)
 |-- precipitation: float (nullable = true)
 |-- precipitation_flag: string (nullable = true)
 |-- GHI_w/m2: float (nullable = true)
 |-- date: string (nullable = true)
 |-- station: string (nullable = true)
 |-- confidence: float (nullable = true)
 |-- surface_temperature_celcius: float (nullable = true)



In [46]:
df2 = (
    df.select(
        df.value.cast('string')
        .alias('data')
    ).rdd.flatMap(lambda x: loads(x.decode('utf-8')))
    .withColumn(
        'data', (
            when( col('data') == '', '*')
            .otherwise(col('data'))
        )
    )
)

AnalysisException: Queries with streaming sources must be executed with writeStream.start();
kafka

In [5]:
df2 = (
    df.select(
        df.value.cast('string')
        .alias('data')
    )
    .
    .withColumn(
        'data', (
            when( col('data') == '', '*')
            .otherwise(col('data'))
        )
    )
)

In [6]:
df2.printSchema()

root
 |-- data: string (nullable = true)



In [43]:
ads = spark.createDataFrame([(1, [1, 2, 3, 4])], ("key", "values"))
ads.printSchema()
ads.rdd.map(lambda x: x)

root
 |-- key: long (nullable = true)
 |-- values: array (nullable = true)
 |    |-- element: long (containsNull = true)



PythonRDD[23] at RDD at PythonRDD.scala:53

In [7]:
class DbWriter:
    # called at the start of processing each partition in each output micro-batch
    def open(self, partition_id, epoch_id):
        self.mongo_client = MongoClient(
            host='localhost',
            port=27017
        )
        self.db = self.mongo_client.fit_3182_assignment_db
        return True
    
    # called once per row of the result dataframe
    # the current code DOES NOT handle duplicate processing
    #   e.g., query fails and restarts just before current micro-batch was fully inserted
    def process(self, row):
        # insert code here!!!!!
#         row.collect()
        print(row)
#         self.db[topic_name].insert_one(row.asDict())
    
    # called once all rows have been processed (possibly with error)
    def close(self, err):
        self.mongo_client.close()

In [None]:
def process(df, epoch_id):
    

In [8]:
db_writer = (
    df2
    .writeStream
    .outputMode('complete')
#     .foreachBatch(DbWriter())
    .foreachBatch(process)
    .trigger(processingTime='10 seconds')
)

In [10]:
console_logger = (
    df2
    .writeStream
    .outputMode('complete')
    .foreachBatch(process)
    .format('console')
    .trigger(processingTime='10 seconds')
)

In [21]:
from json import loads

# db_writer = (
#     df
#     .writeStream
#     .outputMode('append')
#     .foreachBatch(DbWriter())
#     .trigger(processingTime='10 seconds')
# )

fields = (df
          .select(
              split(df.value.cast('string'), ',').alias('data'))
          .withColumn('latitude', element_at('data', 2))
#           .withColumn(
#               'data', (when( col('data') == '', '*')
#                        .otherwise(col('data'))
#                       )
#           )
         )
# groupBy("latitude").count()
fields.printSchema

<bound method DataFrame.printSchema of DataFrame[data: array<string>, latitude: string]>

In [19]:
def process_data(_iter):
    data = _iter.collect()

lines = df.foreachBatch(process_data)

AttributeError: 'DataFrame' object has no attribute 'foreachBatch'

In [11]:
writer = console_logger
# writer = db_writer

In [12]:
try:
    query = writer.start()
    query.awaitTermination()
except KeyboardInterrupt:
    print('Interrupted by CTRL-C. Stopped query')
except StreamingQueryException as exc:
    print(exc)
finally:
    query.stop()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/student/.local/lib/python3.8/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/student/.local/lib/python3.8/site-packages/py4j/clientserver.py", line 475, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


Interrupted by CTRL-C. Stopped query
