In [None]:
from pymongo import MongoClient
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#, col, split, element_at, when, to_json, transform
from pyspark.sql.types import * #StructType #StringType
from pyspark.sql.types import *
from json import loads

In [None]:
spark = (
    SparkSession.builder
    .master('local[*]')
    .appName('Streaming Application')
    .getOrCreate()
)

In [None]:
df = (
    spark.readStream.format('kafka') # specify source
    .option('kafka.bootstrap.servers', 'localhost:9092')
    .option('subscribe', 'Climate, Aqua, Terra')
    .load() # creates streaming dataframe
)

In [None]:
df.printSchema()

In [None]:
schema = (
    StructType()
    .add('latitude', FloatType())
    .add('longitude', FloatType())
    .add('air_temperature_celcius', FloatType())
    .add('relative_humidity', FloatType())
    .add('windspeed_knots', FloatType())
    .add('max_wind_speed', FloatType())
    .add('precipitation', FloatType())
    .add('precipitation_flag', StringType())
    .add('GHI_w/m2', FloatType())
    .add('date', StringType())
    .add('station', StringType())
    .add('confidence', FloatType())
    .add('surface_temperature_celcius', FloatType()))

df2 = (df
       .selectExpr('CAST(value AS STRING)')
       .select(from_json('value', schema).alias('temp'))
       .select('temp.*'))

# query = df2.writeStream.format('console').option('truncate', 'False').trigger(processingTime='10 seconds').start()

In [None]:
df2.printSchema()

In [None]:
def process(df, epoch_id):
    mongo_client = MongoClient(host='localhost', port=27017)
    db = mongo_client.fit_3182_assignment_db
    collection = db.test
    collection.insert_one(df.asDict())
    pprint.pprint(df)
    mongo_client.close()

In [None]:
db_writer = (
    df2
    .writeStream
    .outputMode('complete')
#     .foreachBatch(DbWriter())
    .foreachBatch(process)
    .trigger(processingTime='10 seconds')
)

In [None]:
console_logger = (
    df2
    .writeStream
    .outputMode('complete')
#     .foreachBatch(process)
    .format('console')
    .trigger(processingTime='10 seconds')
)

In [None]:
writer = console_logger
# writer = db_writer

In [None]:
try:
    query = writer.start()
    query.awaitTermination()
except KeyboardInterrupt:
    print('Interrupted by CTRL-C. Stopped query')
except StreamingQueryException as exc:
    print(exc)
finally:
    query.stop()

In [None]:
import pprint

def process(df, epoch_id):
    mongo_client = MongoClient(
        host='localhost',
        port=27017
    )
    db = mongo_client.fit_3182_assignment_db
    collection = db.test
    collection.insert_one(df.asDict())
    pprint.pprint(df)
    mongo_client.close()
    
db_writer = (
    df2
    .writeStream
    .outputMode('complete')
#     .foreachBatch(DbWriter())
    .foreachBatch(process)
    .trigger(processingTime='10 seconds')
)

console_logger = (
    df2
    .writeStream
    .outputMode('append')
#     .foreachBatch(pprint.pprint)
    .format('console')
    .trigger(processingTime='10 seconds')
)

writer = console_logger
# writer = db_writer

try:
    query = writer.start()
    query.awaitTermination()
except KeyboardInterrupt:
    print('Interrupted by CTRL-C. Stopped query')
except StreamingQueryException as exc:
    print(exc)
finally:
    query.stop()

In [None]:
df_old = (
    df.select(
        df.value.cast('string')
        .alias('data')
    )
    .withColumn(
        'data', (
            when( col('data') == '', '*')
            .otherwise(col('data'))
        )
    )
)

In [None]:
df2 = (df.select(
           split(df.key.cast('string'), '_')
           .alias('producer'), df.value.cast('string').alias('temp')
       ).select(
           'producer', 'temp.*'
       ).withColumn('seq', element_at('producer', 2)).withColumn('producer', element_at('producer', 1))
      )