In [5]:

#     kafka-topics  --create --bootstrap-server localhost:9092 --replication-factor 1 --partitions 1 --topic stock-ticks 
#     kafka-console-consumer --bootstrap-server localhost:9092 --topic  stock-ticks   --from-beginning

In [7]:
val readfromkafkaDf= spark.readStream.format("kafka").option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "stock-ticks").option("group-id","stock-ticks-group4-nav1").load()

readfromkafkaDf: org.apache.spark.sql.DataFrame = [key: binary, value: binary ... 5 more fields]


In [8]:
readfromkafkaDf.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [9]:
val stockDf= readfromkafkaDf.selectExpr(("CAST(value as STRING)"),("timestamp"))
stockDf.printSchema()

root
 |-- value: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)



stockDf: org.apache.spark.sql.DataFrame = [value: string, timestamp: timestamp]


In [10]:

import org.apache.spark.sql.types.{StructField, StructType, DoubleType, StringType, LongType, TimestampType}

val schema = StructType(
    List(
  StructField("symbol", StringType, true),
    StructField("volume", LongType, true),
     StructField("price", DoubleType, true),
    StructField("timestamp", LongType, true)  
    )
)

import org.apache.spark.sql.types.{StructField, StructType, DoubleType, StringType, LongType, TimestampType}
schema: org.apache.spark.sql.types.StructType = StructType(StructField(symbol,StringType,true), StructField(volume,LongType,true), StructField(price,DoubleType,true), StructField(timestamp,LongType,true))


In [11]:
val stringToJsonStockDf= stockDf.withColumn("value", from_json($"value",schema))
stringToJsonStockDf.printSchema()

root
 |-- value: struct (nullable = true)
 |    |-- symbol: string (nullable = true)
 |    |-- volume: long (nullable = true)
 |    |-- price: double (nullable = true)
 |    |-- timestamp: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)



stringToJsonStockDf: org.apache.spark.sql.DataFrame = [value: struct<symbol: string, volume: bigint ... 2 more fields>, timestamp: timestamp]


In [12]:
val valueDf= stringToJsonStockDf.select("value.*")
valueDf.printSchema()

root
 |-- symbol: string (nullable = true)
 |-- volume: long (nullable = true)
 |-- price: double (nullable = true)
 |-- timestamp: long (nullable = true)



valueDf: org.apache.spark.sql.DataFrame = [symbol: string, volume: bigint ... 2 more fields]


In [14]:

val timeDf= valueDf .withColumn("timestamp", col("timestamp")/1000)
                .withColumn("time" , to_timestamp(col("timestamp")))
                .drop("timestamp")
timeDf.printSchema()

root
 |-- symbol: string (nullable = true)
 |-- volume: long (nullable = true)
 |-- price: double (nullable = true)
 |-- time: timestamp (nullable = true)



timeDf: org.apache.spark.sql.DataFrame = [symbol: string, volume: bigint ... 2 more fields]


In [31]:
import org.apache.spark.sql.functions._
val stock1minDf= timeDf.groupBy($"symbol", window($"time", "60 Seconds"))
                                .agg(sum("volume").alias("volume"),
                                max("price").alias("high"),
                                min("price").alias("low"),
                                first("price").alias("first"),
                                last("price").alias("last")
                                    )
stock1minDf.printSchema() 


//    import org.apache.spark.sql.functions._
//    df.groupBy("department").agg(max("age"), sum("expense"))

 // echoOnconsole = stock1minDf
 //                 .writeStream
 //                 .outputMode("update")
 //                 .format("console")
 //                 .option("truncate", false)
 //                 .start()
                              

root
 |-- symbol: string (nullable = true)
 |-- window: struct (nullable = false)
 |    |-- start: timestamp (nullable = true)
 |    |-- end: timestamp (nullable = true)
 |-- volume: long (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- first: double (nullable = true)
 |-- last: double (nullable = true)



import org.apache.spark.sql.functions._
stock1minDf: org.apache.spark.sql.DataFrame = [symbol: string, window: struct<start: timestamp, end: timestamp> ... 5 more fields]


In [None]:
stock3minDf= timeDf.groupBy("symbol", F.window("time", "3 minutes"))\
                                .agg( F.sum("volume").alias("volume"),\
                                 F.max("price").alias("high"),\
                                F.min("price").alias("low"),\
                                F.first("price").alias("first"),\
                                F.last("price").alias("last")
                                    )
stock3minDf.printSchema()       

In [None]:
stock5minDf= timeDf.groupBy("symbol", F.window("time", "5 minutes"))\
                                .agg( F.sum("volume").alias("volume"),\
                                 F.max("price").alias("high"),\
                                F.min("price").alias("low"),\
                                F.first("price").alias("first"),\
                                F.last("price").alias("last")
                                    )

stock5minDf.printSchema()


In [33]:
val stock1minDfTojsonDf= stock1minDf.drop("window").selectExpr("to_json(struct(*)) AS value")
stock1minDfTojsonDf.printSchema()

// stock3minDfTojsonDf= stock3minDf.drop("window").selectExpr("to_json(struct(*)) AS value")
// stock3minDfTojsonDf.printSchema()

// stock5minDfTojsonDf= stock5minDf.drop("window").selectExpr("to_json(struct(*)) AS value")
// stock5minDfTojsonDf.printSchema()

root
 |-- value: string (nullable = true)



stock1minDfTojsonDf: org.apache.spark.sql.DataFrame = [value: string]


In [35]:
stock1minDfTojsonDf.writeStream
             .format("kafka")
            .outputMode("update")
             .option("kafka.bootstrap.servers", "localhost:9092")
            .option("topic", "scala-stock-ticks-1min")
            .option("checkpointLocation", "file:///tmp/spark6")
            .start()


// stock3minDfTojsonDf.writeStream\
//              .format("kafka")\
//             .outputMode("update")\
//              .option("kafka.bootstrap.servers", "localhost:9092")\
//             .option("topic", "stock-ticks-3min")\
//             .option("checkpointLocation", "file:///tmp/spark7")\
//             .start()

// stock5minDfTojsonDf.writeStream\
//              .format("kafka")\
//             .outputMode("update")\
//              .option("kafka.bootstrap.servers", "localhost:9092")\
//             .option("topic", "stock-ticks-5min")\
//             .option("checkpointLocation", "file:///tmp/spark8")\
//             .start()

res19: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@567170d6
