In [1]:
val kafkareadDf = spark.readStream.format("kafka")
                      .option("kafka.bootstrap.servers", "localhost:9092")
                      .option("subscribe", "stock-ticks")
                      .option("group.id", "stock-ticks-group3-na123")
                      .load()

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.80.128:4041
SparkContext available as 'sc' (version = 3.1.3, master = local[*], app id = local-1647892491148)
SparkSession available as 'spark'


kafkareadDf: org.apache.spark.sql.DataFrame = [key: binary, value: binary ... 5 more fields]


In [2]:
kafkareadDf.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [3]:
val stockDf = kafkareadDf.selectExpr("CAST(value AS STRING)")
stockDf.printSchema()

root
 |-- value: string (nullable = true)



stockDf: org.apache.spark.sql.DataFrame = [value: string]


In [4]:

import org.apache.spark.sql.types.{StructField, StructType, DoubleType, StringType, LongType, TimestampType}

val schema = StructType(
    List(
  StructField("symbol", StringType, true),
    StructField("volume", LongType, true),
     StructField("price", DoubleType, true),
    StructField("timestamp", LongType, true)  
    )
)

import org.apache.spark.sql.types.{StructField, StructType, DoubleType, StringType, LongType, TimestampType}
schema: org.apache.spark.sql.types.StructType = StructType(StructField(symbol,StringType,true), StructField(volume,LongType,true), StructField(price,DoubleType,true), StructField(timestamp,LongType,true))


In [5]:
val jsonStockDf = stockDf.withColumn("value", from_json($"value",schema))
jsonStockDf.printSchema()

root
 |-- value: struct (nullable = true)
 |    |-- symbol: string (nullable = true)
 |    |-- volume: long (nullable = true)
 |    |-- price: double (nullable = true)
 |    |-- timestamp: long (nullable = true)



jsonStockDf: org.apache.spark.sql.DataFrame = [value: struct<symbol: string, volume: bigint ... 2 more fields>]


In [6]:
val valueDf = jsonStockDf.select("value.*")
valueDf.printSchema()


root
 |-- symbol: string (nullable = true)
 |-- volume: long (nullable = true)
 |-- price: double (nullable = true)
 |-- timestamp: long (nullable = true)



valueDf: org.apache.spark.sql.DataFrame = [symbol: string, volume: bigint ... 2 more fields]


In [7]:
// Year=2022/Month=03/Day=18/Hour=01/Symbol=TSLA   for every minute, 1 file

val timeDf = valueDf.withColumn("SYMBOL1",col("symbol"))
                .withColumn("timestamp", col("timestamp")/1000)
                .withColumn("time" , to_timestamp(col("timestamp")))
                .withColumn("YEAR" ,date_format(col("time"),"yyyy"))
                .withColumn("MONTH" ,date_format(col("time"),"MM"))
                .withColumn("DAY" ,date_format(col("time"),"dd"))
                .withColumn("HOUR" ,date_format(col("time"),"HH"))
                .withColumn("MIN" ,date_format(col("time"),"mm"))
                .drop("timestamp")
timeDf.printSchema()



root
 |-- symbol: string (nullable = true)
 |-- volume: long (nullable = true)
 |-- price: double (nullable = true)
 |-- SYMBOL1: string (nullable = true)
 |-- time: timestamp (nullable = true)
 |-- YEAR: string (nullable = true)
 |-- MONTH: string (nullable = true)
 |-- DAY: string (nullable = true)
 |-- HOUR: string (nullable = true)
 |-- MIN: string (nullable = true)



timeDf: org.apache.spark.sql.DataFrame = [symbol: string, volume: bigint ... 8 more fields]


In [8]:

//echoOnconsole = timeDf.writeStream.outputMode("update").option("truncate", False).format("console").start()



In [11]:
import org.apache.spark.sql.streaming.Trigger

timeDf.writeStream.trigger(Trigger.ProcessingTime("60 seconds"))
        .queryName("Write ticks to csv2")
        .format("csv")
        .option("path", "hdfs://localhost:9000/scala/layers/raw/csv/")
        .option("header", true)
        .option("checkpointLocation", "file:///tmp/spark43")
        .partitionBy("year", "month", "day", "hour", "SYMBOL1")
        .option("truncate", false)
        .start()

import org.apache.spark.sql.streaming.Trigger
res7: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@45ebd5f9


In [15]:
import org.apache.spark.sql._
def processBatchData(candleBatchDf:DataFrame, batch_id:Long)=
{
    print ("process batch called", batch_id, "writing ", candleBatchDf.count())
    (
      candleBatchDf
       // .select('*')
        .write
        .mode("append")
        .format("csv")
        .partitionBy("symbol","YEAR", "MONTH", "DAY","HOUR","MIN")
        .save("hdfs://localhost:9000/Testing71/")
      )
}  


timeDf.writeStream.outputMode("append")
         .option("checkpointLocation", "file:///tmp/spark371")
         .foreachBatch(processBatchData _).start()
        
        
  

import org.apache.spark.sql._
processBatchData: (candleBatchDf: org.apache.spark.sql.DataFrame, batch_id: Long)Unit
res11: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@2354e33d


(process batch called,0,writing ,0)(process batch called,1,writing ,1)(process batch called,2,writing ,1)(process batch called,3,writing ,1)(process batch called,4,writing ,1)(process batch called,5,writing ,1)(process batch called,6,writing ,1)(process batch called,7,writing ,1)(process batch called,8,writing ,1)(process batch called,9,writing ,1)(process batch called,10,writing ,1)(process batch called,11,writing ,1)(process batch called,12,writing ,1)(process batch called,13,writing ,1)(process batch called,14,writing ,1)(process batch called,15,writing ,1)(process batch called,16,writing ,1)(process batch called,17,writing ,1)(process batch called,18,writing ,1)(process batch called,19,writing ,1)(process batch called,20,writing ,1)(process batch called,21,writing ,1)(process batch called,22,writing ,1)(process batch called,23,writing ,1)(process batch called,24,writing ,1)(process batch called,25,writing ,1)(process batch called,26,writing ,1)(process batch called,27,writing ,1)(p