# Acceleration Example

### Imports

In [1]:
import org.apache.spark.streaming.{Seconds, Minutes, StreamingContext}
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe

import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructType

import org.apache.kafka.common.serialization.{BytesDeserializer, StringDeserializer}
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord}

import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.scala.DefaultScalaModule
import com.fasterxml.jackson.module.scala.experimental.ScalaObjectMapper

import collection.JavaConverters.mapAsJavaMapConverter

In [2]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession


val spark_jars = "org.apache.hadoop:hadoop-aws:3.2.0,org.postgresql:postgresql:42.2.18,org.apache.spark:spark-avro_2.12:3.0.1,org.apache.spark:spark-streaming-kafka-0-10_2.11:2.4.5,org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1,org.apache.kafka:kafka-clients:2.6.0,com.databricks:spark-xml_2.12:0.12.0"

val spark = SparkSession.builder.appName("StructuredNetworkWordCount").config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5").getOrCreate()

import spark.implicits._

In [3]:
// Subscribe to 1 topic
// read from Kafka 
// org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5
val inputDF = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "sample_topic").load()


Name: java.lang.ClassNotFoundException
Message: Failed to find data source: kafka. Please find packages at https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects
StackTrace:   at org.apache.spark.sql.execution.datasources.DataSource.lookupDataSource(DataSource.scala:148)
  at org.apache.spark.sql.execution.datasources.DataSource.providingClass$lzycompute(DataSource.scala:79)
  at org.apache.spark.sql.execution.datasources.DataSource.providingClass(DataSource.scala:79)
  at org.apache.spark.sql.execution.datasources.DataSource.sourceSchema(DataSource.scala:218)
  at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo$lzycompute(DataSource.scala:80)
  at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo(DataSource.scala:80)
  at org.apache.spark.sql.execution.streaming.StreamingRelation$.apply(StreamingRelation.scala:30)
  at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:124)
  ... 46 elided
Caused by: java.lang.Cl

### Create Streaming Context

In [None]:
val ssc = new StreamingContext(sc, Seconds(1))
ssc.remember(Minutes(1))

### Setup Kafka input stream

In [None]:
val consumerParams = Map[String, Object](
  "bootstrap.servers" -> "localhost:9092",
  "key.deserializer" -> classOf[BytesDeserializer],
  "value.deserializer" -> classOf[StringDeserializer],
  "group.id" -> "spark-notebook",
  "auto.offset.reset" -> "earliest",
  "enable.auto.commit" -> (false: java.lang.Boolean)
)

val topics = Array("sample-topic")
val stream = KafkaUtils.createDirectStream[String, String](
  ssc,
  PreferConsistent,
  Subscribe[String, String](topics, consumerParams)
)

### Expected Input Schema

{"time":1990,"type":"DOTA_COMBATLOG_DAMAGE","value":4,"attackername":"npc_dota_hero_huskar","targetname":"npc_dota_hero_huskar","sourcename":"npc_dota_hero_huskar","targetsourcename":"npc_dota_hero_huskar","attackerhero":true,"targethero":true,"attackerillusion":false,"targetillusion":false,"inflictor":"item_armlet"}

In [None]:
val schema = new StructType().add("time", "long").add("type", "string").add("value", "long").add("attackername", "string")

### Stream Processing

In [None]:
stream.foreachRDD { rdd =>
    spark.read.schema(schema).json(rdd.map(_.value())).createOrReplaceTempView("logs")
}

In [None]:
stream.foreachRDD { rdd =>
  spark.read.schema(schema).json(rdd.map(_.value())).createOrReplaceTempView("logs")
                   
  spark.sql("select * FROM logs").toJSON.foreachPartition {
    partition =>

      val producerParams = Map[String, Object](
        ProducerConfig.BOOTSTRAP_SERVERS_CONFIG -> "localhost:9092",
        ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringSerializer",
        ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringSerializer"
      )
 
      val producer = new KafkaProducer[String, String](producerParams.asJava)
      
      partition.foreach { s =>
        if (s != "{}")
            producer.send(new ProducerRecord[String, String]("sample-topic", s))
      }
      
      producer.close()
  }
}

### Start stream

In [None]:
ssc.start()

### Lets see what we really read

In [None]:
%%SQL
select * from logs

### Stop stream

In [None]:
StreamingContext.getActive.foreach { _.stop(stopSparkContext = false) }

#### Verify the contents in Kafka using the console consumer

The following command line tools can help print the contents to the console.
```sh
./bin/kafka-console-consumer.sh --topic sample-topic --bootstrap-server localhost:9092
```

In [None]:
// Subscribe to 1 topic
// read from Kafka 
// org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5
val inputDF = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "sample-topic").load()

df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").as[(String, String)]

### Stop stream

In [None]:
StreamingContext.getActive.foreach { _.stop(stopSparkContext = false) }

#### Verify the contents in Kafka using the console consumer

The following command line tools can help print the contents to the console.
```sh
./bin/kafka-console-consumer.sh --topic sample-topic --bootstrap-server localhost:9092
```

In [7]:
// Subscribe to 1 topic
// read from Kafka 
// org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5
val inputDF = spark.readStream.format("kafka").option("kafka.bootstrap.servers", "localhost:9092").option("subscribe", "sample-topic").load()

df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)").as[(String, String)]

Name: java.lang.ClassNotFoundException
Message: Failed to find data source: kafka. Please find packages at https://cwiki.apache.org/confluence/display/SPARK/Third+Party+Projects
StackTrace:   at org.apache.spark.sql.execution.datasources.DataSource.lookupDataSource(DataSource.scala:148)
  at org.apache.spark.sql.execution.datasources.DataSource.providingClass$lzycompute(DataSource.scala:79)
  at org.apache.spark.sql.execution.datasources.DataSource.providingClass(DataSource.scala:79)
  at org.apache.spark.sql.execution.datasources.DataSource.sourceSchema(DataSource.scala:218)
  at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo$lzycompute(DataSource.scala:80)
  at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo(DataSource.scala:80)
  at org.apache.spark.sql.execution.streaming.StreamingRelation$.apply(StreamingRelation.scala:30)
  at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:124)
  ... 50 elided
Caused by: java.lang.Cl