# Streaming
Read socket stream and count words.

## Reference
* [Exploring Spark Structured Streaming](https://dzone.com/articles/exploring-spark-structured-streaming)

# Setup

In [None]:
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.linalg.distributed._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.DataFrame

import org.apache.spark.sql.streaming.Trigger

import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

In [None]:
%%html
<!-- To left align the HTML components in Markdown -->
<style>
table {float:left}
</style>

### Spark parition control based on core availability

In [None]:
val NUM_CORES = 2
val NUM_PARTITIONS = 2

lazy val spark: SparkSession = SparkSession.builder()
    .appName("stream_console")
    .getOrCreate()

spark.conf.set("spark.default.parallelism", 8)
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
/*
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.driver.memory", "6g")
spark.conf.set("spark.executor.memory", "2g")
spark.conf.set("spark.master", "spark://masa:7077")
*/
import spark.implicits._

In [None]:
val configMap = spark.conf.getAll.foreach(println)

# Constant

In [None]:
val PROTOCOL="file://"
val DATA_DIR="/home/oonisim/home/repositories/git/oonisim/spark-programs/Dataframe"

# Main

# Stream

## Input Stream
Open socket connection with netcat.

```
nc -lk 9999
```

Read the console input.

In [None]:
val lines = spark.readStream
    .format("socket")
    .option("host", "localhost")
    .option("port", 9999)
    .load()

// Split the lines into words
val words = lines.as[String].flatMap(_.split(" "))

// Generate running word count
val wordCounts = words.groupBy("value").count()

## Output Stream
Write the word count.

In [None]:
// Start running the query that prints the running counts to the console
val query = wordCounts.writeStream
  .outputMode("complete")
  .format("console")
  .start()

query.awaitTermination()