# Spark Structured Streaming from Files

Here we use spark session to read stream from a folder and continuously receive files as stream from the folder

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master('local[*]').appName("Structured Streaming CWL").getOrCreate()

To read from stream, schema has to be defined. You can pre-define the schema based on the data. Here we put some files in the folder first, infer schema from those files, then apply the same schema to read from stream.

In [None]:
file_dir = './spark_streaming_cwl/'
static_df = spark.read.option("maxFilesPerTrigger", 1).option('inferSchema', True).json(file_dir)

In [None]:
streaming_df = spark.readStream.schema(static_df.schema).option("maxFilesPerTrigger", 1).json(file_dir)

# Structured Dataframe API Operation
Here we just count the number for each map

In [None]:
streaming_count_df = streaming_df.groupBy(streaming_df.map).count()
streaming_count_df.isStreaming

# Output through query

In [None]:
query = (
  streaming_count_df
    .writeStream
    .format("memory")        # memory = store in-memory table 
    .queryName("counts")     # counts = name of the in-memory table
    .outputMode("complete")  # complete = all the counts should be in the table
    .start()
)

# Check the query results

In [None]:
spark.sql('select * from counts').show()

In [None]:
query.stop()