## Data Lab: Spark Streaming
> Streaming demo

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as sqlfn
from pyspark.sql import types as sqlt

In [None]:
spark = SparkSession.builder.appName('SparkStreamingLab1').master('local[2]').getOrCreate()

In [None]:
# Enforce schema, to avoid dynamic changes that can break things!
schema_employee = sqlt.StructType([
    sqlt.StructField('employee_id',sqlt.IntegerType(), True),
    sqlt.StructField('department_name',sqlt.StringType(), True),
    sqlt.StructField('name',sqlt.StringType(), True),
    sqlt.StructField('last_name',sqlt.StringType(), True),
    sqlt.StructField('hire_timestamp',sqlt.TimestampType(), True)
])

In [None]:
#Â Read Stream
df_employees = spark.readStream.format('csv').schema(schema_employee)\
                    .option('header',True)\
                    .option('maxFilesPerTrigger',1)\
                    .load(r'datasets/csv/')

In [None]:
# Is my stream activated?
df_employees.isStreaming

In [None]:
# Show schema
df_employees.printSchema()

In [None]:
# Add aggregation
df_large_teams = df_employees.withWatermark("hire_timestamp", "10 minutes")\
                    .groupBy('department_name','hire_timestamp')\
                        .agg((sqlfn.count('employee_id').alias('count')), sqlfn.max('hire_timestamp'))\
                            .where('count > 1')

In [None]:
df_stream_large_teams = df_large_teams.writeStream.format('console').outputMode('complete').start()

Example to append Streamed data to storage:

```python
df_stream_large_teams = df_large_teams.writeStream\
                                    .format('csv')\
                                    .outputMode('append')\
                                    .option("path", "output/large_depts/")\
                                    .option("checkpointLocation", "datasets/checkpoints/")\
                                    .start()
```

In [None]:
# Stop stream
df_stream_large_teams.stop()