# Window operation on event time
1. This document is mostly based on the 
<a href="https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#window-operations-on-event-time"> [link] </a>

2. Run the generator from data_generator.ipynb in the current folder. 

# Load SparkSession and Data

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StringType
from pyspark.sql.functions import udf
from pyspark.sql.functions import col

from pyspark.sql.functions import to_timestamp
from pyspark.sql import types

spark = SparkSession \
    .builder \
    .appName("Window Operations on Event Time") \
    .getOrCreate()

static_df = spark.read.json("./data/sample.txt")

schema = static_df.schema

df = spark.readStream.format("json").schema(schema).option("maxFilesPerTrigger", 1).load("./data/")

print(df)
static_df.show(5, truncate=False)

DataFrame[created_at: string, sentiment_level: string]
+-------------------+---------------+
|created_at         |sentiment_level|
+-------------------+---------------+
|2021-10-29T18:10:09|2              |
+-------------------+---------------+



# Window operation with water mark
### Conditions for watermarking to clean aggregation state <a href="https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#quick-example">[link]</a> 

In [2]:
from pyspark.sql.types import TimestampType
tsDF = df.select(col("created_at").cast(TimestampType()), col("sentiment_level"))

In [3]:
# If you set a WaterMark with a time, it blocks not to be data included x hours/minutes/seconds passed.

from pyspark.sql.functions import window

windowedCounts = tsDF \
    .withWatermark("created_at", "10 seconds") \
    .groupBy(
        window(col("created_at"), "10 seconds", "5 seconds").alias("created_at"),
        col("sentiment_level")) \
    .count()

# Save your calculated data into the folder.

In [5]:
launch = windowedCounts \
    .writeStream \
    .format("parquet") \
    .option("path", "./windowsedCounts_output/") \
    .option("checkpointLocation", "./windowsedCounts_checkpoint/") \
    .format("memory") \
    .queryName("df") \

.start()

In [6]:
spark.sql("select * from df")

AnalysisException: Table or view not found: df; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [df], [], false


In [None]:
import time

for _ in range(3):
    spark.sql("select * from df").sort("created_at", ascending=False).show(5, truncate=False)
    time.sleep(5)