# Window operation on event time
This document is mostly based on the 
<a href="https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#window-operations-on-event-time"> [link] </a>

# Load SparkSession and Data

In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StringType
from pyspark.sql.functions import udf
from pyspark.sql.functions import col

from pyspark.sql.functions import to_timestamp
from pyspark.sql import types

spark = SparkSession \
    .builder \
    .appName("Window Operations on Event Time") \
    .getOrCreate()

static_df = spark.read.json("./data/")
schema = static_df.schema
df = spark.readStream.json("./data/", schema=schema)
print(df)
static_df.take(5)

DataFrame[created_at: string, sentiment_level: bigint]


[Row(created_at='12:3:2 +0009', sentiment_level=1),
 Row(created_at='3:11:43 +0000', sentiment_level=1),
 Row(created_at='0:1:12 +0000', sentiment_level=1),
 Row(created_at='16:2:27 +0000', sentiment_level=2),
 Row(created_at='18:15:24 +0000', sentiment_level=0)]

# Preprocessing for window operation

In [32]:
import datetime

def from_created_at(x):
    """
    parsing format : "https://docs.python.org/3/library/datetime.html#datetime.date"
    
    The valuable of 'x' has a form of '07:02:44 +0000' 
    """
    dt = datetime.datetime.strptime(x, "%H:%M:%S %z")
    return dt.isoformat()

from_created_at_udf = udf(lambda x: from_created_at(x))

tsDF = df.select(
    to_timestamp(from_created_at_udf(col("created_at"))).alias("created_at"),
    col("sentiment_level").cast(types.IntegerType())
)
tsDF

DataFrame[created_at: timestamp, sentiment_level: int]

In [33]:
from_created_at("12:3:2 +0009")

'1900-01-01T12:03:02+00:09'

In [34]:
from_created_at("12:3:2 +0000")


'1900-01-01T12:03:02+00:00'

# Window operation

In [35]:
from pyspark.sql.functions import window

In [36]:
windowsedCounts = tsDF.groupBy(
    window(tsDF.created_at, "4 hours", "2 hours").alias("created_at"),
    tsDF.sentiment_level
).count()

In [37]:
windowsedCounts02 = tsDF.groupBy(
    window(tsDF.created_at, "10 minutes", "5 minutes").alias("created_at"),
    tsDF.sentiment_level
).count()

In [38]:
launch = windowsedCounts \
    .writeStream \
    .outputMode("complete") \
    .queryName("df") \
    .format("memory") \
    .start()

In [39]:
launch = windowsedCounts \
    .writeStream \
    .outputMode("complete") \
    .queryName("df02") \
    .format("memory") \
    .start()

In [42]:
spark.sql("select * from df").sort("created_at").take(10)

[Row(created_at=Row(start=datetime.datetime(1900, 1, 1, 6, 27, 52), end=datetime.datetime(1900, 1, 1, 10, 27, 52)), sentiment_level=2, count=3),
 Row(created_at=Row(start=datetime.datetime(1900, 1, 1, 6, 27, 52), end=datetime.datetime(1900, 1, 1, 10, 27, 52)), sentiment_level=1, count=2),
 Row(created_at=Row(start=datetime.datetime(1900, 1, 1, 6, 27, 52), end=datetime.datetime(1900, 1, 1, 10, 27, 52)), sentiment_level=0, count=1),
 Row(created_at=Row(start=datetime.datetime(1900, 1, 1, 8, 27, 52), end=datetime.datetime(1900, 1, 1, 12, 27, 52)), sentiment_level=1, count=4),
 Row(created_at=Row(start=datetime.datetime(1900, 1, 1, 8, 27, 52), end=datetime.datetime(1900, 1, 1, 12, 27, 52)), sentiment_level=0, count=2),
 Row(created_at=Row(start=datetime.datetime(1900, 1, 1, 8, 27, 52), end=datetime.datetime(1900, 1, 1, 12, 27, 52)), sentiment_level=2, count=4),
 Row(created_at=Row(start=datetime.datetime(1900, 1, 1, 10, 27, 52), end=datetime.datetime(1900, 1, 1, 14, 27, 52)), sentiment_lev

In [50]:
spark.sql("select * from df02").sort("created_at").take(10)

[Row(created_at=Row(start=datetime.datetime(1900, 1, 1, 6, 27, 52), end=datetime.datetime(1900, 1, 1, 10, 27, 52)), sentiment_level=0, count=1),
 Row(created_at=Row(start=datetime.datetime(1900, 1, 1, 6, 27, 52), end=datetime.datetime(1900, 1, 1, 10, 27, 52)), sentiment_level=1, count=2),
 Row(created_at=Row(start=datetime.datetime(1900, 1, 1, 6, 27, 52), end=datetime.datetime(1900, 1, 1, 10, 27, 52)), sentiment_level=2, count=3),
 Row(created_at=Row(start=datetime.datetime(1900, 1, 1, 8, 27, 52), end=datetime.datetime(1900, 1, 1, 12, 27, 52)), sentiment_level=2, count=4),
 Row(created_at=Row(start=datetime.datetime(1900, 1, 1, 8, 27, 52), end=datetime.datetime(1900, 1, 1, 12, 27, 52)), sentiment_level=0, count=2),
 Row(created_at=Row(start=datetime.datetime(1900, 1, 1, 8, 27, 52), end=datetime.datetime(1900, 1, 1, 12, 27, 52)), sentiment_level=1, count=4),
 Row(created_at=Row(start=datetime.datetime(1900, 1, 1, 10, 27, 52), end=datetime.datetime(1900, 1, 1, 14, 27, 52)), sentiment_lev

In [30]:
spark.stop()