# Exercise 2

## Imports

In [1]:
import pyspark
from pyspark.sql import SparkSession, Window, Row, DataFrame
import pyspark.sql.types as T
import pyspark.sql.functions as F

## Initialize Spark

In [2]:
spark: SparkSession = (SparkSession
    .builder
    .appName('StructuredStreaming')
    .getOrCreate()
)

23/06/03 13:09:59 WARN Utils: Your hostname, martinho-SATELLITE-L50-B resolves to a loopback address: 127.0.1.1; using 192.168.1.66 instead (on interface enp8s0)
23/06/03 13:09:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/06/03 13:10:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Prepare the data

In [132]:
PORT = 9999

In [141]:
items = (spark
    .readStream
    .format('socket')
    .option('host', 'localhost')
    .option('port', PORT)
    .load()
    # Treat as CSV
    .select(F.split('value', ',', limit=3).alias('split_cols'))
    .select(
        F.to_timestamp(F.col('split_cols')[2]).alias('timestamp'),
        F.timestamp_seconds(F.col('split_cols')[1].cast(T.IntegerType())).alias('timestamp_n'),
        F.col('split_cols')[1].cast(T.IntegerType()).alias('n'),
        F.col('split_cols')[0].alias('item'))
)

23/06/03 16:14:13 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.


In [5]:
assert items.isStreaming

## DGIM

In [7]:
from typing import List
from pyspark.accumulators import AccumulatorParam
from itertools import groupby

class DGIMBuckets(AccumulatorParam):

    def zero(self, value):
        return []

    def addInPlace(self, buckets, new_bit_timestamp):
        buckets.append((1, new_bit_timestamp))
        buckets_tmp = []
        
        # keep track of the last merged bucket
        merged_bucket = None

        # deal first with the smaller buckets, among which the later ones come first
        buckets.sort(key=lambda t: (t[0], -t[1]))
        for bucket_size, buckets_of_same_size in groupby(buckets, key=lambda t: t[0]):
            # sort the buckets themselves by the end timestamp (probably not needed)
            buckets_of_same_size = sorted(buckets_of_same_size, key=lambda t: t[1])
            print(bucket_size, buckets_of_same_size, merged_bucket)

            # if we merged a bucket of the previous size, add it to this batch since it now belongs to it
            if merged_bucket is not None:
                buckets_of_same_size.append(merged_bucket)

            # if more than 2 buckets, which should be no more than 3
            if len(buckets_of_same_size) > 2:
                # merge the earliest buckets
                (bitsum1, _), (bitsum2, end_timestamp2) = buckets_of_same_size[:-1]
                merged_bucket = (bitsum1 + bitsum2, end_timestamp2)
                print('merged:', merged_bucket)
                buckets_tmp.append(buckets_of_same_size[-1])
            else:
                merged_bucket = None
                buckets_tmp.extend(buckets_of_same_size)

        if merged_bucket is not None:
            buckets_tmp.append(merged_bucket)

        buckets.clear()
        buckets.extend(buckets_tmp)
        buckets.sort(key=lambda t: t[1])

        return buckets

In [103]:
dgim_buckets = spark.sparkContext.accumulator([], DGIMBuckets())

In [9]:
def dgim_process(row: Row):
    timestamp = row['timestamp']
    bit = row['item']
    if bit == '1':
        print('Updated with row', row)
        dgim_buckets += timestamp

In [142]:
N = 1000    # N determines whether we should halve the counts for the last bucket (if end_timestamp with bucket_size surpasses N)
t = 5       # t determines the frequency with which the counts are computed
k = 900     # k determines the window size

Ns = f'{N} seconds'
ks = f'{k} seconds'
ts = f'{t} seconds'

w = (Window
    .partitionBy('timestamp', '1 day')
    .orderBy(F.desc('timestamp'))                       # descending order so that assignment of integer timestamp starts at the most recent entries
    # .rowsBetween(Window.unboundedPreceding, Window.currentRow)                 # we want to consider the last k rows only
)

outputter = (items
    # .withColumn('timestamp', F.current_timestamp())     # add the processing timestamp to the event
    # .withWatermark('timestamp', Ns)                # discard events that are at least 1 day old
    .withColumn('bucket', F.floor(F.log2('n')))
    .groupby(F.window('timestamp', ks), F.floor(F.log2('n')))
    # .sum()

    .writeStream
    .trigger(processingTime=ts)
    .outputMode('append')
    .format('memory')
    .queryName('outputterMem')
    .start()
)

23/06/03 16:14:16 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-f426ee2e-63ce-42a5-bc5b-38bebcef0b92. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/06/03 16:14:16 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


                                                                                

In [151]:
spark.sql('SELECT * FROM outputterMem').tail(20)

[Row(timestamp=datetime.datetime(2023, 6, 3, 16, 14, 41, 823579), timestamp_n=datetime.datetime(1970, 1, 1, 1, 0, 20), n=20, item='0', bucket=4),
 Row(timestamp=datetime.datetime(2023, 6, 3, 16, 14, 42, 444707), timestamp_n=datetime.datetime(1970, 1, 1, 1, 0, 21), n=21, item='1', bucket=4),
 Row(timestamp=datetime.datetime(2023, 6, 3, 16, 14, 43, 905381), timestamp_n=datetime.datetime(1970, 1, 1, 1, 0, 22), n=22, item='1', bucket=4),
 Row(timestamp=datetime.datetime(2023, 6, 3, 16, 14, 45, 703734), timestamp_n=datetime.datetime(1970, 1, 1, 1, 0, 23), n=23, item='0', bucket=4),
 Row(timestamp=datetime.datetime(2023, 6, 3, 16, 14, 49, 89299), timestamp_n=datetime.datetime(1970, 1, 1, 1, 0, 27), n=27, item='0', bucket=4),
 Row(timestamp=datetime.datetime(2023, 6, 3, 16, 14, 47, 73319), timestamp_n=datetime.datetime(1970, 1, 1, 1, 0, 24), n=24, item='0', bucket=4),
 Row(timestamp=datetime.datetime(2023, 6, 3, 16, 14, 48, 18699), timestamp_n=datetime.datetime(1970, 1, 1, 1, 0, 25), n=25, it

Run this to quit the stream.

In [139]:
outputter.stop()

23/06/03 16:11:54 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 4, writer: org.apache.spark.sql.execution.streaming.sources.MemoryStreamingWrite@36c0d5c3] is aborting.
23/06/03 16:11:54 ERROR WriteToDataSourceV2Exec: Data source write support MicroBatchWrite[epoch: 4, writer: org.apache.spark.sql.execution.streaming.sources.MemoryStreamingWrite@36c0d5c3] aborted.
23/06/03 16:11:54 WARN Shell: Interrupted while joining on: Thread[Thread-33787,5,main]
java.lang.InterruptedException
	at java.base/java.lang.Object.wait(Native Method)
	at java.base/java.lang.Thread.join(Thread.java:1305)
	at java.base/java.lang.Thread.join(Thread.java:1379)
	at org.apache.hadoop.util.Shell.joinThread(Shell.java:1042)
	at org.apache.hadoop.util.Shell.runCommand(Shell.java:1002)
	at org.apache.hadoop.util.Shell.run(Shell.java:900)
	at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:1212)
	at org.apache.hadoop.util.Shell.execCommand(Shell.java:1306)
	at or

### Old

In [None]:
w = (Window
     .partitionBy(F.dayofyear(F.col('timestamp')))  # TODO: wrong, choose better partitioning (if at all)
     .orderBy(F.desc(F.col('timestamp')))           # descending order so that assignment of timestamp and cummulative sum start at the most recent entries
     .rowsBetween(Window.unboundedPreceding, Window.currentRow)
)

(odf
    .withWatermark('timestamp', '5 minutes')
    .withColumn('n', F.row_number().over(w) % N)
    .withColumn('bitsum', F.sum('item').over(w))
    .filter(F.col('n') < N)
    .withColumn('bucket', F.floor(F.log2(F.col('bitsum'))))
    .sort('timestamp')
    .tail(30)
)