# Exercise 2

## Imports

In [1]:
import pyspark
from pyspark.sql import SparkSession, Window, Row
import pyspark.sql.types as T
import pyspark.sql.functions as F

## Initialize Spark

In [2]:
spark: SparkSession = (SparkSession
    .builder
    .appName('StructuredStreaming')
    .getOrCreate()
)

23/05/31 20:37:16 WARN Utils: Your hostname, martinho-MS-7B86 resolves to a loopback address: 127.0.1.1; using 192.168.1.67 instead (on interface enp34s0)
23/05/31 20:37:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/31 20:37:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Prepare the data

In [3]:
PORT = 9999

In [4]:
items = (spark
    .readStream
    .format('socket')
    .option('host', 'localhost')
    .option('port', PORT)
    .load()
    # Treat as CSV
    .select(F.split('value', ',', limit=2).alias('split_cols'))
    .select(F.to_timestamp(F.col('split_cols')[0]).alias('timestamp'), F.col('split_cols')[1].alias('item'))
)

23/05/31 20:37:18 WARN TextSocketSourceProvider: The socket source should not be used for production applications! It does not support recovery.


In [None]:
assert items.isStreaming

## DGIM

In [5]:
N = 1000    # maximum number of bits to consider

In [6]:
from typing import List
from pyspark.accumulators import AccumulatorParam
from itertools import groupby

class DGIMBuckets(AccumulatorParam):

    def zero(self, value):
        return []

    def addInPlace(self, buckets, new_bit_timestamp):
        buckets.append((1, new_bit_timestamp))
        buckets_tmp = []
        
        # keep track of the last merged bucket
        merged_bucket = None

        # deal first with the smaller buckets, among which the later ones come first
        buckets.sort(key=lambda t: (t[0], -t[1]))
        for bucket_size, buckets_of_same_size in groupby(buckets, key=lambda t: t[0]):
            # sort the buckets themselves by the end timestamp (probably not needed)
            buckets_of_same_size = sorted(buckets_of_same_size, key=lambda t: t[1])
            print(bucket_size, buckets_of_same_size, merged_bucket)

            # if we merged a bucket of the previous size, add it to this batch since it now belongs to it
            if merged_bucket is not None:
                buckets_of_same_size.append(merged_bucket)

            # if more than 2 buckets, which should be no more than 3
            if len(buckets_of_same_size) > 2:
                # merge the earliest buckets
                (bitsum1, _), (bitsum2, end_timestamp2) = buckets_of_same_size[:-1]
                merged_bucket = (bitsum1 + bitsum2, end_timestamp2)
                print('merged:', merged_bucket)
                buckets_tmp.append(buckets_of_same_size[-1])
            else:
                merged_bucket = None
                buckets_tmp.extend(buckets_of_same_size)

        if merged_bucket is not None:
            buckets_tmp.append(merged_bucket)

        buckets.clear()
        buckets.extend(buckets_tmp)
        buckets.sort(key=lambda t: t[1])

        return buckets

In [12]:
dgim_buckets = spark.sparkContext.accumulator([], DGIMBuckets())

In [11]:
def dgim_process(row: Row):
    timestamp = row['timestamp']
    bit = row['item']
    if bit == '1':
        print('Updated with row', row)
        dgim_buckets += timestamp

In [14]:
outputter = (items
    .writeStream
    .outputMode('append')
    .format('memory')
    .trigger(processingTime='1 second')
    .foreach(dgim_process)
    # .queryName('outputterMem')
    .start()
)

23/05/31 20:44:11 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-4db62f12-2c4f-475b-9bd2-8e48df614045. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
23/05/31 20:44:11 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


[Stage 4:>                                                        (0 + 12) / 12]

In [None]:
odf = spark.sql('SELECT * FROM outputterMem')

Run this to quit the stream.

In [None]:
outputter.stop()

### Old

In [None]:
w = (Window
     .partitionBy(F.dayofyear(F.col('timestamp')))  # TODO: wrong, choose better partitioning (if at all)
     .orderBy(F.desc(F.col('timestamp')))           # descending order so that assignment of timestamp and cummulative sum start at the most recent entries
     .rowsBetween(Window.unboundedPreceding, Window.currentRow)
)

(odf
    .withWatermark('timestamp', '5 minutes')
    .withColumn('n', F.row_number().over(w) % N)
    .withColumn('bitsum', F.sum('item').over(w))
    .filter(F.col('n') < N)
    .withColumn('bucket', F.floor(F.log2(F.col('bitsum'))))
    .sort('timestamp')
    .tail(30)
)