# Sliding Window Processing

importing libraries and starting a spark session

In [7]:
from pyspark.sql import SparkSession, Window
from pyspark.streaming import StreamingContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .config("spark.executor.memory", "4G") \
    .appName("slidingwindow") \
    .getOrCreate()

sc = spark.sparkContext

The following piece of code computes step by step the top 5 correlated sensors in DataFrame `df`, using the same methods and lines of code as **Task 1**.
- Compute the moving average & std for each timegap
- Compute the possible pairs & their covariance for each timegap
- Compute the pearson correlation coefficient of each pair for each timegap

In [8]:

import pyspark.sql.functions as F
from pyspark.sql.functions import col, rank
from pyspark.sql import Window

from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StringType, IntegerType, StructType, IntegerType, FloatType
def compute_analytics(df) :
    try:

        # Compute the average
        windowval = (Window.orderBy('tgap').partitionBy("DeviceName")
                .rangeBetween(Window.unboundedPreceding, 0))

        df = df.withColumn('cum_sum', F.sum('Count').over(windowval))

        df_analytics = df.withColumn('avg', df["cum_sum"]/(df["tgap"]))


        # Compute the std
        current_row_window = Window.partitionBy("deviceName").orderBy('tgap').rowsBetween(0, 0)

        avg_current_row = F.first(F.col("avg")).over(current_row_window)
    
        df_analytics = df_analytics.withColumn("stddev",
            F.sqrt(F.sum(
            F.pow(df_analytics["Count"]-avg_current_row, 2)
        ).over(window=windowval)))
      
        # Compute the pairs
        pairs_df = df_analytics.alias("sensor1").join(df_analytics.alias("sensor2"),\
                                    col("sensor1.tgap") == col("sensor2.tgap"))\
                                    .select(col("sensor1.tgap"),\
                                            col("sensor1.DeviceName").alias("s1_Name"),\
                                            col("sensor1.Count").alias("s1_Count"),\
                                            col("sensor1.avg").alias("s1_avg"),\
                                            col("sensor1.stddev").alias("s1_std"),\
                                            col("sensor2.DeviceName").alias("s2_Name"),\
                                            col("sensor2.Count").alias("s2_Count"),\
                                            col("sensor2.avg").alias("s2_avg"),\
                                            col("sensor2.stddev").alias("s2_std")\
                                    )\
                                    .where(col("s1_Name") < col("s2_Name"))\
                                    .orderBy("tgap")

        # Compute the covariance of each pair at each time gap
        current_row_window = Window.partitionBy("s1_Name", "s2_Name").orderBy('tgap').rowsBetween(0, 0)



        s1_avg_current_row = F.first(F.col("s1_avg")).over(current_row_window)


        s2_avg_current_row = F.first(F.col("s2_avg")).over(current_row_window)
       
        windowval = Window.partitionBy("s1_Name", "s2_Name").orderBy('tgap').rangeBetween(Window.unboundedPreceding, 0)
        
        pairs_covariance = pairs_df.withColumn("covariance",
                                            F.sum(
                                            (F.col("s1_Count") - s1_avg_current_row) * (F.col("s2_Count") - s2_avg_current_row)
                                        ).over(windowval))
        
        pairs_correl = pairs_covariance.withColumn("pearsonCoeff",
                                            col("covariance")/(col("s1_std")*col("s2_std"))
                                            )
       
        # Compute the top 5 pairs 
        window = Window.partitionBy("tgap").orderBy(col("pearsonCoeff").desc())
      
        # Add a new column with the rank of each row within the window
        ranked_df = pairs_correl.withColumn("rank", rank().over(window))
      
        # Filter for the top-5 pairs of sensors for each time gap
        top_5_pairs = ranked_df.filter(col("rank") <= 5).orderBy(F.desc("tgap"), F.desc("pearsonCoeff")) \
            .select("tgap", "s1_Name", "s2_Name", "pearsonCoeff")
       
        top_5_pairs.show()
    except :
        print(msg)
                       

Precise the schema for the DataFrame (**structured data streaming**)

In [9]:
# function to convert rdd to dataframe with a schema and compute the preason correlation
def process_rdd( rdd):
    if not rdd.isEmpty():
        df = rdd.toDF(schema=["tgap",  "Count","avg_speed", "DeviceName" ])

        compute_analytics(df)
     

This function creates a connection to a network socket in the producer notebook

In [10]:

#You may change the batch interval
#The function returns the Spark context, Spark streaming context, and DStream object
def getDStream(spark, batch_interval=5):

    # Get Spark context
    sc = spark.sparkContext
    sc.setLogLevel("ERROR")

    #Create streaming context, with required batch interval
    ssc = StreamingContext(sc, batch_interval)

    #Checkpointing needed for stateful transforms
    ssc.checkpoint("checkpoint")
    
    # Create a DStream that represents streaming data from a network socket
    # See https://spark.apache.org/docs/latest/streaming-programming-guide.html#a-quick-example
    dstream = ssc.socketTextStream("localhost", 9997)
    
    return [sc,ssc,dstream]

In [11]:
# Get the DStream object containing the streaming data sent by the producer notebook
[sc,ssc,dstream] = getDStream(spark=spark,batch_interval=2)


In [12]:
parsed_lines = dstream.map(lambda line: line.split(",")) 

# Perform sliding window processing
W =  30 # Set the window length (in seconds)
delta = 10  # slide on 10 seconds


windowed_dstream = parsed_lines.window(W, delta)

# Process the windowed DStream
windowed_dstream.foreachRDD(process_rdd)



In [13]:
ssc.start()

+----+-------+-------+------------------+
|tgap|s1_Name|s2_Name|      pearsonCoeff|
+----+-------+-------+------------------+
|  99|CB02411| COM205|0.9767055043570113|
|  99|CB02411| CEK049|0.9675249810012775|
|  99|CB02411|  CJM90|0.9647024820705544|
|  99| CB2105| COM205|0.9562230318359805|
|  99| CEK049|  CJM90|0.9544551444700105|
|  98|CB02411| COM205| 0.976694090139714|
|  98|CB02411| CEK049|0.9675091130410687|
|  98|CB02411|  CJM90|0.9646850889643144|
|  98| CB2105| COM205|0.9562028399563188|
|  98| CEK049|  CJM90|0.9544329959738785|
|  97|CB02411| COM205|0.9766824302519685|
|  97|CB02411| CEK049|0.9674929035962033|
|  97|CB02411|  CJM90|0.9646673214042067|
|  97| CB2105| COM205|0.9561822146733346|
|  97| CEK049|  CJM90|0.9544103709409724|
|  96|CB02411| COM205|0.9766705166761354|
|  96|CB02411| CEK049|0.9674763415235428|
|  96|CB02411|  CJM90|0.9646491671663954|
|  96| CB2105| COM205|0.9561611418817408|
|  96| CEK049|  CJM90|0.9543872538247541|
+----+-------+-------+------------

In [14]:
ssc.stop(stopSparkContext=False,stopGraceFully=False)

+----+-------+-------+------------------+
|tgap|s1_Name|s2_Name|      pearsonCoeff|
+----+-------+-------+------------------+
|1919| CEK049|  CJM90|0.8372409203024744|
|1919|  CEK18|  CJM90|0.8263385458620547|
|1919|CB02411|  CJM90|0.7607630002273769|
|1919|  CJM90| COM205|0.7522802962809342|
|1919| CB1143|  CJM90|0.7492986266952301|
|1918| CEK049|  CJM90|0.8372298193099982|
|1918|  CEK18|  CJM90|  0.82632811430383|
|1918|CB02411|  CJM90|0.7607485130414068|
|1918|  CJM90| COM205|0.7522655992700934|
|1918| CB1143|  CJM90|0.7492853761136815|
|1917| CEK049|  CJM90|0.8372187052169054|
|1917|  CEK18|  CJM90|0.8263176705982768|
|1917|CB02411|  CJM90|0.7607340089643567|
|1917|  CJM90| COM205|0.7522508851565483|
|1917| CB1143|  CJM90|0.7492721102766033|
|1916| CEK049|  CJM90|0.8372850421313249|
|1916|  CEK18|  CJM90|0.8263421502711474|
|1916|CB02411|  CJM90|0.7607562856300675|
|1916|  CJM90| COM205|0.7522723378696997|
|1916| CB1143|  CJM90|0.7492909093240889|
+----+-------+-------+------------