In [37]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
import sys
import pandas as pd

from pyspark.sql import functions as F
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import IntegerType, NullType


from pywrangler.pyspark.wranglers import interval_identifier

In [39]:
PATH = r"C:\Users\rasboldt\Desktop\software\pywrangler\notebooks\oscillation_timeseries.csv"

In [40]:
spark = SparkSession.Builder().appName("Oscillation").getOrCreate()

In [41]:
df_all = spark.read.csv(PATH, sep=";", header=True)

In [42]:
df = df_all.select("machinenumber", "timestamp", "state")\
           .withColumn("timestamp", F.col("timestamp").cast(IntegerType()))

### Show Data

In [43]:
df.show(32, False)

+-------------+---------+-----+
|machinenumber|timestamp|state|
+-------------+---------+-----+
|Bla          |1        |ende |
|Bla          |2        |noise|
|Bla          |3        |start|
|Bla          |4        |noise|
|Bla          |5        |noise|
|Bla          |6        |ende |
|Bla          |7        |start|
|Bla          |8        |start|
|Bla          |9        |noise|
|Bla          |10       |noise|
|Bla          |11       |start|
|Bla          |12       |ende |
|Bla          |13       |noise|
|Bla          |14       |noise|
|Bla          |15       |start|
|Bla          |16       |ende |
|Bla          |17       |start|
|Bla          |18       |start|
|Bla          |19       |noise|
|Bla          |20       |noise|
|Bla          |21       |ende |
|Bla          |22       |noise|
|Bla          |23       |noise|
|Bla          |24       |ende |
|Bla          |25       |start|
|Bla          |26       |ende |
|Bla          |27       |noise|
|Bla          |28       |ende |
|Bla    

### Last start, last end interval

In [44]:
ls_le = interval_identifier.VectorizedCumSum(marker_column="state", 
                                     marker_start="start",
                                     marker_end="ende",
                                     marker_start_use_first=False,
                                     marker_end_use_first=False,           
                                     order_columns=["timestamp"], 
                                     groupby_columns=["machinenumber"])\
                              .transform(df)

In [45]:
ls_le.filter(ls_le.machinenumber.isNotNull()).show(50, False)

+-------------+---------+-----+----+
|machinenumber|timestamp|state|iids|
+-------------+---------+-----+----+
|Bla          |1        |ende |0   |
|Bla          |2        |noise|0   |
|Bla          |3        |start|19  |
|Bla          |4        |noise|19  |
|Bla          |5        |noise|19  |
|Bla          |6        |ende |19  |
|Bla          |7        |start|0   |
|Bla          |8        |start|0   |
|Bla          |9        |noise|0   |
|Bla          |10       |noise|0   |
|Bla          |11       |start|13  |
|Bla          |12       |ende |13  |
|Bla          |13       |noise|0   |
|Bla          |14       |noise|0   |
|Bla          |15       |start|9   |
|Bla          |16       |ende |9   |
|Bla          |17       |start|0   |
|Bla          |18       |start|6   |
|Bla          |19       |noise|6   |
|Bla          |20       |noise|6   |
|Bla          |21       |ende |6   |
|Bla          |22       |noise|6   |
|Bla          |23       |noise|6   |
|Bla          |24       |ende |6   |
|

### First start, first end interval

In [46]:
fs_fe = interval_identifier.VectorizedCumSum(marker_column="state", 
                                     marker_start="start",
                                     marker_end="ende",
                                     marker_start_use_first=True,
                                     marker_end_use_first=True,           
                                     order_columns=["timestamp"], 
                                     groupby_columns=["machinenumber"])\
                              .transform(df)

fs_fe.filter(fs_fe.machinenumber.isNotNull()).show(50, False)

+-------------+---------+-----+----+
|machinenumber|timestamp|state|iids|
+-------------+---------+-----+----+
|Bla          |1        |ende |0   |
|Bla          |2        |noise|0   |
|Bla          |3        |start|4   |
|Bla          |4        |noise|4   |
|Bla          |5        |noise|4   |
|Bla          |6        |ende |4   |
|Bla          |7        |start|6   |
|Bla          |8        |start|6   |
|Bla          |9        |noise|6   |
|Bla          |10       |noise|6   |
|Bla          |11       |start|6   |
|Bla          |12       |ende |6   |
|Bla          |13       |noise|0   |
|Bla          |14       |noise|0   |
|Bla          |15       |start|10  |
|Bla          |16       |ende |10  |
|Bla          |17       |start|12  |
|Bla          |18       |start|12  |
|Bla          |19       |noise|12  |
|Bla          |20       |noise|12  |
|Bla          |21       |ende |12  |
|Bla          |22       |noise|0   |
|Bla          |23       |noise|0   |
|Bla          |24       |ende |0   |
|

### First start, last end interval

In [47]:
fs_le = interval_identifier.VectorizedCumSum(marker_column="state", 
                                     marker_start="start",
                                     marker_end="ende",
                                     marker_start_use_first=True,
                                     marker_end_use_first=False,           
                                     order_columns=["timestamp"], 
                                     groupby_columns=["machinenumber"])\
                              .transform(df)

fs_le.filter(fs_le.machinenumber.isNotNull()).show(50, False)

+-------------+---------+-----+----+
|machinenumber|timestamp|state|iids|
+-------------+---------+-----+----+
|Bla          |1        |ende |0   |
|Bla          |2        |noise|0   |
|Bla          |3        |start|1   |
|Bla          |4        |noise|1   |
|Bla          |5        |noise|1   |
|Bla          |6        |ende |1   |
|Bla          |7        |start|2   |
|Bla          |8        |start|2   |
|Bla          |9        |noise|2   |
|Bla          |10       |noise|2   |
|Bla          |11       |start|2   |
|Bla          |12       |ende |2   |
|Bla          |13       |noise|0   |
|Bla          |14       |noise|0   |
|Bla          |15       |start|3   |
|Bla          |16       |ende |3   |
|Bla          |17       |start|4   |
|Bla          |18       |start|4   |
|Bla          |19       |noise|4   |
|Bla          |20       |noise|4   |
|Bla          |21       |ende |4   |
|Bla          |22       |noise|4   |
|Bla          |23       |noise|4   |
|Bla          |24       |ende |4   |
|

### Last start, first end interval

In [48]:
ls_fe = interval_identifier.VectorizedCumSum(marker_column="state", 
                                     marker_start="start",
                                     marker_end="ende",
                                     marker_start_use_first=False,
                                     marker_end_use_first=True,           
                                     order_columns=["timestamp"], 
                                     groupby_columns=["machinenumber"])\
                              .transform(df)

ls_fe.filter(ls_fe.machinenumber.isNotNull()).show(50, False)

+-------------+---------+-----+----+
|machinenumber|timestamp|state|iids|
+-------------+---------+-----+----+
|Bla          |1        |ende |0   |
|Bla          |2        |noise|0   |
|Bla          |3        |start|1   |
|Bla          |4        |noise|1   |
|Bla          |5        |noise|1   |
|Bla          |6        |ende |1   |
|Bla          |7        |start|0   |
|Bla          |8        |start|0   |
|Bla          |9        |noise|0   |
|Bla          |10       |noise|0   |
|Bla          |11       |start|2   |
|Bla          |12       |ende |2   |
|Bla          |13       |noise|0   |
|Bla          |14       |noise|0   |
|Bla          |15       |start|3   |
|Bla          |16       |ende |3   |
|Bla          |17       |start|0   |
|Bla          |18       |start|4   |
|Bla          |19       |noise|4   |
|Bla          |20       |noise|4   |
|Bla          |21       |ende |4   |
|Bla          |22       |noise|0   |
|Bla          |23       |noise|0   |
|Bla          |24       |ende |0   |
|

### Other Data

In [4]:
data = [["bla", 1, "end"], 
        ["bla", 2, "noise"],
		["bla", 3, "start"],
		["bla", 4, "noise"],
		["bla", 5, "noise"],
		["bla", 6, "noise"],
		["bla", 7, "end"],
		["bla", 8, "end"],
		["bla", 9, "start"],
		["bla", 10, "start"],
		["bla", 11, "start"],
		["bla", 12, "noise"],
		["bla", 13, "end"],
		["bla", 14, "start"],
		["bla", 15, "noise"],
		["bla", 16, "end"],
		["bla", 17, "end"],
		["bla", 18, "end"]]

In [25]:
data = [["bla", 1, 2], 
        ["bla", 2, 3],
		["bla", 3, 1],
		["bla", 4, 3],
		["bla", 5, 3],
		["bla", 6, 3],
		["bla", 7, 2],
		["bla", 8, 2],
		["bla", 9, 1],
		["bla", 10, 1],
		["bla", 11, 1],
		["bla", 12, 3],
		["bla", 13, 2],
		["bla", 14, 1],
		["bla", 15, 3],
		["bla", 16, 2],
		["bla", 17, 2],
		["bla", 18, 2]]

In [30]:
data = [["bla", 1, "start"], 
        ["bla", 2, "noise"],
		["bla", 3, "start"],
		["bla", 4, "noise"],
		["bla", 5, "noise"],
		["bla", 6, "noise"],
		["bla", 7, "end"],
		["bla", 8, "end"],
		["bla", 9, "start"],
		["bla", 10, "start"],
		["bla", 11, "start"],
		["bla", 12, "noise"],
		["bla", 13, "end"],
		["bla", 14, "start"],
		["bla", 15, "noise"],
		["bla", 16, "end"],
		["bla", 17, "end"],
		["bla", 18, "end"]]

In [5]:
df = pd.DataFrame(data=data, columns=["machinenumber", "timestamp", "state"])
df = spark.createDataFrame(df)
df.show(50, False)

+-------------+---------+-----+
|machinenumber|timestamp|state|
+-------------+---------+-----+
|bla          |1        |end  |
|bla          |2        |noise|
|bla          |3        |start|
|bla          |4        |noise|
|bla          |5        |noise|
|bla          |6        |noise|
|bla          |7        |end  |
|bla          |8        |end  |
|bla          |9        |start|
|bla          |10       |start|
|bla          |11       |start|
|bla          |12       |noise|
|bla          |13       |end  |
|bla          |14       |start|
|bla          |15       |noise|
|bla          |16       |end  |
|bla          |17       |end  |
|bla          |18       |end  |
+-------------+---------+-----+



In [12]:
sequence = interval_identifier.VectorizedCumSum(marker_column="state", 
                                     marker_start="start", 
                                     marker_end="end", 
                                     order_columns=["timestamp"], 
                                     groupby_columns=["machinenumber"])

In [13]:
df_lsle = sequence.transform_last_start_last_end(df)
df_lsle.show(50, False)

+-------------+---------+-----+----+
|machinenumber|timestamp|state|iids|
+-------------+---------+-----+----+
|bla          |1        |end  |0   |
|bla          |2        |noise|0   |
|bla          |3        |start|7   |
|bla          |4        |noise|7   |
|bla          |5        |noise|7   |
|bla          |6        |noise|7   |
|bla          |7        |end  |7   |
|bla          |8        |end  |7   |
|bla          |9        |start|0   |
|bla          |10       |start|0   |
|bla          |11       |start|3   |
|bla          |12       |noise|3   |
|bla          |13       |end  |3   |
|bla          |14       |start|1   |
|bla          |15       |noise|1   |
|bla          |16       |end  |1   |
|bla          |17       |end  |1   |
|bla          |18       |end  |1   |
+-------------+---------+-----+----+

