In [1]:
# This script will do all necessary preprocessing for daily data to be scored by AD models

# 1. Reading all of one days data from datalake for choosen station
# 2. Implement deadband-filter on data
# 3. Create and output preprocessed dataset and related features
# 4. Create and output plateaus and related features
# 5. Create and output passages and related features


# Change log
# Date Initials Change
# 20181004 PL Created

In [2]:
# example on path to avro-file
# /rawdata/trackcircuits/lysaker/bn-maintenance40-eh/trackcircuitlysaker/3/2018/08/29/08/02/51.avro
# /rawdata/trackcircuits/<station>/bn-maintenance40-eh/trackciruuit<station>/<partition>/<year>/<month>/<day>/<hour>/<minute>/<AVRO-FILE>

# Batch name of trained models/norm matrix
folder =  dbutils.widgets.get("folder")
#folder = 'sep2018_nov2018'

# Time parameters
day =  dbutils.widgets.get("day")
month =  dbutils.widgets.get("month")
year =  dbutils.widgets.get("year")
#day = '08'
#month = '01'
#year = '2019'



# Stations that should be included
stations = ["lysaker", "skoyen", "nationaltheatret", "sandvika", "asker"]

# doing left padding to fit format of datalake paths
day = day.zfill(2)
month = month.zfill(2)

In [3]:
%run "/Smart Vedlikehold/Utvikling/Sporfelt/functions"

In [4]:
# Read raw data (code for filtering/formating in function)
data, error_code = read_unpacked_rawdata(year = year, month = month, day = day, stations = stations)

# Applying a deadbandfilter. All changes in values are permitted to pass
data = perform_deadband(data, key =["Station","TrackCircuitId","Current"], orderby = "orgTimestamp", value_col = "Measurement")
data = data.cache()

In [5]:
# Syncing time and removing duplicates

# This is done by the following steps
# 1. Timestamp is rounded to nearest 250 ms. If the value is alone on that timestamp it is chosen --Changed to done before entering this snippet
# 2. Remove duplicate values, i.e. same measurement on same rounded timestamp for same trackcircuit
# 3. If multiple values on same rounded timestamp stil exist. The absolute differance between rounded timestamp 
#    and the original timestamp is calculated. The value with the smallest difference is selected.
# 4. If multiple values with same differance exist. the smallest of these are selected

#2
data = data.dropDuplicates(["syncTimestamp","Station","TrackCircuitId","Current","Measurement"]) # .dropDuplicates have implicit keep = first

# 3 # 4
data = data.withColumn("diffTimestamps",  F.abs(data["orgTimestamp"]-data["syncTimestamp"]))
data = data.orderBy(["diffTimestamps", "Measurement"], ascending =[True,True]) # Arranging the dataframe in the order we want
data = data.dropDuplicates(["syncTimestamp","Station","TrackCircuitId","Current"]) # .dropDuplicates have implicit keep = first

data = data.drop("diffTimestamps")
data = data.withColumnRenamed('syncTimestamp','Timestamp')


In [6]:
# Pivoting RC and FC to one row and forwardfill the values

# Creating the pivoted dataframe
df_pivot = pivot_data(df = data,
                      group_key = ["Date","Station","TrackCircuitId", "Timestamp"],
                      pivot_key = "Current",
                      pivot_values = ["FC","RC"],
                      pivot_new_columns = ["Measurement_FC", "Measurement_RC"],
                      value_col = "Measurement" )


# Forwardfilling NULL-values
df_pivot = forward_fill(df = df_pivot,
                        key = ["Station","TrackCircuitId"],
                        orderby = "Timestamp",
                        columns_to_ffill = ["Measurement_FC", "Measurement_RC"])

# Adding timestamp from last row
df_pivot = get_info_from_next_row(df = df_pivot,
                                  key = ["Station","TrackCircuitId"],
                                  orderby = "Timestamp",
                                  value_col = "Timestamp", 
                                  new_column_name = "End")

frequence = 4 # Data is synced to 4 hz, therefore we can say that that 1 seconds contains 4 measurements

df_pivot = df_pivot.withColumn("Deltatime", df_pivot["End"]-df_pivot["Timestamp"])
df_pivot = df_pivot.withColumn("DeltatimeSeconds", df_pivot["Deltatime"]/1000) #Deltatime is given in milliseconds
df_pivot = df_pivot.withColumn("DeltaWeights", df_pivot["DeltatimeSeconds"]*frequence) #Weights for use in later calculation of average and standard deviations


In [7]:
# New version, 20180918 (built ontop Mattis StreamAnalytics code) to detect wheter or not a track circuit is occupied or free
## This could be replaced by information that is collected from Stream Analytics-calculations, however where to get these calculations are not clear today

# Code used in development to start fresh
#df_pivot = df_pivot.select("Station","TrackCircuitId","Timestamp","Measurement_FC","Measurement_RC","End","Deltatime","DeltatimeSeconds")

# Specifying states (states are defined in class to always have the same in all script)
TC_OCCUPIED_STATE = TrackCircuitState().TC_OCCUPIED_STATE
TC_FREE_STATE = TrackCircuitState().TC_FREE_STATE
TC_UNKNOWN_STATE = TrackCircuitState().TC_UNKNOWN_STATE
TC_ARRIVING_STATE = TrackCircuitState().TC_ARRIVING_STATE
TC_DEPARTING_STATE = TrackCircuitState().TC_DEPARTING_STATE
TC_UNCERTAIN_STATE = TrackCircuitState().TC_UNCERTAIN_STATE

# Setting first round of state
df_pivot = TrackCircuitState().set_state_first(df_pivot,folder)

# Code to splitt up Unknown-state into Departing-, Arriving- and Unknown-states based on the track circuit behaviour before and after an unknown period

df_pivot = TrackCircuitState().set_state_final(df_pivot, folder, calculate_first_state = 0)

# NOTE, State could be set in just one line, by changing calculate_first_state = 1

In [8]:
# The norm matrix is related to the data used in training of a model (i.e. a batch name) 
# There will therefore exist multiple normalization matricies (and models), seperated by folders

#df_pivot = df_pivot.select("Station","TrackCircuitId","Timestamp","Measurement_FC","Measurement_RC","End","Deltatime","DeltatimeSeconds","DeltaWeights","State_RC","State_FC","State")
df_pivot = normalize_data(df_pivot, folder)


In [9]:
# Creating the plateaus. This is done by generating a plateauID based on matching the previous stat to current state for each track circuit

df_pivot = get_info_from_previous_row(df_pivot, key = ["Station", "TrackCircuitId"], orderby = "Timestamp", value_col = "State", new_column_name = "previousState")
df_pivot = get_changed_values_only(df_pivot, column_one = "State", column_two ="previousState", new_column_name = "plateauId", generate_sk = 1)
df_pivot = forward_fill(df_pivot, key = ["Station", "TrackCircuitId"], orderby = "Timestamp", columns_to_ffill = ['plateauId'])

# Calculation of values that describes plateaus
# See functions-script for code
df_plateau = calculate_plateau_info(df_pivot)


# Code to start on clean snippet during development (commented out in production)
#df_pivot = df_pivot.select("Date","TrackCircuitId","State","Timestamp","Measurement_FC","Measurement_RC","End","Deltatime","DeltatimeSeconds","DeltaWeights","State_RC","State_FC","wAvgFC","wAvgRC","wStdFC","wStdRC","count","Measurement_FC_norm","Measurement_RC_norm","Station","previousState","plateauId")


In [10]:
# Calculation of features on daily level based on measurements
# E.g. min, max, std, and so on.
# See functions-script for code

df_day_meas = calculate_day_meas(df_pivot)

In [11]:
# Calculating features related to plateaus

TC_OCCUPIED_STATE = "Occupied"
TC_FREE_STATE = "Free"

key = ["Station","TrackCircuitId"]

df_day_plat = df_plateau\
              .withColumn("diff_within_plateau_rc",df_plateau["Max_RC"] -df_plateau["Min_RC"])\
              .withColumn("diff_within_plateau_fc",df_plateau["Max_FC"] -df_plateau["Min_FC"])\
              .groupBy(key)\
              .pivot("State", ["Free", "Occupied","Unknown","Arriving", "Departing"])\
              .agg(F.min("Std_RC").alias("min_std_rc"),
                   F.min("Std_FC").alias("min_std_fc"),
                   F.max("Std_RC").alias("max_std_rc"),
                   F.max("Std_FC").alias("max_std_fc"),
                   F.stddev("Std_RC").alias("std_std_rc"),
                   F.stddev("Std_FC").alias("std_std_fc"),
                   F.max("Avg_RC").alias("max_avg_rc"),
                   F.min("Avg_RC").alias("min_avg_rc"),
                   F.max("Avg_FC").alias("max_avg_fc"),
                   F.min("Avg_FC").alias("min_avg_fc"),
                   F.max("diff_within_plateau_rc").alias("max_diff_max_min_rc"),
                   F.min("diff_within_plateau_rc").alias("min_diff_max_min_rc"),
                   F.max("diff_within_plateau_fc").alias("max_diff_max_min_fc"),
                   F.min("diff_within_plateau_fc").alias("min_diff_max_min_fc"),
                   F.avg("Length").alias("avg_length"),
                   F.max("Length").alias("max_length")
                  )


  
df_day_plat = df_day_plat.withColumn("Free_diff_maxmin_avg_rc",
                                     df_day_plat["Free_max_avg_rc"]-df_day_plat["Free_min_avg_rc"])
df_day_plat = df_day_plat.withColumn("Free_diff_maxmin_avg_fc",
                                     df_day_plat["Free_max_avg_fc"]-df_day_plat["Free_min_avg_fc"])
df_day_plat = df_day_plat.withColumn("Occupied_diff_maxmin_avg_rc",
                                     df_day_plat["Occupied_max_avg_rc"]-df_day_plat["Occupied_min_avg_rc"])
df_day_plat = df_day_plat.withColumn("Occupied_diff_maxmin_avg_fc",
                                     df_day_plat["Occupied_max_avg_fc"]-df_day_plat["Occupied_min_avg_fc"])


df_day_plat = df_day_plat.select("Station","TrackCircuitId",
                                 # Features related to STD within a plateau
                                 "Free_min_std_rc", "Free_max_std_rc", "Free_std_std_rc",
                                 "Free_min_std_fc", "Free_max_std_fc", "Free_std_std_fc",
                                 "Occupied_min_std_rc", "Occupied_max_std_rc", "Occupied_std_std_rc",
                                 "Occupied_min_std_fc", "Occupied_max_std_fc", "Occupied_std_std_fc",
                                 # Features related to differences between plateau in avg
                                 "Free_diff_maxmin_avg_rc","Free_diff_maxmin_avg_fc",
                                 "Occupied_diff_maxmin_avg_rc","Occupied_diff_maxmin_avg_fc",
                                 # Feature related to differences within a plateau
                                 "Free_max_diff_max_min_rc", "Free_max_diff_max_min_fc",
                                 "Free_min_diff_max_min_rc", "Free_min_diff_max_min_fc",
                                 "Occupied_max_diff_max_min_rc", "Occupied_max_diff_max_min_fc",
                                 "Occupied_min_diff_max_min_rc", "Occupied_min_diff_max_min_fc",
                                 # Feature related to lenght of plateau
                                 "Arriving_avg_length", "Departing_avg_length", "Unknown_avg_length",
                                 "Free_avg_length","Occupied_avg_length",
                                 "Arriving_max_length", "Departing_max_length", "Unknown_max_length",
                                 "Free_max_length","Occupied_max_length"
                                ).fillna(0, subset = ["Arriving_avg_length","Departing_avg_length","Unknown_avg_length","Free_avg_length",
                                                      "Occupied_avg_length",
                                                      "Arriving_max_length","Departing_max_length","Unknown_max_length","Free_max_length",
                                                      "Occupied_max_length"])

df_day_plat = df_day_plat.withColumnRenamed("Free_min_std_rc", "RC_F_MIN_STD_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Free_max_std_rc", "RC_F_MAX_STD_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Free_std_std_rc", "RC_F_STD_STD_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Free_min_std_fc", "FC_F_MIN_STD_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Free_max_std_fc", "FC_F_MAX_STD_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Free_std_std_fc", "FC_F_STD_STD_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Occupied_min_std_rc", "RC_O_MIN_STD_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Occupied_max_std_rc", "RC_O_MAX_STD_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Occupied_std_std_rc", "RC_O_STD_STD_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Occupied_min_std_fc", "FC_O_MIN_STD_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Occupied_max_std_fc", "FC_O_MAX_STD_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Occupied_std_std_fc", "FC_O_STD_STD_PLATEAU")

df_day_plat = df_day_plat.withColumnRenamed("Free_diff_maxmin_avg_rc", "RC_F_DIFF_MAXAVG_MINAVG_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Free_diff_maxmin_avg_fc", "FC_F_DIFF_MAXAVG_MINAVG_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Occupied_diff_maxmin_avg_rc", "RC_O_DIFF_MAXAVG_MINAVG_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Occupied_diff_maxmin_avg_fc", "FC_O_DIFF_MAXAVG_MINAVG_PLATEAU")

df_day_plat = df_day_plat.withColumnRenamed("Free_max_diff_max_min_rc", "RC_F_MAXDIFF_MAX_MIN_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Free_max_diff_max_min_fc", "FC_F_MAXDIFF_MAX_MIN_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Free_min_diff_max_min_rc", "RC_F_MINDIFF_MAX_MIN_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Free_min_diff_max_min_fc", "FC_F_MIMDIFF_MAX_MIN_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Occupied_max_diff_max_min_rc", "RC_O_MAXDIFF_MAX_MIN_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Occupied_max_diff_max_min_fc", "FC_O_MAXDIFF_MAX_MIN_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Occupied_min_diff_max_min_rc", "RC_O_MINDIFF_MAX_MIN_PLATEAU")
df_day_plat = df_day_plat.withColumnRenamed("Occupied_min_diff_max_min_fc", "FC_O_MIMDIFF_MAX_MIN_PLATEAU")

df_day_plat = df_day_plat.withColumnRenamed("Arriving_avg_length", "ARRIVING_AVG_LENGTH")
df_day_plat = df_day_plat.withColumnRenamed("Departing_avg_length", "DEPARTING_AVG_LENGTH")
df_day_plat = df_day_plat.withColumnRenamed("Unknown_avg_length", "UNKNOWN_AVG_LENGTH")
df_day_plat = df_day_plat.withColumnRenamed("Free_avg_length", "FREE_AVG_LENGTH")
df_day_plat = df_day_plat.withColumnRenamed("Occupied_avg_length", "OCCUPIED_AVG_LENGTH")

df_day_plat = df_day_plat.withColumnRenamed("Arriving_max_length", "ARRIVING_MAX_LENGTH")
df_day_plat = df_day_plat.withColumnRenamed("Departing_max_length", "DEPARTING_MAX_LENGTH")
df_day_plat = df_day_plat.withColumnRenamed("Unknown_max_length", "UNKNOWN_MAX_LENGTH")
df_day_plat = df_day_plat.withColumnRenamed("Free_max_length", "FREE_MAX_LENGTH")
df_day_plat = df_day_plat.withColumnRenamed("Occupied_max_length", "OCCUPIED_MAX_LENGTH")


In [12]:
# Calculation of passages

df_passage = df_plateau.filter("State in ('Free','Occupied')")

windowSpec = \
  Window \
    .partitionBy("Station","TrackCircuitId") \
    .orderBy("Start") \

prevstate = F.lag(df_passage['State'],1).over(windowSpec)
df_passage = df_passage.withColumn("previousState", prevstate)

nextstate = F.lead(df_passage['State'],1).over(windowSpec)
df_passage = df_passage.withColumn("nextState", nextstate)

prevAvgRC = F.lag(df_passage['Avg_RC'],1).over(windowSpec)
prevAvgFC = F.lag(df_passage['Avg_FC'],1).over(windowSpec)
df_passage = df_passage.withColumn("previousAvg_RC", prevAvgRC)
df_passage = df_passage.withColumn("previousAvg_FC", prevAvgFC)

nextAvgRC = F.lead(df_passage['Avg_RC'],1).over(windowSpec)
nextAvgFC = F.lead(df_passage['Avg_FC'],1).over(windowSpec)
df_passage = df_passage.withColumn("nextAvg_RC", nextAvgRC)
df_passage = df_passage.withColumn("nextAvg_FC", nextAvgFC)

df_passage = df_passage.filter("State = 'Occupied' and nextState = 'Free' and previousState ='Free'")

df_passage = df_passage.withColumn("RC_PASS_AVGDIFF_BEF_AFT", F.abs(df_passage["previousAvg_RC"] -  df_passage["nextAvg_RC"]))
df_passage = df_passage.withColumn("FC_PASS_AVGDIFF_BEF_AFT", F.abs(df_passage["previousAvg_FC"] -  df_passage["nextAvg_FC"]))
df_passage = df_passage.withColumn("RC_PASS_AVGDIFF_BEF_PASS", df_passage["previousAvg_RC"] -  df_passage["Avg_RC"])
df_passage = df_passage.withColumn("FC_PASS_AVGDIFF_BEF_PASS", df_passage["previousAvg_FC"] -  df_passage["Avg_FC"])
df_passage = df_passage.withColumn("RC_PASS_AVGDIFF_PASS_AFT", df_passage["Avg_RC"] -  df_passage["nextAvg_RC"])
df_passage = df_passage.withColumn("FC_PASS_AVGDIFF_PASS_AFT", df_passage["Avg_FC"] -  df_passage["nextAvg_FC"])

df_passage = df_passage.select("Station","TrackCircuitId",
                               "RC_PASS_AVGDIFF_BEF_AFT","FC_PASS_AVGDIFF_BEF_AFT",
                               "RC_PASS_AVGDIFF_BEF_PASS","FC_PASS_AVGDIFF_BEF_PASS",
                               "RC_PASS_AVGDIFF_PASS_AFT","FC_PASS_AVGDIFF_PASS_AFT"
                              )


df_day_pass = df_passage\
              .groupBy("Station","TrackCircuitId")\
              .agg(# Maxium calulations
                   F.max("RC_PASS_AVGDIFF_BEF_AFT").alias("RC_PASS_MAXDIFF_AVG_BEF_AFT"),
                   F.max("FC_PASS_AVGDIFF_BEF_AFT").alias("FC_PASS_MAXDIFF_AVG_BEF_AFT"),
                   F.max("RC_PASS_AVGDIFF_BEF_PASS").alias("RC_PASS_MAXDIFF_AVG_BEF_PASS"),
                   F.max("FC_PASS_AVGDIFF_BEF_PASS").alias("FC_PASS_MAXDIFF_AVG_BEF_PASS"),
                   F.max("RC_PASS_AVGDIFF_PASS_AFT").alias("RC_PASS_MAXDIFF_AVG_PASS_AFT"),
                   F.max("FC_PASS_AVGDIFF_PASS_AFT").alias("FC_PASS_MAXDIFF_AVG_PASS_AFT"),
                   # Minimum calc
                   F.min("RC_PASS_AVGDIFF_BEF_AFT").alias("RC_PASS_MINDIFF_AVG_BEF_AFT"),
                   F.min("FC_PASS_AVGDIFF_BEF_AFT").alias("FC_PASS_MINDIFF_AVG_BEF_AFT"),
                   F.min("RC_PASS_AVGDIFF_BEF_PASS").alias("RC_PASS_MINDIFF_AVG_BEF_PASS"),
                   F.min("FC_PASS_AVGDIFF_BEF_PASS").alias("FC_PASS_MINDIFF_AVG_BEF_PASS"),
                   F.min("RC_PASS_AVGDIFF_PASS_AFT").alias("RC_PASS_MINDIFF_AVG_PASS_AFT"),
                   F.min("FC_PASS_AVGDIFF_PASS_AFT").alias("FC_PASS_MINDIFF_AVG_PASS_AFT"),
                   # Standard deviation calc
                   F.stddev("RC_PASS_AVGDIFF_BEF_AFT").alias("RC_PASS_STDDIFF_AVG_BEF_AFT"),
                   F.stddev("FC_PASS_AVGDIFF_BEF_AFT").alias("FC_PASS_STDDIFF_AVG_BEF_AFT"),
                   F.stddev("RC_PASS_AVGDIFF_BEF_PASS").alias("RC_PASS_STDDIFF_AVG_BEF_PASS"),
                   F.stddev("FC_PASS_AVGDIFF_BEF_PASS").alias("FC_PASS_STDDIFF_AVG_BEF_PASS"),
                   F.stddev("RC_PASS_AVGDIFF_PASS_AFT").alias("RC_PASS_STDDIFF_AVG_PASS_AFT"),
                   F.stddev("FC_PASS_AVGDIFF_PASS_AFT").alias("FC_PASS_STDDIFF_AVG_PASS_AFT")
                  )



In [13]:
# Writing data to files in order to maintain history and make it easier to back-etrack

# Writing preprocessed data to AVRO-files
cols = ['Station','TrackCircuitId','State','State_RC','State_FC','Timestamp','End','Deltatime','DeltatimeSeconds','DeltaWeights','Measurement_FC','Measurement_RC','Measurement_FC_norm','Measurement_RC_norm','wAvgFC','wAvgRC','wStdFC','wStdRC','count']

writepath = 'mnt/root/ml/trackcircuits/data/daily/{}/{}/{}/{}/preprocessed'.format(year,month,day,folder)
df_pivot.select(cols).write.partitionBy("Station").format("com.databricks.spark.avro").save(writepath)

# Writing plateau data to AVRO-files
writepath = 'mnt/root/ml/trackcircuits/data/daily/{}/{}/{}/{}/plateau'.format(year,month,day,folder)
df_plateau.write.partitionBy("Station").format("com.databricks.spark.avro").save(writepath)

# Writing passages data to AVRO-files
writepath = 'mnt/root/ml/trackcircuits/data/daily/{}/{}/{}/{}/passage'.format(year,month,day,folder)
df_passage.coalesce(1).write.partitionBy("Station").format("com.databricks.spark.avro").save(writepath)

# Writing feature from measurement data to AVRO-files
writepath = 'mnt/root/ml/trackcircuits/data/daily/{}/{}/{}/{}/feature_measurement'.format(year,month,day,folder)
df_day_meas.coalesce(1).write.partitionBy("Station").format("com.databricks.spark.avro").save(writepath)

# Writing feature from plateaus data to AVRO-files
writepath = 'mnt/root/ml/trackcircuits/data/daily/{}/{}/{}/{}/feature_plateau'.format(year,month,day,folder)
df_day_plat.coalesce(1).write.partitionBy("Station").format("com.databricks.spark.avro").save(writepath)

# Writing feature from passages data to AVRO-files
writepath = 'mnt/root/ml/trackcircuits/data/daily/{}/{}/{}/{}/feature_passage'.format(year,month,day,folder)
df_day_pass.coalesce(1).write.partitionBy("Station").format("com.databricks.spark.avro").save(writepath)

# Writing number of error codes in data set 
writepath = 'mnt/root/ml/trackcircuits/data/daily/{}/{}/{}/{}/error_codes'.format(year,month,day,folder)
error_code.groupBy("Station").count().coalesce(1).write.format("csv").option("Header", True).save(writepath)