In [1]:
# Standard libaries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns; sns.set()   
import configparser
import os
from pathlib import Path
import argparse
import logging
import datetime
import inspect
from sklearn.utils import resample
import re
from datetime import datetime
from datetime import timedelta
import plotly.express as px
import plotly.graph_objs as go
import math
import pickle

# Internal Packages
from analyse_df import analyse_df
from rename_columns import rename_columns
import plot_settings

In [2]:
# Get the current directory
current_dir = os.getcwd()
data_path = os.path.join(current_dir, 'data')

In [3]:
export_path = os.path.join(current_dir, 'export')

# Check if the directory exists
if not os.path.exists(export_path):
    # Create the directory if it doesn't exist
    os.makedirs(export_path)

In [4]:
# Import data
file_name = 'elba_dataset_pp_1.csv'
data_file_path = os.path.join(data_path, file_name)

df = pd.read_csv(data_file_path, index_col=0)

  df = pd.read_csv(data_file_path, index_col=0)


In [5]:
# Replace one index which has a slight different format to the rest (no milliseconds)
new_index_string = '2022-07-06 19:04:02.014000+00:00'

# Replace the specific index with the new string
df.index = df.index.where(df.index != '2022-07-06 19:04:02+00:00', new_index_string)

In [6]:
# Converting the index as date
df.index = pd.to_datetime(df.index)

In [7]:
df['Sensor_ID'].value_counts()

Sensor_ID
sensornode0010    33710
sensornode0013    33709
sensornode0011    33706
sensornode0014    33705
sensornode0008    33676
sensornode0009    33675
sensornode0012    33667
sensornode0015    33654
sensornode0016    33654
Name: count, dtype: int64

In [8]:
df.groupby('Sensor_ID').head(1)

Unnamed: 0_level_0,Sensor_ID,CO2_Room,CO_Room,H2_Room,Humidity_Room,Motion_Room,PM05_Room,PM100_Room,PM10_Room,PM25_Room,...,PM_Total_Room,Temperature_Room,UV_Room,VOC_Room,VOC_Room_RAW,scenario_label,experiment_number,progress_label,anomaly_label,ternary_label
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-07-04 14:10:54.259355+00:00,sensornode0009,1150.0,-0.43,0.13,51.5,,15.0,0.0,2.0,0.0,...,17.0,26.3,0.0,302.0,0.5,Background,,Unknown,Normal,Background
2022-07-04 14:10:54.401380+00:00,sensornode0015,1300.0,-0.08,0.12,52.85,,7.0,0.0,0.0,0.0,...,8.0,25.7,0.0,266.0,0.5,Background,,Unknown,Normal,Background
2022-07-04 14:10:55.401182+00:00,sensornode0011,412.0,-0.08,0.22,50.6,,16.0,0.0,2.0,0.0,...,19.0,26.5,0.0,264.2,0.5,Background,,Unknown,Normal,Background
2022-07-04 14:10:55.899093+00:00,sensornode0012,1100.0,-0.16,0.13,52.66,,13.0,0.0,1.0,0.0,...,15.0,25.9,0.0,284.0,0.6,Background,,Unknown,Normal,Background
2022-07-04 14:10:59.346954+00:00,sensornode0016,1392.0,0.18,0.09,52.53,,10.0,0.0,1.0,0.0,...,11.0,26.0,0.0,265.0,0.5,Background,,Unknown,Normal,Background
2022-07-04 14:11:02.112854+00:00,sensornode0014,1168.0,0.08,0.04,54.25,,22.0,0.0,3.0,0.0,...,25.0,25.1,0.0,313.6,0.5,Background,,Unknown,Normal,Background
2022-07-04 14:11:02.267016+00:00,sensornode0013,1199.0,-0.03,0.15,54.24,,23.0,0.0,3.0,0.0,...,26.0,25.2,0.0,289.6,0.5,Background,,Unknown,Normal,Background
2022-07-04 14:11:03.205235+00:00,sensornode0008,1166.0,-0.25,0.0,52.64,1.0,18.0,0.0,2.0,0.0,...,20.0,26.0,0.0,253.6,0.5,Background,,Unknown,Normal,Background
2022-07-04 14:11:04.092357+00:00,sensornode0010,1263.0,0.15,0.1,52.29,,13.0,0.0,1.0,0.0,...,15.0,26.2,0.0,302.6,0.6,Background,,Unknown,Normal,Background


### Resample per sensornode

In [9]:
# Set equal start datetime
min_start_time = df.groupby('Sensor_ID').apply(lambda group: group.index.min()).min()

# Define a function to adjust each group's datetime index
def adjust_start_time(group):
    group.index = pd.Index([min_start_time] + list(group.index[1:]))
    return group

# Group by 'Sensor_ID' and apply the adjust_start_time function to each group
df_adjusted = df.groupby('Sensor_ID', group_keys=False).apply(adjust_start_time).sort_index()

In [10]:
# Replace Nan in experiment_number with 'Background'
df_adjusted['experiment_number'] = df_adjusted['experiment_number'].fillna('Background')

In [11]:
df_adjusted

Unnamed: 0,Sensor_ID,CO2_Room,CO_Room,H2_Room,Humidity_Room,Motion_Room,PM05_Room,PM100_Room,PM10_Room,PM25_Room,...,PM_Total_Room,Temperature_Room,UV_Room,VOC_Room,VOC_Room_RAW,scenario_label,experiment_number,progress_label,anomaly_label,ternary_label
2022-07-04 14:10:54.259355+00:00,sensornode0008,1166.0,-0.25,0.00,52.64,1.0,18.0,0.0,2.0,0.0,...,20.0,26.0,0.0,253.6,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:54.259355+00:00,sensornode0016,1392.0,0.18,0.09,52.53,,10.0,0.0,1.0,0.0,...,11.0,26.0,0.0,265.0,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:54.259355+00:00,sensornode0009,1150.0,-0.43,0.13,51.50,,15.0,0.0,2.0,0.0,...,17.0,26.3,0.0,302.0,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:54.259355+00:00,sensornode0012,1100.0,-0.16,0.13,52.66,,13.0,0.0,1.0,0.0,...,15.0,25.9,0.0,284.0,0.6,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:54.259355+00:00,sensornode0013,1199.0,-0.03,0.15,54.24,,23.0,0.0,3.0,0.0,...,26.0,25.2,0.0,289.6,0.5,Background,Background,Unknown,Normal,Background
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-08 12:28:56.188433+00:00,sensornode0010,797.0,1.11,-0.04,40.49,,4.0,0.0,0.0,0.0,...,5.0,25.3,0.0,37.6,0.7,Deodorant,E1,Ventilating,Anomaly,Nuisance
2022-07-08 12:29:01.238767+00:00,sensornode0011,796.0,0.92,0.04,39.63,,7.0,0.0,1.0,0.0,...,8.0,25.3,0.0,48.0,0.8,Deodorant,E1,Ventilating,Anomaly,Nuisance
2022-07-08 12:29:03.484557+00:00,sensornode0013,652.0,1.20,0.10,41.43,,6.0,0.0,1.0,0.0,...,7.0,24.5,0.0,42.8,0.5,Deodorant,E1,Ventilating,Anomaly,Nuisance
2022-07-08 12:29:03.723188+00:00,sensornode0014,634.0,1.17,0.03,41.55,,6.0,0.0,0.0,0.0,...,7.0,24.4,0.0,44.6,0.7,Deodorant,E1,Ventilating,Anomaly,Nuisance


In [12]:
df_adjusted.groupby('Sensor_ID').head(1)

Unnamed: 0,Sensor_ID,CO2_Room,CO_Room,H2_Room,Humidity_Room,Motion_Room,PM05_Room,PM100_Room,PM10_Room,PM25_Room,...,PM_Total_Room,Temperature_Room,UV_Room,VOC_Room,VOC_Room_RAW,scenario_label,experiment_number,progress_label,anomaly_label,ternary_label
2022-07-04 14:10:54.259355+00:00,sensornode0008,1166.0,-0.25,0.0,52.64,1.0,18.0,0.0,2.0,0.0,...,20.0,26.0,0.0,253.6,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:54.259355+00:00,sensornode0016,1392.0,0.18,0.09,52.53,,10.0,0.0,1.0,0.0,...,11.0,26.0,0.0,265.0,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:54.259355+00:00,sensornode0009,1150.0,-0.43,0.13,51.5,,15.0,0.0,2.0,0.0,...,17.0,26.3,0.0,302.0,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:54.259355+00:00,sensornode0012,1100.0,-0.16,0.13,52.66,,13.0,0.0,1.0,0.0,...,15.0,25.9,0.0,284.0,0.6,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:54.259355+00:00,sensornode0013,1199.0,-0.03,0.15,54.24,,23.0,0.0,3.0,0.0,...,26.0,25.2,0.0,289.6,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:54.259355+00:00,sensornode0015,1300.0,-0.08,0.12,52.85,,7.0,0.0,0.0,0.0,...,8.0,25.7,0.0,266.0,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:54.259355+00:00,sensornode0010,1263.0,0.15,0.1,52.29,,13.0,0.0,1.0,0.0,...,15.0,26.2,0.0,302.6,0.6,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:54.259355+00:00,sensornode0011,412.0,-0.08,0.22,50.6,,16.0,0.0,2.0,0.0,...,19.0,26.5,0.0,264.2,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:54.259355+00:00,sensornode0014,1168.0,0.08,0.04,54.25,,22.0,0.0,3.0,0.0,...,25.0,25.1,0.0,313.6,0.5,Background,Background,Unknown,Normal,Background


In [13]:
# resample based on every single sensor node
df_resampled = df_adjusted.groupby(by='Sensor_ID').resample('10S').ffill()

In [14]:
# aplay bfill in order to fill first NaN row
df_resampled = df_resampled.bfill()

In [15]:
# Reset index
df_resampled = df_resampled.droplevel('Sensor_ID', axis=0).sort_index()

In [16]:
df_resampled.groupby('Sensor_ID').head(1)

Unnamed: 0,Sensor_ID,CO2_Room,CO_Room,H2_Room,Humidity_Room,Motion_Room,PM05_Room,PM100_Room,PM10_Room,PM25_Room,...,PM_Total_Room,Temperature_Room,UV_Room,VOC_Room,VOC_Room_RAW,scenario_label,experiment_number,progress_label,anomaly_label,ternary_label
2022-07-04 14:10:50+00:00,sensornode0008,1166.0,-0.25,0.0,52.64,1.0,18.0,0.0,2.0,0.0,...,20.0,26.0,0.0,253.6,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:50+00:00,sensornode0011,412.0,-0.08,0.22,50.6,,16.0,0.0,2.0,0.0,...,19.0,26.5,0.0,264.2,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:50+00:00,sensornode0014,1168.0,0.08,0.04,54.25,,22.0,0.0,3.0,0.0,...,25.0,25.1,0.0,313.6,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:50+00:00,sensornode0013,1199.0,-0.03,0.15,54.24,,23.0,0.0,3.0,0.0,...,26.0,25.2,0.0,289.6,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:50+00:00,sensornode0015,1300.0,-0.08,0.12,52.85,,7.0,0.0,0.0,0.0,...,8.0,25.7,0.0,266.0,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:50+00:00,sensornode0009,1150.0,-0.43,0.13,51.5,,15.0,0.0,2.0,0.0,...,17.0,26.3,0.0,302.0,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:50+00:00,sensornode0016,1392.0,0.18,0.09,52.53,,10.0,0.0,1.0,0.0,...,11.0,26.0,0.0,265.0,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:50+00:00,sensornode0010,1263.0,0.15,0.1,52.29,,13.0,0.0,1.0,0.0,...,15.0,26.2,0.0,302.6,0.6,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:50+00:00,sensornode0012,1100.0,-0.16,0.13,52.66,,13.0,0.0,1.0,0.0,...,15.0,25.9,0.0,284.0,0.6,Background,Background,Unknown,Normal,Background


In [17]:
df_resampled['Sensor_ID'].value_counts()

Sensor_ID
sensornode0011    33951
sensornode0014    33950
sensornode0013    33950
sensornode0010    33949
sensornode0012    33945
sensornode0008    33922
sensornode0009    33922
sensornode0015    33892
sensornode0016    33892
Name: count, dtype: int64

In [18]:
# cut the last part since sensor nodes were turned off seriell

# Identify the minimum end time across all groups
min_end_time = df_resampled.groupby('Sensor_ID').apply(lambda group: group.index.max()).min()- pd.Timedelta(minutes=2)

# Define a function to truncate each group
def truncate_group(group):
    return group.truncate(before=group.index[0], after=min_end_time)

# Group by 'Sensor_ID' and apply the truncate function to each group
df_truncated = df_resampled.groupby('Sensor_ID', group_keys=False).apply(truncate_group).sort_index()

In [19]:
df_truncated['Sensor_ID'].value_counts()

Sensor_ID
sensornode0008    33880
sensornode0015    33880
sensornode0010    33880
sensornode0012    33880
sensornode0013    33880
sensornode0014    33880
sensornode0011    33880
sensornode0016    33880
sensornode0009    33880
Name: count, dtype: int64

In [20]:
df_truncated.groupby('Sensor_ID').head(1)

Unnamed: 0,Sensor_ID,CO2_Room,CO_Room,H2_Room,Humidity_Room,Motion_Room,PM05_Room,PM100_Room,PM10_Room,PM25_Room,...,PM_Total_Room,Temperature_Room,UV_Room,VOC_Room,VOC_Room_RAW,scenario_label,experiment_number,progress_label,anomaly_label,ternary_label
2022-07-04 14:10:50+00:00,sensornode0008,1166.0,-0.25,0.0,52.64,1.0,18.0,0.0,2.0,0.0,...,20.0,26.0,0.0,253.6,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:50+00:00,sensornode0015,1300.0,-0.08,0.12,52.85,,7.0,0.0,0.0,0.0,...,8.0,25.7,0.0,266.0,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:50+00:00,sensornode0010,1263.0,0.15,0.1,52.29,,13.0,0.0,1.0,0.0,...,15.0,26.2,0.0,302.6,0.6,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:50+00:00,sensornode0012,1100.0,-0.16,0.13,52.66,,13.0,0.0,1.0,0.0,...,15.0,25.9,0.0,284.0,0.6,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:50+00:00,sensornode0013,1199.0,-0.03,0.15,54.24,,23.0,0.0,3.0,0.0,...,26.0,25.2,0.0,289.6,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:50+00:00,sensornode0014,1168.0,0.08,0.04,54.25,,22.0,0.0,3.0,0.0,...,25.0,25.1,0.0,313.6,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:50+00:00,sensornode0011,412.0,-0.08,0.22,50.6,,16.0,0.0,2.0,0.0,...,19.0,26.5,0.0,264.2,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:50+00:00,sensornode0016,1392.0,0.18,0.09,52.53,,10.0,0.0,1.0,0.0,...,11.0,26.0,0.0,265.0,0.5,Background,Background,Unknown,Normal,Background
2022-07-04 14:10:50+00:00,sensornode0009,1150.0,-0.43,0.13,51.5,,15.0,0.0,2.0,0.0,...,17.0,26.3,0.0,302.0,0.5,Background,Background,Unknown,Normal,Background


## Adjust labels

In [21]:
# store resampled df
df_elba = df_truncated.copy()

In [22]:
df_elba.ternary_label.unique()

array(['Background', 'Fire', 'Nuisance'], dtype=object)

In [23]:
# Create fire_label for binary classification task
df_elba['fire_label'] = df_elba['ternary_label']
df_elba['fire_label'] = df_elba['fire_label'].map({'Background': 'NoFire', 'Fire': 'Fire', 'Nuisance':'NoFire'})

In [24]:
# Define limits based on PM05 and PM10 (since labeling was performed on these measures)
lower_limit_pm05 = 1000
lower_limit_pm10 = 200

In [25]:
# create control_fire_label column
df_elba['fire_label_control'] = np.nan

In [26]:
# Mask df based on rules
mask = ((df_elba['PM05_Room']>=lower_limit_pm05) | (df_elba['PM10_Room']>=lower_limit_pm10)) & (df_elba['progress_label'] != 'Ventilating') & (df_elba['ternary_label'] !='Nuisance')

# Apply to set fire control label
df_elba.loc[mask, 'fire_label_control'] = 'Fire'

In [27]:
# Fill in between fire labels (where condition is under the threshold)
mask = (df_elba['fire_label'] == 'Fire') #& (df_elba['progress_label'] != 'Ventilating')
df_elba.loc[mask, 'fire_label_control'] = df_elba.loc[mask].groupby(by=['Sensor_ID', 'scenario_label', 'experiment_number'])['fire_label_control'].ffill()

# Set rest of fire_label_control to "NoFire"
df_elba['fire_label_control'] = df_elba['fire_label_control'].fillna('NoFire')

In [28]:
df_elba.loc[mask].groupby(by=['Sensor_ID','scenario_label','experiment_number','progress_label'])['fire_label_control'].value_counts().head(59)

Sensor_ID       scenario_label  experiment_number  progress_label     fire_label_control
sensornode0008  Cable           E0                 End_of_Experiment  Fire                   31
                                                   Ignition           NoFire                 22
                                                   Outgasing          Fire                  109
                                                                      NoFire                 45
                                                   Ventilating        Fire                  122
                                E1                 End_of_Experiment  Fire                   27
                                                   Glowing            Fire                   54
                                                   Ignition           NoFire                 12
                                                   Outgasing          Fire                   15
                                               

In [29]:
df_elba.groupby(by='fire_label_control')['fire_label'].value_counts()

fire_label_control  fire_label
Fire                Fire           30827
NoFire              NoFire        268913
                    Fire            5180
Name: count, dtype: int64

In [30]:
df_elba.columns

Index(['Sensor_ID', 'CO2_Room', 'CO_Room', 'H2_Room', 'Humidity_Room',
       'Motion_Room', 'PM05_Room', 'PM100_Room', 'PM10_Room', 'PM25_Room',
       'PM40_Room', 'PM_Room_Typical_Size', 'PM_Total_Room',
       'Temperature_Room', 'UV_Room', 'VOC_Room', 'VOC_Room_RAW',
       'scenario_label', 'experiment_number', 'progress_label',
       'anomaly_label', 'ternary_label', 'fire_label', 'fire_label_control'],
      dtype='object')

In [31]:
# Drop unrelevant columns
df_elba = df_elba.drop(columns = 'Motion_Room')

In [32]:
df_elba.index.inferred_type == "datetime64"

True

In [33]:
# Export df_shap_values
file_name = f'elba_dataset_pp_2.csv'
data_file_path = os.path.join(export_path, file_name)
df_elba.to_csv(data_file_path) 