In [1]:
# Standard libaries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns; sns.set()   
import configparser
import os
from pathlib import Path
import argparse
import logging
import datetime
import inspect
from sklearn.utils import resample
import re
from datetime import datetime
from datetime import timedelta
import math
import pickle

## Import data

In [2]:
# Get the current directory
current_dir = os.getcwd()
data_path = os.path.join(current_dir, 'data')

In [3]:
# Define Export Path
export_path = os.path.join(current_dir, 'export')

# Check if the directory exists
if not os.path.exists(export_path):
    # Create the directory if it doesn't exist
    os.makedirs(export_path)

In [4]:
# Import data
file_name = 'indoor_fire_detection_multisensornodes_dataset.csv'
data_file_path = os.path.join(data_path, file_name)

df = pd.read_csv(data_file_path, index_col=0,parse_dates=True)

In [5]:
# check if index is in Datetime format
df.index.inferred_type == "datetime64"

True

In [6]:
df['Sensor_ID'].value_counts()

sensornode0008    33880
sensornode0015    33880
sensornode0010    33880
sensornode0012    33880
sensornode0013    33880
sensornode0014    33880
sensornode0011    33880
sensornode0016    33880
sensornode0009    33880
Name: Sensor_ID, dtype: int64

In [7]:
df.columns

Index(['Sensor_ID', 'CO2_Room', 'CO_Room', 'H2_Room', 'Humidity_Room',
       'PM05_Room', 'PM100_Room', 'PM10_Room', 'PM25_Room', 'PM40_Room',
       'PM_Room_Typical_Size', 'PM_Total_Room', 'Temperature_Room', 'UV_Room',
       'VOC_Room', 'VOC_Room_RAW', 'scenario_label', 'experiment_number',
       'progress_label', 'anomaly_label', 'ternary_label', 'fire_label',
       'fire_label_control'],
      dtype='object')

In [8]:
df.anomaly_label

2022-07-04 14:10:50+00:00     Normal
2022-07-04 14:10:50+00:00     Normal
2022-07-04 14:10:50+00:00     Normal
2022-07-04 14:10:50+00:00     Normal
2022-07-04 14:10:50+00:00     Normal
                              ...   
2022-07-08 12:17:20+00:00    Anomaly
2022-07-08 12:17:20+00:00    Anomaly
2022-07-08 12:17:20+00:00    Anomaly
2022-07-08 12:17:20+00:00    Anomaly
2022-07-08 12:17:20+00:00    Anomaly
Name: anomaly_label, Length: 304920, dtype: object

In [9]:
df.scenario_label.unique()

array(['Background', 'Wood', 'Candles', 'Cable', 'Ethanol', 'Lunts',
       'Deodorant', 'Hairspray'], dtype=object)

## Define Function for Deriving non-overlapping Intervals

In [10]:
def split_intervals_non_overlapping(df_input, w):
    ChunksSizes = []
    T = len(df_input.index.unique())
    complete_chunks = T // w  # Calculate the number of complete chunks
    remaining_data = T % w    # Calculate the remaining data points
    
    for i in range(complete_chunks):
        ChunksSizes.append(w)
    
    if remaining_data >= w:
        ChunksSizes.append(w)

    SplitIndexes = [sum(ChunksSizes[:i]) for i in range(complete_chunks)] # +1
    FilteredSplitIndexes = list(map(lambda x: x * 10, SplitIndexes))
    
    # Explicitly create a copy of the DataFrame to avoid SettingWithCopyWarning
    df_input = df_input.copy()

    # Create Date column with relativ time delta
    df_input['date'] = (df_input.index - df_input.index.min()).total_seconds()
    df_input['date'] = df_input['date'].astype(int)

    # Create intervals based on the split values
    for i in range(len(FilteredSplitIndexes)):
        row = FilteredSplitIndexes[i]
        interval_label = int(i) #+ 1
        df_input.loc[df_input.date.isin([row]), 'interval_label'] = interval_label

    # Calculate the length of the remaining data after the last split index
    last_split_index = FilteredSplitIndexes[-1]

    # For rows after the last interval_label, write 'drop'
    df_input.loc[df_input.date >= last_split_index, 'interval_label'] = 'drop'

    # Apply ffill and bfill
    df_input['interval_label'] = df_input['interval_label'].ffill().bfill()

    # Drop all rows containing 'drop' in X_t.Interval_label (after the last splitindex, there is no full interval anymore)
    df_input = df_input[df_input['interval_label'] != 'drop']

    # Convert Interval_label to integer
    df_input['interval_label'] = df_input['interval_label'].astype(int)

    return df_input

## Drop Ventilation Artefacts

In [11]:
df.groupby(by= 'progress_label')['Sensor_ID'].value_counts().tail(10)

progress_label  Sensor_ID     
Unknown         sensornode0012    28372
Ventilating     sensornode0012     2193
                sensornode0013     2182
                sensornode0008     2179
                sensornode0014     2179
                sensornode0010     2177
                sensornode0016     2177
                sensornode0015     2176
                sensornode0009     2175
                sensornode0011     2174
Name: Sensor_ID, dtype: int64

In [12]:
df.Sensor_ID.value_counts()

sensornode0008    33880
sensornode0015    33880
sensornode0010    33880
sensornode0012    33880
sensornode0013    33880
sensornode0014    33880
sensornode0011    33880
sensornode0016    33880
sensornode0009    33880
Name: Sensor_ID, dtype: int64

In [13]:
# Cut ventilating phase (not relevant for detection)
mask = df['progress_label'] != 'Ventilating'
df = df[mask]

In [14]:
# resample based on every single sensor node
df_resampled = df.groupby(by='Sensor_ID').resample('10S').bfill()

In [15]:
# Reset index
df_resampled = df_resampled.droplevel('Sensor_ID', axis=0).sort_index()

In [16]:
# drop last row of each sensor node beacause of unequal ventilation label
df_resampled.drop(df_resampled.tail(1).index,inplace=True)

In [17]:
df_resampled.Sensor_ID.value_counts()

sensornode0008    33872
sensornode0009    33872
sensornode0016    33872
sensornode0010    33872
sensornode0013    33872
sensornode0012    33872
sensornode0011    33872
sensornode0014    33872
sensornode0015    33872
Name: Sensor_ID, dtype: int64

## Derive Intervals

In [18]:
# Define variables
overlapping_intervals = False #True
w = 9 # interval length as number of time points
overlap = w/4 # length of overlap of intervals

In [19]:
# derive dataframe with intervals
if overlapping_intervals == False:
    df_intervals = split_intervals_non_overlapping(df_resampled, w)
else: 
    print('work in progress')

In [20]:
df_intervals.interval_label.value_counts().unique()

array([81])

In [21]:
df_intervals.fire_label.unique()

array(['NoFire', 'Fire'], dtype=object)

## Adjust fire_label to interval size

In [22]:
# if interval includes fire sequence, the interval should be labeled as fire
# Group the DataFrame by "interval_label"
grouped = df_intervals.groupby('interval_label')

# Iterate over each group
for _, group in grouped:
    if 'Fire' in group['fire_label'].values:
        # If 'fire' label exists in the group, assign 'fire' for the whole group
        df_intervals.loc[group.index, 'fire_label'] = 'Fire'

In [23]:
df_intervals.groupby(by='interval_label')['fire_label'].value_counts()

interval_label  fire_label
0               NoFire        81
1               NoFire        81
2               NoFire        81
3               NoFire        81
4               NoFire        81
                              ..
3757            NoFire        81
3758            NoFire        81
3759            NoFire        81
3760            NoFire        81
3761            NoFire        81
Name: fire_label, Length: 3762, dtype: int64

## Export

In [24]:
# Export dataset
file_name = f'indoor_fire_detection_multisensornodes_dataset_preprocessed.csv'
data_file_path = os.path.join(data_path, file_name)
df_intervals.to_csv(data_file_path) 