### 3W compression notebook

This notebook transforms the 3W dataset into a compressed version. The time series is down-sampled from entries everry second to ten seconds, by a locaal averrageg over this period. Aditionally, features P-CKGL and T-CKGL are removed from the dataset since they only appear in real instances.

In [31]:
import pandas as pd
import numpy as np
from pathlib import Path

In [32]:
data_path = Path('data')

In [33]:
def class_and_file_generator(data_path, real=False, simulated=False, drawn=False):
    for class_path in data_path.iterdir():
        if class_path.is_dir():
            class_code = int(class_path.stem)
            for instance_path in class_path.iterdir():
                if (instance_path.suffix == '.csv'):
                    if (simulated and instance_path.stem.startswith('SIMULATED')) or \
                       (drawn and instance_path.stem.startswith('DRAWN')) or \
                       (real and (not instance_path.stem.startswith('SIMULATED')) and \
                       (not instance_path.stem.startswith('DRAWN'))):
                        yield class_code, instance_path

In [34]:
real_instances = list(class_and_file_generator(data_path, real=True, simulated=False, drawn=False))
simulated_instances = list(class_and_file_generator(data_path, real=False, simulated=True, drawn=False))
drawn_instances = list(class_and_file_generator(data_path, real=False, simulated=False, drawn=True))

In [35]:
instances = real_instances + simulated_instances

In [36]:
import os

def create_dir_tree(root_dir):
    os.mkdir(root_dir)
    for i in range(0, 9):
        path = os.path.join(root_dir, str(i))
        os.mkdir(path)


data_root_dir = "data_upgraded_downsampled/"

create_dir_tree(data_root_dir)
clset = {-1}
clset.add
for c, p in instances:
    df = pd.read_csv(p)
    df['timestamp'] = pd.to_datetime(df['timestamp'])  #down-sampling
    df = df.set_index('timestamp')
    print(df.dtypes)
    df.interpolate(method='time', inplace=True)
    df = df.resample('10s').mean()
    df.drop('P-JUS-CKGL', inplace=True, axis=1)        #removing P-CKGL and T-CKGL and QGL attributes
    df.drop('T-JUS-CKGL', inplace=True, axis=1)
    df.drop('QGL', inplace=True, axis=1)
    
    clset.update([*df['class'].unique()])
    path = os.path.join(data_root_dir, str(p)[5:])
    df.to_csv(path)
print(clset)

P-PDG         float64
P-TPT         float64
T-TPT         float64
P-MON-CKP     float64
T-JUS-CKP     float64
P-JUS-CKGL    float64
T-JUS-CKGL    float64
QGL           float64
class         float64
dtype: object
P-PDG         float64
P-TPT         float64
T-TPT         float64
P-MON-CKP     float64
T-JUS-CKP     float64
P-JUS-CKGL    float64
T-JUS-CKGL    float64
QGL           float64
class         float64
dtype: object
P-PDG         float64
P-TPT         float64
T-TPT         float64
P-MON-CKP     float64
T-JUS-CKP     float64
P-JUS-CKGL    float64
T-JUS-CKGL    float64
QGL           float64
class         float64
dtype: object
P-PDG         float64
P-TPT         float64
T-TPT         float64
P-MON-CKP     float64
T-JUS-CKP     float64
P-JUS-CKGL    float64
T-JUS-CKGL    float64
QGL           float64
class         float64
dtype: object
P-PDG         float64
P-TPT         float64
T-TPT         float64
P-MON-CKP     float64
T-JUS-CKP     float64
P-JUS-CKGL    float64
T-JUS-CKGL    float6

QGL svugdje NA za simulirane evente!
ne postoje klase 103, 104, odnosno tranzijentni periodi??

df.isna().sum(axis=0)

In [16]:
df.head()

Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-05-13 07:34:40,28766210.0,20500730.0,125.862543,4063385.0,98.3296,,0.0
2018-05-13 07:34:50,28766195.0,20500740.0,125.86255,4063385.0,98.329567,,0.0
2018-05-13 07:35:00,28766195.0,20500740.0,125.86255,4063385.0,98.32953,,0.0
2018-05-13 07:35:10,28766195.0,20500740.0,125.86255,4063385.0,98.329494,,0.0
2018-05-13 07:35:20,28766195.0,20500740.0,125.86255,4063385.0,98.329457,,0.0


In [25]:
import tsfresh as tf

df_imputed = tf.utilities.dataframe_functions.impute(df.loc[:, df.columns != 'class'])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._where(cond, other, inplace, axis, level, errors=errors)


Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,QGL
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-05-13 07:34:40,28766210.0,2.050073e+07,125.862543,4063385.0,98.329600,0.0
2018-05-13 07:34:50,28766195.0,2.050074e+07,125.862550,4063385.0,98.329567,0.0
2018-05-13 07:35:00,28766195.0,2.050074e+07,125.862550,4063385.0,98.329530,0.0
2018-05-13 07:35:10,28766195.0,2.050074e+07,125.862550,4063385.0,98.329494,0.0
2018-05-13 07:35:20,28766195.0,2.050074e+07,125.862550,4063385.0,98.329457,0.0
...,...,...,...,...,...,...
2018-05-13 15:34:00,33689661.0,7.977187e+06,3.427143,4002607.8,31.241793,0.0
2018-05-13 15:34:10,33689659.0,7.977042e+06,3.427117,4002603.2,31.263274,0.0
2018-05-13 15:34:20,33689658.0,7.978292e+06,3.427058,4002548.1,31.201267,0.0
2018-05-13 15:34:30,33689659.0,7.977801e+06,3.427035,4002695.3,31.271275,0.0


Unnamed: 0_level_0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,QGL,class
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-05-13 07:34:40,28766210.0,2.050073e+07,125.862543,4063385.0,98.329600,,0.0
2018-05-13 07:34:50,28766195.0,2.050074e+07,125.862550,4063385.0,98.329567,,0.0
2018-05-13 07:35:00,28766195.0,2.050074e+07,125.862550,4063385.0,98.329530,,0.0
2018-05-13 07:35:10,28766195.0,2.050074e+07,125.862550,4063385.0,98.329494,,0.0
2018-05-13 07:35:20,28766195.0,2.050074e+07,125.862550,4063385.0,98.329457,,0.0
...,...,...,...,...,...,...,...
2018-05-13 15:34:00,33689661.0,7.977187e+06,3.427143,4002607.8,31.241793,,2.0
2018-05-13 15:34:10,33689659.0,7.977042e+06,3.427117,4002603.2,31.263274,,2.0
2018-05-13 15:34:20,33689658.0,7.978292e+06,3.427058,4002548.1,31.201267,,2.0
2018-05-13 15:34:30,33689659.0,7.977801e+06,3.427035,4002695.3,31.271275,,2.0
