# IMPORTS

## Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Data

In [2]:
from novelty_detection.data import load_data
df = load_data("gaia_data.csv", header_names=None)
print(df.shape)
df.head()

(203206, 13)


Unnamed: 0.1,Unnamed: 0,T_ext,Solar_irrad,T_imp,T_ret,BC1_power,BC2_power,Refr1_power,Refr2_power,BC1_flow,BC2_flow,Refr1_flow,Refr2_flow
0,2022-05-13 12:00:00+00:00,24.4,874.0,9.880005,10.869995,3.05,16.5,15.7,0.25,2923.0,2476.0,0.0,3153.0
1,2022-05-13 12:01:00+00:00,,,,,3.05,16.3,15.7,0.25,2923.0,2476.0,0.0,3153.0
2,2022-05-13 12:02:00+00:00,24.35,875.0,9.940002,10.75,3.1,16.4,15.75,0.3,2793.0,2476.0,0.0,3211.0
3,2022-05-13 12:03:00+00:00,,,,,3.1,16.5,15.65,0.25,2793.0,2606.0,0.0,3196.0
4,2022-05-13 12:04:00+00:00,24.51,875.0,9.940002,10.899994,3.05,16.4,15.55,0.25,2779.0,2505.0,0.0,3196.0


# PREPROCESSING

Preprocessing steps:
- convert the time column of df to datetime and change its name to 'datetime'
- use the datetime column as the new index of the df
- make the df continuous in time (1 min between rows):
    - split if time jump bigger tha threshold
    - add rows full of NaNs if lower than threshold
- keep only the rows that (should) contain all values of the measures because of the sampling time
- split the df if there are more consecutive NaNs than a threshold
- Fill the remianing NaNs in df usinf interpolation and remove df that are smaller than a threshold

In [3]:
from novelty_detection.preprocessing import *

columns = ['T_ext', 'Solar_irrad', 'T_imp', 
           'BC1_power', 'BC2_power', 'Refr1_power', 
           'Refr2_power', 'BC1_flow', 'BC2_flow', 
           'Refr1_flow', 'Refr2_flow', 'T_ret']

df_date = convert_df_time_column_to_datetime(df)
df_index = convert_df_to_df_with_datetime_index(df_date)
dfs_continuous = make_df_continuous_in_time(df_index,  max_minutes=30)
dfs_freq_reduced = convert_dfs_variables_to_same_frequency(dfs_continuous, rows_to_skip=2)
dfs_nan_split = split_dfs_based_on_consecutive_nans(dfs_freq_reduced, max_consecutive_nans=30)
dfs_valid = fill_dfs_nans_and_keep_long_dfs_only(dfs_nan_split, thresh_len=1000)
dfs_columns = rearrange_and_keep_important_columns(dfs_valid, columns)

print('len dfs_continuous', len(dfs_continuous))
print('len dfs_nan_split', len(dfs_nan_split))
print('len dfs_valid', len(dfs_valid))

len dfs_continuous 2
len dfs_nan_split 87
len dfs_valid 11


In [4]:
dfs_columns[0].head()

Unnamed: 0_level_0,T_ext,Solar_irrad,T_imp,BC1_power,BC2_power,Refr1_power,Refr2_power,BC1_flow,BC2_flow,Refr1_flow,Refr2_flow,T_ret
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2022-05-13 12:00:00+00:00,24.4,874.0,9.880005,3.05,16.5,15.7,0.25,2923.0,2476.0,0.0,3153.0,10.869995
2022-05-13 12:02:00+00:00,24.35,875.0,9.940002,3.1,16.4,15.75,0.3,2793.0,2476.0,0.0,3211.0,10.75
2022-05-13 12:04:00+00:00,24.51,875.0,9.940002,3.05,16.4,15.55,0.25,2779.0,2505.0,0.0,3196.0,10.899994
2022-05-13 12:06:00+00:00,24.629999,870.0,9.990021,3.1,16.45,15.65,0.3,2808.0,2491.0,0.0,3340.0,10.809998
2022-05-13 12:08:00+00:00,24.57,868.0,10.140015,3.1,16.65,15.8,0.25,2750.0,2476.0,0.0,3355.0,10.950012


## Save data

In [5]:
from novelty_detection.data import save_data

for i,df_split in enumerate(dfs_columns):
    name = f'gaia_data_{i}.csv'
    save_data(df_split, name, data_type='processed', index=True)

## Load data

In [9]:
df = load_data("gaia_data_0.csv", header_names=None)
df_date = convert_df_time_column_to_datetime(df)
df_index = convert_df_to_df_with_datetime_index(df_date)
print(df_index.shape)
df_index.head()

(2520, 12)


Unnamed: 0_level_0,T_ext,Solar_irrad,T_imp,BC1_power,BC2_power,Refr1_power,Refr2_power,BC1_flow,BC2_flow,Refr1_flow,Refr2_flow,T_ret
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2022-05-13 12:00:00+00:00,24.4,874.0,9.880005,3.05,16.5,15.7,0.25,2923.0,2476.0,0.0,3153.0,10.869995
2022-05-13 12:02:00+00:00,24.35,875.0,9.940002,3.1,16.4,15.75,0.3,2793.0,2476.0,0.0,3211.0,10.75
2022-05-13 12:04:00+00:00,24.51,875.0,9.940002,3.05,16.4,15.55,0.25,2779.0,2505.0,0.0,3196.0,10.899994
2022-05-13 12:06:00+00:00,24.629999,870.0,9.990021,3.1,16.45,15.65,0.3,2808.0,2491.0,0.0,3340.0,10.809998
2022-05-13 12:08:00+00:00,24.57,868.0,10.140015,3.1,16.65,15.8,0.25,2750.0,2476.0,0.0,3355.0,10.950012
