In [1]:
import pathlib
import sys

import numpy as np
import pandas as pd

In [2]:
START = '2019-09-01'
END = '2019-10-01'

### Collect raw data frames and remove rows where value didn't change comparing to previous time step

In [5]:
folder = pathlib.Path('../data/GaiaAnomalyDataset/data')
short_data_folder = folder / f"{START.replace('-', '_')}_to_{END.replace('-', '_')}"
data_frames = {}
if not short_data_folder.exists():
    short_data_folder.mkdir()
    
for file in folder.glob('*.csv'):
    data = pd.read_csv(file, parse_dates=True, index_col='date')
    # Select indices within desired time range
    mask = np.logical_and(data.index > START, data.index < END)
    data = data[mask]
    # Filter out the records where values didn't change to reduce size.
    a = data.values.reshape((len(data.values,)))
    mask = (np.abs(a[1:] - a[:-1]) > sys.float_info.epsilon)
    mask = np.concatenate(([True], mask))
    data.iloc[mask]
    
    data_frames[file.name] = data

### Join data frames into a one frame

In [None]:
result_df = None
# Join data frame so all selected parameters have their
# original values.
for df in data_frames.values():
    if result_df is not None:
        result_df = result_df.join(df, how='outer')
    else:
        result_df = df

# Back-fill empty cells.
result_df = result_df.bfill().ffill()
print(result_df)

### Remove rows where there is no change again compared to previous timestamp

In [7]:
df_mask = None
for column in result_df.columns:
    values = result_df[column].values
    a = values.reshape((len(values,)))
    mask = (np.abs(a[1:] - a[:-1]) > 0)
    mask = np.concatenate(([True], mask))
    
    if df_mask is None:
        df_mask = mask
    else:
        df_mask = np.logical_or(df_mask, mask)

short_data = result_df.iloc[df_mask]
short_data.to_csv(short_data_folder / "short_data.csv")

897429
