In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from src.preprocessing import windowing

* A partir de los 2.5 segundos no se producen predicciones

In [None]:
data = pd.read_csv('data/raw/DisparoGenerador.csv')
interval_ms = 100
data['interval'] = (data['Time (s)']-1)// (interval_ms/1000) # the intervals cover exactly 0.25 seconds

data = data[data["interval"]<6].reindex()

## Scenario 1

In [None]:
original_columns = ['Fmin (Hz)', 'AngB1_unwrapped (degrees)', 'AngB2_unwrapped (degrees)', 'AngB3_unwrapped (degrees)']

lagged_data = windowing(data, original_columns)
lagged_data.to_csv('data/processed_100ms/Scenario1_DisparoGeneradorLagged.csv', index=None)

## Scenario 2

In [None]:
original_columns = ['AngB1_unwrapped (degrees)', 'AngB2_unwrapped (degrees)', 'AngB3_unwrapped (degrees)']

lagged_data = windowing(data, original_columns)
lagged_data.to_csv('data/processed_100ms/Scenario2_DisparoCargaNoFreqLagged.csv', index=None)

## Scenario 3

In [None]:
original_columns = ['AngB1 (degrees)', 'AngB2 (degrees)', 'AngB3 (degrees)']

lagged_data = windowing(data, original_columns)
lagged_data.to_csv('data/processed_100ms/Scenario3_DisparoCargaNoFreqNoUwrappedLagged.csv', index=None)

## Scenario 4

In [None]:
def windowing_doubleph(data, columns, by=['code']):
     
    lagged_data = pd.DataFrame()

    for code, group in tqdm(data.groupby(by)):
        
        for interval in range(2, int(max(group.interval.tolist())+1)):
            row = group[columns][(group.interval==interval) | (group.interval==(interval-1))]

            lagged_columns = [f'{col}-{lag}' for col in columns for lag in range(len(row)-1, -1, -1)]
            df_row = pd.DataFrame(data=[row.values.flatten('F')], columns=lagged_columns, index=[0])

            df_row['code'] = code
            df_row['interval'] = interval
            df_row['target'] = group.iloc[0]['target']
            df_row['pred_time'] =  group[(group.interval==interval)].iloc[-1]['Time (s)']

            lagged_data = pd.concat((lagged_data, df_row))
    return lagged_data

original_columns = ['AngB1_unwrapped (degrees)', 'AngB2_unwrapped (degrees)', 'AngB3_unwrapped (degrees)']

lagged_data = windowing_doubleph(data, original_columns)
lagged_data.to_csv('data/processed_100ms/Scenario4_DisparoCargaNoFreqDoublePHLagged.csv', index=None)

## Scenario 5

In [None]:
scenario_2_df = pd.read_csv('data/processed_100ms/Scenario2_DisparoCargaNoFreqLagged.csv')
features = []
for feat_idx in range(3):
    features.extend(scenario_2_df.columns[feat_idx*115:(feat_idx+1)*115].tolist()[::2])
metadata = scenario_2_df.columns[-4:].tolist()

scenario_2_df[[*features, *metadata]].to_csv('data/processed_100ms/Scenario5_DisparoGeneradorNoFreqHalfSampledLagged.csv', index=None)

## Scenario 6

In [None]:
def oversample(data, sigma=0.1):
    mu = 0
    features = data.columns[:-4].tolist()
    data_copy = data.copy()
    data_copy[features] += np.random.normal(mu, sigma, data[features].shape)
    
    return pd.concat((data, data_copy)).reset_index(drop=True)

scenario_2_df = pd.read_csv('data/processed_100ms/Scenario2_DisparoCargaNoFreqLagged.csv')

oversample(scenario_2_df, sigma=0.1).to_csv('data/processed_100ms/Scenario6_DisparoGeneradorNoFreqOversampled2x1Lagged.csv', index=None)

## Scenario 7

In [None]:
scenario_2_df = pd.read_csv('data/processed_100ms/Scenario2_DisparoCargaNoFreqLagged.csv')

oversample(scenario_2_df, sigma=0.01).to_csv('data/processed_100ms/Scenario7_DisparoGeneradorNoFreqOversampled2x01Lagged.csv', index=None)

## Scenario 8, 9 & 10

In [None]:
scenario_2_df = pd.read_csv('data/processed_100ms/Scenario2_DisparoCargaNoFreqLagged.csv')

metadata = scenario_2_df.columns[-4:].tolist()
columns = scenario_2_df.columns[:-4]
for ang_idx in range(1, 4):
    features = columns[~columns.str.contains(f'AngB{ang_idx}_unwrapped')]
    
    scenario_2_df[[*features, *metadata]].to_csv(f'data/processed_100ms/Scenario{7+ang_idx}_DisparoGeneradorNoFreqAngB{ang_idx}RemovedLagged.csv', index=None)

# Feature engineering

In [None]:
from tsfresh import extract_relevant_features
from tsfresh.feature_extraction.settings import MinimalFCParameters

data = pd.read_csv('data/raw/DisparoGenerador.csv')
interval_ms = 100
data['interval'] = (data['Time (s)']-1)// (interval_ms/1000) # the intervals cover exactly 0.25 seconds

data = data[data["interval"]<6].reindex()

In [None]:
data["id_col"] = data["code"] + "-"+data["interval"].astype(str)

selected_columns = ['Time (s)', 'id_col', 'AngB1_unwrapped (degrees)', 'AngB2_unwrapped (degrees)', 'AngB3_unwrapped (degrees)']


tsfresh_scenario2_data = data[selected_columns]

target = data[["id_col", "target"]].drop_duplicates().set_index('id_col')["target"]


In [None]:
extracted_features = extract_relevant_features(tsfresh_scenario2_data, target, column_id="id_col", column_sort="Time (s)", default_fc_parameters=MinimalFCParameters())

In [None]:
scenario11_data = extracted_features.reset_index()
scenario11_data["code"] = scenario11_data["index"].str.split("-").str[0]
scenario11_data["interval"] = scenario11_data["index"].str.split("-").str[1]
scenario11_data["target"] = target.values.tolist()

scenario11_data = scenario11_data.drop("index", axis=1)

In [None]:
scenario11_data.to_csv(f'data/processed_{interval_ms}ms/Scenario11_DisparoGeneradorMinumumFE.csv', index=None)

In [None]:
extracted_features = extract_relevant_features(tsfresh_scenario2_data, target, column_id="id_col", column_sort="Time (s)")

In [None]:
scenario12_data = extracted_features.reset_index()
scenario12_data["code"] = scenario12_data["index"].str.split("-").str[0]
scenario12_data["interval"] = scenario12_data["index"].str.split("-").str[1]
scenario12_data["target"] = target.values.tolist()

scenario12_data = scenario12_data.drop("index", axis=1)

In [None]:
scenario12_data.to_csv(f'data/processed_{interval_ms}ms/Scenario12_DisparoGeneradorComprehensiveFE.csv', index=None)