# NB 5 - Preprocessing / Feature Extraction

In [None]:
import os

import pandas as pd
from pyarrow import parquet as pq

import recipys.recipe, recipys.ingredients, recipys.step, recipys.selector

# Initial configuration

This notebook assumes that the `combined.parquet` cohort data file is available in `data/interim/combined.parquet`.

In [None]:
path_to_data = "../data/interim"

# Load data

In [None]:
eICU_cohort_data = pq.read_table(os.path.join(path_to_data, "combined.parquet")).to_pandas()

In [None]:
eICU_cohort_data

# Get historical mean, max, min and var as features

In [None]:
sepsis_recipe = recipys.recipe.Recipe(data=eICU_cohort_data, predictors=eICU_cohort_data.drop(columns=['stay_id', 'time']).columns.tolist())
#sepsis_recipe = recipys.recipe.Recipe(data=eICU_cohort_data, predictors=['resp', 'sbp'])
print(sepsis_recipe)

In [None]:
sepsis_step_to_add = recipys.step.StepHistorical(sel=recipys.selector.all_numeric_predictors(), fun=recipys.step.Accumulator.MEAN)
sepsis_recipe.add_step(sepsis_step_to_add)

sepsis_step_to_add = recipys.step.StepHistorical(sel=recipys.selector.all_numeric_predictors(), fun=recipys.step.Accumulator.MAX)
sepsis_recipe.add_step(sepsis_step_to_add)

sepsis_step_to_add = recipys.step.StepHistorical(sel=recipys.selector.all_numeric_predictors(), fun=recipys.step.Accumulator.MIN)
sepsis_recipe.add_step(sepsis_step_to_add)

sepsis_step_to_add = recipys.step.StepHistorical(sel=recipys.selector.all_numeric_predictors(), fun=recipys.step.Accumulator.VAR)
sepsis_recipe.add_step(sepsis_step_to_add)

In [None]:
sepsis_recipe.bake()

# Delete all clinical concepts and only keep features

In [None]:
columns_to_keep = ['stay_id', 'time', 'hospitalid', 'uniquepid'] + eICU_cohort_data.filter(regex='_mean$|_max$|_min$|_var$').columns.tolist()

In [None]:
feature_data = eICU_cohort_data[columns_to_keep]

# Save feature data to parquet file

In [None]:
feature_data.to_parquet(os.path.join(path_to_data, "features.parquet"))