#### Dataset
https://www.kaggle.com/c/ventilator-pressure-prediction/data

In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, RobustScaler, normalize
# from sklearn.model_selection import GroupKFold
# from sklearn import metrics

# import time
# import lightgbm as lgb

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', None)



In [53]:
# General
from IPython.display import display
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import os
import glob
import random
import gc
gc.enable()
pd.set_option('display.max_columns', None)

# Utils
from sklearn import preprocessing
# Deep Learning
import tensorflow as tf
from tensorflow import keras
# Metrics
from sklearn.metrics import mean_absolute_error

# Random Seed Initialize
RANDOM_SEED = 42

def seed_everything(seed=RANDOM_SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything()

In [54]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

### Feature Engineering

In [55]:
all_pressure = np.sort(train_df.pressure.unique())
PRESSURE_MIN = all_pressure[0].item()
PRESSURE_MAX = all_pressure[-1].item()
PRESSURE_STEP = ( all_pressure[1] - all_pressure[0] ).item()

In [56]:
def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    return df

In [57]:
train_df = add_features(train_df)
test_df = add_features(test_df)

display(train_df.head())
print(train_df.shape)
print(test_df.shape)

Unnamed: 0,id,breath_id,time_step,u_in,u_out,pressure,area,u_in_cumsum,u_in_lag1,u_out_lag1,u_in_lag_back1,u_out_lag_back1,u_in_lag2,u_out_lag2,u_in_lag_back2,u_out_lag_back2,u_in_lag3,u_out_lag3,u_in_lag_back3,u_out_lag_back3,u_in_lag4,u_out_lag4,u_in_lag_back4,u_out_lag_back4,breath_id__u_in__max,breath_id__u_out__max,u_in_diff1,u_out_diff1,u_in_diff2,u_out_diff2,breath_id__u_in__diffmax,breath_id__u_in__diffmean,u_in_diff3,u_out_diff3,u_in_diff4,u_out_diff4,cross,cross2,R_20,R_5,R_50,C_10,C_20,C_50,R__C_20__10,R__C_20__20,R__C_20__50,R__C_50__10,R__C_50__20,R__C_50__50,R__C_5__10,R__C_5__20,R__C_5__50
0,1,1,0.0,0.083334,0,5.837492,0.0,0.083334,0.0,0.0,18.383041,0.0,0.0,0.0,22.509278,0.0,0.0,0.0,22.808822,0.0,0.0,0.0,25.35585,0.0,28.313036,1,0.083334,0.0,0.083334,0.0,28.229702,10.062673,0.083334,0.0,0.083334,0.0,0.0,0.0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0
1,2,1,0.033652,18.383041,0,5.907794,0.618632,18.466375,0.083334,0.0,22.509278,0.0,0.0,0.0,22.808822,0.0,0.0,0.0,25.35585,0.0,0.0,0.0,27.259866,0.0,28.313036,1,18.299707,0.0,18.383041,0.0,9.929994,-8.237035,18.383041,0.0,18.383041,0.0,0.0,0.0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0
2,3,1,0.067514,22.509278,0,7.876254,2.138333,40.975653,18.383041,0.0,22.808822,0.0,0.083334,0.0,25.35585,0.0,0.0,0.0,27.259866,0.0,0.0,0.0,27.127486,0.0,28.313036,1,4.126236,0.0,22.425944,0.0,5.803758,-12.363271,22.509278,0.0,22.509278,0.0,0.0,0.0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0
3,4,1,0.101542,22.808822,0,11.742872,4.454391,63.784476,22.509278,0.0,25.35585,0.0,18.383041,0.0,27.259866,0.0,0.083334,0.0,27.127486,0.0,0.0,0.0,26.807732,0.0,28.313036,1,0.299544,0.0,4.425781,0.0,5.504214,-12.662816,22.725488,0.0,22.808822,0.0,0.0,0.0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0
4,5,1,0.135756,25.35585,0,12.234987,7.896588,89.140326,22.808822,0.0,27.259866,0.0,22.509278,0.0,27.127486,0.0,18.383041,0.0,26.807732,0.0,0.083334,0.0,27.864715,0.0,28.313036,1,2.547028,0.0,2.846573,0.0,2.957185,-15.209844,6.972809,0.0,25.272516,0.0,0.0,0.0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0


(6036000, 53)
(4024000, 52)


### some references (but not limited to): 
01. https://www.kaggle.com/shivamb/how-autoencoders-work-intro-and-usecases
02. https://machinelearningmastery.com/autoencoder-for-regression/
03. https://towardsdatascience.com/applied-deep-learning-part-3-autoencoders-1c083af4d798
04. https://www.kaggle.com/residentmario/autoencoders