In [2]:
import numpy as np
import pandas as pd

import optuna

from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import RobustScaler, normalize
from sklearn.model_selection import train_test_split, GroupKFold, KFold

from IPython.display import display

from sklearn.preprocessing import OrdinalEncoder


In [3]:
df = pd.read_csv("../../dataset/kaggle_ventilator/train_5folds.csv")
test_df = pd.read_csv("../../dataset/kaggle_ventilator/test.csv")

In [4]:
def add_features(df, transform="robust"):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    r_map = {5: 0, 20: 1, 50: 2}
    c_map = {10: 0, 20: 1, 50: 2}
    df['R'] = df['R'].map(r_map)
    df['C'] = df['C'].map(c_map)
    
    return df


In [5]:
train = add_features(df)
test = add_features(test_df)

In [7]:
RS = RobustScaler()

In [8]:
cont_seq_cols=[
    'time_step', 'u_in', 'u_out',
    'area', 'u_in_cumsum', 'u_in_lag1', 'u_out_lag1', 'u_in_lag_back1',
    'u_out_lag_back1', 'u_in_lag2', 'u_out_lag2', 'u_in_lag_back2',
    'u_out_lag_back2', 'u_in_lag3', 'u_out_lag3', 'u_in_lag_back3',
    'u_out_lag_back3', 'u_in_lag4', 'u_out_lag4', 'u_in_lag_back4',
    'u_out_lag_back4', 'breath_id__u_in__max', 'breath_id__u_out__max',
    'u_in_diff1', 'u_out_diff1', 'u_in_diff2', 'u_out_diff2',
    'breath_id__u_in__diffmax', 'breath_id__u_in__diffmean', 'u_in_diff3',
    'u_out_diff3', 'u_in_diff4', 'u_out_diff4', 'cross', 'cross2'
]

In [9]:
train[cont_seq_cols] = RS.fit_transform(train[cont_seq_cols])
test[cont_seq_cols] = RS.fit_transform(test[cont_seq_cols])

In [11]:
train.to_csv("../../dataset/kaggle_ventilator/train_5folds_nb1_robust.csv", index=False)
test.to_csv("../../dataset/kaggle_ventilator/test_nb1_robust.csv", index=False)

In [12]:
sorted(cont_seq_cols)

['area',
 'breath_id__u_in__diffmax',
 'breath_id__u_in__diffmean',
 'breath_id__u_in__max',
 'breath_id__u_out__max',
 'cross',
 'cross2',
 'time_step',
 'u_in',
 'u_in_cumsum',
 'u_in_diff1',
 'u_in_diff2',
 'u_in_diff3',
 'u_in_diff4',
 'u_in_lag1',
 'u_in_lag2',
 'u_in_lag3',
 'u_in_lag4',
 'u_in_lag_back1',
 'u_in_lag_back2',
 'u_in_lag_back3',
 'u_in_lag_back4',
 'u_out',
 'u_out_diff1',
 'u_out_diff2',
 'u_out_diff3',
 'u_out_diff4',
 'u_out_lag1',
 'u_out_lag2',
 'u_out_lag3',
 'u_out_lag4',
 'u_out_lag_back1',
 'u_out_lag_back2',
 'u_out_lag_back3',
 'u_out_lag_back4']