In [1]:
import numpy as np
import pandas as pd

import optuna

from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import RobustScaler, normalize
from sklearn.model_selection import train_test_split, GroupKFold, KFold

from IPython.display import display

from sklearn.preprocessing import OrdinalEncoder
import category_encoders as ce
from sklearn.preprocessing import KBinsDiscretizer

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
df = pd.read_csv("/home/koga/workspace/dataset/kaggle_ventilator/train_5folds.csv")
test_df = pd.read_csv("/home/koga/workspace/dataset/kaggle_ventilator/test.csv")

In [3]:
df.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure,kfold
0,1,1,20,50,0.0,0.083334,0,5.837492,4
1,2,1,20,50,0.033652,18.383041,0,5.907794,4
2,3,1,20,50,0.067514,22.509278,0,7.876254,4
3,4,1,20,50,0.101542,22.808822,0,11.742872,4
4,5,1,20,50,0.135756,25.35585,0,12.234987,4


In [4]:
def add_features(df):

    df["u_in_log1p"] = np.log1p(df["u_in"])
    df["u_in_power"] = np.power(df["u_in"], 2)
    df["u_in_round2"] = np.round(df["u_in"], 2)

    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    
    # diff
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']

    # rolling
    df[f'windowmean5_u_in'] = df.groupby('breath_id')['u_in'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)
    df[f'windowmean10_u_in'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).mean().reset_index(drop=True)

    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']

    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    r_map = {5: 0, 20: 1, 50: 2}
    c_map = {10: 0, 20: 1, 50: 2}
    df['R'] = df['R'].map(r_map)
    df['C'] = df['C'].map(c_map)
    
    return df



In [5]:
new_df = add_features(df)

In [16]:
not_used = ["id", "breath_id", "R", "C", "pressure", "kfold"]
used_columns = list(set(list(new_df.columns)) - set(not_used))
used_columns

['u_in_diff3',
 'cross',
 'u_out_lag1',
 'u_in_power',
 'u_out_lag_back2',
 'breath_id__u_in__max',
 'u_out_lag4',
 'u_in_lag_back3',
 'u_out_diff2',
 'breath_id__u_in__diffmax',
 'time_step',
 'u_out_diff1',
 'u_in_lag1',
 'u_in_log1p',
 'u_out_diff4',
 'u_out_lag_back1',
 'u_in_lag_back2',
 'u_in_lag_back1',
 'u_out_lag2',
 'u_in_diff2',
 'u_in_cumsum',
 'u_out_lag3',
 'u_out_lag_back4',
 'u_out_lag_back3',
 'u_in_diff1',
 'breath_id__u_out__max',
 'u_in_round2',
 'windowmean10_u_in',
 'u_in',
 'u_in_lag2',
 'u_in_lag3',
 'u_in_diff4',
 'breath_id__u_in__diffmean',
 'u_in_lag4',
 'windowmean5_u_in',
 'u_out',
 'u_in_lag_back4',
 'cross2',
 'area',
 'u_out_diff3']

In [17]:
RS = RobustScaler()
new_df[used_columns] = RS.fit_transform(new_df[used_columns])

In [18]:
new_df.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure,kfold,u_in_log1p,u_in_power,u_in_round2,area,u_in_cumsum,u_in_lag1,u_out_lag1,u_in_lag_back1,u_out_lag_back1,u_in_lag2,u_out_lag2,u_in_lag_back2,u_out_lag_back2,u_in_lag3,u_out_lag3,u_in_lag_back3,u_out_lag_back3,u_in_lag4,u_out_lag4,u_in_lag_back4,u_out_lag_back4,u_in_diff1,u_out_diff1,u_in_diff2,u_out_diff2,u_in_diff3,u_out_diff3,u_in_diff4,u_out_diff4,windowmean5_u_in,windowmean10_u_in,breath_id__u_in__max,breath_id__u_out__max,breath_id__u_in__diffmax,breath_id__u_in__diffmean,cross,cross2
0,1,1,1,2,-0.989052,-0.937384,-1.0,5.837492,4,-1.100643,-0.779094,-0.938998,-0.516581,-0.725228,-0.864121,-1.0,2.890207,-1.0,-0.842527,-1.0,3.664026,-1.0,-0.819258,-1.0,3.743915,-1.0,-0.794152,-1.0,4.277824,-1.0,0.471409,0.0,0.217993,0.0,0.121028,0.0,0.068315,0.0,-0.920162,-0.771404,0.112208,0.0,0.326941,1.273864,0.0,-0.665541
1,2,1,1,2,-0.963608,3.049278,-1.0,5.907794,4,0.878825,12.91099,3.04793,-0.514031,-0.676829,-0.84736,-1.0,3.739712,-1.0,-0.842527,-1.0,3.724218,-1.0,-0.819258,-1.0,4.256189,-1.0,-0.794152,-1.0,4.661047,-1.0,113.467773,0.0,58.76267,0.0,38.091005,0.0,26.30625,0.0,1.015117,0.858288,0.112208,0.0,-0.178436,-1.450205,0.0,-0.665541
2,3,1,1,2,-0.938006,3.948195,-1.0,7.876254,4,1.011274,19.746583,3.947712,-0.507768,-0.617568,2.83314,-1.0,3.801381,-1.0,-0.825784,-1.0,4.236036,-1.0,-0.819258,-1.0,4.639136,-1.0,-0.794152,-1.0,4.634403,-1.0,25.549576,0.0,71.696779,0.0,46.652513,0.0,32.222406,0.0,1.951123,1.646495,0.112208,0.0,-0.292389,-2.06443,0.0,-0.665541
3,4,1,1,2,-0.912278,4.013452,-1.0,11.742872,4,1.019963,20.29652,4.013072,-0.498222,-0.557517,3.663022,-1.0,4.32576,-1.0,2.850764,-1.0,4.618643,-1.0,-0.802501,-1.0,4.612511,-1.0,-0.794152,-1.0,4.570046,-1.0,1.812565,0.0,14.110409,0.0,47.101127,0.0,32.65189,0.0,2.434965,2.053937,0.112208,0.0,-0.300662,-2.10902,0.0,-0.665541
4,5,1,1,2,-0.886409,4.568332,-1.0,12.234987,4,1.089713,25.266363,4.568627,-0.484036,-0.490761,3.723268,-1.0,4.717757,-1.0,3.679756,-1.0,4.592042,-1.0,2.877119,-1.0,4.5482,-1.0,-0.777379,-1.0,4.782785,-1.0,15.75373,0.0,9.058184,0.0,14.415967,0.0,36.184309,0.0,2.833014,2.389133,0.112208,0.0,-0.371002,-2.488167,0.0,-0.665541


In [19]:
new_df.to_csv("../../dataset/kaggle_ventilator/train_5folds_nb4_robust.csv", index=False)