In [32]:
import numpy as np
import pandas as pd

import optuna

from sklearn.metrics import mean_absolute_error as mae
from sklearn.preprocessing import RobustScaler, normalize
from sklearn.model_selection import train_test_split, GroupKFold, KFold

from IPython.display import display

from sklearn.preprocessing import OrdinalEncoder
import category_encoders as ce
from sklearn.preprocessing import KBinsDiscretizer

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [19]:
df = pd.read_csv("/home/koga/workspace/dataset/kaggle_ventilator/train_5folds.csv")
test_df = pd.read_csv("/home/koga/workspace/dataset/kaggle_ventilator/test.csv")

In [20]:
df.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure,kfold
0,1,1,20,50,0.0,0.083334,0,5.837492,4
1,2,1,20,50,0.033652,18.383041,0,5.907794,4
2,3,1,20,50,0.067514,22.509278,0,7.876254,4
3,4,1,20,50,0.101542,22.808822,0,11.742872,4
4,5,1,20,50,0.135756,25.35585,0,12.234987,4


In [28]:
def add_features(df):

    df["u_in_log1p"] = np.log1p(df["u_in"])
    df["u_in_power"] = np.power(df["u_in"], 2)
    df["u_in_round2"] = np.round(df["u_in"], 2)

    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df = df.fillna(0)
    
    # diff
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']

    # rolling
    df[f'windowmean5_u_in'] = df.groupby('breath_id')['u_in'].rolling(window=5, min_periods=1).mean().reset_index(drop=True)

    df[f'windowmean10_u_in'] = df.groupby('breath_id')['u_in'].rolling(window=10, min_periods=1).mean().reset_index(drop=True)

    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']

    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    r_map = {5: 0, 20: 1, 50: 2}
    c_map = {10: 0, 20: 1, 50: 2}
    df['R'] = df['R'].map(r_map)
    df['C'] = df['C'].map(c_map)
    
    return df


def normalize(train, test=None):
    train_input_mean = train.u_in.mean()
    train_input_sigma = train.u_in.std()
    print(train_input_mean, train_input_sigma)
    train['u_in'] = (train.u_in - train_input_mean) / train_input_sigma
    if test is not None:
        test['u_in'] = (test.u_in - train_input_mean) / train_input_sigma
    return train, test


In [23]:
df, test_df = normalize(df, test_df)

7.321614728628858 13.434701105129037


In [29]:
new_df = add_features(df)

In [35]:
new_df.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure,kfold,u_in_log1p,u_in_power,u_in_round2,area,u_in_cumsum,u_in_lag1,u_out_lag1,u_in_lag_back1,u_out_lag_back1,u_in_lag2,u_out_lag2,u_in_lag_back2,u_out_lag_back2,u_in_lag3,u_out_lag3,u_in_lag_back3,u_out_lag_back3,u_in_lag4,u_out_lag4,u_in_lag_back4,u_out_lag_back4,u_in_diff1,u_out_diff1,u_in_diff2,u_out_diff2,u_in_diff3,u_out_diff3,u_in_diff4,u_out_diff4,windowmean5_u_in,windowmean10_u_in,breath_id__u_in__max,breath_id__u_out__max,breath_id__u_in__diffmax,breath_id__u_in__diffmean,cross,cross2
0,1,1,1,2,0.0,-0.538775,0,5.837492,4,-0.773869,0.290278,-0.54,0.0,-0.538775,0.0,0.0,0.823347,0.0,0.0,0.0,1.13048,0.0,0.0,0.0,1.152776,0.0,0.0,0.0,1.342362,0.0,-0.538775,0.0,-0.538775,0.0,-0.538775,0.0,-0.538775,0.0,-0.538775,-0.538775,1.562478,1,2.101253,0.749006,-0.0,0.0
1,2,1,1,2,0.033652,0.823347,0,5.907794,4,0.600674,0.677901,0.82,0.027708,0.284572,-0.538775,0.0,1.13048,0.0,0.0,0.0,1.152776,0.0,0.0,0.0,1.342362,0.0,0.0,0.0,1.484086,0.0,1.362122,0.0,0.823347,0.0,0.823347,0.0,0.823347,0.0,0.142286,0.142286,1.562478,1,0.73913,-0.613116,0.0,0.0
2,3,1,1,2,0.067514,1.13048,0,7.876254,4,0.756347,1.277985,1.13,0.104031,1.415053,0.823347,0.0,1.152776,0.0,-0.538775,0.0,1.342362,0.0,0.0,0.0,1.484086,0.0,0.0,0.0,1.474232,0.0,0.307133,0.0,1.669255,0.0,1.13048,0.0,1.13048,0.0,0.471684,0.471684,1.562478,1,0.431998,-0.920249,0.0,0.0
3,4,1,1,2,0.101542,1.152776,0,11.742872,4,0.766758,1.328894,1.15,0.221087,2.567829,1.13048,0.0,1.342362,0.0,0.823347,0.0,1.484086,0.0,-0.538775,0.0,1.474232,0.0,0.0,0.0,1.450432,0.0,0.022296,0.0,0.329429,0.0,1.691551,0.0,1.152776,0.0,0.641957,0.641957,1.562478,1,0.409701,-0.942545,0.0,0.0
4,5,1,1,2,0.135756,1.342362,0,12.234987,4,0.85116,1.801936,1.34,0.40332,3.910191,1.152776,0.0,1.484086,0.0,1.13048,0.0,1.474232,0.0,0.823347,0.0,1.450432,0.0,-0.538775,0.0,1.529107,0.0,0.189586,0.0,0.211882,0.0,0.519015,0.0,1.881137,0.0,0.782038,0.782038,1.562478,1,0.220115,-1.132131,0.0,0.0


In [38]:
new_df.to_csv("../../dataset/kaggle_ventilator/train_5folds_nb3_normalize.csv", index=False)

In [40]:
print(new_df.columns)

Index(['id', 'breath_id', 'R', 'C', 'time_step', 'u_in', 'u_out', 'pressure', 'kfold', 'u_in_log1p', 'u_in_power', 'u_in_round2', 'area', 'u_in_cumsum', 'u_in_lag1', 'u_out_lag1', 'u_in_lag_back1', 'u_out_lag_back1', 'u_in_lag2', 'u_out_lag2', 'u_in_lag_back2', 'u_out_lag_back2', 'u_in_lag3', 'u_out_lag3', 'u_in_lag_back3', 'u_out_lag_back3', 'u_in_lag4', 'u_out_lag4', 'u_in_lag_back4', 'u_out_lag_back4', 'u_in_diff1', 'u_out_diff1', 'u_in_diff2', 'u_out_diff2', 'u_in_diff3', 'u_out_diff3', 'u_in_diff4', 'u_out_diff4', 'windowmean5_u_in', 'windowmean10_u_in', 'breath_id__u_in__max', 'breath_id__u_out__max', 'breath_id__u_in__diffmax', 'breath_id__u_in__diffmean', 'cross', 'cross2'], dtype='object')
