In [1]:
import os

import category_encoders as ce
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
from IPython.display import display
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GroupKFold, KFold, train_test_split
from sklearn.preprocessing import (KBinsDiscretizer, OrdinalEncoder,
                                   RobustScaler, normalize)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [2]:
train_df = pd.read_csv("../../dataset/kaggle_ventilator/train_5folds.csv")
test_df = pd.read_csv("../../dataset/kaggle_ventilator/test.csv")

In [3]:
def add_features(df, transform="robust"):
    df["u_in_log1p"] = np.log1p(df["u_in"])
    df["u_in_power"] = np.power(df["u_in"], 2)
    
    df['time_delta'] = df.groupby('breath_id')['time_step'].diff().fillna(0)
    df['delta'] = df['time_delta'] * df['u_in']
    df['area'] = df.groupby('breath_id')['delta'].cumsum()

    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['one'] = 1
    df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
    df['u_in_cummean'] =df['u_in_cumsum'] / df['count']
    
    df = df.drop(['count','one'], axis=1)
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df['u_in_lag_back10'] = df.groupby('breath_id')['u_in'].shift(-10)
    df['u_out_lag_back10'] = df.groupby('breath_id')['u_out'].shift(-10)
    df = df.fillna(0)
    
    ### rolling window ts feats
    df['ewm_u_in_mean'] = (df
                           .groupby('breath_id')['u_in']
                           .ewm(halflife=9)
                           .mean()
                           .reset_index(level=0,drop=True))
    df['ewm_u_in_std'] = (df
                          .groupby('breath_id')['u_in']
                          .ewm(halflife=10)
                          .std()
                          .reset_index(level=0,drop=True)) ## could add covar?
    df['ewm_u_in_corr'] = (df
                           .groupby('breath_id')['u_in']
                           .ewm(halflife=15)
                           .corr()
                           .reset_index(level=0,drop=True)) # self umin corr
    
    df[["15_in_sum","15_in_min","15_in_max","15_in_mean","15_out_std"]] = (df
                                                                              .groupby('breath_id')['u_in']
                                                                              .rolling(window=15,min_periods=1)
                                                                              .agg({"15_in_sum":"sum",
                                                                                    "15_in_min":"min",
                                                                                    "15_in_max":"max",
                                                                                    "15_in_mean":"mean",
                                                                                    "15_in_std":"std"})
                                                                              .reset_index(level=0,drop=True))
    
    df[["45_in_sum","45_in_min","45_in_max","45_in_mean","45_out_std"]] = (df
                                                                              .groupby('breath_id')['u_in']
                                                                              .rolling(window=45,min_periods=1)
                                                                              .agg({"45_in_sum":"sum",
                                                                                    "45_in_min":"min",
                                                                                    "45_in_max":"max",
                                                                                    "45_in_mean":"mean",
                                                                                    "45_in_std":"std"})
                                                                              .reset_index(level=0,drop=True))
    
    df[["15_out_mean"]] = (df
                           .groupby('breath_id')['u_out']
                           .rolling(window=15,min_periods=1)
                           .agg({"15_out_mean":"mean"})
                           .reset_index(level=0,drop=True))
    
    df = df.fillna(0)
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    
    df['breath_id__u_out__mean'] =df.groupby(['breath_id'])['u_out'].mean()
    df['breath_id__u_in__mean'] =df.groupby(['breath_id'])['u_in'].mean()

    df['breath_id__u_in__min'] = df.groupby(['breath_id'])['u_in'].transform('min')
    df['breath_id__u_out__min'] = df.groupby(['breath_id'])['u_out'].transform('min')
    
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    
    df['u_in_diff_1_2'] = df['u_in_lag1'] - df['u_in_lag2']
    df['u_out_diff_1_2'] = df['u_out_lag1'] - df['u_out_lag2']
    df['u_in_lagback_diff_1_2'] = df['u_in_lag_back1'] - df['u_in_lag_back2']
    df['u_out_lagback_diff_1_2'] = df['u_out_lag_back1'] - df['u_out_lag_back2']
    
    df['u_in_lagback_diff1'] = df['u_in'] - df['u_in_lag_back1']
    df['u_out_lagback_diff1'] = df['u_out'] - df['u_out_lag_back1']
    df['u_in_lagback_diff2'] = df['u_in'] - df['u_in_lag_back2']
    df['u_out_lagback_diff2'] = df['u_out'] - df['u_out_lag_back2']
    
    df.loc[df['time_step'] == 0, 'u_in_diff'] = 0
    df.loc[df['time_step'] == 0, 'u_out_diff'] = 0
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    
    df['u_in_partition_out_sum'] = df.groupby(['breath_id',"u_out"])['u_in'].transform("sum")
    
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']

    c_dic = {10: 0, 20: 1, 50:2}
    r_dic = {5: 0, 20: 1, 50:2}
    rc_sum_dic = {v: i for i, v in enumerate([15, 25, 30, 40, 55, 60, 70, 100])}
    rc_dot_dic = {v: i for i, v in enumerate([50, 100, 200, 250, 400, 500, 2500, 1000])}  

    df['C_cate'] = df['C'].map(c_dic)
    df['R_cate'] = df['R'].map(r_dic)
    df['RC_sum'] = (df['R'] + df['C']).map(rc_sum_dic)
    df['RC_dot'] = (df['R'] * df['C']).map(rc_dot_dic)
    
    df = df.drop(['R','C'], axis=1)
    
    return df


In [4]:
train_df = add_features(train_df)
test_df = add_features(test_df)

KeyError: 'R'

In [18]:
sorted(train_df.columns)

['C_cate',
 'RC_dot',
 'RC_sum',
 'R_cate',
 'area',
 'breath_id',
 'breath_id__u_in__diffmax',
 'breath_id__u_in__diffmean',
 'breath_id__u_in__max',
 'cross',
 'cross2',
 'delta',
 'id',
 'kfold',
 'pressure',
 'time_delta',
 'time_step',
 'u_in',
 'u_in_cummean',
 'u_in_cumsum',
 'u_in_lag1',
 'u_in_lag2',
 'u_in_lag3',
 'u_in_lag4',
 'u_in_lag_back1',
 'u_in_lag_back2',
 'u_in_lag_back3',
 'u_in_lag_back4',
 'u_out',
 'u_out_lag1',
 'u_out_lag2',
 'u_out_lag3',
 'u_out_lag4',
 'u_out_lag_back1',
 'u_out_lag_back2',
 'u_out_lag_back3',
 'u_out_lag_back4']

In [9]:
norm_features = [
        'area',
        'breath_id__u_in__diffmax',
        'breath_id__u_in__diffmean',
        'breath_id__u_in__max',
        'cross',
        'cross2',
        'delta',
        'time_delta',
        'time_step',
        'u_in',
        'u_in_cummean',
        'u_in_cumsum',
        'u_in_lag1',
        'u_in_lag2',
        'u_in_lag3',
        'u_in_lag4',
        'u_in_lag_back1',
        'u_in_lag_back2',
        'u_in_lag_back3',
        'u_in_lag_back4',
        ]

train_df, test_df = norm_scale(train_df, test_df, norm_features)

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure,kfold,time_delta,...,u_in_lag_back4,u_out_lag_back4,breath_id__u_in__max,breath_id__u_out__max,breath_id__u_in__diffmax,breath_id__u_in__diffmean,C_cate,R_cate,RC_sum,RC_dot
0,1,1,20,50,0.0,0.083334,0,5.837492,4,0.0,...,25.35585,0.0,28.313036,1,28.229702,10.062673,2,1,6,7
1,2,1,20,50,0.033652,18.383041,0,5.907794,4,0.033652,...,27.259866,0.0,28.313036,1,9.929994,-8.237035,2,1,6,7
2,3,1,20,50,0.067514,22.509278,0,7.876254,4,0.033862,...,27.127486,0.0,28.313036,1,5.803758,-12.363271,2,1,6,7
3,4,1,20,50,0.101542,22.808822,0,11.742872,4,0.034028,...,26.807732,0.0,28.313036,1,5.504214,-12.662816,2,1,6,7
4,5,1,20,50,0.135756,25.35585,0,12.234987,4,0.034213,...,27.864715,0.0,28.313036,1,2.957185,-15.209844,2,1,6,7


In [20]:
train_df.to_csv("../../dataset/kaggle_ventilator/train_5folds_nb10_robust.csv", index=False)
test_df.to_csv("../../dataset/kaggle_ventilator/test_nb10_robust.csv", index=False)

In [None]:
        'C_cate',
        'RC_dot',
        'RC_sum',
        'R_cate',
        'u_out',
        'u_out_lag1',
        'u_out_lag2',
        'u_out_lag3',
        'u_out_lag4',
        'u_out_lag_back1',
        'u_out_lag_back2',
        'u_out_lag_back3',
        'u_out_lag_back4'