In [1]:
import os

import category_encoders as ce
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
from IPython.display import display
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GroupKFold, KFold, train_test_split
from sklearn.preprocessing import (KBinsDiscretizer, OrdinalEncoder,
                                   RobustScaler, normalize)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [2]:
df = pd.read_csv("/home/koga/workspace/dataset/kaggle_ventilator/train_5folds.csv")
test_df = pd.read_csv("/home/koga/workspace/dataset/kaggle_ventilator/test.csv")

In [3]:
def add_feature(df):
    df['time_delta'] = df.groupby('breath_id')['time_step'].diff().fillna(0)
    df['delta'] = df['time_delta'] * df['u_in']
    df['area'] = df.groupby('breath_id')['delta'].cumsum()

    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['one'] = 1
    df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
    df['u_in_cummean'] =df['u_in_cumsum'] / df['count']
    
    df = df.drop(['count','one'], axis=1)
    return df

def add_lag_feature(df, USE_LAG=4):
    # https://www.kaggle.com/kensit/improvement-base-on-tensor-bidirect-lstm-0-173
    for lag in range(1, USE_LAG+1):
        df[f'breath_id_lag{lag}']=df['breath_id'].shift(lag).fillna(0)
        df[f'breath_id_lag{lag}same']=np.select([df[f'breath_id_lag{lag}']==df['breath_id']], [1], 0)

        # u_in 
        df[f'u_in_lag_{lag}'] = df['u_in'].shift(lag).fillna(0) * df[f'breath_id_lag{lag}same']
        #df[f'u_in_lag_{lag}_back'] = df['u_in'].shift(-lag).fillna(0) * df[f'breath_id_lag{lag}same']
        df[f'u_in_time{lag}'] = df['u_in'] - df[f'u_in_lag_{lag}']
        #df[f'u_in_time{lag}_back'] = df['u_in'] - df[f'u_in_lag_{lag}_back']
        df[f'u_out_lag_{lag}'] = df['u_out'].shift(lag).fillna(0) * df[f'breath_id_lag{lag}same']
        #df[f'u_out_lag_{lag}_back'] = df['u_out'].shift(-lag).fillna(0) * df[f'breath_id_lag{lag}same']

    # breath_time
    df['time_step_lag'] = df['time_step'].shift(1).fillna(0) * df[f'breath_id_lag{lag}same']
    df['breath_time'] = df['time_step'] - df['time_step_lag']

    drop_columns = ['time_step_lag']
    drop_columns += [f'breath_id_lag{i}' for i in range(1, USE_LAG+1)]
    drop_columns += [f'breath_id_lag{i}same' for i in range(1, USE_LAG+1)]
    df = df.drop(drop_columns, axis=1)

    # fill na by zero
    df = df.fillna(0)
    return df

c_dic = {10: 0, 20: 1, 50:2}
r_dic = {5: 0, 20: 1, 50:2}
rc_sum_dic = {v: i for i, v in enumerate([15, 25, 30, 40, 55, 60, 70, 100])}
rc_dot_dic = {v: i for i, v in enumerate([50, 100, 200, 250, 400, 500, 2500, 1000])}    

def add_category_features(df):
    df['C_cate'] = df['C'].map(c_dic)
    df['R_cate'] = df['R'].map(r_dic)
    df['RC_sum'] = (df['R'] + df['C']).map(rc_sum_dic)
    df['RC_dot'] = (df['R'] * df['C']).map(rc_dot_dic)
    return df


def norm_scale(train_df, test_df, norm_features):
    scaler = RobustScaler()
    all_u_in = np.vstack([train_df[norm_features].values, test_df[norm_features].values])
    scaler.fit(all_u_in)
    train_df[norm_features] = scaler.transform(train_df[norm_features].values)
    test_df[norm_features] = scaler.transform(test_df[norm_features].values)
    return train_df, test_df

In [4]:
train_df = add_feature(df)
test_df = add_feature(test_df)
train_df = add_lag_feature(train_df)
test_df = add_lag_feature(test_df)
train_df = add_category_features(train_df)
test_df = add_category_features(test_df)


In [5]:
train_df.columns

Index(['id', 'breath_id', 'R', 'C', 'time_step', 'u_in', 'u_out', 'pressure', 'kfold', 'time_delta', 'delta', 'area', 'cross', 'cross2', 'u_in_cumsum', 'u_in_cummean', 'u_in_lag_1', 'u_in_time1', 'u_out_lag_1', 'u_in_lag_2', 'u_in_time2', 'u_out_lag_2', 'u_in_lag_3', 'u_in_time3', 'u_out_lag_3', 'u_in_lag_4', 'u_in_time4', 'u_out_lag_4', 'breath_time', 'C_cate', 'R_cate', 'RC_sum', 'RC_dot'], dtype='object')

In [6]:
USE_LAG = 4
CATE_FEATURES = ['R_cate', 'C_cate', 'RC_dot', 'RC_sum']
CONT_FEATURES = ['u_in', 'time_step'] + ['u_in_cumsum', 'u_in_cummean', 'area', 'cross', 'cross2']
LAG_FEATURES = ['breath_time']
LAG_FEATURES += [f'u_in_lag_{i}' for i in range(1, USE_LAG+1)]
#LAG_FEATURES += [f'u_in_lag_{i}_back' for i in range(1, USE_LAG+1)]
LAG_FEATURES += [f'u_in_time{i}' for i in range(1, USE_LAG+1)]
#LAG_FEATURES += [f'u_in_time{i}_back' for i in range(1, USE_LAG+1)]
# LAG_FEATURES += [f'u_out_lag_{i}' for i in range(1, USE_LAG+1)]
#LAG_FEATURES += [f'u_out_lag_{i}_back' for i in range(1, USE_LAG+1)]
ALL_FEATURES = CATE_FEATURES + CONT_FEATURES + LAG_FEATURES
norm_features = CONT_FEATURES + LAG_FEATURES
print(norm_features)


['u_in', 'time_step', 'u_in_cumsum', 'u_in_cummean', 'area', 'cross', 'cross2', 'breath_time', 'u_in_lag_1', 'u_in_lag_2', 'u_in_lag_3', 'u_in_lag_4', 'u_in_time1', 'u_in_time2', 'u_in_time3', 'u_in_time4']


In [7]:
train_df, test_df = norm_scale(train_df, test_df, norm_features)

In [8]:
train_df["u_in_round2"] = np.round(train_df["u_in"], 2)
test_df["u_in_round2"] = np.round(test_df["u_in"], 2)

In [10]:
train_df.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure,kfold,time_delta,delta,area,cross,cross2,u_in_cumsum,u_in_cummean,u_in_lag_1,u_in_time1,u_out_lag_1,u_in_lag_2,u_in_time2,u_out_lag_2,u_in_lag_3,u_in_time3,u_out_lag_3,u_in_lag_4,u_in_time4,u_out_lag_4,breath_time,C_cate,R_cate,RC_sum,RC_dot,u_in_round2
0,1,1,20,50,-0.989105,-0.938051,0,5.837492,4,0.0,0.0,-0.734147,0.0,-0.66557,-0.724032,-0.647256,-0.864294,0.472113,0.0,-0.841938,0.21807,0.0,-0.818664,0.1211,0.0,-0.793497,0.068265,0.0,-15.40746,2,1,6,7,-0.94
1,2,1,20,50,-0.963659,3.054611,0,5.907794,4,0.033652,0.618632,-0.682712,0.0,-0.66557,-0.675685,0.16287,-0.84752,113.649156,0.0,-0.841938,58.785756,0.0,-0.818664,38.106145,0.0,-0.793497,26.285903,0.0,0.077784,2,1,6,7,3.05
2,3,1,20,50,-0.938055,3.95488,0,7.876254,4,0.033862,0.762212,-0.619338,0.0,-0.66557,-0.616486,0.55469,2.835938,25.590379,0.0,-0.825195,71.724949,0.0,-0.818664,46.67105,0.0,-0.793497,32.197482,0.0,15.659572,2,1,6,7,3.95
3,4,1,20,50,-0.912325,4.020235,0,11.742872,4,0.034028,0.776134,-0.554806,0.0,-0.66557,-0.556499,0.757231,3.666488,1.815413,0.0,2.851513,14.115946,0.0,-0.801907,47.119843,0.0,-0.793497,32.626634,0.0,31.317608,2,1,6,7,4.02
4,5,1,20,50,-0.886455,4.57595,0,12.234987,4,0.034213,0.867507,-0.482678,0.0,-0.66557,-0.489813,0.923858,3.726782,15.77887,0.0,3.680541,9.061736,0.0,2.877892,14.421712,0.0,-0.776723,36.156321,0.0,0.33593,2,1,6,7,4.58


In [13]:
CATE_FEATURES = ['R_cate', 'C_cate', 'RC_dot', 'RC_sum'] + [f'u_out_lag_{i}' for i in range(1, USE_LAG+1)]

In [16]:
train_df[CATE_FEATURES] = train_df[CATE_FEATURES].astype(int)
test_df[CATE_FEATURES] = test_df[CATE_FEATURES].astype(int)

In [23]:
train_df["u_in_lag_2"].max()

19.24968532598747

In [18]:
train_df.to_csv("../../dataset/kaggle_ventilator/train_5folds_nb9_robust.csv", index=False)
test_df.to_csv("../../dataset/kaggle_ventilator/test_nb9_robust.csv", index=False)

In [20]:
sorted(train_df.columns)

['C',
 'C_cate',
 'R',
 'RC_dot',
 'RC_sum',
 'R_cate',
 'area',
 'breath_id',
 'breath_time',
 'cross',
 'cross2',
 'delta',
 'id',
 'kfold',
 'pressure',
 'time_delta',
 'time_step',
 'u_in',
 'u_in_cummean',
 'u_in_cumsum',
 'u_in_lag_1',
 'u_in_lag_2',
 'u_in_lag_3',
 'u_in_lag_4',
 'u_in_round2',
 'u_in_time1',
 'u_in_time2',
 'u_in_time3',
 'u_in_time4',
 'u_out',
 'u_out_lag_1',
 'u_out_lag_2',
 'u_out_lag_3',
 'u_out_lag_4']