In [11]:
import os

import category_encoders as ce
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
from IPython.display import display
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GroupKFold, KFold, train_test_split
from sklearn.preprocessing import (KBinsDiscretizer, OrdinalEncoder,
                                   RobustScaler, normalize)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [12]:
df = pd.read_csv("/home/koga/workspace/dataset/kaggle_ventilator/train_5folds.csv")
test_df = pd.read_csv("/home/koga/workspace/dataset/kaggle_ventilator/test.csv")

In [13]:
def add_features(df):
    df["u_in_log1p"] = np.log1p(df["u_in"])
    df["u_in_power"] = np.power(df["u_in"], 2)
    
    df['last_value_u_in'] = df.groupby('breath_id')['u_in'].transform('last')
    
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['time_step_cumsum'] = df.groupby(['breath_id'])['time_step'].cumsum()
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df['u_in_lag_back10'] = df.groupby('breath_id')['u_in'].shift(-10)
    df['u_out_lag_back10'] = df.groupby('breath_id')['u_out'].shift(-10)
    df = df.fillna(0)
    
    df['u_in_first'] = df.groupby('breath_id')['u_in'].first()
    df['u_out_first'] = df.groupby('breath_id')['u_out'].first()
    
    ## time since last step
    df['time_step_diff'] = df.groupby('breath_id')['time_step'].diff().fillna(0)
    ### rolling window ts feats
    df['ewm_u_in_mean'] = (df
                           .groupby('breath_id')['u_in']
                           .ewm(halflife=9)
                           .mean()
                           .reset_index(level=0,drop=True))
    df['ewm_u_in_std'] = (df
                          .groupby('breath_id')['u_in']
                          .ewm(halflife=10)
                          .std()
                          .reset_index(level=0,drop=True)) ## could add covar?
    df['ewm_u_in_corr'] = (df
                           .groupby('breath_id')['u_in']
                           .ewm(halflife=15)
                           .corr()
                           .reset_index(level=0,drop=True)) # self umin corr
    
    df[["15_in_sum","15_in_min","15_in_max","15_in_mean","15_out_std"]] = (df
                                                                              .groupby('breath_id')['u_in']
                                                                              .rolling(window=15,min_periods=1)
                                                                              .agg({"15_in_sum":"sum",
                                                                                    "15_in_min":"min",
                                                                                    "15_in_max":"max",
                                                                                    "15_in_mean":"mean",
                                                                                    "15_in_std":"std"})
                                                                              .reset_index(level=0,drop=True))
    
    df[["45_in_sum","45_in_min","45_in_max","45_in_mean","45_out_std"]] = (df
                                                                              .groupby('breath_id')['u_in']
                                                                              .rolling(window=45,min_periods=1)
                                                                              .agg({"45_in_sum":"sum",
                                                                                    "45_in_min":"min",
                                                                                    "45_in_max":"max",
                                                                                    "45_in_mean":"mean",
                                                                                    "45_in_std":"std"})
                                                                              .reset_index(level=0,drop=True))
    
    df[["15_out_mean"]] = (df
                           .groupby('breath_id')['u_out']
                           .rolling(window=15,min_periods=1)
                           .agg({"15_out_mean":"mean"})
                           .reset_index(level=0,drop=True))
    
    df = df.fillna(0)
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    
    df['breath_id__u_out__mean'] =df.groupby(['breath_id'])['u_out'].mean()
    df['breath_id__u_in__mean'] =df.groupby(['breath_id'])['u_in'].mean()

    df['breath_id__u_in__min'] = df.groupby(['breath_id'])['u_in'].transform('min')
    df['breath_id__u_out__min'] = df.groupby(['breath_id'])['u_out'].transform('min')
    
    
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_out_diff1'] = df['u_out'] - df['u_out_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_out_diff2'] = df['u_out'] - df['u_out_lag2']
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_out_diff3'] = df['u_out'] - df['u_out_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_out_diff4'] = df['u_out'] - df['u_out_lag4']
    
    df['u_in_diff_1_2'] = df['u_in_lag1'] - df['u_in_lag2']
    df['u_out_diff_1_2'] = df['u_out_lag1'] - df['u_out_lag2']
    df['u_in_lagback_diff_1_2'] = df['u_in_lag_back1'] - df['u_in_lag_back2']
    df['u_out_lagback_diff_1_2'] = df['u_out_lag_back1'] - df['u_out_lag_back2']
    
    df['u_in_lagback_diff1'] = df['u_in'] - df['u_in_lag_back1']
    df['u_out_lagback_diff1'] = df['u_out'] - df['u_out_lag_back1']
    df['u_in_lagback_diff2'] = df['u_in'] - df['u_in_lag_back2']
    df['u_out_lagback_diff2'] = df['u_out'] - df['u_out_lag_back2']
    
    df.loc[df['time_step'] == 0, 'u_in_diff'] = 0
    df.loc[df['time_step'] == 0, 'u_out_diff'] = 0
    
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    
    
    df['u_in_partition_out_sum'] = df.groupby(['breath_id',"u_out"])['u_in'].transform("sum")
    
    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    r_map = {5: 0, 20: 1, 50: 2}
    c_map = {10: 0, 20: 1, 50: 2}
    rc_map = {'20_50': 0, '20_20': 1, '50_20': 2, '50_50': 3, '5_50': 4, '5_20': 5, '50_10': 6, '20_10': 7, '5_10': 8}
    df['RC'] = df['R'].astype(str) + '_' + df['C'].astype(str)
    
    df['R'] = df['R'].map(r_map)
    df['C'] = df['C'].map(c_map)
    
    df['RC'] = df['RC'].map(rc_map)
    df = df.fillna(0)
    return df


In [14]:
train = add_features(df)
test = add_features(test_df)

In [15]:
not_used = ["id", "breath_id", "R", "C", "RC", "pressure", "kfold"]
used_columns = list(set(train.columns) - set(not_used))
sorted(used_columns)

['15_in_max',
 '15_in_mean',
 '15_in_min',
 '15_in_sum',
 '15_out_mean',
 '15_out_std',
 '45_in_max',
 '45_in_mean',
 '45_in_min',
 '45_in_sum',
 '45_out_std',
 'area',
 'breath_id__u_in__diffmax',
 'breath_id__u_in__diffmean',
 'breath_id__u_in__max',
 'breath_id__u_in__mean',
 'breath_id__u_in__min',
 'breath_id__u_out__max',
 'breath_id__u_out__mean',
 'breath_id__u_out__min',
 'cross',
 'cross2',
 'ewm_u_in_corr',
 'ewm_u_in_mean',
 'ewm_u_in_std',
 'last_value_u_in',
 'time_step',
 'time_step_cumsum',
 'time_step_diff',
 'u_in',
 'u_in_cumsum',
 'u_in_diff',
 'u_in_diff1',
 'u_in_diff2',
 'u_in_diff3',
 'u_in_diff4',
 'u_in_diff_1_2',
 'u_in_first',
 'u_in_lag1',
 'u_in_lag2',
 'u_in_lag3',
 'u_in_lag4',
 'u_in_lag_back1',
 'u_in_lag_back10',
 'u_in_lag_back2',
 'u_in_lag_back3',
 'u_in_lag_back4',
 'u_in_lagback_diff1',
 'u_in_lagback_diff2',
 'u_in_lagback_diff_1_2',
 'u_in_log1p',
 'u_in_partition_out_sum',
 'u_in_power',
 'u_out',
 'u_out_diff',
 'u_out_diff1',
 'u_out_diff2',

In [16]:
RS = RobustScaler()
train[used_columns] = RS.fit_transform(train[used_columns])
train["u_in_round2"] = np.round(train["u_in"], 2)



AttributeError: 'RobustScaler' object has no attribute 'ransform'

In [19]:
test[used_columns] = RS.transform(test[used_columns])
test["u_in_round2"] = np.round(test["u_in"], 2)

In [21]:
train.to_csv("../../dataset/kaggle_ventilator/train_5folds_nb6_robust.csv", index=False)
test.to_csv("../../dataset/kaggle_ventilator/test_nb6_robust.csv", index=False)