In [1]:
import os

import category_encoders as ce
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
from IPython.display import display
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GroupKFold, KFold, train_test_split
from sklearn.preprocessing import (KBinsDiscretizer, OrdinalEncoder,
                                   RobustScaler, normalize, MinMaxScaler)
from sklearn import model_selection


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [2]:
df = pd.read_csv("../data/train_5folds.csv")
test_df = pd.read_csv("../data/test.csv")

In [4]:

from pykalman import KalmanFilter
def Kalman1D(observations,damping=1):
    # To return the smoothed time series data
    observation_covariance = damping
    initial_value_guess = observations[0]
    transition_matrix = 1
    transition_covariance = 0.1
    initial_value_guess
    kf = KalmanFilter(
            initial_state_mean=initial_value_guess,
            initial_state_covariance=observation_covariance,
            observation_covariance=observation_covariance,
            transition_covariance=transition_covariance,
            transition_matrices=transition_matrix
        )
    pred_state, state_cov = kf.smooth(observations)
    return pred_state

def add_features(df, transform="robust"):
    df["u_in_log1p"] = np.log1p(df["u_in"])
    df["u_in_kalman"] = Kalman1D(df["u_in"])
    
    df['time_delta'] = df.groupby('breath_id')['time_step'].diff().fillna(0)
    df['delta'] = df['time_delta'] * df['u_in']
    df['area'] = df.groupby('breath_id')['delta'].cumsum()

    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['one'] = 1
    df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
    df['u_in_cummean'] =df['u_in_cumsum'] / df['count']
    
    df = df.drop(['count','one'], axis=1)

    for i in range(1, 8):    
        df[f'u_in_lag{i}'] = df.groupby('breath_id')['u_in'].shift(i)
        df[f'u_out_lag{i}'] = df.groupby('breath_id')['u_out'].shift(i)
        df[f'u_in_lag_back{i}'] = df.groupby('breath_id')['u_in'].shift(-i)
        df[f'u_out_lag_back{i}'] = df.groupby('breath_id')['u_out'].shift(-i)

    df = df.fillna(0)

    for i in range(1, 8):    
        df[f'u_in_diff{i}'] = df['u_in'] - df[f'u_in_lag{i}']

    df = df.fillna(0)
    
    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    

    r_map = {5: 0, 20: 1, 50: 2}
    c_map = {10: 0, 20: 1, 50: 2}
    rc_map = {'20_50': 0, '20_20': 1, '50_20': 2, '50_50': 3, '5_50': 4, '5_20': 5, '50_10': 6, '20_10': 7, '5_10': 8}
    df['RC'] = df['R'].astype(str) + '_' + df['C'].astype(str)
    
    df['R'] = df['R'].map(r_map)
    df['C'] = df['C'].map(c_map)
    
    df['RC'] = df['RC'].map(rc_map)
    return df


In [5]:
train_df = add_features(df)
test_df = add_features(test_df)

In [7]:
train_df.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure,kfold,u_in_log1p,u_in_kalman,time_delta,delta,area,cross,cross2,u_in_cumsum,u_in_cummean,u_in_lag1,u_out_lag1,u_in_lag_back1,u_out_lag_back1,u_in_lag2,u_out_lag2,u_in_lag_back2,u_out_lag_back2,u_in_lag3,u_out_lag3,u_in_lag_back3,u_out_lag_back3,u_in_lag4,u_out_lag4,u_in_lag_back4,u_out_lag_back4,u_in_lag5,u_out_lag5,u_in_lag_back5,u_out_lag_back5,u_in_lag6,u_out_lag6,u_in_lag_back6,u_out_lag_back6,u_in_lag7,u_out_lag7,u_in_lag_back7,u_out_lag_back7,u_in_diff1,u_in_diff2,u_in_diff3,u_in_diff4,u_in_diff5,u_in_diff6,u_in_diff7,breath_id__u_in__max,breath_id__u_out__max,breath_id__u_in__diffmax,breath_id__u_in__diffmean,RC
0,1,1,1,2,0.0,0.083334,0,5.837492,4,0.080043,13.272894,0.0,0.0,0.0,0.0,0.0,0.083334,0.083334,0.0,0.0,18.383041,0.0,0.0,0.0,22.509278,0.0,0.0,0.0,22.808822,0.0,0.0,0.0,25.35585,0.0,0.0,0.0,27.259866,0.0,0.0,0.0,27.127486,0.0,0.0,0.0,26.807732,0.0,0.083334,0.083334,0.083334,0.083334,0.083334,0.083334,0.083334,28.313036,1,28.229702,10.062673,0
1,2,1,1,2,0.033652,18.383041,0,5.907794,4,2.964399,15.910805,0.033652,0.618632,0.618632,0.0,0.0,18.466375,9.233188,0.083334,0.0,22.509278,0.0,0.0,0.0,22.808822,0.0,0.0,0.0,25.35585,0.0,0.0,0.0,27.259866,0.0,0.0,0.0,27.127486,0.0,0.0,0.0,26.807732,0.0,0.0,0.0,27.864715,0.0,18.299707,18.383041,18.383041,18.383041,18.383041,18.383041,18.383041,28.313036,1,9.929994,-8.237035,0
2,3,1,1,2,0.067514,22.509278,0,7.876254,4,3.157395,18.301494,0.033862,0.762212,1.380843,0.0,0.0,40.975653,13.658551,18.383041,0.0,22.808822,0.0,0.083334,0.0,25.35585,0.0,0.0,0.0,27.259866,0.0,0.0,0.0,27.127486,0.0,0.0,0.0,26.807732,0.0,0.0,0.0,27.864715,0.0,0.0,0.0,28.313036,0.0,4.126236,22.425944,22.509278,22.509278,22.509278,22.509278,22.509278,28.313036,1,5.803758,-12.363271,0
3,4,1,1,2,0.101542,22.808822,0,11.742872,4,3.170056,20.271404,0.034028,0.776134,2.156978,0.0,0.0,63.784476,15.946119,22.509278,0.0,25.35585,0.0,18.383041,0.0,27.259866,0.0,0.083334,0.0,27.127486,0.0,0.0,0.0,26.807732,0.0,0.0,0.0,27.864715,0.0,0.0,0.0,28.313036,0.0,0.0,0.0,26.866758,0.0,0.299544,4.425781,22.725488,22.808822,22.808822,22.808822,22.808822,28.313036,1,5.504214,-12.662816,0
4,5,1,1,2,0.135756,25.35585,0,12.234987,4,3.27169,21.987572,0.034213,0.867507,3.024485,0.0,0.0,89.140326,17.828065,22.808822,0.0,27.259866,0.0,22.509278,0.0,27.127486,0.0,18.383041,0.0,26.807732,0.0,0.083334,0.0,27.864715,0.0,0.0,0.0,28.313036,0.0,0.0,0.0,26.866758,0.0,0.0,0.0,26.762803,0.0,2.547028,2.846573,6.972809,25.272516,25.35585,25.35585,25.35585,28.313036,1,2.957185,-15.209844,0


In [8]:
train_df.describe()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure,kfold,u_in_log1p,u_in_kalman,time_delta,delta,area,cross,cross2,u_in_cumsum,u_in_cummean,u_in_lag1,u_out_lag1,u_in_lag_back1,u_out_lag_back1,u_in_lag2,u_out_lag2,u_in_lag_back2,u_out_lag_back2,u_in_lag3,u_out_lag3,u_in_lag_back3,u_out_lag_back3,u_in_lag4,u_out_lag4,u_in_lag_back4,u_out_lag_back4,u_in_lag5,u_out_lag5,u_in_lag_back5,u_out_lag_back5,u_in_lag6,u_out_lag6,u_in_lag_back6,u_out_lag_back6,u_in_lag7,u_out_lag7,u_in_lag_back7,u_out_lag_back7,u_in_diff1,u_in_diff2,u_in_diff3,u_in_diff4,u_in_diff5,u_in_diff6,u_in_diff7,breath_id__u_in__max,breath_id__u_out__max,breath_id__u_in__diffmax,breath_id__u_in__diffmean,RC
count,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0,6036000.0
mean,3018000.0,62838.86,1.069795,0.9547647,1.307225,7.321615,0.6204493,11.22041,2.0,1.452719,7.321613,0.03268504,0.2366825,12.98736,1.824144,1.123002,406.23,12.56786,7.262764,0.6079493,7.147731,0.6204493,7.203947,0.5954493,6.797571,0.6204493,7.145212,0.5829493,6.508529,0.6204493,7.086514,0.5704493,6.297275,0.6204493,7.027904,0.5579493,6.062309,0.6204493,6.96937,0.5454493,5.79768,0.6204493,6.910958,0.5329493,5.543617,0.6204493,0.05885066,0.1176676,0.1764026,0.2351004,0.2937106,0.3522451,0.4106564,36.02008,1.0,28.69846,1.669705e-17,4.145951
std,1742443.0,36335.26,0.8508248,0.8345633,0.7659778,13.4347,0.4852752,8.109703,1.414214,1.095093,10.81787,0.003876106,0.4347354,13.34446,2.80986,0.9555894,414.1439,14.43182,13.45586,0.4882079,13.14116,0.4852752,13.47672,0.4908049,12.42073,0.4852752,13.49727,0.4930714,11.93367,0.4852752,13.51754,0.4950121,11.65354,0.4852752,13.5375,0.4966306,11.33376,0.4852752,13.55717,0.4979301,10.90741,0.4852752,13.57653,0.4989132,10.52839,0.4852752,7.961707,10.72019,10.64721,10.57946,11.9211,13.10265,13.65175,31.40594,0.0,30.45699,12.25743,2.514575
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,-1.895744,0.0,0.0,5.154815e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0.0,1.0,0.0,-96.02704,0.0
25%,1509001.0,31377.0,0.0,0.0,0.6428995,0.3936623,0.0,6.329607,1.0,0.331935,1.750569,0.03188229,0.0040908,4.284722,0.0,0.0,134.9007,3.908123,0.008853319,0.0,0.1241114,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0006808841,-0.05050907,-0.07197841,-0.1998261,-0.2955214,11.38441,1.0,5.3876,-1.72014,2.0
50%,3018000.0,62765.5,1.0,1.0,1.308123,4.386146,1.0,7.032628,2.0,1.68383,4.447039,0.03342915,0.1429761,8.835663,0.0,1.308123,275.5448,7.395767,4.296469,1.0,4.344646,1.0,4.193605,1.0,4.275508,1.0,4.074383,1.0,4.194037,1.0,3.945635,1.0,4.101774,1.0,3.816974,1.0,4.012225,1.0,3.659551,1.0,3.888248,1.0,3.48043,1.0,3.758163,1.0,0.007337235,0.01519439,0.02500432,0.03568778,0.05946485,0.07894962,0.1132884,23.65779,1.0,16.39117,1.50513,4.0
75%,4527000.0,94301.0,2.0,2.0,1.965502,4.983895,1.0,13.64103,3.0,1.789072,6.954368,0.03404236,0.170214,16.30608,4.4745,1.965502,514.7282,15.18511,4.980925,1.0,4.981339,1.0,4.977415,1.0,4.976431,1.0,4.97326,1.0,4.97201,1.0,4.968365,1.0,4.968431,1.0,4.962587,1.0,4.963984,1.0,4.95575,1.0,4.959055,1.0,4.947678,1.0,4.953309,1.0,0.161212,0.3125768,0.4812711,0.6469432,0.9427237,1.20951,1.521205,52.87211,1.0,41.59759,4.997646,6.0
max,6036000.0,125749.0,2.0,2.0,2.937238,100.0,1.0,64.82099,4.0,4.615121,97.17002,0.2510931,8.298332,91.49965,100.0,2.937238,2723.956,100.0,100.0,1.0,100.0,1.0,100.0,1.0,100.0,1.0,100.0,1.0,100.0,1.0,100.0,1.0,100.0,1.0,100.0,1.0,100.0,1.0,100.0,1.0,100.0,1.0,100.0,1.0,100.0,1.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,1.0,100.0,34.04945,8.0


In [9]:
train_df["u_in_round2"] = np.round(train_df["u_in"], 2)
test_df["u_in_round2"] = np.round(test_df["u_in"], 2)


In [10]:
train_df["kfold"] = -1
y = train_df["pressure"].values

kf = model_selection.GroupKFold(n_splits=10)
# kf = model_selection.StratifiedGroupKFold(n_splits=5)

for f, (t_, v_) in enumerate(kf.split(X=train_df, y=y, groups=train_df.breath_id.values)):
    print(f)
    train_df.loc[v_, "kfold"] = f

0
1
2
3
4
5
6
7
8
9


In [11]:
train_df.to_csv("../data/train_10folds_nb14.csv", index=False)
test_df.to_csv("../data/test_nb14.csv", index=False)

In [15]:
not_used = ["id", "breath_id", "R", "C", "RC", "pressure", "kfold", "time_step"]
used_columns = list(set(train_df.columns) - set(not_used))
norm_feature = sorted([c for c in sorted(used_columns) if "u_out" not in c])

In [16]:
norm_feature

['area',
 'breath_id__u_in__diffmax',
 'breath_id__u_in__diffmean',
 'breath_id__u_in__max',
 'cross',
 'cross2',
 'delta',
 'time_delta',
 'u_in',
 'u_in_cummean',
 'u_in_cumsum',
 'u_in_diff1',
 'u_in_diff2',
 'u_in_diff3',
 'u_in_diff4',
 'u_in_diff5',
 'u_in_diff6',
 'u_in_diff7',
 'u_in_kalman',
 'u_in_lag1',
 'u_in_lag2',
 'u_in_lag3',
 'u_in_lag4',
 'u_in_lag5',
 'u_in_lag6',
 'u_in_lag7',
 'u_in_lag_back1',
 'u_in_lag_back2',
 'u_in_lag_back3',
 'u_in_lag_back4',
 'u_in_lag_back5',
 'u_in_lag_back6',
 'u_in_lag_back7',
 'u_in_log1p',
 'u_in_round2']

In [17]:
RS = RobustScaler()
train_df[norm_feature] = RS.fit_transform(train_df[norm_feature])
test_df[norm_feature] = RS.fit_transform(test_df[norm_feature])



In [18]:
train_df.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure,kfold,u_in_log1p,u_in_kalman,time_delta,delta,area,cross,cross2,u_in_cumsum,u_in_cummean,u_in_lag1,u_out_lag1,u_in_lag_back1,u_out_lag_back1,u_in_lag2,u_out_lag2,u_in_lag_back2,u_out_lag_back2,u_in_lag3,u_out_lag3,u_in_lag_back3,u_out_lag_back3,u_in_lag4,u_out_lag4,u_in_lag_back4,u_out_lag_back4,u_in_lag5,u_out_lag5,u_in_lag_back5,u_out_lag_back5,u_in_lag6,u_out_lag6,u_in_lag_back6,u_out_lag_back6,u_in_lag7,u_out_lag7,u_in_lag_back7,u_out_lag_back7,u_in_diff1,u_in_diff2,u_in_diff3,u_in_diff4,u_in_diff5,u_in_diff6,u_in_diff7,breath_id__u_in__max,breath_id__u_out__max,breath_id__u_in__diffmax,breath_id__u_in__diffmean,RC,u_in_round2
0,1,1,1,2,0.0,-0.937384,0,5.837492,9,-1.100643,1.696041,-15.475938,-0.860663,-0.734997,0.0,-0.665541,-0.725228,-0.648438,-0.864121,0.0,2.890207,0.0,-0.842527,0.0,3.664026,0.0,-0.819258,0.0,3.743915,0.0,-0.794152,0.0,4.277824,0.0,-0.76915,0.0,4.683263,0.0,-0.738446,0.0,4.686223,0.0,-0.703447,0.0,4.653368,0.0,0.471409,0.217993,0.121028,0.068315,0.023523,0.003111,-0.016488,0.112208,1,0.326941,1.273864,0,-0.938998
1,2,1,1,2,0.033652,3.049278,0,5.907794,9,0.878825,2.202961,0.103311,2.86327,-0.683536,0.0,-0.665541,-0.676829,0.162935,-0.84736,0.0,3.739712,0.0,-0.842527,0.0,3.724218,0.0,-0.819258,0.0,4.256189,0.0,-0.794152,0.0,4.661047,0.0,-0.76915,0.0,4.656595,0.0,-0.738446,0.0,4.621744,0.0,-0.703447,0.0,4.866757,0.0,113.467773,58.76267,38.091005,26.30625,18.058084,12.987738,10.056416,0.112208,1,-0.178436,-1.450205,0,3.04793
2,3,1,1,2,0.067514,3.948195,0,7.876254,9,1.011274,2.662374,0.200442,3.727568,-0.620131,0.0,-0.665541,-0.617568,0.555359,2.83314,0.0,3.801381,0.0,-0.825784,0.0,4.236036,0.0,-0.819258,0.0,4.639136,0.0,-0.794152,0.0,4.634403,0.0,-0.76915,0.0,4.59218,0.0,-0.738446,0.0,4.834887,0.0,-0.703447,0.0,4.957266,0.0,25.549576,71.696779,46.652513,32.222406,22.124535,15.915525,12.327665,0.112208,1,-0.292389,-2.06443,0,3.947712
3,4,1,1,2,0.101542,4.013452,0,11.742872,9,1.019963,3.040926,0.277152,3.811377,-0.555568,0.0,-0.665541,-0.557517,0.758212,3.663022,0.0,4.32576,0.0,2.850764,0.0,4.618643,0.0,-0.802501,0.0,4.612511,0.0,-0.794152,0.0,4.570046,0.0,-0.76915,0.0,4.80511,0.0,-0.738446,0.0,4.925291,0.0,-0.703447,0.0,4.665284,0.0,1.812565,14.110409,47.101127,32.65189,22.419739,16.128068,12.492546,0.112208,1,-0.300662,-2.10902,0,4.013072
4,5,1,1,2,0.135756,4.568332,0,12.234987,9,1.089713,3.370717,0.363024,4.361409,-0.483405,0.0,-0.665541,-0.490761,0.925096,3.723268,0.0,4.717757,0.0,3.679756,0.0,4.592042,0.0,2.877119,0.0,4.5482,0.0,-0.777379,0.0,4.782785,0.0,-0.76915,0.0,4.895425,0.0,-0.738446,0.0,4.633647,0.0,-0.703447,0.0,4.644297,0.0,15.75373,9.058184,14.415967,36.184309,24.929863,17.935321,13.894534,0.112208,1,-0.371002,-2.488167,0,4.568627


In [None]:
train_df.to_csv("../data/train_10folds_nb14_robust.csv", index=False)
test_df.to_csv("../data/test_nb14.csv", index=False)