In [1]:
import os

import category_encoders as ce
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
from IPython.display import display
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import GroupKFold, KFold, train_test_split
from sklearn.preprocessing import (KBinsDiscretizer, OrdinalEncoder,
                                   RobustScaler, normalize)
from sklearn import model_selection


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


In [5]:
df = pd.read_csv("../data/train_5folds.csv")
test_df = pd.read_csv("../data/test.csv")

In [None]:
def add_features(df, transform="robust"):
    df["u_in_log1p"] = np.log1p(df["u_in"])
    df["u_in_power"] = np.power(df["u_in"], 2)
    
    df['time_delta'] = df.groupby('breath_id')['time_step'].diff().fillna(0)
    df['delta'] = df['time_delta'] * df['u_in']
    df['area'] = df.groupby('breath_id')['delta'].cumsum()

    df['cross']= df['u_in']*df['u_out']
    df['cross2']= df['time_step']*df['u_out']
    
    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()
    df['one'] = 1
    df['count'] = (df['one']).groupby(df['breath_id']).cumsum()
    df['u_in_cummean'] =df['u_in_cumsum'] / df['count']
    
    df = df.drop(['count','one'], axis=1)
    
    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_out_lag1'] = df.groupby('breath_id')['u_out'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_out_lag_back1'] = df.groupby('breath_id')['u_out'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_out_lag2'] = df.groupby('breath_id')['u_out'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_out_lag_back2'] = df.groupby('breath_id')['u_out'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_out_lag3'] = df.groupby('breath_id')['u_out'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_out_lag_back3'] = df.groupby('breath_id')['u_out'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_out_lag4'] = df.groupby('breath_id')['u_out'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df['u_out_lag_back4'] = df.groupby('breath_id')['u_out'].shift(-4)
    df['u_in_lag8'] = df.groupby('breath_id')['u_in'].shift(8)
    df['u_out_lag8'] = df.groupby('breath_id')['u_out'].shift(8)
    df['u_in_lag_back8'] = df.groupby('breath_id')['u_in'].shift(-8)
    df['u_out_lag_back8'] = df.groupby('breath_id')['u_out'].shift(-8)
    

    df['u_in_lag10'] = df.groupby('breath_id')['u_in'].shift(10)
    df['u_in_lag_back10'] = df.groupby('breath_id')['u_in'].shift(-10)
    df['u_in_lag15'] = df.groupby('breath_id')['u_in'].shift(15)
    df['u_in_lag_back15'] = df.groupby('breath_id')['u_in'].shift(-15)
    df['u_in_lag20'] = df.groupby('breath_id')['u_in'].shift(20)
    df['u_in_lag_back20'] = df.groupby('breath_id')['u_in'].shift(-20)
    df['u_in_lag30'] = df.groupby('breath_id')['u_in'].shift(30)
    df['u_in_lag_back30'] = df.groupby('breath_id')['u_in'].shift(-30)
    df['u_in_lag40'] = df.groupby('breath_id')['u_in'].shift(40)
    df['u_in_lag_back40'] = df.groupby('breath_id')['u_in'].shift(-40)
    df['u_in_lag60'] = df.groupby('breath_id')['u_in'].shift(60)
    df['u_in_lag_back60'] = df.groupby('breath_id')['u_in'].shift(-60)
    df['u_in_lag75'] = df.groupby('breath_id')['u_in'].shift(75)
    df['u_in_lag_back75'] = df.groupby('breath_id')['u_in'].shift(-75)

    df = df.fillna(0)

    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['u_in_diff8'] = df['u_in'] - df['u_in_lag8']
    df['u_in_diff10'] = df['u_in'] - df['u_in_lag10']
    df['u_in_diff15'] = df['u_in'] - df['u_in_lag15']
    df['u_in_diff20'] = df['u_in'] - df['u_in_lag20']
    df['u_in_diff30'] = df['u_in'] - df['u_in_lag30']
    df['u_in_diff40'] = df['u_in'] - df['u_in_lag40']
    df['u_in_diff60'] = df['u_in'] - df['u_in_lag60']
    df['u_in_diff75'] = df['u_in'] - df['u_in_lag75']

    df = df.fillna(0)
    

    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
    df['breath_id__u_out__max'] = df.groupby(['breath_id'])['u_out'].transform('max')
    df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
    df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']
    


    r_map = {5: 0, 20: 1, 50: 2}
    c_map = {10: 0, 20: 1, 50: 2}
    rc_map = {'20_50': 0, '20_20': 1, '50_20': 2, '50_50': 3, '5_50': 4, '5_20': 5, '50_10': 6, '20_10': 7, '5_10': 8}
    df['RC'] = df['R'].astype(str) + '_' + df['C'].astype(str)
    
    df['R'] = df['R'].map(r_map)
    df['C'] = df['C'].map(c_map)
    
    df['RC'] = df['RC'].map(rc_map)
    return df
