<font color='blue' size=6>**Feature development**</font>

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES']="0"
os.environ['TF_FORCE_GPU_ALLOW_GROWTH']='true'
import sys
import sklearn
import scipy
import numpy as np
import pandas as pd
from tqdm import tqdm
import itertools
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import tensorflow as tf
import plotly.express as px
from scipy.stats import mode
import mlflow
from tensorflow.keras import Model
from tensorflow.keras.layers import LSTM, Bidirectional, Input, Dense, BatchNormalization, Conv1D, MaxPooling1D, Flatten, Reshape, TimeDistributed
from tensorflow.keras.layers import GlobalMaxPooling1D, MaxPool1D, LSTM, Bidirectional, Dropout, Concatenate, AveragePooling1D, Concatenate, SpatialDropout1D
from tensorflow.keras.metrics import MeanSquaredError, AUC
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import BatchNormalization, Conv1DTranspose, Dropout
from tensorflow.keras.optimizers import Adam, Adadelta, Adamax
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.callbacks import TensorBoard
from sklearn.model_selection import GroupShuffleSplit
from joblib import Parallel, delayed
import inspect
import datetime as dt
import tsfel

global_start=dt.datetime.now()
for m in [sklearn, pd, np, scipy, plotly, tf, mlflow]:
    print(f"{m.__name__:15s}:{m.__version__}")

2022-04-29 09:02:07.515309: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


sklearn        :1.0.1
pandas         :1.3.5
numpy          :1.19.5
scipy          :1.7.3
plotly         :5.4.0
tensorflow     :2.4.0
mlflow         :1.24.0


# Source data loading

In [2]:
TRAINDATA=os.path.join("..", "data","train.csv")
TESTDATA=os.path.join("..", "data","test.csv")
TRAINLABELS=os.path.join("..", "data","train_labels.csv")

train_df=pd.read_csv(TRAINDATA)
train_labels=pd.read_csv(TRAINLABELS)
test_df=pd.read_csv(TESTDATA)

org_feat_names=[f"sensor_{x:02d}" for x in range(13)]
for feat in org_feat_names :
    print(feat)

sensor_00
sensor_01
sensor_02
sensor_03
sensor_04
sensor_05
sensor_06
sensor_07
sensor_08
sensor_09
sensor_10
sensor_11
sensor_12


In [3]:
train_df.head(3)

Unnamed: 0,sequence,subject,step,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09,sensor_10,sensor_11,sensor_12
0,0,47,0,-0.196291,0.112395,1.0,0.329204,-1.00466,-0.131638,-0.127505,0.368702,-0.1,-0.963873,-0.985069,0.531893,4.751492
1,0,47,1,-0.44745,0.134454,1.0,-0.658407,0.162495,0.340314,-0.209472,-0.867176,0.2,-0.301301,0.082733,-0.231481,0.45439
2,0,47,2,0.326893,-0.694328,1.0,0.330088,0.473678,1.280479,-0.094718,0.535878,1.4,1.002168,0.449221,-0.58642,-4.736147


- single data input is a sequence
- a sequence relates to subject and has 60 timesteps
- the dimentsions of oryginal elementary input is 60 steps x 13 sensors, relevant to a particular sequence and subject
- subjects in train and test data do not overlap
- there are multiple sequences for a single subject in train and test data

Consequences:
- **group k-fold must be used to crossvalidate data**, to avboid using series for the same subject in train and test data (it's possible that subjects have their individual characteristics, influencing sensors results)
- **scalar values may be computed and passed to independent input**, in addition to sensorxsequences data
- **some data modifications**, especially those which smooth the sequences should be considered

# Time-based feature creation

In [4]:
org_feat_names

['sensor_00',
 'sensor_01',
 'sensor_02',
 'sensor_03',
 'sensor_04',
 'sensor_05',
 'sensor_06',
 'sensor_07',
 'sensor_08',
 'sensor_09',
 'sensor_10',
 'sensor_11',
 'sensor_12']

In [5]:
# shifting sensor data forward
train_df_=train_df.copy()
test_df_ = test_df.copy()
for feat in tqdm(org_feat_names,desc=feat):
    nf_train=pd.DataFrame(index=train_df.index)
    nf_test=pd.DataFrame(index=test_df.index)

    for window_size in [3]:
        # computing forward differences
        new_feat = f"{feat}_fwd_diff_{window_size}"
        nf_train[new_feat] = train_df[feat] - train_df.groupby('sequence')[feat].shift(window_size).fillna(0)
        nf_test[new_feat] = test_df[feat] - test_df.groupby('sequence')[feat].shift(window_size).fillna(0)
        
        new_feat = f"{feat}_rew_diff_{window_size}"
        # computing reverse differences
        nf_train[new_feat] = train_df[feat] - train_df.groupby('sequence')[feat].shift(-window_size).fillna(0)
        nf_test[new_feat] = test_df[feat] - test_df.groupby('sequence')[feat].shift(-window_size).fillna(0)
    
    for window_size in [3]:
        new_feat = f"{feat}_roll_mean_{window_size}"
        nf_train[new_feat] = train_df.groupby('sequence')[feat].rolling(
            window=window_size, min_periods=1).mean().reset_index(level=0, drop=True).fillna(0)
        nf_test[new_feat] = test_df.groupby('sequence')[feat].rolling(
            window=window_size, min_periods=1).mean().reset_index(level=0, drop=True).fillna(0)

        new_feat = f"{feat}_roll_var_{window_size}"
        nf_train[new_feat] = train_df.groupby('sequence')[feat].rolling(
            window=window_size, min_periods=1).var().reset_index(level=0, drop=True).fillna(0)
        nf_test[new_feat] = test_df.groupby('sequence')[feat].rolling(
            window=window_size, min_periods=1).var().reset_index(level=0, drop=True).fillna(0)
        
    train_df_=train_df_.join(nf_train.copy())
    test_df_=test_df_.join(nf_test.copy())        

sensor_12: 100%|████████████████████████████████| 13/13 [00:44<00:00,  3.42s/it]


In [6]:
cols_0=[x for x in train_df_.columns if x.startswith('sensor_00')]
train_df_[cols_0].head()

Unnamed: 0,sensor_00,sensor_00_fwd_diff_3,sensor_00_rew_diff_3,sensor_00_roll_mean_3,sensor_00_roll_var_3
0,-0.196291,-0.196291,-0.719474,-0.196291,0.0
1,-0.44745,-0.44745,-0.719474,-0.32187,0.03154
2,0.326893,0.326893,1.400309,-0.105616,0.156068
3,0.523184,0.719474,0.172334,0.134209,0.263378
4,0.272025,0.719474,0.576507,0.374034,0.017437


In [7]:
print(f"Number of features: {train_df_.shape[1]}")
print(f"Number of features: {test_df_.shape[1]}")

Number of features: 68
Number of features: 68


In [8]:
train_df_.to_pickle(f"/mnt/workdata/_WORK_/Kaggle_202204/data_1/train_transformed_1_small.zip")
test_df_.to_pickle(f"/mnt/workdata/_WORK_/Kaggle_202204/data_1/test_transformed_1_small.zip")

# Scalar features

In [9]:
from scipy.signal import find_peaks
ts = train_df.loc[train_df['sequence']==0, 'sensor_00']

def extract_base_scalar_features(ts: pd.Series, name:str)-> pd.Series:
    res={
        name+'_mean': ts.mean(), 
        name+'_std': ts.std(), 
        name+'_mean_abs_dev': tsfel.mean_abs_deviation(ts), 
        # name+'_min': ts.min(),
        # name+'_max': ts.max(),
        # name+'_gap': ts.max() - ts.min(),
        # name+'_med': ts.median(),
        # name+'_med_abs_dev': tsfel.median_abs_deviation(ts),
        # name+'_iqr': tsfel.interq_range(ts),
        # name+'_neg_cnt': len([x for x in ts if x<0]),
        # name+'_pos_cnt': len([x for x in ts if x>0]),
        # name+'_abv_mean': len([x for x in ts if x> ts.mean()]),
        # name+'_bel_mean': len([x for x in ts if x< ts.mean()]),
        # name+'_skew': ts.skew(),
        # name+'_kurt': ts.kurtosis(),
        # name+'_npeaks': len(find_peaks(ts)),
        # name+"_abs_energy" : tsfel.abs_energy(ts),
        # name+"_auc" : tsfel.auc(ts,1),
        # name+"_centroid" : tsfel.calc_centroid(ts,1),
        # name+"_entropy" : tsfel.entropy(ts),
        # name+"_neg_turn_point" : tsfel.negative_turning(ts),
        # name+"_pos_turn_point" : tsfel.positive_turning(ts),
        # name+"_ptp_distance" : tsfel.pk_pk_distance(ts),
        # name+"_sig_dist" : tsfel.distance(ts),
        # name+"_slope" : tsfel.slope(ts),
        # name+"_sum_abs_diff" : tsfel.sum_abs_diff(ts),
        # name+"_sero_cross" : tsfel.zero_cross(ts),
        
    }
    return res

def compute_sensor_feats(df, sensor):
    results=dict()
    for g in tqdm(df.groupby('sequence')["sensor_"+sensor]):
        results[g[0]]=extract_base_scalar_features(g[1],"sensor_"+sensor)
    return results


In [10]:
# generating scalar features for train data

SENSORNAMES = ['00', '01','02','03','04','05','06','07','08','09','10','11','12']
results = Parallel(n_jobs=13)(
    delayed(compute_sensor_feats)(
        train_df, sensor) for sensor in SENSORNAMES)

100%|██████████| 25968/25968 [00:08<00:00, 2913.60it/s]

In [11]:
# transforming train data scvalar features into dataframe
train_res={sensor:features for sensor,features in zip(SENSORNAMES, results)}

train_scalar_feats=pd.DataFrame()
for sensor in SENSORNAMES:
    _=pd.DataFrame.from_dict(train_res[sensor])
    train_scalar_feats = pd.concat([train_scalar_feats, _.T], axis=1)

train_scalar_feats.to_pickle("/mnt/workdata/_WORK_/Kaggle_202204/data_1/train_scalarfeats_1_small.pkl")
print(train_scalar_feats.shape)
train_scalar_feats.head()

(25968, 39)


Unnamed: 0,sensor_00_mean,sensor_00_std,sensor_00_mean_abs_dev,sensor_01_mean,sensor_01_std,sensor_01_mean_abs_dev,sensor_02_mean,sensor_02_std,sensor_02_mean_abs_dev,sensor_03_mean,...,sensor_09_mean_abs_dev,sensor_10_mean,sensor_10_std,sensor_10_mean_abs_dev,sensor_11_mean,sensor_11_std,sensor_11_mean_abs_dev,sensor_12_mean,sensor_12_std,sensor_12_mean_abs_dev
0,0.041744,1.43166,0.829612,-0.013025,1.509221,1.153697,0.739306,0.1383,0.112968,0.034204,...,0.660276,-0.015075,0.733289,0.620259,0.010391,1.257716,0.967853,-1.286985,8.20688,4.388229
1,-0.069294,1.593907,0.964271,-0.019958,0.949403,0.708158,-1.602773,0.492015,0.423397,0.120162,...,0.800374,0.327065,1.607556,0.855868,0.199914,1.774826,1.089381,-1.604085,50.291288,26.509574
2,-0.001546,1.147526,0.855307,0.025,0.803884,0.596695,-1.008764,0.170335,0.103789,-0.024794,...,0.601582,0.018813,2.249767,1.100152,-0.002932,0.611138,0.482209,0.000945,0.361055,0.24947
3,0.029869,1.769705,1.379164,-0.049177,1.914868,1.24614,0.494766,0.197465,0.178603,0.033481,...,2.022037,0.017394,0.448963,0.303783,-0.02668,1.399494,1.013548,-0.003708,0.241708,0.170745
4,0.059132,2.014779,1.495183,0.014916,1.385251,0.977479,0.9,0.164884,0.146667,-0.056268,...,0.798873,-0.00065,0.342,0.302701,-0.099657,2.894626,2.106692,2.8e-05,0.0516,0.040891


In [12]:
# generating scalar features for test data

SENSORNAMES = ['00', '01','02','03','04','05','06','07','08','09','10','11','12']
results = Parallel(n_jobs=13)(
    delayed(compute_sensor_feats)(
        test_df, sensor) for sensor in SENSORNAMES)



  0%|          | 0/12218 [00:00<?, ?it/s]
  0%|          | 0/12218 [00:00<?, ?it/s]
  0%|          | 0/12218 [00:00<?, ?it/s]
  0%|          | 0/12218 [00:00<?, ?it/s]
  0%|          | 0/12218 [00:00<?, ?it/s]
  0%|          | 0/12218 [00:00<?, ?it/s]
  0%|          | 0/12218 [00:00<?, ?it/s]
  0%|          | 0/12218 [00:00<?, ?it/s]
  0%|          | 0/12218 [00:00<?, ?it/s]
  0%|          | 0/12218 [00:00<?, ?it/s]
  0%|          | 0/12218 [00:00<?, ?it/s]
  0%|          | 0/12218 [00:00<?, ?it/s]
100%|██████████| 12218/12218 [00:04<00:00, 2913.57it/s]

In [13]:
# transforming test data scvalar features into dataframe
test_res={sensor:features for sensor,features in zip(SENSORNAMES, results)}

test_scalar_feats=pd.DataFrame()
for sensor in SENSORNAMES:
    _=pd.DataFrame.from_dict(test_res[sensor])
    test_scalar_feats = pd.concat([test_scalar_feats, _.T], axis=1)

test_scalar_feats.to_pickle("/mnt/workdata/_WORK_/Kaggle_202204/data_1/test_scalarfeats_1_small.pkl")
print(test_scalar_feats.shape)
test_scalar_feats.head()

(12218, 39)


Unnamed: 0,sensor_00_mean,sensor_00_std,sensor_00_mean_abs_dev,sensor_01_mean,sensor_01_std,sensor_01_mean_abs_dev,sensor_02_mean,sensor_02_std,sensor_02_mean_abs_dev,sensor_03_mean,...,sensor_09_mean_abs_dev,sensor_10_mean,sensor_10_std,sensor_10_mean_abs_dev,sensor_11_mean,sensor_11_std,sensor_11_mean_abs_dev,sensor_12_mean,sensor_12_std,sensor_12_mean_abs_dev
25968,-0.002602,1.091741,0.732741,-0.036432,4.499484,1.787575,0.883711,0.29129,0.236299,0.021195,...,0.655155,0.040895,0.855084,0.527249,0.025977,2.892104,1.429141,-0.000945,0.369897,0.285707
25969,0.028516,1.45004,0.848916,0.002276,0.775379,0.625245,0.272902,0.18452,0.133818,0.008776,...,0.526072,0.026936,0.402218,0.333322,-0.001423,0.925089,0.66475,0.02429,0.659728,0.56288
25970,0.057664,5.083618,4.023308,-0.021796,2.903657,2.412885,-0.617986,0.116012,0.081883,-0.034292,...,1.337068,-0.098473,2.955076,2.621078,-0.046451,2.846617,2.126154,0.071284,19.0278,16.620936
25971,-0.004791,0.914917,0.687991,0.014863,0.950923,0.717209,-0.580709,0.142112,0.12247,-0.030841,...,0.400218,0.021339,0.324061,0.241013,-0.001886,0.545613,0.403864,0.021114,0.889882,0.758959
25972,-0.001443,2.783685,1.083572,-0.02535,23.942241,6.486993,-0.164191,0.160825,0.156203,-0.021209,...,1.257703,-0.107817,2.865205,2.258879,-0.015912,13.526715,3.776632,12.399581,153.174703,127.454656















