In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')
import os
import gc



In [2]:
train = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
print(train.shape)
print(test.shape)

sub = pd.read_csv("../input/tabular-playground-series-apr-2022/sample_submission.csv")
print(sub.shape)
labels = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')
print('label', labels.shape)

(1558080, 16)
(733080, 16)
(12218, 2)
label (25968, 2)


In [3]:
features = [c for c in train.columns if 'sensor' in c]

# adding labels to train data
train = pd.merge(train, labels,how='left', on="sequence")
train.head()

Unnamed: 0,sequence,subject,step,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09,sensor_10,sensor_11,sensor_12,state
0,0,47,0,-0.196291,0.112395,1.0,0.329204,-1.00466,-0.131638,-0.127505,0.368702,-0.1,-0.963873,-0.985069,0.531893,4.751492,0
1,0,47,1,-0.44745,0.134454,1.0,-0.658407,0.162495,0.340314,-0.209472,-0.867176,0.2,-0.301301,0.082733,-0.231481,0.45439,0
2,0,47,2,0.326893,-0.694328,1.0,0.330088,0.473678,1.280479,-0.094718,0.535878,1.4,1.002168,0.449221,-0.58642,-4.736147,0
3,0,47,3,0.523184,0.75105,1.0,0.976991,-0.563287,-0.720269,0.79326,0.951145,-0.3,-0.995665,-0.43429,1.34465,0.429241,0
4,0,47,4,0.272025,1.07458,1.0,-0.136283,0.398579,0.044877,0.560109,-0.541985,-0.9,1.055636,0.812631,0.123457,-0.223359,0


# FE

In [4]:
def addFeatures(df):  
    
    for f in features:
        df[f + '_lag1'] = df.groupby('sequence')[f].shift(1).fillna(0).astype('float32')
        
        #df[f + '_lag_back1'] = df.groupby('sequence')[f].shift(-1).fillna(0).astype('float32')
        #df[f + '_cumsum'] = df.groupby('sequence')[f].cumsum().astype('float32')
        df[f + '_diff1'] = (df[f] - df[f + '_lag1'].astype('float32')
    return df

train = addFeatures(train)
test = addFeatures(test)

print(train.shape)
print(test.shape)
train.head()

SyntaxError: invalid syntax (276878481.py, line 9)

In [None]:
Window = 60

target = train.state.values

y = train['state'].to_numpy().reshape(-1, Window)
train.drop(["sequence","step","subject","state"], axis=1, inplace=True)
test.drop(["sequence","step","subject"], axis=1, inplace=True)
y

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler
sc = StandardScaler()

train = sc.fit_transform(train)
test = sc.transform(test)

In [None]:
train = train.reshape(-1, Window, train.shape[-1])
test = test.reshape(-1, Window, train.shape[-1])
print(train.shape)

In [None]:
import tensorflow as tf
import logging
from tensorflow.keras import *

In [None]:
# Detect hardware, return appropriate distribution strategy
print(tf.version.VERSION)
tf.get_logger().setLevel(logging.ERROR)
try: # detect TPU
    tpu = None
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError: # detect GPU(s) and enable mixed precision
    strategy = tf.distribute.MirroredStrategy() # works on GPU and multi-GPU
    policy = tf.keras.mixed_precision.experimental.Policy('mixed_float16')
    tf.config.optimizer.set_jit(True) # XLA compilation
    tf.keras.mixed_precision.experimental.set_policy(policy)
    print('Mixed precision enabled')
print("REPLICAS: ", strategy.num_replicas_in_sync)

# LSTM

In [None]:
def plotHist(hist):
    plt.plot(hist.history["auc"])
    plt.plot(hist.history["val_auc"])
    plt.title("model performance")
    plt.ylabel("area_under_curve")
    plt.xlabel("epoch")
    plt.legend(["train", "validation"], loc="upper left")
    plt.show()

In [None]:
from tensorflow.keras.metrics import AUC

def createModel():
    with strategy.scope():
        model = Sequential(
        [
            Input(shape = (Window, train.shape[-1])),
            Bidirectional(LSTM(768, return_sequences = True)),
            Bidirectional(LSTM(512, return_sequences = True)), 
            Bidirectional(LSTM(256, return_sequences = True)),
            Bidirectional(LSTM(128, return_sequences = True)),
            Dense(128, activation = 'selu'),
            Dense(1, activation = 'sigmoid')
        ]
        )
        model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = [AUC(name = 'auc')]
                     )
    return (model)

In [None]:
%%time

from sklearn.model_selection import KFold, StratifiedKFold
from tensorflow.keras.layers import *
from tensorflow.keras import *
from tensorflow.keras.callbacks import *

skf = KFold(n_splits = 5, shuffle = True, random_state = 2022)
preds = []

for fold, (train_idx, val_idx) in enumerate(skf.split(train, y)):
    print('fold', fold+1)
    print('*'*40)
    
    X_train, X_valid = train[train_idx], train[val_idx]
    y_train, y_valid = y[train_idx], y[val_idx]
    
    es = EarlyStopping(monitor = 'val_auc', mode = 'max', patience = 5, verbose = 1, restore_best_weights = True)
    lr = ReduceLROnPlateau(monitor="val_auc", factor=0.8, 
                               patience=4, verbose = 1)
    model = createModel()
    
    history = model.fit(X_train, y_train, validation_data = (X_valid, y_valid),
                       epochs = 50, batch_size = 256, callbacks = [es, lr])
    
    preds.append(model.predict(test).squeeze())
    plotHist(history)
    
    del X_train, X_valid, y_train, y_valid, model, history
    gc.collect()
    

# Submission

In [None]:
sub["state"] = sum(preds)/skf.n_splits 
sub.to_csv('submission.csv', index=False)
sub