In [None]:
import numpy as np 
import pandas as pd
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = True)
import seaborn as sns

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold

import optuna

from tqdm import tqdm
tqdm.pandas()

import tensorflow as tf
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.layers import Concatenate, LSTM, GRU
from tensorflow.keras.layers import Bidirectional, Multiply

from xgboost import XGBClassifier

np.random.seed(2022)
tf.random.set_seed(2022)

pd.set_option('display.max_columns', None)
#########################################################
train = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
t_lbls = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
ss = pd.read_csv('../input/tabular-playground-series-apr-2022/sample_submission.csv')

# EDA

In [None]:
train.head(3)

In [None]:
print('DATA INFORMATION')
print()
print('Count of sequences:')
print(f'train - {int(len(train)/60)} | test - {int(len(test)/60)}')
print()
print('Missing values:')
print(f'train - {train.isna().sum().sum()} | test - {test.isna().sum().sum()}')
print()
print('Distribution of target:')
print(f'"1" - {round(t_lbls["state"].value_counts()[1]/len(t_lbls)*100,2)}% | "0" - {round(t_lbls["state"].value_counts()[0]/len(t_lbls)*100,2)}%')
print()
print('-'*39)
print()
print('Train features')
display(train[train.columns.tolist()[3:]].describe().transpose()[['mean', 'min', 'max']]\
.style.background_gradient(cmap='Blues'))
print()
print('-'*39)
print()
print('Test features')
display(test[test.columns.tolist()[3:]].describe().transpose()[['mean', 'min', 'max']]\
.style.background_gradient(cmap='Blues'))

In [None]:
seqs, i = list(t_lbls[t_lbls['state']==0]['sequence'][:3]) + list(t_lbls[t_lbls['state']==1]['sequence'][:3]), 0
colors = ['#c21b1b', '#c21b1b', '#c21b1b', '#21a5de', '#21a5de', '#21a5de']
fig = plt.figure(figsize = (15, 20))
for sensor in train.columns.tolist()[3:]:
    for color, seq in zip(colors, seqs):
        i += 1
        plt.subplot(13,6,i)
        sns.set_style("white")
        if i < 7: 
            plt.title(f"Sequence {seq}", size = 12, fontname = 'monospace')
        a = sns.lineplot(data=train[train['sequence']==seq][sensor], color = color, linewidth = 1)
        plt.xlabel('')
        plt.ylabel('')
        if (i-1) % 6 == 0: 
            plt.ylabel(sensor, size = 12, fontname = 'monospace')
        plt.xticks([])
        plt.yticks([])
        
fig.tight_layout(h_pad = 3)

plt.figtext(0.5, 1.05, 'Sequences examples', fontsize = 23, fontname = 'monospace', ha='center')
plt.figtext(0.22, 1.03, 'Target 0', fontsize = 20, fontname = 'monospace', color = '#c21b1b')
plt.figtext(0.71, 1.03, 'Target 1', fontsize = 20, fontname = 'monospace', color = '#21a5de')

plt.show()

# Preprocessing

**For DNN model**

In [None]:
features = train.columns.tolist()[3:]
def prep(df):
    for feature in features:
        df[feature+'_lag1'] = df.groupby('sequence')[feature].shift(1)
        df.fillna(0, inplace=True)
        df[feature+'_diff1'] = df[feature] - df[feature+'_lag1']

prep(train)
prep(test)

features = train.columns.tolist()[3:]
sc = StandardScaler()
train[features] = sc.fit_transform(train[features])
test[features] = sc.transform(test[features])

groups = train["sequence"]
labels = t_lbls["state"]

train = train.drop(["sequence", "subject", "step"], axis=1).values
train = train.reshape(-1, 60, train.shape[-1])

test = test.drop(["sequence", "subject", "step"], axis=1).values
test = test.reshape(-1, 60, test.shape[-1])

**For XGB model**

In [None]:
def prep(df):
    
    result = pd.DataFrame()
    result['sequence'] = df['sequence'].unique()
    result = result.merge(df[['sequence', 'subject']], on='sequence', how='left')
    
    for sensor in test.columns.tolist()[3:]:
        aggs = df.groupby('sequence').agg({sensor: ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew']})
        aggs.columns = aggs.columns.map('_'.join)
        result = result.merge(aggs.reset_index(), on='sequence', how='left')
        
        aggs = df.groupby('subject').agg({sensor: ['mean', 'max', 'min', 'var', 'mad', 'sum', 'median', 'skew']})
        aggs.columns = aggs.columns.map('_subject_'.join)
        result = result.merge(aggs.reset_index(), on='subject', how='left')
    
    return result

train = prep(train)
test = prep(test)

train.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)

features = train.columns.tolist()[2:]
sc = StandardScaler()
train[features] = sc.fit_transform(train[features])
test[features] = sc.transform(test[features])

X = train[features]
test = test[features]
y = t_lbls['state']

# XGB

In [None]:
def objective(trial, data = X, target = y):

    params = {
        'n_estimators': 10000,
        'max_depth': trial.suggest_int('max_depth', 2, 9),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'min_child_weight': trial.suggest_int('min_child_weight', 10, 300),
        'gamma': trial.suggest_float('gamma', 0.0001, 1.0, log = True),
        'alpha': trial.suggest_float('alpha', 0.0001, 10.0, log = True),
        'lambda': trial.suggest_float('lambda', 0.0001, 10.0, log = True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.8),
        'subsample': trial.suggest_float('subsample', 0.1, 0.8),
        'tree_method': 'gpu_hist',
        'booster': 'gbtree',
        'random_state': 2022,
        'use_label_encoder': False,
        'eval_metric': 'auc'
    }
    
    model = XGBClassifier(**params)
    scores = []
    k = StratifiedKFold(n_splits = 5, random_state = 228, shuffle = True)
    for fold, (train_idx, val_idx) in enumerate(k.split(X, y)):
        
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train, eval_set = [(X_val, y_val)], early_stopping_rounds = 50, verbose = False)
        
        val_preds = model.predict_proba(X_val)[:,1]
        val_score = roc_auc_score(y_val, val_preds)

        scores.append(val_score)
        
        print(f"Fold {fold+1} | AUC: {val_score}")
    
    return np.mean(scores)

study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 150)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
print('Best value:', study.best_value)

In [None]:
best_params = {'n_estimators': 10000, 
               'tree_method': 'gpu_hist',
               'booster': 'gbtree',
               'random_state': 2022,
               'use_label_encoder': False,
               'eval_metric': 'auc',
               'max_depth': 5,
               'learning_rate': 0.013430615331501902, 
               'min_child_weight': 11, 
               'gamma': 0.2507452781368943, 
               'alpha': 0.00010885805925508797,
               'lambda': 0.017572701090619865,
               'colsample_bytree': 0.5450566395055472,
               'subsample': 0.6223621722492708}

In [None]:
predictions, scores = [], []
k = StratifiedKFold(n_splits = 10, random_state = 2022, shuffle = True)
for fold, (train_idx, val_idx) in enumerate(k.split(X, y)):
    
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = XGBClassifier(**best_params)
   
    model.fit(X_train, y_train, eval_set = [(X_val, y_val)], verbose = False, early_stopping_rounds = 50)
    
    val_preds = model.predict_proba(X_val)[:,1]
    val_score = roc_auc_score(y_val, val_preds)
    print(f'Fold {fold+1} AUC: {round(val_score, 4)}')
    
    scores.append(val_score)
    predictions.append(model.predict_proba(test)[:,1])
print('------------------')    
print(f'Mean AUC - {round(np.mean(scores), 4)}')

predictions = sum(predictions)/k.n_splits

**Mean AUC on 10 folds - 0.9391**

**LB score - 0.917**

In [None]:
ss["state"] = predictions
ss.to_csv('submission11_XGB.csv', index=False)

# DNN model

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
    BATCH_SIZE = tpu_strategy.num_replicas_in_sync * 64
    print("Running on TPU:", tpu.master())
    print(f"Batch Size: {BATCH_SIZE}")
    
except ValueError:
    strategy = tf.distribute.get_strategy()
    BATCH_SIZE = 256
    print(f"Running on {strategy.num_replicas_in_sync} replicas")
    print(f"Batch Size: {BATCH_SIZE}")

In [None]:
def dnn_model():

    x_input = Input(shape=(train.shape[-2:]))
    x1 = Bidirectional(LSTM(768, return_sequences=True))(x_input)
        
    x21 = Bidirectional(LSTM(512, return_sequences=True))(x1)
    x22 = Bidirectional(LSTM(512, return_sequences=True))(x_input)
    l2 = Concatenate(axis=2)([x21, x22])
        
    x31 = Bidirectional(LSTM(384, return_sequences=True))(l2)
    x32 = Bidirectional(LSTM(384, return_sequences=True))(x21)
    l3 = Concatenate(axis=2)([x31, x32])
        
    x41 = Bidirectional(LSTM(256, return_sequences=True))(l3)
    x42 = Bidirectional(LSTM(128, return_sequences=True))(x32)
    l4 = Concatenate(axis=2)([x41, x42])
        
    l5 = Concatenate(axis=2)([x1, l2, l3, l4])
    g = GlobalMaxPooling1D()(l5)
    x7 = Dense(128, activation='selu')(g)
    x8 = Dropout(0.2)(x7)
    x_output = Dense(units=1, activation="sigmoid")(x8)
    
    model = Model(inputs=x_input, outputs=x_output, name='lstm_model')
    
    return model

model = dnn_model()

plot_model(
    model, 
    to_file='Super_Model.png', 
    show_shapes=False,
    show_layer_names=True
)

In [None]:
with tpu_strategy.scope():
    VERBOSE = True
    predictions, scores = [], []
    k = GroupKFold(n_splits = 15)

    for fold, (train_idx, val_idx) in enumerate(k.split(train, labels, groups.unique())):
        print('-'*15, '>', f'Fold {fold+1}', '<', '-'*15)
    
        X_train, X_val = train[train_idx], train[val_idx]
        y_train, y_val = labels.iloc[train_idx].values, labels.iloc[val_idx].values
        
        model = dnn_model()
        model.compile(optimizer="adam", loss="binary_crossentropy", metrics='AUC')

        lr = ReduceLROnPlateau(monitor="val_auc", factor=0.5, 
                               patience=2, verbose=VERBOSE, mode="max")

        es = EarlyStopping(monitor="val_auc", patience=7, 
                           verbose=VERBOSE, mode="max", 
                           restore_best_weights=True)
        
        save_locally = tf.saved_model.SaveOptions(experimental_io_device='/job:localhost')
        chk_point = ModelCheckpoint(f'./TPS_model_2022_{fold+1}C.h5', options=save_locally, 
                                    monitor='val_auc', verbose=VERBOSE, 
                                    save_best_only=True, mode='max')
        
        model.fit(X_train, y_train, 
                  validation_data=(X_val, y_val), 
                  epochs=20,
                  verbose=VERBOSE,
                  batch_size=BATCH_SIZE, 
                  callbacks=[lr, chk_point, es])
        
        load_locally = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost')
        model = load_model(f'./TPS_model_2022_{fold+1}C.h5', options=load_locally)
        
        y_pred = model.predict(X_val, batch_size=BATCH_SIZE).squeeze()
        score = roc_auc_score(y_val, y_pred)
        scores.append(score)
        predictions.append(model.predict(test, batch_size=BATCH_SIZE).squeeze())
        print(f"Fold-{fold+1} | OOF Score: {score}")
    
    print(f'Mean AUC on {k.n_splits} folds - {np.mean(scores)}')

**Mean AUC on 15 folds - 0.9718**

**LB score - 0.968**

In [None]:
ss["state"] = sum(predictions)/k.n_splits 
ss.to_csv('submission12.csv', index=False)

# Blending

Results of XGB and 2 DNN models.

In [None]:
s1 = pd.read_csv('../input/tps-apr/s_0.968.csv')
s2 = pd.read_csv('../input/tps-apr/s_0.97.csv') # the result from https://www.kaggle.com/code/hamzaghanmi/tps-april-tensorflow-bi-lstm
s3 = pd.read_csv('../input/tps-apr/submission11_XGB.csv')

In [None]:
ss['state'] = s1['state']*0.3 + s2['state']*0.45 + s3['state']*0.25 
ss.to_csv('blend_sub12.csv', index=False)
ss