In [1]:
import numpy as np
from sklearn.preprocessing import RobustScaler, OneHotEncoder
import pickle as pk
import pandas as pd
from tensorflow import keras
import scipy 
np.random.seed(0)

# Dataset With Raw X, Y, Z

In [2]:
''' 
Sliding window False
Raw X, Y, Z data 
Sampling Rate = 20 

Using 1 PID for training and 1 PID for testing
'''

with open("../data/pickles/all_pids_data_slidingwindowFalse10_samplingrate20_df.p", 'rb') as f:
  data = pk.load(f)
dataframe = data['data']
dataframe['pid'].unique()

array(['BK7610', 'BU4707', 'DC6359', 'JB3156', 'JR8022', 'MC7070',
       'SF3079'], dtype=object)

In [3]:
train_dataframe = dataframe[(dataframe['pid'] == 'SF3079') |
                            (dataframe['pid'] == 'BU4707') |
                            (dataframe['pid'] == 'DC6359') |
                            (dataframe['pid'] == 'JB3156') |
                            (dataframe['pid'] == 'JR8022') |
                            (dataframe['pid'] == 'MC7070') ]
test_dataframe = dataframe[(dataframe['pid'] == 'BK7610')]
train_dataframe.shape, test_dataframe.shape

((1679320, 6), (614460, 6))

In [4]:
train_dataframe.shape, test_dataframe.shape

((1679320, 6), (614460, 6))

In [5]:
''' 
Downsampling - basically we now take only one record per second instead of 20  
'''

def create_dataset(X, y, time_steps=1, step=1):
    Xs, ys = [], []
    for i in range(0, len(X) - time_steps, step):
        output = X.iloc[i:(i + time_steps)].values
        labels = y.iloc[i: i + time_steps]
        Xs.append(output)
        ys.append(scipy.stats.mode(labels)[0][0])
    return np.array(Xs), np.array(ys).reshape(-1, 1)


X_train, y_train = create_dataset(train_dataframe[['x', 'y', 'z']],train_dataframe.tac,20,20)
X_test, y_test = create_dataset(test_dataframe[['x', 'y', 'z']],test_dataframe.tac,20,20)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

  ys.append(scipy.stats.mode(labels)[0][0])
  ys.append(scipy.stats.mode(labels)[0][0])


((83965, 20, 3), (30722, 20, 3), (83965, 1), (30722, 1))

# LSTM on Vanilla Time Series

In [6]:
model = keras.Sequential()
model.add(keras.layers.LSTM(units=256,input_shape=[X_train.shape[1], X_train.shape[2]]))
model.add(keras.layers.Dropout(rate=0.5))
model.add(keras.layers.Dense(units=128, activation='relu'))
model.add(keras.layers.Dense(y_train.shape[1], activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['acc'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 256)               266240    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 128)               32896     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 299,265
Trainable params: 299,265
Non-trainable params: 0
_________________________________________________________________


In [7]:
history = model.fit(X_train, y_train,
    epochs=10,
    batch_size=32, 
    validation_split=0.0, 
    shuffle=False)   

Epoch 1/10


2023-05-11 16:16:41.309603: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
model.evaluate(X_test, y_test) 



[3.065591812133789, 0.6346917748451233]

# MLP Classifier on Vanilla Time Series

In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
clf = MLPClassifier(solver='adam', shuffle=True, random_state=1)

# X0_dash = np.reshape(X0, (X0.shape[0], X0.shape[1] * X0.shape[2]))

clf.fit(np.reshape(X_train, (83965, 60)), y_train)
# clf.get_params()

print('Accuracy ', accuracy_score(y_test, clf.predict(np.reshape(X_test, ((30722, 60))))))
print('Prec ', precision_score(y_test, clf.predict(np.reshape(X_test, ((30722, 60))))))
print('Recall ', recall_score(y_test, clf.predict(np.reshape(X_test, ((30722, 60))))))

  y = column_or_1d(y, warn=True)


Accuracy  0.6060803333116334
Prec  0.6285763949243873
Recall  0.927278321965229


# Bidirectional LSTM on vanilla Time Series

In [10]:
model = keras.Sequential()
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=256,input_shape=[X_train.shape[1], X_train.shape[2]])))
model.add(keras.layers.Dropout(rate=0.5))
model.add(keras.layers.Dense(units=128, activation='relu'))
model.add(keras.layers.Dense(y_train.shape[1], activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer=keras.optimizers.Adam(0.001),metrics=['acc']) 
model.build((X_train.shape)) 
model.summary() 

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirectiona  (83965, 512)             532480    
 l)                                                              
                                                                 
 dropout_1 (Dropout)         (83965, 512)              0         
                                                                 
 dense_2 (Dense)             (83965, 128)              65664     
                                                                 
 dense_3 (Dense)             (83965, 1)                129       
                                                                 
Total params: 598,273
Trainable params: 598,273
Non-trainable params: 0
_________________________________________________________________


In [11]:
history = model.fit(
    X_train, y_train,
    epochs=25,
    batch_size=32, 
    validation_split=0.01, 
    shuffle=False)   

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [33]:
model.evaluate(X_test, y_test) 



[0.7217283844947815, 0.7154703736305237]

# CNN On Vanilla Time Series

In [131]:
batch_size = 256
epochs = 20

flatten = keras.layers.Flatten()
conv_layer1 = keras.layers.Conv1D(filters = 64, kernel_size = 3, padding='SAME')
conv_layer2 = keras.layers.Conv1D(filters = 64, kernel_size = 3, padding='SAME')
dropout = keras.layers.Dropout(0.5)
max_pooling = keras.layers.MaxPool1D(pool_size=2)
fc_layer = keras.layers.Dense(units=128, activation = 'relu')
fc_layer2 = keras.layers.Dense(y_train.shape[1], activation = 'sigmoid')
base_model = keras.Sequential([
                                  conv_layer1,  
                                  conv_layer2, 
                                  dropout, 
                                  max_pooling, 
                                  flatten, 
                                  fc_layer, 
                                  fc_layer2
                                ])

base_model.compile(loss='binary_crossentropy', 
                   optimizer='adam',
                   metrics=['acc']) 



In [132]:
history = base_model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32, 
    validation_split=0.01,
    shuffle=False) 

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [133]:
base_model.evaluate(X_test, y_test) 



[0.7578141689300537, 0.3100312054157257]

# Data with Feature Engineering on Raw X, Y, Z

In [49]:
## PID = DC6359
df = pd.read_csv("../data/DC6359_feature_engg_updated_new.csv")
df = df.sample(frac=1, random_state=1).reset_index()
df = df.drop(['Unnamed: 0', 'index'], axis=1) 
df.head()

Unnamed: 0,tac,x_mean,y_mean,z_mean,x_std,y_std,z_std,x_aad,y_aad,z_aad,...,z_arg_diff,x_argmax_fft,y_argmax_fft,z_argmax_fft,x_argmin_fft,y_argmin_fft,z_argmin_fft,x_arg_diff_fft,y_arg_diff_fft,z_arg_diff_fft
0,0,-0.000585,-0.000364,0.004651,0.003299,0.004179,0.008746,0.002534,0.002625,0.005187,...,43,41,19,28,16,24,27,25,5,1
1,0,0.00071,-0.005559,-0.010026,0.043935,0.042405,0.04251,0.02088,0.019815,0.026667,...,17,1,3,1,17,42,28,16,39,27
2,1,0.023324,0.015599,-0.027149,0.128959,0.106513,0.131192,0.088268,0.074175,0.094685,...,10,3,11,6,24,16,36,21,5,30
3,0,0.012703,-0.011832,-0.009105,0.029305,0.043782,0.033665,0.020353,0.0197,0.023722,...,8,0,45,6,34,24,21,34,21,15
4,0,-0.031388,0.001648,-0.045044,0.120015,0.076248,0.114803,0.078569,0.052472,0.080129,...,5,40,0,10,24,42,9,16,42,1


In [50]:
## PID = BU4707
test_df = pd.read_csv("../data/BU4707_feature_engg_updated_new.csv")
test_df = test_df.drop(['Unnamed: 0'], axis=1) 
test_df.head()

Unnamed: 0,tac,x_mean,y_mean,z_mean,x_std,y_std,z_std,x_aad,y_aad,z_aad,...,z_arg_diff,x_argmax_fft,y_argmax_fft,z_argmax_fft,x_argmin_fft,y_argmin_fft,z_argmin_fft,x_arg_diff_fft,y_arg_diff_fft,z_arg_diff_fft
0,0,0.004578,0.007246,0.016726,0.003708,0.004323,0.004929,0.003109,0.003611,0.003963,...,22,20,23,21,24,26,31,4,3,10
1,0,0.001759,0.010015,0.016626,0.013341,0.010943,0.010745,0.006699,0.006029,0.005773,...,2,6,47,46,21,12,14,15,35,32
2,0,0.002363,0.009744,0.013846,0.017084,0.013323,0.014038,0.010165,0.00848,0.009637,...,11,6,1,4,18,24,37,12,23,33
3,0,0.002435,0.008929,0.010908,0.011838,0.00981,0.012986,0.007481,0.007362,0.009902,...,83,0,8,38,34,28,44,34,20,6
4,0,0.000526,0.009971,0.010272,0.005405,0.007611,0.010529,0.003908,0.005775,0.00774,...,5,32,16,1,35,20,26,3,4,25


In [51]:
df.shape, test_df.shape

((6216, 107), (4490, 107))

In [53]:
corr_matrix = df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.95 
to_drop = [column for column in upper.columns if any(upper[column] > 0.80)]
to_drop

# Drop features 
new_df = df.drop(to_drop, axis=1)
new_test_df = test_df.drop(to_drop, axis=1)
df.shape, new_df.shape, test_df.shape, new_test_df.shape

((6216, 107), (6216, 41), (4490, 107), (4490, 41))

In [54]:
X_train = new_df.drop(['tac'], axis=1)
y_train = new_df.tac
X_test = new_test_df.drop(['tac'], axis=1)
y_test = new_test_df.tac 

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6216, 40), (6216,), (4490, 40), (4490,))

In [57]:
y_train = np.reshape(y_train, (-1,1))
y_test = np.reshape(y_test, (-1,1))
y_train.shape, y_test.shape

((6216, 1), (4490, 1))

In [200]:
batch_size = 32
epochs = 50

fc_layer1 = keras.layers.Dense(units=128, activation = 'relu')
fc_layer2 = keras.layers.Dense(units=64, activation = 'relu')
fc_layer3 = keras.layers.Dense(units=32, activation = 'relu')
fc_layer4 = keras.layers.Dense(units=1, activation = 'sigmoid')
# Cant use softmax at the end since it will normalize and give 1 
base_model = keras.Sequential([
fc_layer1,
fc_layer2,
fc_layer3,
fc_layer4
])

base_model.compile(loss=keras.losses.BinaryCrossentropy(from_logits = False), 
                   optimizer=keras.optimizers.Adam(0.001, beta_1=0.9, beta_2= 0.999), 
                   metrics=['acc']) 

base_model.build((X_train.shape))
base_model.summary()

Model: "sequential_36"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_68 (Dense)            (83965, 20, 128)          512       
                                                                 
 dense_69 (Dense)            (83965, 20, 64)           8256      
                                                                 
 dense_70 (Dense)            (83965, 20, 32)           2080      
                                                                 
 dense_71 (Dense)            (83965, 20, 1)            33        
                                                                 
Total params: 10,881
Trainable params: 10,881
Non-trainable params: 0
_________________________________________________________________


In [94]:
# model = keras.Sequential()
# model.add(keras.layers.LSTM(20, input_shape=[40,]))
# model.add(keras.layers.TimeDistributed(keras.layers.Dense(1, activation='sigmoid')))
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.build((X_train.shape)) 
# model.summary() 

In [106]:
history = base_model.fit(
    X_train, y_train,
    epochs=20,
    batch_size=256, 
    validation_split=0.01, 
    shuffle=False)   

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [107]:
base_model.evaluate(X_test, y_test)



[0.7980446815490723, 0.679287314414978]

# Sliding window dataset - Model does not learn on this data

In [13]:
with open("../data/pickles/data_slidingwindowTrue10_samplingrate20_df_DC6359.p", 'rb') as f:
  data = pk.load(f)
X = data['X']
Y = data['Y']
# dataframe = data['data']
# dataframe['pid'].unique() 

In [22]:
model = keras.Sequential()
model.add(
      keras.layers.LSTM(
          units=256,
          input_shape=[X.shape[1], X.shape[2]]
      )
)
model.add(keras.layers.Dropout(rate=0.5))
model.add(keras.layers.Dense(units=128, activation='relu'))
model.add(keras.layers.Dense(y_train.shape[1], activation='sigmoid'))

model.compile(
  loss='binary_crossentropy',
  optimizer='adam', 
  metrics=['acc'] 
)

model.build((X.shape))
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_5 (LSTM)               (None, 256)               266240    
                                                                 
 dropout_5 (Dropout)         (None, 256)               0         
                                                                 
 dense_10 (Dense)            (None, 128)               32896     
                                                                 
 dense_11 (Dense)            (None, 1)                 129       
                                                                 
Total params: 299,265
Trainable params: 299,265
Non-trainable params: 0
_________________________________________________________________


In [24]:
history = model.fit(
    X, Y,
    epochs=20,
    batch_size=256, 
    validation_split=0.01, 
    shuffle=False)   

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [26]:
model = keras.Sequential()
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units=256,input_shape=[X_train.shape[1], X_train.shape[2]])))
model.add(keras.layers.Dropout(rate=0.5))
model.add(keras.layers.Dense(units=128, activation='relu'))
model.add(keras.layers.Dense(y_train.shape[1], activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer=keras.optimizers.Adam(0.001),metrics=['acc']) 
model.build((X.shape)) 
model.summary() 

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_2 (Bidirectio  (15543, 512)             532480    
 nal)                                                            
                                                                 
 dropout_7 (Dropout)         (15543, 512)              0         
                                                                 
 dense_14 (Dense)            (15543, 128)              65664     
                                                                 
 dense_15 (Dense)            (15543, 1)                129       
                                                                 
Total params: 598,273
Trainable params: 598,273
Non-trainable params: 0
_________________________________________________________________


In [27]:
history = model.fit(
    X, Y,
    epochs=20,
    batch_size=256, 
    validation_split=0.01, 
    shuffle=False)   

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20