# Train macro Classifiers without masking for missing data

# Window jitter and time shift data augmentation is added

In [2]:
import glob
import os
import numpy as np
import csv
#import cv2
import io
from sklearn.preprocessing import StandardScaler
import shutil

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

# Options to set if needed

In [3]:
# from keras.backend.tensorflow_backend import set_session
# import tensorflow as tf
# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
# config.log_device_placement = True  # to log device placement (on which device the operation ran)
# sess = tf.Session(config=config)
# set_session(sess)  # set this TensorFlow session as the default 

In [33]:
data_dir_train='data_2/train'
data_dir_val= 'data_2/val'
data_dir_test= 'data_2/test'

In [34]:
sub_dirs = ['left_hip','right_arm','right_wrist' ] #three sensors, 9 channels total

In [4]:
def parse_IMU_files(parent_dir, sub_dirs, startTime, endTime, file_name, window_length):
    
    data = []
    
    data_count = 0
    
    for sub_dir in sub_dirs:
        channel=[]
        
        for fn in glob.glob(os.path.join(parent_dir,sub_dir, file_name)):
            file = open(fn, newline='')
            reader = csv.reader(file)
            first = True
            count = 0
            for row in reader:
                
                if first:
                    first = False
                    continue
                
                timestamp=float(row[3]) #4th column is timestamp
                if timestamp >=startTime and timestamp <=endTime and count<window_length:
                    
                    channel.append([float(row[0]),float(row[1]),float(row[2])])
                    count = count + 1 
                    data_count = data_count+1
                    
        data.append(channel)         
    return data, data_count

In [5]:
#trying to play with some training data extraction
import random

def parse_IMU_files_2(parent_dir, sub_dirs, startTime, endTime, file_name, window_length):
    
    data = []
    
    data_count = 0
    for sub_dir in sub_dirs:
        channel=[]
        
        
        for fn in glob.glob(os.path.join(parent_dir,sub_dir, file_name)):
            file = open(fn, newline='')
            reader = csv.reader(file)
            first = True
            count = 0
            for row in reader:
                
                if first:
                    first = False
                    continue
                
                timestamp=float(row[3]) #4th column is timestamp
                
                window_jitter1 = random.randint(-150,150)
                
                window_jitter2 = random.randint(-150,150)
        
                if timestamp >=(startTime+window_jitter1) and timestamp <=(endTime+window_jitter2) and count<window_length:
                    
                    channel.append([float(row[0]),float(row[1]),float(row[2])])
                    count = count + 1  
                    
                    data_count = data_count+1
                    
        data.append(channel)         
    return data, data_count


def parse_IMU_files_3(parent_dir, sub_dirs, startTime, endTime, file_name, window_length):
    
    data = []
    
    data_count = 0
    for sub_dir in sub_dirs:
        channel=[]
                
        window_jitter1 = random.randint(-1500,1500)
        window_jitter2 = random.randint(-1500,1500)
            
        for fn in glob.glob(os.path.join(parent_dir,sub_dir, file_name)):
            file = open(fn, newline='')
            reader = csv.reader(file)
            first = True
            count = 0
            
            for row in reader:
                
                if first:
                    first = False
                    continue
                
                timestamp=float(row[3]) #4th column is timestamp
    
                if timestamp >=(startTime+window_jitter1) and timestamp <=(endTime+window_jitter2) and count<window_length:
                    
                    channel.append([float(row[0]),float(row[1]),float(row[2])])
                    count = count + 1  
                    
                    data_count = data_count+1
                    
        data.append(channel)         
    return data, data_count

In [6]:
data_min_count = 100 # per sample, valid points

In [7]:
def get_train_data(data_dir, sub_dirs):
    files = os.listdir(data_dir+'/left_hip')
    number_of_samples = 500
    
    labels_macro =dict()
    
    labels_macro['sandwich'] = 0
    labels_macro['fruitsalad'] = 1
    labels_macro['cereal'] = 2
    
    
    #read the labels
    labels_loc = 'data/LabelTable.csv'
    file_label = open(labels_loc, newline='')
    label_reader = csv.reader(file_label)
    file_label_mapping = dict()
    
    first = True
    for row in label_reader:
        
        if first:
            first = False
            continue
        
        file_label_mapping[row[0]+'.csv'] = labels_macro[row[1]]
        
    all_data = []
    all_labels = []
    for f in files:
        
        st_index = 0
        end_index = 30000
        step = 1000 #overlapping window, step: 1000. 
        window_index = 10000 #6 second window
        
        print('reading file:',f)
        f_name = f
        
        if f_name == '.DS_Store':
            continue
        
        curr_label_file = file_label_mapping[f_name]
        
        while st_index+step < end_index:
        
            data, data_count = parse_IMU_files(data_dir, sub_dirs, st_index, st_index+window_index,  f, number_of_samples)
            st_index = st_index+step
            
            if data_count<data_min_count:
                continue
            
            train_data_sample  = np.zeros((9, number_of_samples))
            train_data_label   = curr_label_file
            for i in range(len(data)):
                for j in range(len(data[i])):
                    train_data_sample[i*3,j]=data[i][j][0]
                    train_data_sample[i*3+1,j]=data[i][j][1]
                    train_data_sample[i*3+2,j]=data[i][j][2]
            
            all_data.append(train_data_sample)
            all_labels.append(train_data_label)
            
    return all_data, all_labels

In [8]:
def get_train_data_2(data_dir, sub_dirs):
    files = os.listdir(data_dir+'/left_hip')
    number_of_samples = 500
    
    labels_macro =dict()
    
    labels_macro['sandwich'] = 0
    labels_macro['fruitsalad'] = 1
    labels_macro['cereal'] = 2
    
    
    #read the labels
    labels_loc = 'data/LabelTable.csv'
    file_label = open(labels_loc, newline='')
    label_reader = csv.reader(file_label)
    file_label_mapping = dict()
    
    first = True
    for row in label_reader:
        
        if first:
            first = False
            continue
        
        file_label_mapping[row[0]+'.csv'] = labels_macro[row[1]]
        
    all_data = []
    all_labels = []
    
    all_counts = []
    
    for f in files:
        
        st_index = 0
        end_index = 30000
        step = 1000 #overlapping window, step 
        window_index = 10000 #6 second window
        
        print('reading file:',f)
        f_name = f
        
        if f_name == '.DS_Store':
            continue
        
        curr_label_file = file_label_mapping[f_name]
        
        while st_index+step < end_index:
        
            data, data_count = parse_IMU_files_2(data_dir, sub_dirs, st_index, st_index+window_index,  f, number_of_samples)
            
            st_index = st_index+step
            
            if data_count<data_min_count:
                continue
            
            train_data_sample  = np.zeros((9, number_of_samples))
            train_data_label   = curr_label_file
            for i in range(len(data)):
                for j in range(len(data[i])):
                    train_data_sample[i*3,j]=data[i][j][0]
                    train_data_sample[i*3+1,j]=data[i][j][1]
                    train_data_sample[i*3+2,j]=data[i][j][2]
            
            all_data.append(train_data_sample)
            all_labels.append(train_data_label)
            
            all_counts.append(data_count)
            
    return all_data, all_labels, all_counts

In [9]:
def get_train_data_3(data_dir, sub_dirs):
    files = os.listdir(data_dir+'/left_hip')
    number_of_samples = 500
    
    labels_macro =dict()
    
    labels_macro['sandwich'] = 0
    labels_macro['fruitsalad'] = 1
    labels_macro['cereal'] = 2
    
    
    #read the labels
    labels_loc = 'data/LabelTable.csv'
    file_label = open(labels_loc, newline='')
    label_reader = csv.reader(file_label)
    file_label_mapping = dict()
    
    first = True
    for row in label_reader:
        
        if first:
            first = False
            continue
        
        file_label_mapping[row[0]+'.csv'] = labels_macro[row[1]]
        
    all_data = []
    all_labels = []
    
    all_counts = []
    
    for f in files:
        
        st_index = 0
        end_index = 30000
        step = 1000 #overlapping window, step: 1000. 
        window_index = 10000 #6 second window
        
        print('reading file:',f)
        f_name = f
        
        if f_name == '.DS_Store':
            continue
        
        curr_label_file = file_label_mapping[f_name]
        
        while st_index+step < end_index:
                
            
            data, data_count = parse_IMU_files_3(data_dir, sub_dirs, st_index, st_index+window_index,  f, number_of_samples)
            
            st_index = st_index+step
            
            if data_count<data_min_count:
                continue
            
            train_data_sample  = np.zeros((9, number_of_samples))
            train_data_label   = curr_label_file
            for i in range(len(data)):
                for j in range(len(data[i])):
                    train_data_sample[i*3,j]=data[i][j][0]
                    train_data_sample[i*3+1,j]=data[i][j][1]
                    train_data_sample[i*3+2,j]=data[i][j][2]
            
            all_data.append(train_data_sample)
            all_labels.append(train_data_label)
            
            all_counts.append(data_count)
            
            
            
    return all_data, all_labels, all_counts

In [4]:
#receive windowed training and validation data
train_x, train_y = get_train_data(data_dir_train,sub_dirs)
val_x, val_y = get_train_data(data_dir_val,sub_dirs)
test_x, test_y = get_train_data(data_dir_test,sub_dirs)

In [5]:
train_x2, train_y2, all_counts = get_train_data_2(data_dir_train,sub_dirs)

In [6]:
train_x3, train_y3, all_counts = get_train_data_3(data_dir_train,sub_dirs)

In [16]:
train_samples_2 = np.array(train_x2) 
train_labels2_2 = np.array(train_y2)
print(train_samples_2.shape)
print(train_labels2_2.shape)

(4862, 9, 500)
(4862,)


In [17]:
train_samples_3 = np.array(train_x3) 
train_labels2_3 = np.array(train_y3)
print(train_samples_3.shape)
print(train_labels2_3.shape)

(4856, 9, 500)
(4856,)


In [11]:
train_samples = np.array(train_x) 
train_labels2 = np.array(train_y)
val_samples = np.array(val_x) 
val_labels2 = np.array(val_y)
test_samples = np.array(test_x) 
test_labels2 = np.array(test_y)

In [12]:
print(train_samples.shape)
print(train_labels2.shape)
print(val_samples.shape)
print(val_labels2.shape)
print(test_samples.shape)
print(test_labels2.shape)

(4864, 9, 500)
(4864,)
(1348, 9, 500)
(1348,)
(1356, 9, 500)
(1356,)


In [20]:
train_samples = np.vstack((train_samples,train_samples_2, train_samples_3))
train_labels2 = np.hstack((train_labels2, train_labels2_2, train_labels2_3))

In [13]:
print(train_samples.shape)
print(train_labels2.shape)
print(val_samples.shape)
print(val_labels2.shape)
print(test_samples.shape)
print(test_labels2.shape)

(4864, 9, 500)
(4864,)
(1348, 9, 500)
(1348,)
(1356, 9, 500)
(1356,)


In [14]:
#convert to one hot encoding
from keras.utils  import to_categorical
train_labels = to_categorical(train_labels2)
train_labels.shape
val_labels = to_categorical(val_labels2)
val_labels.shape
test_labels = to_categorical(test_labels2)
test_labels.shape

(1356, 3)

In [15]:
print(train_samples.shape)
print(train_labels.shape)
print(val_samples.shape)
print(val_labels.shape)
print(test_samples.shape)
print(test_labels2.shape)

(4864, 9, 500)
(4864, 3)
(1348, 9, 500)
(1348, 3)
(1356, 9, 500)
(1356,)


In [16]:
#For CNN and BidirLSTM:
number_of_samples = 500
train_samples = train_samples.reshape((-1, 9,number_of_samples, 1))
val_samples = val_samples.reshape((-1, 9,number_of_samples, 1))
test_samples = test_samples.reshape((-1, 9,number_of_samples, 1))

# Models

In [17]:
import keras
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, LSTM, Dense, Dropout, Flatten, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Permute, Reshape
from keras import backend as K
from keras.callbacks import ModelCheckpoint

In [18]:
def _data_reshaping(X_tr, X_va, X_ts, network_type):
    _, win_len, dim = X_tr.shape
    print(network_type)
    if network_type=='CNN' or network_type=='ConvLSTM':

        X_tr = np.swapaxes(X_tr,1,2)
        X_va = np.swapaxes(X_va,1,2)
        X_ts = np.swapaxes(X_ts,1,2)

        X_tr = np.reshape(X_tr, (-1, dim, win_len, 1))
        X_va = np.reshape(X_va, (-1, dim, win_len, 1))
        X_ts = np.reshape(X_ts, (-1, dim, win_len, 1))
    
    return X_tr, X_va, X_ts

In [19]:
from keras import regularizers

def model_variant(model, num_feat_map, dim, network_type,p):
    print(network_type)
    if network_type == 'ConvLSTM':
        model.add(Permute((2, 1, 3))) 
        model.add(Reshape((-1,num_feat_map*dim)))
        model.add(Bidirectional(LSTM(128, return_sequences=False, stateful=False)))
    if network_type == 'CNN':
        
        model.add(Flatten())
        model.add(Dense(64, activation='relu'))
        model.add(BatchNormalization()) 
        model.add(Dropout(p))

        
def model_conv(model, num_feat_map,p,b):
    model.add(Conv2D(num_feat_map, kernel_size=(1, 10),    
                 activation='relu',
                 input_shape=(dim, win_len, 1),
                 padding='same'))
    
    model.add(Conv2D(num_feat_map, kernel_size=(1, 10), activation='relu',padding='same'))
    
    if (b==1):
        model.add(BatchNormalization()) 
    model.add(Conv2D(num_feat_map, kernel_size=(1, 10), activation='relu',padding='same'))
    
    if (b==1):
        model.add(BatchNormalization()) 
    model.add(MaxPooling2D(pool_size=(1, 3)))
    
    model.add(Conv2D(num_feat_map, kernel_size=(1, 10), activation='relu',padding='same')) 
    model.add(Conv2D(num_feat_map, kernel_size=(1, 10), activation='relu',padding='same'))
    if (b==1):
        model.add(BatchNormalization()) 
    model.add(MaxPooling2D(pool_size=(1, 2)))
    model.add(Dropout(p))
    
    model.add(Conv2D(num_feat_map, kernel_size=(1, 10), activation='relu',padding='same'))  
    if (b==1):
        model.add(BatchNormalization()) 
    model.add(MaxPooling2D(pool_size=(1, 2)))
    
    model.add(Dropout(p))
    
def model_LSTM(model,p):
    model.add(LSTM(num_hidden_lstm, 
               input_shape=(win_len,dim), 
               return_sequences=True))
    model.add(Dropout(p))
    model.add(LSTM(num_hidden_lstm, return_sequences=False))
    model.add(Dropout(p))
    
def model_output(model):
    model.add(Dense(num_classes, activation='softmax'))

In [20]:
batch_size = 64
num_feat_map = 128
num_hidden_lstm = 128
num_classes = 3


network_type = 'ConvLSTM'
_, dim, win_len,_ = train_samples.shape

print(win_len)
print(dim)

500
9


In [21]:
p=0.5 #Dropout
b = 1 #BatchNorm
print('building the model ... ')
model = Sequential()

if network_type=='CNN' or network_type=='ConvLSTM':
    model_conv(model, num_feat_map,p,b)
    model_variant(model, num_feat_map, dim, network_type,p)
if network_type=='LSTM':
    model_LSTM(model,p)
       
model_output(model)    
model.summary()

building the model ... 
Instructions for updating:
If using Keras pass *_constraint arguments to layers.

ConvLSTM
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 9, 500, 128)       1408      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 9, 500, 128)       163968    
_________________________________________________________________
batch_normalization_1 (Batch (None, 9, 500, 128)       512       
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 9, 500, 128)       163968    
_________________________________________________________________
batch_normalization_2 (Batch (None, 9, 500, 128)       512       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 9, 166, 128)       0         
_____

In [22]:
X_train = train_samples
y_train = train_labels
X_val = val_samples
y_val = val_labels
X_test = test_samples
y_test = test_labels

In [23]:
print(X_train.shape, X_val.shape, X_test.shape)

(4864, 9, 500, 1) (1348, 9, 500, 1) (1356, 9, 500, 1)


In [24]:
from keras.models import load_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, accuracy_score
import matplotlib.pyplot as plt
import itertools
y_true = np.argmax(y_test, axis=1)

class monitor_Training(keras.callbacks.Callback):
    def on_train_begin(self, logs={}):
        return
 
    def on_train_end(self, logs={}):
        return
 
    def on_epoch_begin(self, epoch, logs={}):
        return
 
    def on_epoch_end(self, epoch, logs={}):
        y_pred = np.argmax(self.model.predict(X_test), axis=1)
        accuracy = accuracy_score(y_true, y_pred)
        print('test accuracy is:', accuracy)
        return
 
    def on_batch_begin(self, batch, logs={}):
        return
 
    def on_batch_end(self, batch, logs={}):
        return

In [25]:
# checkpoint /home/sandeep/data/DCBL_macro_TTV_4.hdf5
filepath="macro_without_masking.hdf5"

In [26]:
epochs = 40

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer='adam',
              metrics=['accuracy'])


checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True)

checkpoint2 = monitor_Training()

callbacks_list = [checkpoint, checkpoint2]

H = model.fit(train_samples, train_labels,
            batch_size=batch_size,
            epochs=epochs,
            verbose=1,
            shuffle=True,
            validation_data=(X_val, y_val),
             callbacks=callbacks_list
             )

In [27]:
history = H

In [28]:
import matplotlib.pyplot as plt

# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
print(np.argmax(np.array(history.history['val_accuracy'])))
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
print('Maximum validation accuracy: ',np.max(np.array(history.history['val_accuracy'])))
print('Training accuracy of best model: ',np.array(history.history['accuracy'])[np.argmax(np.array(history.history['val_accuracy']))])


In [29]:
from keras.models import load_model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, accuracy_score
import matplotlib.pyplot as plt
import itertools

In [30]:
model = load_model(filepath)
y_pred = np.argmax(model.predict(X_test), axis=1)
y_true = np.argmax(y_test, axis=1)
cf_matrix = confusion_matrix(y_true, y_pred)
print(cf_matrix)
class_wise_f1 = np.round(f1_score(y_true, y_pred, average=None)*100)*0.01
accuracy = accuracy_score(y_true, y_pred)


print('the mean-f1 score: {:.2f}'.format(np.mean(class_wise_f1)))
print('accuracy is: {:.2f}'.format(accuracy))


[[570  39  23]
 [ 76 277  24]
 [ 93  62 192]]
the mean-f1 score: 0.74
accuracy is: 0.77


# File by file the Macro comparison

In [31]:
# load the data windows per file and see their predictions

In [51]:
def get_prediction_accuracy(data_dir, sub_dirs):

    files = os.listdir(data_dir+'/left_hip')
    number_of_samples = 500

    labels_macro =dict()

    labels_macro['sandwich'] = 0
    labels_macro['fruitsalad'] = 1
    labels_macro['cereal'] = 2


    #read the labels
    labels_loc = 'data/LabelTable.csv'
    file_label = open(labels_loc, newline='')
    label_reader = csv.reader(file_label)
    file_label_mapping = dict()

    first = True
    for row in label_reader:

        if first:
            first = False
            continue

        file_label_mapping[row[0]+'.csv'] = labels_macro[row[1]]


    total_count = 0
    correct_count = 0

    for f in files:

        file_data = []
        file_label = []

        st_index = 0
        end_index = 30000
        step = 1000 #overlapping window, step: 1000. 
        window_index = 10000 #6 second window

        #print('reading file:',f)
        f_name = f

        if f_name == '.DS_Store':
            continue

        total_count = total_count+1

        curr_label_file = file_label_mapping[f_name]

        while st_index+step < end_index:

            data, data_count = parse_IMU_files(data_dir, sub_dirs, st_index, st_index+window_index,  f, number_of_samples)
            st_index = st_index+step

            if data_count<data_min_count:
                continue

            train_data_sample  = np.zeros((9, number_of_samples))
            train_data_label   = curr_label_file
            for i in range(len(data)):
                for j in range(len(data[i])):
                    train_data_sample[i*3,j]=data[i][j][0]
                    train_data_sample[i*3+1,j]=data[i][j][1]
                    train_data_sample[i*3+2,j]=data[i][j][2]

            file_data.append(train_data_sample)
            #file_label.append(train_data_label)

        file_data = np.array(file_data)
        file_label = curr_label_file

        file_data = file_data.reshape((-1, 9,500, 1))

        y_pred = np.argmax(model.predict(file_data), axis=1) 
        counts = np.bincount(y_pred)
        prediction = np.argmax(counts) #max occuring value in windows
        
        #correct prediction
        if int(prediction)==int(file_label):
            correct_count = correct_count+1
        
    
    return total_count, correct_count        
            

In [52]:
# Test data accuracy:
total_count, correct_count = get_prediction_accuracy(data_dir_test, sub_dirs)
print('total_count of test files:', total_count)
print('correctly predicted test files:', correct_count)
print('Percentage accuracy:', (correct_count/total_count)*100.0)

total_count of test files: 50
correctly predicted test files: 42
Percentage accuracy: 84.0


In [53]:
# Train accuracy
total_count, correct_count = get_prediction_accuracy(data_dir_train, sub_dirs)
print('total_count of train files:', total_count)
print('correctly predicted train files:', correct_count)
print('Percentage accuracy:', (correct_count/total_count)*100.0)

total_count of train files: 175
correctly predicted train files: 175
Percentage accuracy: 100.0


In [54]:
#val accuracy
total_count, correct_count = get_prediction_accuracy(data_dir_val, sub_dirs)
print('total_count of val files:', total_count)
print('correctly predicted val files:', correct_count)
print('Percentage accuracy:', (correct_count/total_count)*100.0)

total_count of val files: 48
correctly predicted val files: 45
Percentage accuracy: 93.75
