In [None]:
import scipy.io as sio
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal
from scipy.integrate import simps
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import FastICA
from sklearn import preprocessing
from scipy import signal

import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset as torchDataset



In [None]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
data_dir = '/home/ghn8/courses/neuroengineering/project/data'



In [None]:
def get_patient_training_data(pid):
    ICTAL = 1
    NONICTAL = 0
    
    ictal_data_dir = '/home/ghn8/courses/neuroengineering/project/data/patient_%d/ictal train/' % pid
    nonictal_data_dir = '/home/ghn8/courses/neuroengineering/project/data/patient_%d/non-ictal train/' % pid
    
    train_inputs = []
    train_labels = []
    
    for i in range(1, len(os.listdir(ictal_data_dir))+1):
        file_path = os.path.join(ictal_data_dir, 'patient_%d_%d.mat' % (pid, i))
        data_mat = sio.loadmat(file_path)
        data = data_mat['data'].astype(np.float32)
        if(np.isnan(data).any()):
            raise(ValueError("NaN in ictal data"))
        train_inputs.append(data.T) # transpose data to get num_channels x timepoints
        train_labels.append(ICTAL)
    
    for i in range(1, len(os.listdir(nonictal_data_dir))+1):
        file_path = os.path.join(nonictal_data_dir, 'patient_%d_%d.mat' % (pid, i))
        data_mat = sio.loadmat(file_path)
        data = data_mat['data'].astype(np.float64)
        if(np.isnan(data).any()):
            raise(ValueError("NaN in non-ictal data"))
        train_inputs.append(data.T) # transpose data to get num_channels x timepoints
        train_labels.append(NONICTAL)
        
    train_inputs = np.asarray(train_inputs)
    train_labels = np.asarray(train_labels)
    
    return train_inputs, train_labels

In [None]:
def get_patient_test_data(pid):
    test_dir = '/home/ghn8/courses/neuroengineering/project/data/patient_%d/test/' % pid
    
    test_inputs = []
    
    for i in range(1, len(os.listdir(test_dir))+1):
        file_path = os.path.join(test_dir, 'patient_%d_test_%d.mat' % (pid, i))
        if os.path.exists(file_path):
            data_mat = sio.loadmat(file_path)
            data = data_mat['data'].astype(np.float32)
            data = np.nan_to_num(data)
            if(np.isnan(data).any()):
                raise(ValueError("NaN in test data"))
            test_inputs.append(data.T) # transpose data to get num_channels x timepoints
        
    test_inputs = np.asarray(test_inputs)
    
    return test_inputs

In [None]:
NUM_PATIENTS = 7
all_train_data = {}

train_data_dump_file = os.path.join('/home/ghn8/courses/neuroengineering/project/preprocessed/all_data', 'all_train_data.pkl')

if not os.path.exists(train_data_dump_file):
    for pid in range(1, NUM_PATIENTS+1):
        train_inputs, train_labels = get_patient_training_data(pid)
        all_train_data['patient_%d' % pid] = { 'inputs': train_inputs, 'labels': train_labels}
        print('Finished reading data for patient %d' % pid)

    pickle.dump(all_train_data, open(train_data_dump_file, 'wb'))
else:
    all_train_data = pickle.load(open(train_data_dump_file, 'rb'))


In [None]:
all_test_data = {}

test_data_dump_file = os.path.join('/home/ghn8/courses/neuroengineering/project/preprocessed/all_data', 'all_test_data.pkl')

if not os.path.exists(test_data_dump_file):
    for pid in range(1, NUM_PATIENTS+1):
        test_inputs = get_patient_test_data(pid)
        all_test_data['patient_%d' % pid] = test_inputs
        print('Finished reading test data for patient %d' % pid)

    pickle.dump(all_test_data, open(test_data_dump_file, 'wb'))
else:
    all_test_data = pickle.load(open(test_data_dump_file, 'rb'))


In [None]:
def fft(timeseries):
    return np.log10(np.absolute(np.fft.rfft(timeseries)))

def get_spectral_features(timeseries):
    raw_fft = fft(timeseries)
    mask = np.logical_or(np.isneginf(raw_fft), np.isinf(raw_fft))
    raw_fft[mask] = 0.0;

    return raw_fft
    
    

In [None]:
def shuffle_data(features, labels):
    assert features.shape[0] == labels.shape[0], "Mismatched number of samples"
    shuffled_indices = np.random.permutation(features.shape[0])
    
    shuffled_features = features[shuffled_indices, :]
    shuffled_labels = labels[shuffled_indices]
    return shuffled_features, shuffled_labels


In [None]:
def transform_inputs_to_spectral_features(inputs):
    num_samples = inputs.shape[0]
    features = []

    for i in range(num_samples):
        sample = inputs[i, :, :].squeeze()

        spectral_features = get_spectral_features(sample)
        features.append(spectral_features)

    features = np.asarray(features)
    return features


In [None]:
def ica_clean(data, n_components):
    ica = FastICA(n_components=n_components)
    return ica.fit_transform(data)

In [None]:
def get_temporal_features(inputs, num_eigenvalues=-1, num_icas=0):
    num_samples = inputs.shape[0]
    features = []

    for i in range(num_samples):
        sample = inputs[i, :, :].squeeze()
        
        if num_icas > 0:
            sample = ica_clean(sample, num_icas)
        
        norm_sample = preprocessing.scale(sample)
        corr = np.corrcoef(norm_sample)
        
        uppper_triag_indices = np.triu_indices(corr.shape[0], 1)
        upper_triag_vals = corr[uppper_triag_indices].ravel()
        
        eigenvalues, _ = np.linalg.eig(corr)
        eigenvalues = np.sort(eigenvalues)
        
        if num_eigenvalues > 0:
            eigenvalues = eigenvalues[:num_eigenvalues]

        features.append(np.concatenate((upper_triag_vals, eigenvalues)))
    features = np.asarray(features)
    return features


In [None]:
def transform_inputs_to_features(inputs):
    spectral_features = transform_inputs_to_spectral_features(inputs, lpf_freq)
        
    temporal_features = get_temporal_features(inputs)
    
    features = np.concatenate((spectral_features, spectral_features), axis=1)
    
    return features


In [None]:
def compute_mean_signal_by_windows(data, num_windows):
    num_samples, num_timepoints, num_channels = data.shape
    new_data = np.zeros((num_samples, num_windows, num_channels))
    window = int(num_timepoints / num_windows)

    for i in range(num_windows):
        start_tp = i * window
        end_tp = (i+1) * window
        if i == (num_windows - 1):
            end_tp = end_tp + 1
        
        window_data = np.mean(data[:, start_tp:end_tp, :], axis=1)
        new_data[:, i, :] = window_data

    return new_data

In [None]:
class EEGDataset(torchDataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __getitem__(self, index):
        features = self.features[index, :,  :].astype('float')
        label = np.expand_dims(self.labels[index], axis=1)

        return features, label

    def __len__(self):
        return self.features.shape[0]

In [None]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.functional as F
import torch.optim as optim

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(LSTMClassifier, self).__init__()

        self.num_layers = 1
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=self.num_layers)

        self.hidden2out = nn.Linear(hidden_dim, 1)
        self.dropout_layer = nn.Dropout(p=0.1)


    def init_hidden(self, batch_size):
        return(autograd.Variable(torch.randn(self.num_layers, batch_size, self.hidden_dim)),
                autograd.Variable(torch.randn(self.num_layers, batch_size, self.hidden_dim)))


    def forward(self, batch):
        self.hidden = self.init_hidden(batch.size(1))

        outputs, (ht, ct) = self.lstm(batch, self.hidden)

        # ht is the last hidden state of the sequences
        # ht = (1 x batch_size x hidden_dim)
        # ht[-1] = (batch_size x hidden_dim)
        output = self.dropout_layer(ht[-1])
        output = self.hidden2out(output)

        return output
    

def train(train_loader, num_epochs, num_in_channels, num_hidden, learning_rate):
    model = LSTMClassifier(num_in_channels, num_hidden)

    # Loss and optimizer
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)

    # Training
    total_step = len(train_loader)

    for epoch in range(num_epochs):
        epoch_loss = 0.0
        for i, (features, label) in enumerate(train_loader):
            model.zero_grad()
            
            features = np.swapaxes(features, 0, 1)
            features = features.type(torch.FloatTensor)
            label = label.type(torch.FloatTensor)

            # Forward pass
            output = model(features)

            loss = criterion(output, label)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            optimizer.step()

            epoch_loss += loss.item()

        epoch_loss = epoch_loss / len(train_loader)
        if ((epoch+1) % 1 == 0):
            print("\tEpoch [%d/%d], Loss: %.4f" % (epoch+1, num_epochs, epoch_loss))
        if epoch_loss <= 1e-4:
            break

    return model

def predict(model, test_loader):
    # Test
    model.eval()
    predicted_probs = np.empty((0, 1))
    with torch.no_grad():
        correct = 0
        total = 0
        for inputs, labels in test_loader:
            inputs = np.swapaxes(inputs, 0, 1)
            inputs = inputs.type(torch.FloatTensor)

            probs = torch.sigmoid(model(inputs)).data.cpu().numpy()
            predicted_probs = np.concatenate((predicted_probs, probs))
    
    return predicted_probs

def eval(model, test_loader):
    # Test
    model.eval()
    
    all_predicted_labels = np.empty((0, 1))
    all_probs = np.empty((0, 1))

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs = np.swapaxes(inputs, 0, 1)
            inputs = inputs.type(torch.FloatTensor)
            labels = labels.type(torch.FloatTensor)

            probs = torch.sigmoid(model(inputs)).data.cpu().numpy()
            predicted_labels = np.where(probs > 0.5, 1, 0)

            all_predicted_labels = np.concatenate((all_predicted_labels, predicted_labels))
            all_probs = np.concatenate((all_probs, probs))
    
    return all_predicted_labels.squeeze(), all_probs.squeeze()

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] ="5"
is_testing = True

for pid in range(1, NUM_PATIENTS+1):
    train_inputs = all_train_data['patient_%d' % pid]['inputs']
    train_labels = all_train_data['patient_%d' % pid]['labels']
    test_inputs = all_test_data['patient_%d' % pid]
    
    output_file = os.path.join('/home/ghn8/courses/neuroengineering/project/gia/outputs', 'cnn_spectral_LSTM_decay1e-5_pid%d.csv' % pid)
    
    # Hyperparameters
    num_epochs = 20
    batch_size = 100
    learning_rate = 0.1
    num_hidden = 32

    train_inputs = np.swapaxes(train_inputs, 1, 2)
    test_inputs = np.swapaxes(test_inputs, 1, 2)
    num_samples, num_timepoints, num_channels = train_inputs.shape
    
    if num_timepoints > 500:
        train_inputs = compute_mean_signal_by_windows(train_inputs, 500)
    print(train_inputs.shape)
    print(test_inputs.shape)
    
    if is_testing:
        print("Predicting patient %d" % pid)
        
        fh = open(output_file, 'w')
        if pid == 1:
            fh.write("id,prediction\n")
            
        train_dataset = EEGDataset(features=train_inputs,
                                  labels=train_labels)
        test_dataset = EEGDataset(features=test_inputs,
                                  labels=np.zeros((test_inputs.shape[0], 1)))

        # Data loader
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                  batch_size=batch_size,
                                                  shuffle=False)
        test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                                 batch_size=batch_size,
                                                 shuffle=False)

        model = train(train_loader, num_epochs, num_channels, num_hidden, learning_rate)
        predicted_probs = predict(model, test_loader)

        for i in range(len(predicted_probs)):
            fh.write("patient_%d_%d,%f\n" % (pid, i+1, predicted_probs[i]))
        fh.close()
    else:
        print("Evaluating patient %d" % pid)
        val_fraction = 0.2

        # creat training and validation data
        [shuffled_features, shuffled_labels] = shuffle_data(train_inputs, train_labels)

        num_val_samples = int(num_samples * val_fraction)

        val_features = shuffled_features[:num_val_samples, :]
        val_labels = shuffled_labels[:num_val_samples]

        bootstraped_train_features = shuffled_features[num_val_samples:, :]
        bootstraped_train_labels = shuffled_labels[num_val_samples:]
        
        train_dataset = EEGDataset(features=bootstraped_train_features,
                                  labels=bootstraped_train_labels)
        test_dataset = EEGDataset(features=val_features,
                                  labels=val_labels)

    
        # Data loader
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                  batch_size=batch_size,
                                                  shuffle=False)
        test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                                 batch_size=batch_size,
                                                 shuffle=False)

        model = train(train_loader, num_epochs, num_channels, num_hidden, learning_rate)
        all_predicted_labels, all_probs = eval(model, test_loader)
        accuracy = (all_predicted_labels == val_labels).sum() * 100.0 / val_labels.shape[0]
        
        print("Accuracy: %.2f" % accuracy + "%")
        print("AUC :", roc_auc_score(val_labels, all_probs))

