In [None]:
import scipy.io as sio
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy import signal
from scipy.integrate import simps
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import FastICA
from sklearn import preprocessing
from scipy import signal


In [None]:
SEED = 42
np.random.seed(SEED)
data_dir = '/home/ghn8/courses/neuroengineering/project/data'



In [None]:
def get_patient_training_data(pid):
    ICTAL = 1
    NONICTAL = 0
    
    ictal_data_dir = '/home/ghn8/courses/neuroengineering/project/data/patient_%d/ictal train/' % pid
    nonictal_data_dir = '/home/ghn8/courses/neuroengineering/project/data/patient_%d/non-ictal train/' % pid
    
    train_inputs = []
    train_labels = []
    
    for i in range(1, len(os.listdir(ictal_data_dir))+1):
        file_path = os.path.join(ictal_data_dir, 'patient_%d_%d.mat' % (pid, i))
        data_mat = sio.loadmat(file_path)
        data = data_mat['data'].astype(np.float32)
        if(np.isnan(data).any()):
            raise(ValueError("NaN in ictal data"))
        train_inputs.append(data.T) # transpose data to get num_channels x timepoints
        train_labels.append(ICTAL)
    
    for i in range(1, len(os.listdir(nonictal_data_dir))+1):
        file_path = os.path.join(nonictal_data_dir, 'patient_%d_%d.mat' % (pid, i))
        data_mat = sio.loadmat(file_path)
        data = data_mat['data'].astype(np.float64)
        if(np.isnan(data).any()):
            raise(ValueError("NaN in non-ictal data"))
        train_inputs.append(data.T) # transpose data to get num_channels x timepoints
        train_labels.append(NONICTAL)
        
    train_inputs = np.asarray(train_inputs)
    train_labels = np.asarray(train_labels)
    
    return train_inputs, train_labels

In [None]:
def get_patient_test_data(pid):
    test_dir = '/home/ghn8/courses/neuroengineering/project/data/patient_%d/test/' % pid
    
    test_inputs = []
    
    for i in range(1, len(os.listdir(test_dir))+1):
        file_path = os.path.join(test_dir, 'patient_%d_test_%d.mat' % (pid, i))
        if os.path.exists(file_path):
            data_mat = sio.loadmat(file_path)
            data = data_mat['data'].astype(np.float32)
            data = np.nan_to_num(data)
            if(np.isnan(data).any()):
                raise(ValueError("NaN in test data"))
            test_inputs.append(data.T) # transpose data to get num_channels x timepoints
        
    test_inputs = np.asarray(test_inputs)
    
    return test_inputs

In [None]:
NUM_PATIENTS = 7
all_train_data = {}

train_data_dump_file = os.path.join('/home/ghn8/courses/neuroengineering/project/preprocessed/all_data', 'all_train_data.pkl')

if not os.path.exists(train_data_dump_file):
    for pid in range(1, NUM_PATIENTS+1):
        train_inputs, train_labels = get_patient_training_data(pid)
        all_train_data['patient_%d' % pid] = { 'inputs': train_inputs, 'labels': train_labels}
        print('Finished reading data for patient %d' % pid)

    pickle.dump(all_train_data, open(train_data_dump_file, 'wb'))
else:
    all_train_data = pickle.load(open(train_data_dump_file, 'rb'))


In [None]:
all_test_data = {}

test_data_dump_file = os.path.join('/home/ghn8/courses/neuroengineering/project/preprocessed/all_data', 'all_test_data.pkl')

if not os.path.exists(test_data_dump_file):
    for pid in range(1, NUM_PATIENTS+1):
        test_inputs = get_patient_test_data(pid)
        all_test_data['patient_%d' % pid] = test_inputs
        print('Finished reading test data for patient %d' % pid)

    pickle.dump(all_test_data, open(test_data_dump_file, 'wb'))
else:
    all_test_data = pickle.load(open(test_data_dump_file, 'rb'))


In [None]:
def fft(timeseries):
    return np.log10(np.absolute(np.fft.rfft(timeseries)))

def get_spectral_features(timeseries, min_f=1, max_f=50):
    raw_fft = fft(timeseries)
    mask = np.logical_or(np.isneginf(raw_fft), np.isinf(raw_fft))
    raw_fft[mask] = 0.0;
    return raw_fft[:, min_f:max_f]
    
    

In [None]:
def shuffle_data(features, labels):
    assert features.shape[0] == labels.shape[0], "Mismatched number of samples"
    shuffled_indices = np.random.permutation(features.shape[0])
    
    shuffled_features = features[shuffled_indices, :]
    shuffled_labels = labels[shuffled_indices]
    return shuffled_features, shuffled_labels


In [None]:
def transform_inputs_to_spectral_features(inputs, lpf_freq):
    num_samples = inputs.shape[0]
    features = []

    for i in range(num_samples):
        sample = inputs[i, :, :].squeeze()

        min_f = 1
        max_f = lpf_freq*2 if (sample.shape[1] > 500) else lpf_freq

        spectral_features = get_spectral_features(sample, min_f=min_f, max_f=max_f).ravel()
        features.append(spectral_features)
    features = np.asarray(features)
    return features


In [None]:
def ica_clean(data, n_components):
    ica = FastICA(n_components=n_components)
    return ica.fit_transform(data)

In [None]:
def get_temporal_features(inputs, num_eigenvalues=-1, num_icas=0):
    num_samples = inputs.shape[0]
    features = []

    for i in range(num_samples):
        sample = inputs[i, :, :].squeeze()
        
        if num_icas > 0:
            sample = ica_clean(sample, num_icas)
        
        norm_sample = preprocessing.scale(sample)
        corr = np.corrcoef(norm_sample)
        
        uppper_triag_indices = np.triu_indices(corr.shape[0], 1)
        upper_triag_vals = corr[uppper_triag_indices].ravel()
        
        eigenvalues, _ = np.linalg.eig(corr)
        eigenvalues = np.sort(eigenvalues)
        
        if num_eigenvalues > 0:
            eigenvalues = eigenvalues[:num_eigenvalues]

        features.append(np.concatenate((upper_triag_vals, eigenvalues)))
    features = np.asarray(features)
    return features


In [None]:
def transform_inputs_to_features(inputs, lpf_freq=50, n_ica=5):
    spectral_features = transform_inputs_to_spectral_features(inputs, lpf_freq)
    if n_ica > 0:
        spectral_features = ica_clean(spectral_features, n_ica)
        
    temporal_features = get_temporal_features(inputs)
    
    features = np.concatenate((spectral_features, spectral_features), axis=1)
    
    return features


In [None]:
selected_channels_by_patient = { 'patient_1': [0, 2, 60, 61, 76, 83],
                         'patient_2': [8, 13, 30, 31, 44],
                         'patient_3': [1, 8, 10, 11, 12],
                         'patient_4': [2, 39, 70, 71, 87],
                         'patient_5': [18, 19, 26, 41, 49, 75],
                         'patient_6': [11, 27, 38, 41, 69],
                         'patient_7': [6, 7, 20, 25, 28, 50, 60] }

for n_estimators in [1000]:
    output_file = os.path.join('/home/ghn8/courses/neuroengineering/project/gia/outputs', 'rfc_lpf50+temporal_%de_selected-channels.csv' % n_estimators)
    is_testing = True

    for pid in range(1, NUM_PATIENTS+1):
        patient_label = 'patient_%d' % pid
        train_inputs = all_train_data[patient_label]['inputs']
        train_labels = all_train_data[patient_label]['labels']
        test_inputs = all_test_data[patient_label]
        
        selected_channels = selected_channels_by_patient[patient_label]
        train_inputs = train_inputs[:, selected_channels, :]
        test_inputs = test_inputs[:, selected_channels, :]

        # hyperparameters
        lpf_freq = 50
        n_ica = 20 if (pid == 5) else 0
        train_features = transform_inputs_to_features(train_inputs, lpf_freq=50, n_ica=0)
        test_features = transform_inputs_to_features(test_inputs, lpf_freq=50, n_ica=0)

        if is_testing:
            print("Predicting patient %d" % pid)
            rfc = RandomForestClassifier(n_estimators=n_estimators, bootstrap=False)

            # training
            rfc.fit(train_features, train_labels)
            ictal_pred_probs = rfc.predict_proba(test_features)[:, 1]

            if pid == 1:
                fh = open(output_file, 'w')
                fh.write("id,prediction\n")
            else:
                fh = open(output_file, 'a')

            for i in range(len(ictal_pred_probs)):
                fh.write("patient_%d_%d,%f\n" % (pid, i+1, ictal_pred_probs[i]))
            fh.close()
        else:
            print("Evaluating patient %d" % pid)
            val_fraction = 0.2

            # creat training and validation data
            [shuffled_features, shuffled_labels] = shuffle_data(train_features, train_labels)

            num_samples = train_features.shape[0]
            num_val_samples = int(num_samples * val_fraction)

            val_features = shuffled_features[:num_val_samples, :]
            val_labels = shuffled_labels[:num_val_samples]

            bootstraped_train_features = shuffled_features[num_val_samples:, :]
            bootstraped_train_labels = shuffled_labels[num_val_samples:]

            rfc = RandomForestClassifier(n_estimators=n_estimators)

            # training
            rfc.fit(bootstraped_train_features, bootstraped_train_labels)
            print("Accuracy: ", rfc.score(val_features, val_labels))
            val_ictal_probs = rfc.predict_proba(val_features)[:, 1]
            print("AUC :", roc_auc_score(val_labels, val_ictal_probs))

