In [1]:
# Section 1
import numpy as np
import matplotlib.pyplot as plt
import h5py
import copy

# Section 2
from scipy import signal

# 1. Loading the data #

Load and preprocess the data: copied from notebook 4 of exercises from week 10

In [9]:
# data_path = ''
fs = 24400 # Hz

# Subjects
n_subjects = 2
subjs_info_loading = {'Subject2' : {'Name' : 'p2-t2', 'Vars' : ['Baseline', 'Angio36', 'RR15', 'TV500']},
                      'Subject21' : {'Name' : 'p6-t6', 'Vars' : ['Baseline', 'Angio36', 'RR15', 'TV125']}
                     }
subjs_info_final = {'Subject2' : {'Name' : 'p2-t2', 'Vars' : ['Baseline', 'Angio36', 'RRC', 'TVC']},
                    'Subject21' : {'Name' : 'p6-t6', 'Vars' : ['Baseline', 'Angio36', 'RRC', 'TVC']}
                   }
animals_labels = ['p2-t2', 'p6-t6']

def load_data_all_subjects(subjs_info_loading, subjs_info_final, fs, type_data = 'Field_Data_Neuro'):
    subjects_names = list(subjs_info_loading.keys())
    subjs_info = subjs_info_final
    all_data = {}
    print('Start loading ... \n')
    for subject in subjects_names:
        name_subj, data_struct = load_data_one_subject(subject, subjs_info_loading, fs, type_data = type_data)
        all_data[name_subj] = data_struct
        print('================== %s loaded. =================='%subject)
    return all_data


def load_data_one_subject(subject, subjs_info_loading, fs, type_data = 'Field_Data_Neuro'):
    name_subj_to_stock = subjs_info_loading[subject]['Name']
    vars_to_load = subjs_info_loading[subject]['Vars']
    data_struct = {}
    
    # file = h5py.File('data\\' + subject + '.mat','r')
    file = h5py.File(subject + '.mat','r')
    
    # To get the names of the fields after decoding ASCII
    all_field_names = get_field_names(file)

    for var in vars_to_load:
        id_field = np.where(all_field_names == var)[0]
        curr_reference_data1 = file['Vagus_Data_Stimuli'][type_data][id_field][0][0]
        curr_reference_data2 = file[curr_reference_data1][0][0]
        final_data = np.transpose(np.asarray(file[curr_reference_data2]))
        
        if var == 'TV800' or var == 'TV500' or var == 'TV125': 
            var_name = 'TVC'
        elif var == 'RR15' or var == 'RR20': 
            var_name = 'RRC'
        else:
            var_name = var
            
        data_struct[var_name] = {}
        data_struct[var_name]['Data'] = final_data
        n_time_pts = np.shape(final_data)[-1]
        data_struct[var_name]['Time_pts'] = np.linspace(0, n_time_pts/fs, n_time_pts)
        
    return name_subj_to_stock, data_struct

def get_field_names(file):
    n_fields,_ = np.shape(file['Vagus_Data_Stimuli']['stimuli_name'])
    all_field_names = []
    for field in range(n_fields):
        curr_reference_field = file['Vagus_Data_Stimuli']['stimuli_name'][field][0]
        curr_field_ASCII = file[curr_reference_field]
        curr_field = decode_ASCII(curr_field_ASCII)
        all_field_names.append(curr_field)
    return np.asarray(all_field_names)


def decode_ASCII(numbers_array):
    name = ''
    squeezed_numbers = np.squeeze(numbers_array)
    for n in squeezed_numbers:
        name += chr(n)
    return name

data = load_data_all_subjects(subjs_info_loading, subjs_info_final, fs)
print('Loading completed \n')

#print('Showing the data file:')
#for key1 in data.keys():
#    print('=========== %s ==========='%key1)
#    for key2 in data[key1].keys():
#        print('--- %s'%key2)
#        for key3 in data[key1][key2].keys():
#            print(' - %s'%key3)
#            print('Shape : ', np.shape(data[key1][key2][key3]))

def cut_all_data_to_established_duration_per_challenge(data):
    '''
    This function is a wrap-up to the function 'cut_data_one_pig_to_established_duration_per_challenge'.
    '''
    
    new_data_struct = {}
    for pig in data.keys():
        print('================== Working on Pig %s =================='%pig)
        data_curr_pig = data[pig]
        data_curr_pig_cut = cut_data_one_pig_to_established_duration_per_challenge(data_curr_pig)
        new_data_struct[pig] = data_curr_pig_cut
        
    return new_data_struct


def cut_data_one_pig_to_established_duration_per_challenge(data_one_pig):
    ''' 
    This function is used to cut the data for each challenge to the duration shown in Suppl. Table 1 in Vallone et al., 2021. 
    We take the first part of the data for each challenge (arbitrary choice). 
    '''
    new_struct = copy.deepcopy(data_one_pig)
    
    dur_baseline = 5 #min
    dur_RRC = 2 #min
    dur_TVC = 2 #min
    
    for challenge in new_struct.keys():
        data_curr_chal = new_struct[challenge]['Data']
        time_pts_curr_chal = new_struct[challenge]['Time_pts']
        t_end_sec = time_pts_curr_chal[-1]
        
        if challenge == 'Baseline': t_end_sec = dur_baseline * 60
        elif challenge == 'RRC': t_end_sec = dur_RRC * 60
        elif challenge == 'TVC': t_end_sec = dur_TVC * 60
            
#         print('Challenge %s , t_end_sec %0.3f'%(challenge, t_end_sec))
            
        id_t_end_curr_chal = find_specific_time_index(time_pts_curr_chal, t_end_sec)
        new_struct[challenge]['Data'] = data_curr_chal[:,:id_t_end_curr_chal]
        new_struct[challenge]['Time_pts'] = time_pts_curr_chal[:id_t_end_curr_chal]
    
    return new_struct

def find_specific_time_index(time_pts, t):
    t_id = np.argmin(np.abs(time_pts - t))
    return t_id

print('Start cutting data ... \n')
cut_data = cut_all_data_to_established_duration_per_challenge(data)
print('Data cutting completed \n')

Start loading ... 



Data is structured pig -> challenge -> data, time

In [12]:
print('Showing the data files:')
for key1 in cut_data.keys():
    print('=========== %s ==========='%key1)
    for key2 in cut_data[key1].keys():
        print('--- %s'%key2)
        for key3 in cut_data[key1][key2].keys():
            print(' - %s'%key3)
            print('Shape : ', np.shape(cut_data[key1][key2][key3]), ', type: ', type(cut_data[key1][key2][key3]))

Showing the data files:
--- Baseline
 - Data
Shape :  (8, 7319999) , type:  <class 'numpy.ndarray'>
 - Time_pts
Shape :  (7319999,) , type:  <class 'numpy.ndarray'>
--- Angio36
 - Data
Shape :  (8, 14780415) , type:  <class 'numpy.ndarray'>
 - Time_pts
Shape :  (14780415,) , type:  <class 'numpy.ndarray'>
--- RRC
 - Data
Shape :  (8, 2927999) , type:  <class 'numpy.ndarray'>
 - Time_pts
Shape :  (2927999,) , type:  <class 'numpy.ndarray'>
--- TVC
 - Data
Shape :  (8, 2927999) , type:  <class 'numpy.ndarray'>
 - Time_pts
Shape :  (2927999,) , type:  <class 'numpy.ndarray'>
--- Baseline
 - Data
Shape :  (16, 7319999) , type:  <class 'numpy.ndarray'>
 - Time_pts
Shape :  (7319999,) , type:  <class 'numpy.ndarray'>
--- Angio36
 - Data
Shape :  (16, 14940159) , type:  <class 'numpy.ndarray'>
 - Time_pts
Shape :  (14940159,) , type:  <class 'numpy.ndarray'>
--- RRC
 - Data
Shape :  (16, 2927999) , type:  <class 'numpy.ndarray'>
 - Time_pts
Shape :  (2927999,) , type:  <class 'numpy.ndarray'>

# 2. Preprocessing #

The paper says they use a 4-order Butterworth filter [1000, 6000] Hz. 

Note that we have fs= 24400Hz, hence by Shannon the recorded signal can have no components above 2*fs 48000. 

Why do we bandpass at 6k Hz then???

We highpass at 1kHz to remove any forms of noise, notably due to the organism itself. 

In [67]:
f_low = 1000 # [Hz]
f_high = 6000 # [Hz]
N = 4 # order of the filter
sos = signal.butter(N, [f_low, f_high], 'bandpass', analog=False, fs=fs, output='sos')

In [184]:
def bandpass_data(data, sos):
    print('Bandpassing the data... \n')
    new_data = copy.deepcopy(data)
    for pig in data.keys():
        print('=========== %s ==========='%pig)
        for challenge in data[pig].keys():
            new_data[pig][challenge] = signal.sosfilt(sos, cut_data[pig][challenge]['Data'])
    return new_data        

Note that here we remove the 'time' field, which is not used

In [185]:
# takes ~40sec to run
data_filtered = bandpass_data(cut_data, sos)

Bandpassing the data... 



Sub-sampling of the signal by a factor 2: 

(Why???)

In [190]:
# takes ~20sec to run
data_subsampled = copy.deepcopy(data_filtered)
# data_subsampled = data_blank
for pig in data.keys():
    for challenge in data[pig].keys():
        data_subsampled[pig][challenge] = data_subsampled[pig][challenge][:,0:-1:2]

# 3. Split into test/train, perform windowing, extract features #

Following the paper, we use a 90-10 train-test split

In [234]:
def split_data(data, tt_split):
    data_train = {'Baseline': {}, 'Angio36' : {}, 'RRC': {}, 'TVC': {}}
    data_test = dict.fromkeys(data.keys(), {})
    for challenge in data.keys():
        L = data[challenge].shape[1]
        data_train[challenge]= data[challenge][:,0:int(tt_split*L)]
        data_test[challenge]= data[challenge][:,int(tt_split*L):]
    return data_train, data_test

In [250]:
# Could make a dict of dicts but for two pigs it's unecessary
tt_split = 0.9
pig1 = 'p2-t2'
pig2 = 'p6-t6'
pig1_train_data, pig1_test_data = split_data(data_subsampled[pig1], tt_split)
pig2_train_data, pig2_test_data = split_data(data_subsampled[pig2], tt_split)

Following this we have the following data structure: four separate sets, each set contains the four challenges, for each challenge the data

Compute features, as given in the paper. 

Note the structure used:

- compute_features computes features on the time series for a single channel, for a single challenge
- windowing_and_features generates windows and computes features on each window from the [channels x time] array for a given challenge
- data_windowing_and_features creates data and label vectors from the data {challenges x [channels x time]}

Other implementation detail: it's easier to perform windowing and features computation in the same function, because that way the dimension you gain from windiwing is lost in computing features, then you don't need to use 3-d arrays

In [251]:
def compute_features(data):
    # [TODO]
    '''
    :param data: vector [timesteps]
    '''
    mean = ...
    variance = ...
    skew = ...
    kurtosis = ...
    mav = ...
    max = ...
    amp = ...
    wh = ...
    pow = ...
    return np.array([mean, variance, skew, kurtosis, mav, max, amp, wh, pow])

In [252]:
def windowing_and_features(data, windowSize_n, windowOverlap_n):
    '''
    :param data: ndarray [channels x timesteps]
    '''
    windowStep_n = windowSize_n - windowOverlap_n
    features = np.array([])
    for channel in data:
        num_windows = int((channel.shape[0]-windowSize_n)/windowStep_n)
        windows = np.array([])
        for i in range(num_windows+1):
            windows = np.append(windows, data[i*windowStep_n:i*windowStep_n+windowSize_n])
        features = compute_features(windows)
    return features

In [253]:
def data_windowing_and_features(data, windowSize_n, windowOverlap_n):
    '''
    :param data: dictionnary with four keys (challenges), each challenge contains an ndarray [channels x timesteps]
    '''
    features = np.empty(0)
    labels = np.empty(0)
    for challenge in data.keys():
        print(challenge)
        challenge_features = windowing_and_features(data[challenge], windowSize_n, windowOverlap_n)
        label = list(data).index(challenge) # converts to an index
        challenge_labels = label* np.ones(challenge_features.shape[0])
        features = np.append(features, challenge_features)
        labels = np.append(labels, challenge_labels)
    return features, labels

In [254]:
windowSize_t = 50 # [ms]
windowSize_n = windowSize_t * fs #[timesteps]
windowOverlap_train_t = 20 #[ms]
windowOverlap_train_n = windowOverlap_train_t * fs
windowOverlap_test_n = 0

pig1_train_X, pig1_train_y = data_windowing_and_features(pig1_train_data, windowSize_n, windowOverlap_train_n)
pig1_test_X, pig1_test_y = data_windowing_and_features(pig1_test_data, windowSize_n, windowOverlap_test_n)

pig2_train_X, pig2_train_y = data_windowing_and_features(pig2_train_data, windowSize_n, windowOverlap_train_n)
pig2_test_X, pig2_test_y = data_windowing_and_features(pig2_test_data, windowSize_n, windowOverlap_test_n)