In [None]:
### concats training brain data and brain labels
### gets rid of voxels that have missing label values
### normalizes training data to get mean and std files
### outputs: concated data/labels, 80/20 train/validate partitions
### saved in training_inputs

### outputs: normalized testing data
### saved in testing_inputs

In [1]:
import numpy as np

In [None]:
# available data: C03, C04, C05, C06, C07, C08, P01, P02
# TRAINING: C03 - C06, P01 (5 brains)
# TESTING: C07, C08, P02 (3 brains)

### load TRAINING/VALIDATION data 

In [2]:
#load preprocessed data for TRAINING

C03_data = np.load('../preprocessed_data/C03_preprocessed_data.npy')
C04_data = np.load('../preprocessed_data/C04_preprocessed_data.npy')
C05_data = np.load('../preprocessed_data/C05_preprocessed_data.npy')
C06_data = np.load('../preprocessed_data/C06_preprocessed_data.npy')
P01_data = np.load('../preprocessed_data/P01_preprocessed_data.npy')

In [3]:
#load preprocessed labels for TRAINING

C03_labels = np.load('../preprocessed_data/C03_labels.npy')
C04_labels = np.load('../preprocessed_data/C04_labels.npy')
C05_labels = np.load('../preprocessed_data/C05_labels.npy')
C06_labels = np.load('../preprocessed_data/C06_labels.npy')
P01_labels = np.load('../preprocessed_data/P01_labels.npy')

In [4]:
orig_data = np.concatenate((C03_data, C04_data, C05_data, C06_data, P01_data),axis=0)
orig_data.shape

(3046406, 32)

In [5]:
orig_labels = np.concatenate((C03_labels, C04_labels, C05_labels, C06_labels, P01_labels),axis=0)
orig_labels.shape

(3046406, 40)

In [6]:
orig_labels

array([[0.01878405, 0.02175641, 0.02371561, ..., 0.09441077, 0.09964417,
        0.10438113],
       [0.        , 0.        , 0.        , ..., 0.04352095, 0.04483591,
        0.0460328 ],
       [0.        , 0.        , 0.        , ..., 0.04056251, 0.04269485,
        0.04465765],
       ...,
       [0.12500669, 0.09725073, 0.05055761, ..., 0.01146965, 0.00939775,
        0.0074357 ],
       [0.14649108, 0.        , 0.        , ..., 0.        , 0.        ,
        0.07779288],
       [0.15930531, 0.09019263, 0.0053288 , ..., 0.02778104, 0.0487898 ,
        0.06824179]])

In [7]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [8]:
data, labels = unison_shuffled_copies(orig_data, orig_labels)

In [9]:
print(data.shape)
print(labels.shape)

(3046406, 32)
(3046406, 40)


In [10]:
# if first feature/column of data is 0, change to nan
data[data[:,0]==0]='nan'

In [23]:
#data = data[~np.isnan(data).any(axis=1)]

In [11]:
data

array([[ 921.,  835.,  686., ...,   19.,   16.,   13.],
       [1101.,  973.,  802., ...,   40.,   27.,   30.],
       [1000.,  850.,  734., ...,   11.,   13.,   15.],
       ...,
       [1011.,  868.,  772., ...,   44.,   57.,   49.],
       [ 677.,  651.,  531., ...,   22.,   26.,    4.],
       [1133.,  939.,  797., ...,   15.,    3.,   20.]])

In [12]:
data.shape

(3046406, 32)

In [13]:
# mask of nans in first feature
# if first feature is 0, then get rid of that voxel/row
mask = ~np.isnan(data).any(axis=1)

# kevin did this to: data = data[~np.isnan(data).any(axis=1)]
# we have 40 labels though

In [14]:
masked_labels = labels[mask]

In [15]:
masked_data = data[mask]

In [16]:
masked_labels.shape

(3046406, 40)

In [17]:
masked_data.shape

(3046406, 32)

In [18]:
masked_data

array([[ 921.,  835.,  686., ...,   19.,   16.,   13.],
       [1101.,  973.,  802., ...,   40.,   27.,   30.],
       [1000.,  850.,  734., ...,   11.,   13.,   15.],
       ...,
       [1011.,  868.,  772., ...,   44.,   57.,   49.],
       [ 677.,  651.,  531., ...,   22.,   26.,    4.],
       [1133.,  939.,  797., ...,   15.,    3.,   20.]])

### normalizing data 

In [19]:
# mean across the T2 times (so 32 of them, arr is (32, 1))

mean = masked_data.mean(axis=0)
masked_data = masked_data - mean
std = masked_data.std(axis=0)
masked_data = masked_data/std

In [20]:
# save mean and std
np.save('../training_inputs/mean_train', mean)
np.save('../training_inputs/std_train', std)

In [21]:
# save shuffled and normalized data, and shuffled labels
np.save('../training_inputs/train_data_all', masked_data)
np.save('../training_inputs/train_labels_all', masked_labels)

In [22]:
mean.shape

(32,)

In [23]:
masked_data.shape

(3046406, 32)

In [24]:
masked_data[10]

array([ 1.7773128 ,  1.88286167,  1.74715168,  1.91215624,  1.39430171,
        1.70771805,  1.07902093,  1.44642893,  0.53293439,  0.95277605,
        0.13071348,  0.50923576,  0.19361348,  0.11258662, -0.12936919,
       -0.11700127, -0.21111441, -0.11435187, -0.09120936, -0.24430906,
       -0.13137178, -0.23654159, -0.24214025, -0.35084653, -0.2108348 ,
       -0.25541331, -0.23825838, -0.29992205, -0.27323009, -0.27686426,
       -0.32270468, -0.31159563])

### training data partition

In [25]:
# use 20% for validation. 35333*0.2 = 7066.6 so 7000

x_val = masked_data[:7000,:]
y_val = masked_labels[:7000, :]

partial_x_train = masked_data[7000:, :]
partial_y_train = masked_labels[7000:, :]

In [26]:
# save partitions
np.save('../training_inputs/partitions/train_data_80', partial_x_train)
np.save('../training_inputs/partitions/train_labels_80', partial_y_train)

np.save('../training_inputs/partitions/train_data_20', x_val)
np.save('../training_inputs/partitions/train_labels_20', y_val)

In [27]:
len(partial_x_train)

3039406

In [28]:
partial_x_train

array([[ 1.69361789,  1.60547707,  1.6020303 , ..., -0.57172773,
        -0.45411436, -0.56492388],
       [-0.0111154 ,  0.1797202 ,  0.17720223, ..., -0.24410165,
        -0.27890145, -0.35565271],
       [ 1.01084357,  0.97858786,  0.96877338, ..., -0.61541121,
        -0.60742564, -0.71912369],
       ...,
       [ 0.62760896,  0.46820019,  0.75109131, ..., -0.36423122,
        -0.191295  , -0.26753854],
       [-0.84365957, -0.73564899, -0.83864742, ..., -0.60449034,
        -0.53077   , -0.76318078],
       [ 1.16501842,  0.86208633,  0.91600197, ..., -0.68093643,
        -0.78263854, -0.58695243]])

### load TESTING data 

In [29]:
#load preprocessed data for TESTING

C07_data = np.load('../preprocessed_data/C07_preprocessed_data.npy')
C08_data = np.load('../preprocessed_data/C08_preprocessed_data.npy')
P02_data = np.load('../preprocessed_data/P02_preprocessed_data.npy')

In [30]:
print(C07_data.shape)
print(C08_data.shape)
print(P02_data.shape)

(559513, 32)
(618312, 32)
(524977, 32)


In [31]:
# normalize them
C07_data = C07_data-mean
C07_data = C07_data/std

In [32]:
C08_data = C08_data-mean
C08_data = C08_data/std

In [33]:
P02_data = P02_data-mean
P02_data = P02_data/std

In [34]:
print(C07_data.shape)
print(C08_data.shape)
print(P02_data.shape)

(559513, 32)
(618312, 32)
(524977, 32)


In [35]:
# save normalized testing data
np.save('../testing_inputs/C07_data_norm', C07_data)
np.save('../testing_inputs/C08_data_norm', C08_data)
np.save('../testing_inputs/P02_data_norm', P02_data)