In [1]:
import mne
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.decomposition import PCA
import seaborn as sns
import data_preprocessing as dp
import utilities
import models.training as train
import pandas as pd

In [2]:
data, events = dp.load_data(subjects=range(1,11))
data, events = dp.choose_condition(data, events, 'inner speech')

[[0m[38;2;255;0;0m#[38;2;252;2;0m#[38;2;249;5;0m#[38;2;247;7;0m#[38;2;244;10;0m#[38;2;242;12;0m#[38;2;239;15;0m#[38;2;237;17;0m#[38;2;234;20;0m#[38;2;232;22;0m#[38;2;229;25;0m#[38;2;226;28;0m#[38;2;224;30;0m#[38;2;221;33;0m#[38;2;219;35;0m#[38;2;216;38;0m#[38;2;214;40;0m#[38;2;211;43;0m#[38;2;209;45;0m#[38;2;206;48;0m#[38;2;204;51;0m#[38;2;201;53;0m#[38;2;198;56;0m#[38;2;196;58;0m#[38;2;193;61;0m#[38;2;191;63;0m#[38;2;188;66;0m#[38;2;186;68;0m#[38;2;183;71;0m#[38;2;181;73;0m#[38;2;178;76;0m#[38;2;175;79;0m#[38;2;173;81;0m#[38;2;170;84;0m#[38;2;168;86;0m#[38;2;165;89;0m#[38;2;163;91;0m#[38;2;160;94;0m#[38;2;158;96;0m#[38;2;155;99;0m#[38;2;153;102;0m#[38;2;150;104;0m#[38;2;147;107;0m#[38;2;145;109;0m#[38;2;142;112;0m#[38;2;140;114;0m#[38;2;137;117;0m#[38;2;135;119;0m#[38;2;132;122;0m#[38;2;130;124;0m#[38;2;127;127;0m#[38;2;124;130;0m#[38;2;122;132;0m#[38;2;119;135;0m#[38;2;117;137;0m#[38;2;114;140;0m#[38;2;112;142;0m#[38;2;109;14

### Paramters

In [3]:
explained_var = 0.98
today = np.datetime_as_string(np.datetime64('today', 'D'))

### Functions:
 - plot original and reconstructed pca data
 - plot PCA
 - Plot difference between original and PCA reconstruction


In [6]:
def save_raw(data,name):
    df = pd.DataFrame(data)
    df.to_csv(f'dataset/preprocessed/{name}')


### Data preparation:
filter relevant interval


In [7]:
f_data = dp.filter_interval(data, [1,3.5],256)
cue_data = dp.filter_interval(data,[0.5,1], 256)

Standardize

In [8]:
#standardize data
scaler = RobustScaler()
cue_scaler = RobustScaler()
s_data = scaler.fit_transform(f_data.reshape(-1, f_data.shape[-1])).reshape(f_data.shape)
s_cue_data = cue_scaler.fit_transform(cue_data.reshape(-1, cue_data.shape[-1])).reshape(cue_data.shape)

### Create PCA datasets

In [9]:
# reshape version
rf_data = s_data.reshape(len(s_data), 128*640)
rcue_data = s_cue_data.reshape(len(s_cue_data), 128*128)
# mean version
mean_data = np.mean(s_data, axis = 0)
mean_cue = np.mean(s_cue_data,axis=0)

### Fit and apply PCA:
Fit and apply on reshaped data

In [10]:
# fit and apply on reshaped data
pca_1 = PCA(n_components=explained_var)
pca_cue = PCA(explained_var)
rf_pca = pca_1.fit_transform(rf_data)
rcue_pca = pca_cue.fit_transform(rcue_data)

fit on mean data, applied on time dimension

In [11]:
# fit on mean data, applied on time dimension
pca_2 = PCA(n_components=explained_var)
pca_2.fit(mean_data)
mean_time_pca = [pca_2.transform(elem) for elem in s_data]
pca_2cue = PCA(n_components=explained_var)
pca_2cue.fit(mean_cue)
mean_time_cue_pca = [pca_2cue.transform(elem) for elem in s_cue_data]

fit on transposed mean data, applied on channel dimension

In [12]:
# fit on transposed mean data, applied on channel dimension
pca_3 = PCA(n_components=explained_var)
pca_3.fit(mean_data.T)
mean_channel_pca = [pca_3.transform(elem.T).T for elem in s_data]
pca_3cue = PCA(n_components=explained_var)
pca_3cue.fit(mean_cue.T)
mean_channel_cue_pca = [pca_3cue.transform(elem.T).T for elem in s_cue_data]

### save raw data

In [13]:
channel = np.array(mean_channel_pca)
channel = channel.reshape(2076, 42*640)
channel.shape

(2076, 26880)

In [14]:
time = np.array(mean_time_pca)
time = time.reshape(2076, 128*35)
time.shape

(2076, 4480)

In [18]:
#save_raw(rf_pca, f'reshape_pca{explained_var*100}-all')
df_time = pd.DataFrame(time)
df_time.to_csv(f'dataset/preprocessed/time_pca{int(explained_var*100)}_df_flat_128x35')
#save_raw(channel,f'channel_pca{int(explained_var*100)}_df_flat_42x640')
#save_raw(time, f'time_pca{int(explained_var*100)}_df_flat_128x35')

In [17]:
df_channel = pd.DataFrame(channel)
df_channel.to_csv(f'dataset/preprocessed/channel_pca{int(explained_var*100)}_df_flat_42x640')

In [52]:
df = pd.DataFrame(rf_pca)
df.to_csv(f'dataset/preprocessed/reshaped_pca{int(explained_var*100)}_df')

In [55]:
rf_pca

(2076, 1603)

In [53]:
df_cue = pd.DataFrame(rcue_pca)
df_cue.to_csv(f'dataset/preprocessed/reshaped_pca{int(explained_var*100)}_df_cue')
df_all = df.append(df_cue)
df_all.to_csv(f'dataset/preprocessed/reshaped_pca{int(explained_var*100)}_df_e+c')
label = pd.DataFrame(events[:,1])
label.to_csv(f'dataset/preprocessed/label')

  df_all = df.append(df_cue)


### Create Tensorflow dataset

In [None]:
pca_comp = rf_pca.shape[1]

In [None]:
rf_dataset = tf.data.Dataset.from_tensor_slices((rf_pca, events[:,1]))

In [2]:
rf_datasets = dp.preprocessing_pipeline(
    rf_dataset,
    functions = [lambda sample:(sample[0], tf.one_hot(sample[1], 4))#, # one-hot
                 #lambda sample: (tf.reshape(sample[0], (pca_comp, 1, 1)),
                                 #sample[1])
                 ],
    args = [[]],
    batch_size = 12
)


NameError: name 'dp' is not defined

In [3]:
time_dataset = tf.data.Dataset.from_tensor_slices((mean_time_pca, events[:,1]))

NameError: name 'tf' is not defined

In [4]:
time_datasets = dp.preprocessing_pipeline(
    time_dataset,
    functions = [lambda sample:(sample[0], tf.one_hot(sample[1], 4)), # one-hot
                lambda sample: (tf.reshape(sample[0],(*sample[0].shape, 1)), sample[1])],
    args = [[], []],
    batch_size = 12
)


NameError: name 'dp' is not defined

In [5]:
channel_dataset = tf.data.Dataset.from_tensor_slices((mean_channel_pca, events[:,1]))

NameError: name 'tf' is not defined

In [6]:
channel_datasets = dp.preprocessing_pipeline(
    channel_dataset,
    functions = [lambda sample:(sample[0], tf.one_hot(sample[1], 4)), # one-hot
                lambda sample: (tf.reshape(sample[0],(*sample[0].shape, 1)), sample[1])],
    args = [[], []],
    batch_size = 12
)


NameError: name 'dp' is not defined

### save datasets

In [35]:
# save dataset so that we can just load the preprocessed version next time
today = np.datetime.today
tf.data.experimental.save(rf_datasets,
                          f'dataset/preprocessed/pca_all/{today}reshaped_pca{explained_var*100}train_ds_')
tf.data.experimental.save(time_datasets,
                          f'dataset/preprocessed/pca_all/{today}time_pca{explained_var*100}train_ds/')
tf.data.experimental.save(channel_datasets,
                          f'dataset/preprocessed/pca_all/{today}channel_pca{explained_var*100}train_ds')