## Processing of the windows
This notebook focusses on the processing of the sampled windows, to prepare them to be used as training and test data in the modelling part.

In [1]:
import pandas as pd
import numpy as np
import os
from scipy import stats

In [2]:
project_dir = os.getcwd().split('\\')[:-1]
project_dir = '\\'.join(project_dir)
data_dir = project_dir + '\\data'
processed_dir = data_dir + '\\processed'
processed_files = os.listdir(processed_dir)
processed_files = [file for file in processed_files if file.endswith('hdf') and not file.startswith('processed_')]

#### Gender & Age + Outlier removal

In [4]:
for file in processed_files:
    window = pd.read_hdf(f'{processed_dir}\\{file}')
    print(f'In file {file} there is data from {len(window.pp.unique())} participants')

In file window_120_step_108.hdf there is data from 60 participants
In file window_120_step_120.hdf there is data from 60 participants
In file window_120_step_84.hdf there is data from 60 participants
In file window_180_step_125.hdf there is data from 60 participants
In file window_180_step_162.hdf there is data from 60 participants
In file window_180_step_180.hdf there is data from 60 participants


In [3]:
demogr = pd.read_csv(data_dir + '\\raw\\Demogr tDCS WM stress.csv', sep=',')

#Handle the rows that did not separatly properly
i = demogr[demogr.id.str.split(',').str.len()>2].index
demogr.loc[i, 'geboortedatum_patient'] = demogr.loc[i, 'id'].str.split(',').str[1]
demogr.loc[i, 'geslacht'] = demogr.loc[i, 'id'].str.split(',').str[2]
demogr.loc[i, 'id'] = demogr.loc[i, 'id'].str.split(',').str[0]

demogr = demogr[['id', 'geboortedatum_patient', 'geslacht']]
demogr['geboortedatum_patient'] = pd.to_datetime(demogr['geboortedatum_patient'])
now = pd.Timestamp('now')
demogr['leeftijd'] = (now - demogr['geboortedatum_patient']).astype('<m8[Y]')

In [4]:
def add_demographics(windows):
    return windows.merge(demogr[['id', 'geslacht', 'leeftijd']], how='left', left_on='pp', right_on='id', validate='many_to_one')
    
# pd.read_hdf(processed_files[0])

In [5]:
targets = ['mean_SCL', 'corrected_mean_SCL', 'range_corrected_mean_SCL', 'standardised_mean_scl',
           'frequency_NS_SCR', 'HRV_MeanNN', 'HRV_RMSSD', 'HRV_SDNN', 'HRV_MeanNN_corrected',
           'HRV_RMSSD_corrected', 'HRV_SDNN_corrected']

demogr.id = demogr.id.str.split(',').str[0].astype('int64')

for file in processed_files:
    window = pd.read_hdf(f'{processed_dir}\\{file}')
    window.pp = window.pp.astype('int64')
    df = add_demographics(window) # Adding the age and gender of each participants
    df.to_hdf(f'{processed_dir}\\processed_{file}', key='data')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['geslacht'], dtype='object')]

  pytables.to_hdf(
