## This Notebook shows how a raw Dataset gets processed for further machine learning steps

In [6]:
# Imports
import os, sys
import numpy as np
from typing import Tuple, List, Dict

# to enable local imports
module_path = os.path.abspath(os.path.join('code'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [5]:
from utils import readFileCSV, loadConfigFile
from pipelines import (filter_signal, pre_process_signal, feature_extraction, convert_data)
import pandas as pd
from typing import Dict


def saveFeatureListToFile(featureList : List, filepath : str):
    
    if type(featureList) is not list:
        raise Exception("The given feature list is not a list!")
    
    print("Saving a feature list to: '{}'".format(filepath))
    
    f = open(filepath, "w")
    for feature in featureList:
        line = "{}\n".format(feature)
        f.write(line)
    f.close()
    

def processRawFileWithPipeline(filepath : str, yamlConfig) -> (pd.Series, pd.DataFrame, List[str]):
    ''' Process a given filepath with the current pipelines 
    
    This creates two different data objects:
        epochSeries: This is a panda.Series which contains dataframes. Each index at the series represens one epoch
        frequencyFreatureDf: This is a dataframe of the frequency features of the epochSeries. The index represnts the epochs. The features are the columns
    '''
    if not os.path.isfile(filepath):
        raise Exception("The file '{}' does not exists!".format(filepath))
    
    df = readFileCSV(filepath)
    df, channelNameList =  convert_data(df=df, config=yamlConfig, starttime=None)
    df = filter_signal(df=df, config=yamlConfig) # general filtering
    epochSeries = pre_process_signal(df=df, config=yamlConfig)   # pre-processing
    frequencyFeatureDf = feature_extraction(epochSeries=epochSeries, config=yamlConfig) # extract features
    
    return epochSeries, frequencyFeatureDf, channelNameList

def safeAndProcessRawFileWithPipeline(rawFilePath : str, fileDir : str, label : str, yamlConfig):
    ''' Process the given rawfilePath and safe the result as pickle files
    This function calls 'processRawFileWithPipeline()' and the two returning data objects will be safed
    
    @param str rawFilePath: path to file which gets process
    @param str fileDir: Directory where the data objects should be stored
    @param str label: A label to know which data we process, e.g. fatigue, normal or awake data
    @param yamlConfig: A loaded yaml config file for processing the data
    '''
    print ("Starting to process {}...".format(rawFilePath))
    # process the file
    epochSeries, frequencyFeatureDf, channelNameList = processRawFileWithPipeline(filepath=rawFilePath, yamlConfig=yamlConfig)
    
    # save the epoch series
    epochSeries.to_pickle(os.path.join(fileDir,'epochSeries_{}.pkl'.format(label)))
    
    # save the frequency df
    frequencyFeatureDf.to_pickle(os.path.join(fileDir,'frequencyFeaturesDf_{}.pkl'.format(label)))
    
    # save the channel name list
    saveFeatureListToFile(featureList=channelNameList,
                          filepath=os.path.join(fileDir, "features_channel_names.txt"))
    
    # save frequency features
    saveFeatureListToFile(featureList=list(frequencyFeatureDf.columns),
                          filepath=os.path.join(fileDir, "features_frequency_df.txt"))

def processRawDatasetToPickleFiles(datasetDirPath : str, device : str, awakeFileName : str,
                                   fatigueFileName : str, normalFileName : str, unlabeledFileName : str):
    '''
    @param str datasetDirPath: Path where the directory of the dataset is
    @param str device: name of the device, to load the correct yaml file for processing
    
    Depending on the dataset there might be awake, normal, fatigue or unlabeled data. 
    @param awakeFileName: filename of the awake data or None then it will be ignored
    @param fatigueFileName: filename of the fatigue data or None then it will be ignored
    @param normalFileName: filename of the normal data or None then it will be ignored
    @param unlabeledFileName: filename of the unlabeled data or None then it will be ignored
    '''
    
    if not os.path.isdir(datasetDirPath):
        raise Exception("The given dir path '{}' does not exist!".format(datasetDirPath))
        
    # Load the yaml config file for the processing
    yamlConfig = loadConfigFile(device)
    
    for root, dirs, files in os.walk(datasetDirPath):
        for subjectDir in dirs:
            print("#############################################")
            print("Process Subject {} Data...".format(subjectDir))
            print("---------------------------------------------")
            
            if awakeFileName is not None: 
                safeAndProcessRawFileWithPipeline(rawFilePath=os.path.join(root, subjectDir, awakeFilename),
                                                  fileDir=os.path.join(root, subjectDir),
                                                  label = "awake",
                                                  yamlConfig=yamlConfig)
                
            if fatigueFileName is not None: 
                safeAndProcessRawFileWithPipeline(rawFilePath=os.path.join(root, subjectDir, fatigueFileName),
                                                  fileDir=os.path.join(root, subjectDir),
                                                  label = "fatigue",
                                                  yamlConfig=yamlConfig)
                
            if normalFileName is not None: 
                safeAndProcessRawFileWithPipeline(rawFilePath=os.path.join(root, subjectDir, normalFileName),
                                                  fileDir=os.path.join(root, subjectDir),
                                                  label = "normal",
                                                  yamlConfig=yamlConfig)
                
            if unlabeledFileName is not None: 
                safeAndProcessRawFileWithPipeline(rawFilePath=os.path.join(root, subjectDir, normalFileName),
                                                  fileDir=os.path.join(root, subjectDir),
                                                  label = "unlabeled",
                                                  yamlConfig=yamlConfig)
    
    print("#######################################")
    print("Done processing and saving a complete Dataset!")

def loadPickeldData(dataDir : str, label : str):
    ''' Load the epochseries and frequency feature df
    
    @param str dataDir: Directory where the data is
    @param str label: decide which 
    '''
    try:
        epochSeries = pd.read_pickle(os.path.join(dataDir,'epochSeries_{}.pkl'.format(label)))
    except Exception as e:
        #print (e)
        epochSeries = None
        
    try:
        frequencyFeatureDf = pd.read_pickle(os.path.join(dataDir,'frequencyFeaturesDf_{}.pkl'.format(label)))
    except Exception as e:
        #print (e)
        frequencyFeatureDf = None

    return epochSeries, frequencyFeatureDf

def loadPickeldDataset(datasetDirPath : str) -> Dict:
    ''' This functions loads a complete dataset into a dict
    
    Each Subject contains a dict with 'awake', 'normal', 'fatigue' and 'unlabeled' entry.
    Each entry contain the epochSeries and frequencyFeatureDf
    '''
    
    if not os.path.isdir(datasetDirPath):
        raise Exception("The given dir path '{}' does not exist!".format(datasetDirPath))
    
    datasetDict = {}
    
    for root, dirs, files in os.walk(datasetDirPath):
        for subjectDir in dirs:
            print("Load Subject {} Data...".format(subjectDir))
            
            epochSeries_awake, frequencyFeatureDf_awake = loadPickeldData(dataDir = os.path.join(datasetDirPath, subjectDir),
                                                                          label="awake")
            
            epochSeries_normal, frequencyFeatureDf_normal = loadPickeldData(dataDir = os.path.join(datasetDirPath, subjectDir),
                                                                          label="normal")
            
            epochSeries_fatigue, frequencyFeatureDf_fatigue = loadPickeldData(dataDir = os.path.join(datasetDirPath, subjectDir),
                                                                          label="fatigue")
            
            epochSeries_unlabeled, frequencyFeatureDf_unlabeled = loadPickeldData(dataDir = os.path.join(datasetDirPath, subjectDir),
                                                                          label="unlabeled")
            
            datasetDict[subjectDir] = {"awake" : (epochSeries_awake, frequencyFeatureDf_awake),
                                       "normal" : (epochSeries_normal, frequencyFeatureDf_normal),
                                       "fatigue" : (epochSeries_fatigue, frequencyFeatureDf_fatigue),
                                       "unlabeled" : (epochSeries_unlabeled, frequencyFeatureDf_unlabeled)}
    return datasetDict


In [8]:
from consts import DEVICES_NEUROSCAN

PROCESS_DATA = False

if PROCESS_DATA:
    # Process the online EEG Data
    processRawDatasetToPickleFiles(datasetDirPath = "D:/Masterthesis/EEG_Data/eeg_data_online",
                              device = DEVICES_NEUROSCAN,
                              awakeFileName = None,
                              fatigueFileName = "Fatigue_state_256hz.csv",
                              normalFileName = "Normal_state_256hz.csv",
                              unlabeledFileName = None)
else:
    print ("Already processed the EEG Online Data")

Loading the config file for neuroscan
#############################################
Process Subject 1 Data...
---------------------------------------------
Starting to process D:/Masterthesis/EEG_Data/eeg_data_online\1\Fatigue_state_256hz.csv...
Creating sliding windows...
Converting 3d Numpy Array to a series of Df's
Normalizing data...
Deleting Nan's...
Frequenccy Bands: [(0.5, 4, 'Delta'), (4, 8, 'Theta'), (8, 12, 'Alpha'), (12, 30, 'Beta'), (30, 50, 'Gamma')]


  .format(nperseg, input_length))
  bp /= total_power


Creating bandpower, lower & upper envelope dictionary...


  if (np.sign(aTimeSeries[k]-aTimeSeries[k-1])==1) and (np.sign(aTimeSeries[k]-aTimeSeries[k+1])==1) and ((k-lastPeak)>rejectCloserThan):
  if (np.sign(aTimeSeries[k]-aTimeSeries[k-1])==-1) and ((np.sign(aTimeSeries[k]-aTimeSeries[k+1]))==-1) and ((k-lastTrough)>rejectCloserThan):


Creating statistics bandpower dict...
Creating a nice feature dataframe...
Saving a feature list to: 'D:/Masterthesis/EEG_Data/eeg_data_online\1\features_channel_names.txt'
Saving a feature list to: 'D:/Masterthesis/EEG_Data/eeg_data_online\1\features_frequency_df.txt'
Starting to process D:/Masterthesis/EEG_Data/eeg_data_online\1\Normal_state_256hz.csv...
Creating sliding windows...
Converting 3d Numpy Array to a series of Df's
Normalizing data...
Deleting Nan's...
Frequenccy Bands: [(0.5, 4, 'Delta'), (4, 8, 'Theta'), (8, 12, 'Alpha'), (12, 30, 'Beta'), (30, 50, 'Gamma')]
Creating bandpower, lower & upper envelope dictionary...
Creating statistics bandpower dict...
Creating a nice feature dataframe...
Saving a feature list to: 'D:/Masterthesis/EEG_Data/eeg_data_online\1\features_channel_names.txt'
Saving a feature list to: 'D:/Masterthesis/EEG_Data/eeg_data_online\1\features_frequency_df.txt'
#############################################
Process Subject 10 Data...
-------------------

In [10]:
eegDataset = loadPickeldDataset(datasetDirPath= "D:/Masterthesis/EEG_Data/eeg_data_online")

Load Subject 1 Data...
Load Subject 10 Data...
Load Subject 11 Data...
Load Subject 12 Data...
Load Subject 2 Data...
Load Subject 3 Data...
Load Subject 4 Data...
Load Subject 5 Data...
Load Subject 6 Data...
Load Subject 7 Data...
Load Subject 8 Data...
Load Subject 9 Data...


Unnamed: 0,channel_2,channel_27,channel_33,channel_36,channel_37,channel_38,channel_39,channel_40
0,,,,0.533807,,,,
1,,,,0.625042,,,,
2,,,,0.503881,,,,
3,,,,0.496285,,,,
4,,,,0.496285,,,,
...,...,...,...,...,...,...,...,...
507,,,,0.620425,,,,
508,,,,0.639592,,,,
509,,,,0.668937,,,,
510,,,,0.679682,,,,


In [13]:
eegDataset['2']['normal'][0][0]

Unnamed: 0,channel_1,channel_2,channel_3,channel_4,channel_5,channel_6,channel_7,channel_8,channel_9,channel_10,...,channel_31,channel_32,channel_33,channel_34,channel_35,channel_36,channel_37,channel_38,channel_39,channel_40
0,0.0,0.0,0.0,0.770674,0.0,0.0,0.0,0.0,0.0,0.922101,...,0.0,0.397388,0.0,0.930844,0.0,0.533807,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.625042,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.503881,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
507,0.0,0.0,0.0,0.094512,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.620425,0.0,0.0,0.0,0.0
508,0.0,0.0,0.0,0.094147,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.639592,0.0,0.0,0.0,0.0
509,0.0,0.0,0.0,0.078884,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.668937,0.0,0.0,0.0,0.0
510,0.0,0.0,0.0,0.061193,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.0,0.000000,0.0,0.679682,0.0,0.0,0.0,0.0


In [11]:
dataSeries = [test]
dataSeries = pd.Series(dataSeries)

In [56]:
X, y = convertFreatueDfToXy(eegDataset['2']['normal'][1], target=0)

numpy.ndarray

In [9]:
def convertFeatureDfToXy(featureDf : pd.DataFrame, target : int) -> (np.ndarray, np.ndarray):
    samples = []
    targetArray = []
    
    for index, row in featureDf.iterrows():
        timesteps = []
        features = row.to_numpy() # features
        timesteps.append(features)
        samples.append(timesteps)
        
        targetArray.append(target)
    
    X = np.array(samples)
    y = np.array(targetArray)
    
    return X, y 
    

def createXyFromDataSeries(dataSeries : pd.Series, target : int) -> (np.ndarray, np.ndarray):
    ''' Create X and y for machine learning
    
    @param pd.Series dataSeries: Should be a series of dataframes
    
    X should look like this [samples, timesteps, features]
        samples: The epoch
        timesteps: E.g. if the epoch contains 200 values then the timestep should contain 200 values
        features: The actual value
    
    y should look tlike this [classIds] 
        classIds: The label for the sample of the X Data
    '''
    
    samples = []
    targetArray = []
    
    if dataSeries is None:
        raise TypeError("Data Series is None!")
    
    if type(dataSeries) != pd.Series:
        raise Exception("The given dataSeries is not a pd.Series! It is {}".format(type(dataSeries)))
    
    # loop through the data Series
    for df in dataSeries:
        
        if type(df) != pd.DataFrame: # check the type
            raise Exception("The dataseries contains a {} object - The series should dataframes only!".format(type(df)))
            
        timesteps = []
            
        for index, row in df.iterrows():
            features = row.to_numpy() # features
            timesteps.append(features)
        
        samples.append(timesteps)
        targetArray.append(target)
    
    
    X = np.array(samples)
    y = np.array(targetArray)
    
    return X, y

def createMachineLearningDataset(eegDataset, targetLabelDict) -> (np.array, np.array):
    
    X = None
    y = None
    
    for subject in eegDataset:
        for target in targetLabelDict:
            
            try:
                print("Processing Subject {} - Target: {} ...".format(subject, target))
                eegDataset[subject][target][0]
                tempX, tempy = createXyFromDataSeries(dataSeries = eegDataset[subject][target][0],
                                       target = targetLabelDict[target])

                if X is None:
                    X = tempX
                else:
                    X = np.concatenate((X, tempX))

                if y is None:
                    y = tempy
                else:
                    y = np.concatenate((y, tempy))
            
            except TypeError:
                print("Skipping Target: {}".format(target))
    
    print("Done!")
    return X,y

def saveDictToFile(myDict, filepath : str):
    print("Saving dict to {}".format(filepath))
    f = open(filepath, "w")
    for key, value in myDict.items():
        line = "{v} {k}\n".format(v=value, k=key.upper())
        f.write(str(line))
    f.close()



def createAndSafeMlDataset(eegDataset : Dict[str, Dict[str ,Tuple[pd.Series, pd.DataFrame]]], targetLabelDict : Dict,
                           featureList : List[str], dirPath : str) -> (np.array, np.array):
    
    if not os.path.isdir(dirPath):
        raise Exception("The given directory path is not valid! Given path: {}".format(dirPath))
    
    print("Creating Machine Learning Dataset!")
    X, y = createMachineLearningDataset(eegDataset, targetLabelDict)
    
    print("\nSaving Machine Learning Dataset into this directory: {}".format(dirPath))
    np.save(os.path.join(dirPath, "data_X.npy"), X)
    np.save(os.path.join(dirPath, "data_y.npy"), y)
    
    # Save feature list
    
    # Save target labels
    saveDictToFile(targetLabelDict, filepath=os.path.join(dirPath,'target_labels.txt'))
    
    return X, y

#createDatasetXy(dataSeries)

In [11]:
targetLabelDict = {"normal" : 0,
              "fatigue" : 1,
              "awake" : 2,
              "unlabeled" : 3}

X, y = createMachineLearningDataset(eegDataset, targetLabelDict)

# Todo epoch series UND frequency df X und y erstellen!!

Processing Subject 1 - Target: normal ...
Processing Subject 1 - Target: fatigue ...
Processing Subject 1 - Target: awake ...
Skipping Target: awake
Processing Subject 1 - Target: unlabeled ...
Skipping Target: unlabeled
Processing Subject 10 - Target: normal ...
Processing Subject 10 - Target: fatigue ...
Processing Subject 10 - Target: awake ...
Skipping Target: awake
Processing Subject 10 - Target: unlabeled ...
Skipping Target: unlabeled
Processing Subject 11 - Target: normal ...
Processing Subject 11 - Target: fatigue ...
Processing Subject 11 - Target: awake ...
Skipping Target: awake
Processing Subject 11 - Target: unlabeled ...
Skipping Target: unlabeled
Processing Subject 12 - Target: normal ...
Processing Subject 12 - Target: fatigue ...
Processing Subject 12 - Target: awake ...
Skipping Target: awake
Processing Subject 12 - Target: unlabeled ...
Skipping Target: unlabeled
Processing Subject 2 - Target: normal ...
Processing Subject 2 - Target: fatigue ...
Processing Subject 

Saving dict to D:/Masterthesis/EEG_Data/eeg_data_online/target_labels.txt


In [84]:
type(eegDataset['1']['normal'][1])

pandas.core.frame.DataFrame

In [77]:
type(eegDataset['1'])

dict

In [71]:
np.save('D:/Masterthesis/EEG_Data/eeg_data_online/data_X.npy', X)

In [72]:
np.save('D:/Masterthesis/EEG_Data/eeg_data_online/data_y.npy', y)

In [31]:
#dataSeries = eegDataset['2']['normal'][0]
#dataSeries = pd.Series(dataSeries)
#X, y = createXyFromDataSeries(dataSeries, target=0)

In [40]:
X.shape

(299, 512, 40)

In [47]:
np.concatenate((completeX, X)).shape

ValueError: zero-dimensional arrays cannot be concatenated

In [37]:
completeX.shape

(299, 512, 40)