# Notebook for Testing Pipelines

In [67]:
# Imports 
from sklearn.pipeline import Pipeline
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import yaml
import os, sys
from typing import Dict
import numpy as np


# Local Imports
module_path = os.path.abspath(os.path.join('code'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from Signal_Transfomers import (ConvertIndexToTimestamp, ExtractSignals,
                                BandpassFilter, BandstopFilter, ReplaceOutliers,
                                CenterData)

from Pre_Processing_Transformers import (SlidingWindow, NormalizeData, DeleteFaultyEpochs,
                                            ReplaceNaNs)


from Feature_Extraction_Transformer import (Frequency_Features)

from Measuring_Functions import (getChannelUsageInEpochSeries)

from plotFunctions import (plotInteractiveEpochs, plotFeatureEpochs)
from consts import *

### Define local functions

In [2]:
def loadConfigFile(configFilePath : str) -> Dict:
    with open(configFilePath, 'r') as stream:
        try:
            yamlConfig = yaml.safe_load(stream)
            return yamlConfig
        except yaml.YAMLError as exc:
            print(exc)
            return None

def readFileCSV(filePath : str) -> pd.DataFrame:
    df = pd.read_csv(filePath)
    return df


def filter_signal(df : pd.DataFrame, config : Dict, starttime=None) -> pd.DataFrame:
    ''' Filter the signal with bandpass, bandstopp and repace outliers '''

    # signal processing pipeline - the first pipeline - e.g. extract the signal from the raw .csv and filter it
    signal_processing_pipeline = Pipeline([
        ('Convert Index to Timestamp', ConvertIndexToTimestamp(device=config['deviceName'], starttime=starttime)),
        ('Extract signals', ExtractSignals(device=config['deviceName'])),
        ('Bandpass Filter', BandpassFilter(device=config['deviceName'], lowcufreq=config['lowcutFreq_bandpass'], highcutfreq=config['highcutFreq_bandpass'], samplingRate=config['samplingRate'])),
        ('Bandstop Filter', BandstopFilter(device=config['deviceName'], lowcufreq=config['lowcutFreq_bandstopp'], highcutfreq=config['highcutFreq_bandstopp'], samplingRate=config['samplingRate'])),
        ('Replace Outliers', ReplaceOutliers(device=config['deviceName'], lowerThreshold=config['lowerThreshold'], upperThreshold=config['upperThreshold']))
    ])
    df = signal_processing_pipeline.fit_transform(df)
    return df

def pre_process_signal(df : pd.DataFrame, config : Dict) -> pd.Series:
    ''' Pre-process the signal by creating epochs, delete faulty epochs and normalize it
    
    Returns a series of dataframes
    '''
    # pre-process the pipeline for machine learning
    pre_processing_pipeline = Pipeline([
        ('Create Epochs', SlidingWindow(samplingRate=config['samplingRate'], windowSizeInSeconds=config['epochWindowSize'], overlapInSeconds=config['overlap'])),
        ('Delete Faulty Epochs', DeleteFaultyEpochs(maxFaultyRate=config['maxFaultyRate'])), # returns a numpy series with dataframes
        ('Replace NaNs with Zero', ReplaceNaNs()),
        ('Normalize Data', NormalizeData())
    ])

    epochSeries = pre_processing_pipeline.fit_transform(df)
    return epochSeries

def feature_extraction(epochSeries : pd.Series, config : Dict) -> pd.Series:
    ''' Extract features from the generated epoch series 
    
    Do the whole feature extraction in a pipeline
    '''

    feature_extraction_pipeline = Pipeline([
        ("Frequency Band feature extraction", Frequency_Features(samplingRate=config['samplingRate'], frequencyBands=config['frequencyBands'],
                                                                numberOfChannels=config['numberOfChannels'], epochSizeCalculation=config['epochSizeCalculation']))
    ])

    epochSeries = feature_extraction_pipeline.fit_transform(epochSeries)

    return epochSeries

def generateFeatureDf(csvFilePath, starttime, yamlConfig, label : str ) -> pd.DataFrame:
    ''' Generates datframes of given csv filepaths'''
    
    print ("Generating {} driving feature df...".format(label))
    
    # load dataset
    df = readFileCSV(csvFilePath)  
    
    # Filter the signal
    df = filter_signal(df=df, config=yamlConfig, starttime=starttime)

    # Pre-process the signal
    epochSeries = pre_process_signal(df=df, config=yamlConfig)
    # Save epochSeries
    print("Saving epoch series...")
    epochSeries.to_pickle(os.path.join(mainDir,'generatedData','epochSeries_{}.pkl'.format(label)))


    # Extract Frequency Features
    frequencyFeatureDf = feature_extraction(epochSeries=epochSeries, config=yamlConfig)
    # Save frequency features dataframe
    print("Saving frequency feature dataframe...")
    frequencyFeatureDf.to_pickle(os.path.join(mainDir,'generatedData','frequencyFeaturesDf_{}.pkl'.format(label)))
    
    return frequencyFeatureDf

## Main

In [51]:
GENERATE_DATA = False
# load config
mainDir = "D:/Masterthesis/thesis_eeg/"

TARGET_AWAKE = "awake"
TARGET_NON_AWAKE = "non_awake"
TARGET_UNLABELED = "unlabeled"

configFilePath = "D:/Masterthesis/thesis_eeg/config/openBci.yaml"
yamlConfig = loadConfigFile(configFilePath)
#print(yamlConfig)

subjectDict = {"subjectId" : 1,
               "awakeCsvPath" : "D:/OneDrive - bwedu/Masterthesis/Experiments+Data/Fahren+AimLab/2020_03_05_Propand_1/openBci_record-[2020.03.05-12.27.35]_raw_awake_aimlab.csv",
               "unlabeledCsvPath" : "D:/OneDrive - bwedu/Masterthesis/Experiments+Data/Fahren+AimLab/2020_03_05_Propand_1/openBci_record-[2020.03.05-12.34.34]_raw_driving_unlabled.csv"}

if GENERATE_DATA:
    print("Generating data...")
    
    # create awake feature df
    frequencyFeatureDf_awake = generateFeatureDf(csvFilePath= subjectDict['awakeCsvPath'],
                                                 starttime = pd.Timestamp(datetime.strptime('[2020.03.05-12.27.27]', "[%Y.%m.%d-%H.%M.%S]")),
                                                 yamlConfig = yamlConfig,
                                                 label = TARGET_AWAKE)

    # create undetermined feature df
    frequencyFeatureDf_unlabeled = generateFeatureDf(csvFilePath= subjectDict['unlabeledCsvPath'],
                                                 starttime = pd.Timestamp(datetime.strptime('[2020.03.05-12.34.34]', "[%Y.%m.%d-%H.%M.%S]")),
                                                 yamlConfig = yamlConfig,
                                                 label = TARGET_UNLABELED)
    
    

else:
    # load data
    print("Loading data...")
    # ---- AWAKE DATA -----
    #epochSeries_awake = pd.read_pickle(os.path.join(mainDir,'generatedData','epochSeries_{}.pkl'.format(TARGET_AWAKE)))
    frequencyFeatureDf_awake = pd.read_pickle(os.path.join(mainDir,'generatedData','frequencyFeaturesDf_{}.pkl'.format(TARGET_AWAKE)))
    
    # ---- UNLABELED DATA -----
    #epochSeries_unlabeled = pd.read_pickle(os.path.join(mainDir,'generatedData','epochSeries_{}.pkl'.format(TARGET_UNLABELED)))
    frequencyFeatureDf_unlabeled = pd.read_pickle(os.path.join(mainDir,'generatedData','frequencyFeaturesDf_{}.pkl'.format(TARGET_UNLABELED)))
    
    
    # ---- Create NON-WAKE DATA ----
    frequencyFeatureDf_non_awake = frequencyFeatureDf_unlabeled[110:160] # 50 epochs = 50 x 5 seconds = 250 seconds
    frequencyFeatureDf_non_awake.reset_index(drop=True, inplace=True)
    
#print(epochSeries[0].head())
#plt.show(epochSeries[0].plot())



print("Frequency Feature Df has {} Epochs and {} Features.".format(frequencyFeatureDf_unlabeled.shape[0],
                                                                    frequencyFeatureDf_unlabeled.shape[1],
                                                                    yamlConfig['epochSizeCalculation']))




Loading data...
Frequency Feature Df has 182 Epochs and 570 Features.


Unnamed: 0,channel_1_Delta_mean_bandpower_list,channel_1_Delta_mean_bandpower_lower_envelope_list,channel_1_Delta_mean_bandpower_upper_envelope_list,channel_1_Delta_std_dev_bandpower_list,channel_1_Delta_std_dev_bandpower_lower_envelope_list,channel_1_Delta_std_dev_bandpower_upper_envelope_list,channel_1_Theta_mean_bandpower_list,channel_1_Theta_mean_bandpower_lower_envelope_list,channel_1_Theta_mean_bandpower_upper_envelope_list,channel_1_Theta_std_dev_bandpower_list,...,channel_19_Beta_mean_bandpower_upper_envelope_list,channel_19_Beta_std_dev_bandpower_list,channel_19_Beta_std_dev_bandpower_lower_envelope_list,channel_19_Beta_std_dev_bandpower_upper_envelope_list,channel_19_Gamma_mean_bandpower_list,channel_19_Gamma_mean_bandpower_lower_envelope_list,channel_19_Gamma_mean_bandpower_upper_envelope_list,channel_19_Gamma_std_dev_bandpower_list,channel_19_Gamma_std_dev_bandpower_lower_envelope_list,channel_19_Gamma_std_dev_bandpower_upper_envelope_list
0,0.773782,0.0,0.0,0.077154,0.0,0.0,0.078978,0.0,0.0,0.049088,...,0.084043,0.013363,0.015392,0.010509,0.028557,0.013575,0.036507,0.007687,0.007525,0.000819
1,0.673283,0.0,0.0,0.19459,0.0,0.0,0.088678,0.0,0.0,0.053534,...,0.084564,0.023242,0.011996,0.005177,0.025418,0.018853,0.034445,0.01043,0.006501,0.001915
2,0.762959,0.0,0.0,0.102904,0.0,0.0,0.054094,0.0,0.0,0.043404,...,0.105048,0.018805,0.011338,0.011589,0.034479,0.01763,0.045483,0.009854,0.002371,0.003858
3,0.715105,0.0,0.0,0.137318,0.0,0.0,0.043181,0.0,0.0,0.028929,...,0.060451,0.014022,0.009067,0.010806,0.021435,0.017938,0.025899,0.004281,0.002917,0.005276
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.130505,0.038057,0.021739,0.030455,0.01985,0.016104,0.044569,0.012477,0.007558,0.009065
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.192792,0.07833,0.005118,0.006863,0.017816,-0.00068,0.060914,0.024566,0.001768,0.0017
6,0.707586,0.0,0.0,0.131417,0.0,0.0,0.102497,0.0,0.0,0.063822,...,0.119611,0.014318,0.020984,0.037417,0.022183,0.015994,0.042023,0.00624,0.008204,0.008885
7,0.794907,0.0,0.0,0.092811,0.0,0.0,0.045652,0.0,0.0,0.048353,...,0.089806,0.020845,0.021805,0.023408,0.024306,0.018112,0.035738,0.009347,0.009164,0.004267
8,0.264261,0.0,0.0,0.295209,0.0,0.0,0.024343,0.0,0.0,0.017875,...,0.135288,0.043319,0.010646,0.011446,0.032674,0.007035,0.04469,0.013472,0.003529,0.001248
9,0.444416,0.0,0.0,0.178817,0.0,0.0,0.077406,0.0,0.0,0.012687,...,0.083084,0.008273,0.0051,0.011438,0.025612,0.020298,0.035338,0.005355,0.002681,0.003536


In [89]:
target_dict = {"awake" : 0, "non-awake" : 1, "unlabeled" : 2}

def createDataAndTargetArray(awakeDf, non_awakeDf, unlabeledDf):
    targetArray = []
    dataDf = pd.DataFrame()
    
    if awakeDf is not None:
        
        # TODO - Checken ob gut oder schlecht?!?!?
        awakeDf.fillna(0)
        
        startCounter = 0
        if dataDf.empty:
            # Do this because then it will copy also all columns, then we can append stuff
            dataDf = awakeDf.loc[:1]
            
            # append to awake into the target array
            targetArray.append(target_dict['awake'])
            targetArray.append(target_dict['awake'])
            startCounter = 2
            
        # We have to start at 2 if we added 0:1 already
        for i in range(startCounter, len(awakeDf)):
            dataDf = dataDf.append(awakeDf.loc[i], ignore_index=True)
            targetArray.append(target_dict['awake'])

            
    if non_awakeDf is not None:
        
        # TODO - Checken ob gut oder schlecht?!?!?
        non_awakeDf.fillna(0)
        
        startCounter = 0
        if dataDf.empty:
            # Do this because then it will copy also all columns, then we can append stuff
            dataDf = non_awakeDf.loc[:1]
            
            # append to awake into the target array
            targetArray.append(target_dict['non-awake'])
            targetArray.append(target_dict['non-awake'])
            startCounter = 2
        
        # We have to start at 2 if we added 0:1 already
        for i in range(startCounter, len(non_awakeDf)):
            dataDf = dataDf.append(non_awakeDf.loc[i], ignore_index=True)
            targetArray.append(target_dict['non-awake'])
            
            
    if unlabeledDf is not None:
        
        # TODO - Checken ob gut oder schlecht?!?!?
        unlabeledDf.fillna(0)
        
        startCounter = 0
        if dataDf.empty:
            # Do this because then it will copy also all columns, then we can append stuff
            dataDf = unlabeledDf.loc[:1]
            
            # append to awake into the target array
            targetArray.append(target_dict['unlabeled'])
            targetArray.append(target_dict['unlabeled'])
            startCounter = 2
        
        # We have to start at 2 if we added 0:1 already
        for i in range(startCounter, len(unlabeledDf)):
            dataDf = dataDf.append(unlabeledDf.loc[i], ignore_index=True)
            targetArray.append(target_dict['unlabeled'])

            
    return (dataDf.to_numpy(), np.array(targetArray))

dataArray, targetArray = createDataAndTargetArray(awakeDf = frequencyFeatureDf_awake,
                                               non_awakeDf = frequencyFeatureDf_non_awake,
                                               unlabeledDf = None)



In [90]:
dataArray

array([[0.8736504453478214, 0.7984500132084075, 0.9012756792793742, ...,
        0.0022718345962174855, 0.00703569909419123, 0.02715597200067778],
       [0.8224975913628956, 0.7303482996582106, 0.8997157720135239, ...,
        0.021473586428483543, 0.008277927888654545, 0.006649606894895675],
       [0.805816657371629, 0.7782152182768824, 0.8623125125745467, ...,
        0.010608974789590313, 0.0075707386836198366,
        0.0037140509726986634],
       ...,
       [nan, nan, nan, ..., 0.016384232025278868, 0.006945705695864618,
        0.008227443431086645],
       [nan, nan, nan, ..., 0.006402541845865167, 0.006032997161945762,
        0.0026230385723960753],
       [nan, nan, nan, ..., 0.010099139352405398, 0.0014701650996359777,
        0.006491969383578458]], dtype=object)

In [93]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(dataArray, targetArray, test_size=0.3,random_state=109) # 70% training and 30% test

In [96]:
#Import svm model
from sklearn import svm


#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel


#Train the model using the training sets
clf.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [5]:
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()

In [18]:
cancer.data.shape

(569, 30)

In [23]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.3,random_state=109) # 70% training and 30% test