# Notebook for Testing Pipelines

In [67]:
# Imports 
from sklearn.pipeline import Pipeline
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import yaml
import os, sys
from typing import Dict
import numpy as np


# Local Imports
module_path = os.path.abspath(os.path.join('code'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from Signal_Transfomers import (ConvertIndexToTimestamp, ExtractSignals,
                                BandpassFilter, BandstopFilter, ReplaceOutliers,
                                CenterData)

from Pre_Processing_Transformers import (SlidingWindow, NormalizeData, DeleteFaultyEpochs,
                                            ReplaceNaNs)


from Feature_Extraction_Transformer import (Frequency_Features)

from Measuring_Functions import (getChannelUsageInEpochSeries)

from plotFunctions import (plotInteractiveEpochs, plotFeatureEpochs)
from consts import *

### Define local functions

In [2]:
def loadConfigFile(configFilePath : str) -> Dict:
    with open(configFilePath, 'r') as stream:
        try:
            yamlConfig = yaml.safe_load(stream)
            return yamlConfig
        except yaml.YAMLError as exc:
            print(exc)
            return None

def readFileCSV(filePath : str) -> pd.DataFrame:
    df = pd.read_csv(filePath)
    return df


def filter_signal(df : pd.DataFrame, config : Dict, starttime=None) -> pd.DataFrame:
    ''' Filter the signal with bandpass, bandstopp and repace outliers '''

    # signal processing pipeline - the first pipeline - e.g. extract the signal from the raw .csv and filter it
    signal_processing_pipeline = Pipeline([
        ('Convert Index to Timestamp', ConvertIndexToTimestamp(device=config['deviceName'], starttime=starttime)),
        ('Extract signals', ExtractSignals(device=config['deviceName'])),
        ('Bandpass Filter', BandpassFilter(device=config['deviceName'], lowcufreq=config['lowcutFreq_bandpass'], highcutfreq=config['highcutFreq_bandpass'], samplingRate=config['samplingRate'])),
        ('Bandstop Filter', BandstopFilter(device=config['deviceName'], lowcufreq=config['lowcutFreq_bandstopp'], highcutfreq=config['highcutFreq_bandstopp'], samplingRate=config['samplingRate'])),
        ('Replace Outliers', ReplaceOutliers(device=config['deviceName'], lowerThreshold=config['lowerThreshold'], upperThreshold=config['upperThreshold']))
    ])
    df = signal_processing_pipeline.fit_transform(df)
    return df

def pre_process_signal(df : pd.DataFrame, config : Dict) -> pd.Series:
    ''' Pre-process the signal by creating epochs, delete faulty epochs and normalize it
    
    Returns a series of dataframes
    '''
    # pre-process the pipeline for machine learning
    pre_processing_pipeline = Pipeline([
        ('Create Epochs', SlidingWindow(samplingRate=config['samplingRate'], windowSizeInSeconds=config['epochWindowSize'], overlapInSeconds=config['overlap'])),
        ('Delete Faulty Epochs', DeleteFaultyEpochs(maxFaultyRate=config['maxFaultyRate'])), # returns a numpy series with dataframes
        ('Replace NaNs with Zero', ReplaceNaNs()),
        ('Normalize Data', NormalizeData())
    ])

    epochSeries = pre_processing_pipeline.fit_transform(df)
    return epochSeries

def feature_extraction(epochSeries : pd.Series, config : Dict) -> pd.Series:
    ''' Extract features from the generated epoch series 
    
    Do the whole feature extraction in a pipeline
    '''

    feature_extraction_pipeline = Pipeline([
        ("Frequency Band feature extraction", Frequency_Features(samplingRate=config['samplingRate'], frequencyBands=config['frequencyBands'],
                                                                numberOfChannels=config['numberOfChannels'], epochSizeCalculation=config['epochSizeCalculation']))
    ])

    epochSeries = feature_extraction_pipeline.fit_transform(epochSeries)

    return epochSeries

def generateFeatureDf(csvFilePath, starttime, yamlConfig, label : str ) -> pd.DataFrame:
    ''' Generates datframes of given csv filepaths'''
    
    print ("Generating {} driving feature df...".format(label))
    
    # load dataset
    df = readFileCSV(csvFilePath)  
    
    # Filter the signal
    df = filter_signal(df=df, config=yamlConfig, starttime=starttime)

    # Pre-process the signal
    epochSeries = pre_process_signal(df=df, config=yamlConfig)
    # Save epochSeries
    print("Saving epoch series...")
    epochSeries.to_pickle(os.path.join(mainDir,'generatedData','epochSeries_{}.pkl'.format(label)))


    # Extract Frequency Features
    frequencyFeatureDf = feature_extraction(epochSeries=epochSeries, config=yamlConfig)
    # Save frequency features dataframe
    print("Saving frequency feature dataframe...")
    frequencyFeatureDf.to_pickle(os.path.join(mainDir,'generatedData','frequencyFeaturesDf_{}.pkl'.format(label)))
    
    return frequencyFeatureDf

## Main

In [51]:
GENERATE_DATA = False
# load config
mainDir = "D:/Masterthesis/thesis_eeg/"

TARGET_AWAKE = "awake"
TARGET_NON_AWAKE = "non_awake"
TARGET_UNLABELED = "unlabeled"

configFilePath = "D:/Masterthesis/thesis_eeg/config/openBci.yaml"
yamlConfig = loadConfigFile(configFilePath)
#print(yamlConfig)

subjectDict = {"subjectId" : 1,
               "awakeCsvPath" : "D:/OneDrive - bwedu/Masterthesis/Experiments+Data/Fahren+AimLab/2020_03_05_Propand_1/openBci_record-[2020.03.05-12.27.35]_raw_awake_aimlab.csv",
               "unlabeledCsvPath" : "D:/OneDrive - bwedu/Masterthesis/Experiments+Data/Fahren+AimLab/2020_03_05_Propand_1/openBci_record-[2020.03.05-12.34.34]_raw_driving_unlabled.csv"}

if GENERATE_DATA:
    print("Generating data...")
    
    # create awake feature df
    frequencyFeatureDf_awake = generateFeatureDf(csvFilePath= subjectDict['awakeCsvPath'],
                                                 starttime = pd.Timestamp(datetime.strptime('[2020.03.05-12.27.27]', "[%Y.%m.%d-%H.%M.%S]")),
                                                 yamlConfig = yamlConfig,
                                                 label = TARGET_AWAKE)

    # create undetermined feature df
    frequencyFeatureDf_unlabeled = generateFeatureDf(csvFilePath= subjectDict['unlabeledCsvPath'],
                                                 starttime = pd.Timestamp(datetime.strptime('[2020.03.05-12.34.34]', "[%Y.%m.%d-%H.%M.%S]")),
                                                 yamlConfig = yamlConfig,
                                                 label = TARGET_UNLABELED)
    
    

else:
    # load data
    print("Loading data...")
    # ---- AWAKE DATA -----
    #epochSeries_awake = pd.read_pickle(os.path.join(mainDir,'generatedData','epochSeries_{}.pkl'.format(TARGET_AWAKE)))
    frequencyFeatureDf_awake = pd.read_pickle(os.path.join(mainDir,'generatedData','frequencyFeaturesDf_{}.pkl'.format(TARGET_AWAKE)))
    
    # ---- UNLABELED DATA -----
    #epochSeries_unlabeled = pd.read_pickle(os.path.join(mainDir,'generatedData','epochSeries_{}.pkl'.format(TARGET_UNLABELED)))
    frequencyFeatureDf_unlabeled = pd.read_pickle(os.path.join(mainDir,'generatedData','frequencyFeaturesDf_{}.pkl'.format(TARGET_UNLABELED)))
    
    
    # ---- Create NON-WAKE DATA ----
    frequencyFeatureDf_non_awake = frequencyFeatureDf_unlabeled[110:160] # 50 epochs = 50 x 5 seconds = 250 seconds
    frequencyFeatureDf_non_awake.reset_index(drop=True, inplace=True)
    
#print(epochSeries[0].head())
#plt.show(epochSeries[0].plot())



print("Frequency Feature Df has {} Epochs and {} Features.".format(frequencyFeatureDf_unlabeled.shape[0],
                                                                    frequencyFeatureDf_unlabeled.shape[1],
                                                                    yamlConfig['epochSizeCalculation']))




Loading data...
Frequency Feature Df has 182 Epochs and 570 Features.


In [122]:
target_dict = {"awake" : 1, "non-awake" : 0, "unlabeled" : 2}

def createDataAndTargetArray(awakeDf, non_awakeDf, unlabeledDf):
    targetArray = []
    dataDf = pd.DataFrame()
    
    if awakeDf is not None:
        
        # TODO - Checken ob gut oder schlecht?!?!?
        awakeDf = awakeDf.fillna(0)
        
        startCounter = 0
        if dataDf.empty:
            # Do this because then it will copy also all columns, then we can append stuff
            dataDf = awakeDf.loc[:1]
            
            # append to awake into the target array
            targetArray.append(target_dict['awake'])
            targetArray.append(target_dict['awake'])
            startCounter = 2
            
        # We have to start at 2 if we added 0:1 already
        for i in range(startCounter, len(awakeDf)):
            dataDf = dataDf.append(awakeDf.loc[i], ignore_index=True)
            targetArray.append(target_dict['awake'])

            
    if non_awakeDf is not None:
        
        # TODO - Checken ob gut oder schlecht?!?!?
        non_awakeDf = non_awakeDf.fillna(0)
        
        startCounter = 0
        if dataDf.empty:
            # Do this because then it will copy also all columns, then we can append stuff
            dataDf = non_awakeDf.loc[:1]
            
            # append to awake into the target array
            targetArray.append(target_dict['non-awake'])
            targetArray.append(target_dict['non-awake'])
            startCounter = 2
        
        # We have to start at 2 if we added 0:1 already
        for i in range(startCounter, len(non_awakeDf)):
            dataDf = dataDf.append(non_awakeDf.loc[i], ignore_index=True)
            targetArray.append(target_dict['non-awake'])
            
            
    if unlabeledDf is not None:
        
        # TODO - Checken ob gut oder schlecht?!?!?
        unlabeledDf = unlabeledDf.fillna(0)
        
        startCounter = 0
        if dataDf.empty:
            # Do this because then it will copy also all columns, then we can append stuff
            dataDf = unlabeledDf.loc[:1]
            
            # append to awake into the target array
            targetArray.append(target_dict['unlabeled'])
            targetArray.append(target_dict['unlabeled'])
            startCounter = 2
        
        # We have to start at 2 if we added 0:1 already
        for i in range(startCounter, len(unlabeledDf)):
            dataDf = dataDf.append(unlabeledDf.loc[i], ignore_index=True)
            targetArray.append(target_dict['unlabeled'])

            
    return (dataDf.to_numpy(), np.array(targetArray))

dataArray, targetArray = createDataAndTargetArray(awakeDf = frequencyFeatureDf_awake,
                                               non_awakeDf = frequencyFeatureDf_non_awake,
                                               unlabeledDf = None)



In [124]:
targetArray

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [125]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(dataArray, targetArray, test_size=0.3,random_state=109, shuffle=False) # 70% training and 30% test

In [148]:
#Import svm model
from sklearn import svm


#Create a svm Classifier

clf = svm.SVC(kernel="poly", degree=3, coef0=1, C=5, gamma="auto")


#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)
y_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [149]:
test = frequencyFeatureDf_unlabeled.fillna(0)
test = test.to_numpy()
clf.predict(test)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1])

In [150]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# Model Precision: what percentage of positive tuples are labeled as such?
print("Precision:",metrics.precision_score(y_test, y_pred))

# Model Recall: what percentage of positive tuples are labelled as such?
print("Recall:",metrics.recall_score(y_test, y_pred))

Accuracy: 0.0
Precision: 0.0
Recall: 0.0


  'recall', 'true', average, warn_for)


In [151]:
from sklearn.model_selection import cross_val_score
cross_val_score(clf, X_train, y_train, cv=3, scoring="accuracy")

array([0.82758621, 0.85714286, 0.85185185])

In [153]:
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(clf, X_train, y_train, cv=3)
y_train_pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [154]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, y_train_pred)

array([[ 0, 13],
       [ 0, 71]], dtype=int64)

In [5]:
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()

In [18]:
cancer.data.shape

(569, 30)

In [23]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=0.3,random_state=109) # 70% training and 30% test