In [189]:
import pandas as pd
import numpy as np
from itertools import combinations

from sklearn.feature_selection import RFECV
from sklearn.base import clone
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import (cross_val_score, GridSearchCV, KFold, LeaveOneOut,
                                     ShuffleSplit, StratifiedKFold, train_test_split)
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier



In [167]:
#All file paths: ['Data\\Data20SecondOverlapAllChannels\\bandpower_theta.csv','Data\\bandpower_delta.csv','Data\\bandpower_alpha.csv', 'Data\\bandpower_beta.csv', 'Data\\bandpower_gamma.csv', 'Data\\coherence_delta.csv','Data\\coherence_theta.csv','Data\\coherence_alpha.csv','Data\\coherence_beta.csv', 'Data\\coherence_gamma.csv']

X_files = ['Data\\Data20SecondOverlapAllChannels\\bandpower_theta.csv', 'Data\\Data20SecondOverlapAllChannels\\bandpower_delta.csv', 'Data\\Data20SecondOverlapAllChannels\\bandpower_alpha.csv', 'Data\\Data20SecondOverlapAllChannels\\bandpower_beta.csv', 'Data\\Data20SecondOverlapAllChannels\\bandpower_gamma.csv', 'Data\\Data20SecondOverlapAllChannels\\coherence_delta.csv', 'Data\\Data20SecondOverlapAllChannels\\coherence_theta.csv', 'Data\\Data20SecondOverlapAllChannels\\coherence_alpha.csv', 'Data\\Data20SecondOverlapAllChannels\\coherence_beta.csv', 'Data\\Data20SecondOverlapAllChannels\\coherence_gamma.csv']
Y_files = 'Data\\Data20SecondOverlapAllChannels\\stress_raw.csv'

In [168]:
#EEG features
X_dfs = [pd.read_csv(file) for file in X_files]


Y_dfs = pd.read_csv(Y_files)
Y_dfs = pd.DataFrame(Y_dfs)
stressThreshold = 6.
Y_dfs = (Y_dfs >= stressThreshold).astype(int)


In [169]:
num_rows = len(X_dfs[0])
combined_rows = [pd.concat([df.iloc[i] for df in X_dfs], ignore_index=True) for i in range(num_rows)]


X_dfs = pd.concat(combined_rows, axis=1).transpose()


#Drop subject 7(Unconforming Data)
X_dfs = X_dfs.drop(range(723, 872))
X_dfs = X_dfs.reset_index(drop=True)
Y_dfs = Y_dfs.drop(range(723, 872))
Y_dfs = Y_dfs.reset_index(drop=True)

#Drop subject 5(Unconforming Data)
X_dfs = X_dfs.drop(range(468, 585))
X_dfs = X_dfs.reset_index(drop=True)
Y_dfs = Y_dfs.drop(range(468, 585))
Y_dfs = Y_dfs.reset_index(drop=True)

print("X shape", X_dfs.shape)
print("Y shape", Y_dfs.shape)


X shape (863, 1500)
Y shape (863, 1)


In [170]:
#Find how many stress and no stress samples we have.

value_counts = Y_dfs.value_counts()

num_zeros = value_counts.get(0, 0)
num_ones = value_counts.get(1, 0)

print(f"Number of 0s(No stress): {num_zeros}")
print(f"Number of 1s(Stress): {num_ones}")

Number of 0s(No stress): 425
Number of 1s(Stress): 438


In [171]:
def looCV(x, y):
    ##LOOCV

    # Instantiate the SVM classifier
    svm_classifier = SVC(kernel='linear')

    # Instantiate LeaveOneOut
    loo = LeaveOneOut()

    # Initialize a list to store the accuracy for each fold
    accuracies = []

    # Loop over each train-test split
    for train_index, test_index in loo.split(x):
        # Split the data
        X_train, X_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.values[train_index], y.values[test_index]
        
        # Train the classifier
        svm_classifier.fit(X_train, y_train.ravel())
        
        # Make a prediction for the test set
        y_pred = svm_classifier.predict(X_test)
        
        # Calculate the accuracy for this fold
        accuracies.append(accuracy_score(y_test, y_pred))

    # Calculate the mean accuracy across all folds
    mean_accuracy = np.mean(accuracies)

    print(f"Mean LOOCV Accuracy: {mean_accuracy}")

def kFoldCV(x, y):
    #K-Fold


    # Define the classifier
    svm_classifier = SVC(kernel='linear')

    # Choose the number of folds
    k = 10  

    # Perform k-fold cross-validation
    scores = cross_val_score(svm_classifier, x, y.values.ravel(), cv=k, scoring='accuracy')

    # Display the results
    print(f"Accuracy scores for each fold: {scores}")
    print(f"Mean accuracy across all folds: {scores.mean()}")
    print(f"Standard deviation of the accuracy across all folds: {scores.std()}")

def kFoldStratCV(x,y):
        #Stratified K-Fold

    # Initialize the classifier
    svm_classifier = SVC(kernel='linear')

    # Define the number of folds
    k = 10

    # Create the StratifiedKFold object
    skf = StratifiedKFold(n_splits=k)

    # Initialize an array to store the accuracy for each fold
    accuracies = []

    # Perform Stratified K-Fold Cross-Validation
    for train_index, test_index in skf.split(x, y):
        # Split the data into training and test sets
        X_train, X_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Train the classifier
        svm_classifier.fit(X_train, y_train.values.ravel())

        # Predict the labels for the test set
        y_pred = svm_classifier.predict(X_test)

        # Calculate the accuracy for this fold
        accuracies.append(accuracy_score(y_test, y_pred))

    # Calculate the mean and standard deviation of the accuracies
    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)

    print(f"Mean Accuracy: {mean_accuracy}")
    print(f"Standard Deviation of Accuracy: {std_accuracy}")

def shuffleSplitCV(x, y):
        #Shuffle Split

    # Initialize the classifier
    svm_classifier = SVC(kernel='linear')

    # Define the number of iterations and test set size
    n_splits = 100  
    test_size = 0.2  

    # Create the ShuffleSplit object
    cv = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=0)

    # Initialize an array to store the accuracy for each split
    accuracies = []

    # Perform the splits, train the model, and calculate accuracy
    for train_index, test_index in cv.split(x):
        # Ensure we're using iloc to access rows by integer location
        X_train, X_test = x.iloc[train_index], x.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Fit the model and make predictions
        svm_classifier.fit(X_train, y_train.values.ravel())
        y_pred = svm_classifier.predict(X_test)
        
        # Calculate and store the accuracy
        accuracies.append(accuracy_score(y_test, y_pred))

    # Calculate the mean and standard deviation of the accuracy
    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)

    print(f"Mean Accuracy: {mean_accuracy}")
    print(f"Standard Deviation of Accuracy: {std_accuracy}")



In [174]:
def crossValidations(x,y):
    looCV(x,y)
    print("Kfold: \n\n\n")
    kFoldCV(x,y)
    print("KfoldStrat: \n\n\n")
    kFoldStratCV(x,y)
    print("Shuffle Split: \n\n\n")
    shuffleSplitCV(x,y)

No feature Selection methods:

In [173]:
crossValidations(X_dfs, Y_dfs)

Mean LOOCV Accuracy: 0.7960602549246814
Accuracy scores for each fold: [0.56321839 0.55172414 0.57471264 0.44186047 0.58139535 0.51162791
 0.51162791 0.58139535 0.72093023 0.54651163]
Mean accuracy across all folds: 0.5585004009623095
Standard deviation of the accuracy across all folds: 0.06775358894533128
Mean Accuracy: 0.5585004009623095
Standard Deviation of Accuracy: 0.06775358894533128
Mean Accuracy: 0.7913294797687861
Standard Deviation of Accuracy: 0.029377585571267198


In [185]:
def RecurrentFeatureElim(x,y,n):
    svc = SVC(kernel="linear")

    # Number of RFE iterations and the number of features to select
    n_features_to_select = n  

    # Initialize a DataFrame to store the selection count for each feature
    rfe = RFE(estimator=svc, n_features_to_select=n_features_to_select, step=1)

    # Fit RFE
    rfe.fit(x, y.values.ravel())

    X_transformed = rfe.transform(x)
    X_transformed_df = pd.DataFrame(X_transformed)

    return X_transformed_df

Use RFE for feature selection

In [186]:
X_dfs_RFE = RecurrentFeatureElim(X_dfs,Y_dfs,200)

crossValidations(X_dfs_RFE, Y_dfs)

Mean LOOCV Accuracy: 0.9177288528389339
Kfold: 



Accuracy scores for each fold: [0.73563218 0.79310345 0.93103448 0.90697674 0.73255814 0.79069767
 0.8255814  0.76744186 0.87209302 0.96511628]
Mean accuracy across all folds: 0.8320235231221599
Standard deviation of the accuracy across all folds: 0.07833258898391907
KfoldStrat: 



Mean Accuracy: 0.8320235231221599
Standard Deviation of Accuracy: 0.07833258898391907
Shuffle Split: 



Mean Accuracy: 0.9115028901734106
Standard Deviation of Accuracy: 0.019545693834765118


In [192]:
def RecurrentFeatureElimCV(x,y):
    svc = SVC(kernel="linear")


    # Initialize a DataFrame to store the selection count for each feature
    rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(10), scoring='accuracy')

    # Fit RFE
    rfecv.fit(x, y.values.ravel())

    X_transformed = rfecv.transform(x)
    X_transformed_df = pd.DataFrame(X_transformed)

    return X_transformed_df

In [194]:
X_dfs_RFECV = RecurrentFeatureElimCV(X_dfs,Y_dfs)


KeyboardInterrupt: 