# Import And Functions

In [2]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from imblearn.over_sampling import RandomOverSampler
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [3]:
lda = LinearDiscriminantAnalysis()
svc_lin = SVC(C=1000.0, kernel='linear')
svc_rbf = SVC(kernel='rbf', C=1000.0, gamma=0.1)
logreg = LogisticRegression(max_iter=100000)

In [4]:
def cross_validation(dfs, classifier, n_splits=21):
    scores = []
    
    # Determine base group size and the number of groups that need an extra sample
    base_group_size = len(dfs) // n_splits
    num_extra = len(dfs) % n_splits
    
    # Create the dynamic groups
    grouped_pairs = []
    start = 0
    for i in range(n_splits):
        end = start + base_group_size
        if i < num_extra:  # Add an extra sample to the first 'num_extra' groups
            end += 1
        grouped_pairs.append(dfs[start:end])
        start = end
    
    for i in range(n_splits):
        # Prepare the training and test data
        test_df = pd.concat(grouped_pairs[i], ignore_index=True)
        train_df = pd.concat([df for j, group in enumerate(grouped_pairs) for df in group if j != i], ignore_index=True)
        
        X_train = train_df.drop('label', axis=1)
        y_train = train_df['label']
        X_test = test_df.drop('label', axis=1)
        y_test = test_df['label']
        
        # Handle class imbalance for the training set
        ros = RandomOverSampler(random_state=42)
        X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
        
        # Train and validate the classifier
        classifier.fit(X_resampled, y_resampled)
        score = classifier.score(X_test, y_test)
        scores.append(score)
        print(f"{score * 100:.2f}%")
    
    return scores

# Average cross-validation score
def print_average(scores):
    average_score = sum(scores) / len(scores)
    print(f"Average Cross-Validation Accuracy: {average_score * 100:.2f}%")

# 95% Confidence Interval
def ci_95(scores):
    std_dev = np.std(scores)
    mean_score = np.mean(scores)
    error_range = (mean_score - 1.96 * std_dev, mean_score + 1.96 * std_dev)
    print(f"95% Confidence Interval for Accuracy: ({error_range[0] * 100:.2f}%, {error_range[1] * 100:.2f}%)")

# 🧪Experiment Data
- 19 EEG channels
- Bad Epochs removed
- Transformed to BEDNR representations (3 types: no normalization, Half normalized, normalized)
- Averaged the 8 representations per epoch

## 🔺Non Normalized

In [4]:
pairs = ["003", "004", "005", "007", "008", "009", "010", "011", "012", "013", "014", "016", "017", "018", "019", "020", "022", "023", "024", "025", "027"]
dfs_nonorm = []

for pair in pairs:
    df = pd.read_csv('BENDR/BENDR_no-norm_av/pair'+pair+'_df.csv')
    dfs_nonorm.append(df)

#print the length if all the dfs were concatenated
print(pd.concat(dfs_nonorm, ignore_index=True).shape)

(4825, 1025)


In [5]:
# LDA
scores_nonorm_lda = cross_validation(dfs_nonorm, lda)
print_average(scores_nonorm_lda)
ci_95(scores_nonorm_lda)

57.80%
50.40%
49.80%
51.52%
53.85%
57.98%
51.57%
47.24%
50.21%
50.81%
52.99%
55.28%
52.07%
59.75%
59.07%
51.41%
51.20%
46.92%
57.61%
55.36%
50.21%
Average Cross-Validation Accuracy: 53.00%
95% Confidence Interval for Accuracy: (45.83%, 60.18%)


In [6]:
# SVC RBF
scores_nonorm_svc_rbf = cross_validation(dfs_nonorm, svc_rbf)
print_average(scores_nonorm_svc_rbf)
ci_95(scores_nonorm_svc_rbf)

54.13%
53.60%
52.59%
54.11%
56.80%
53.78%
52.76%
48.82%
52.28%
57.32%
58.12%
58.13%
52.07%
56.43%
50.21%
57.43%
50.80%
60.77%
56.79%
57.14%
57.51%
Average Cross-Validation Accuracy: 54.84%
95% Confidence Interval for Accuracy: (48.89%, 60.79%)


In [7]:
# SVC Linear
scores_nonorm_svc_lin = cross_validation(dfs_nonorm, svc_lin)
print_average(scores_nonorm_svc_lin)
ci_95(scores_nonorm_svc_lin)

55.96%
54.80%
52.99%
52.81%
53.85%
52.94%
49.61%
48.03%
51.45%
58.13%
58.55%
60.16%
50.00%
59.75%
55.27%
55.42%
52.00%
56.15%
57.61%
54.76%
57.94%
Average Cross-Validation Accuracy: 54.68%
95% Confidence Interval for Accuracy: (48.24%, 61.11%)


In [8]:
# Logistic Regression
scores_nonorm_logreg = cross_validation(dfs_nonorm, logreg)
print_average(scores_nonorm_logreg)
ci_95(scores_nonorm_logreg)

53.67%
54.00%
50.20%
53.25%
55.03%
52.52%
54.33%
48.82%
53.94%
50.41%
53.42%
58.94%
50.41%
56.02%
55.70%
56.63%
51.20%
72.31%
52.67%
54.76%
55.79%
Average Cross-Validation Accuracy: 54.48%
95% Confidence Interval for Accuracy: (45.37%, 63.59%)


## 🔹Half Normalized

In [9]:
pairs = ["003", "004", "005", "007", "008", "009", "010", "011", "012", "013", "014", "016", "017", "018", "019", "020", "022", "023", "024", "025", "027"]
dfs_halfnorm = []

for pair in pairs:
    df = pd.read_csv('BENDR/BENDR_half-norm_av/pair'+pair+'_df.csv')
    dfs_halfnorm.append(df)

#print the length if all the dfs were concatenated
print(pd.concat(dfs_halfnorm, ignore_index=True).shape)

(4825, 1025)


In [10]:
# LDA
scores_halfnorm_lda = cross_validation(dfs_halfnorm, lda)
print_average(scores_halfnorm_lda)
ci_95(scores_halfnorm_lda)

50.46%
47.20%
56.18%
54.98%
56.21%
63.03%
49.21%
51.97%
52.28%
50.00%
52.14%
58.94%
54.13%
50.21%
50.63%
51.00%
48.00%
53.85%
54.73%
55.36%
48.07%
Average Cross-Validation Accuracy: 52.79%
95% Confidence Interval for Accuracy: (45.35%, 60.22%)


In [11]:
# SVC RBF
scores_halfnorm_svc_rbf = cross_validation(dfs_halfnorm, svc_rbf)
print_average(scores_halfnorm_svc_rbf)
ci_95(scores_halfnorm_svc_rbf)

55.05%
54.00%
53.39%
48.48%
57.40%
65.97%
43.31%
53.94%
49.38%
50.41%
53.42%
60.57%
54.13%
48.96%
54.85%
55.02%
51.60%
53.85%
57.61%
53.57%
51.07%
Average Cross-Validation Accuracy: 53.62%
95% Confidence Interval for Accuracy: (44.74%, 62.49%)


In [12]:
# SVC Linear
scores_halfnorm_svc_lin = cross_validation(dfs_halfnorm, svc_lin)
print_average(scores_halfnorm_svc_lin)
ci_95(scores_halfnorm_svc_lin)

55.96%
43.20%
56.57%
52.38%
60.36%
65.55%
47.24%
49.21%
49.79%
51.22%
54.70%
56.10%
51.65%
56.02%
55.70%
52.21%
48.40%
46.92%
58.44%
59.52%
54.51%
Average Cross-Validation Accuracy: 53.60%
95% Confidence Interval for Accuracy: (43.60%, 63.60%)


In [13]:
# Logistic Regression
scores_halfnorm_logreg = cross_validation(dfs_halfnorm, logreg)
print_average(scores_halfnorm_logreg)
ci_95(scores_halfnorm_logreg)

55.96%
48.40%
56.18%
51.08%
57.99%
75.21%
48.03%
54.72%
49.79%
49.59%
58.12%
56.50%
55.37%
49.79%
62.03%
54.62%
52.80%
30.00%
65.43%
51.79%
51.50%
Average Cross-Validation Accuracy: 54.04%
95% Confidence Interval for Accuracy: (37.96%, 70.13%)


## 🔸Normalized

In [6]:
pairs = ["003", "004", "005", "007", "008", "009", "010", "011", "012", "013", "014", "016", "017", "018", "019", "020", "022", "023", "024", "025", "027"]
dfs_norm = []

for pair in pairs:
    df = pd.read_csv('BENDR/BENDR_norm_av/pair'+pair+'_df.csv')
    dfs_norm.append(df)

#print the length if all the dfs were concatenated
print(pd.concat(dfs_norm, ignore_index=True).shape)

(4825, 1025)


In [15]:
# LDA
scores_norm_lda = cross_validation(dfs_norm, lda)
print_average(scores_norm_lda)
ci_95(scores_norm_lda)

45.87%
54.00%
45.82%
45.45%
49.11%
53.78%
48.82%
48.43%
51.45%
53.25%
44.44%
49.59%
52.07%
50.21%
48.52%
49.40%
45.20%
50.00%
47.33%
44.64%
44.64%
Average Cross-Validation Accuracy: 48.67%
95% Confidence Interval for Accuracy: (42.75%, 54.58%)


In [16]:
# SVC RBF
scores_norm_svc_rbf = cross_validation(dfs_norm, svc_rbf)
print_average(scores_norm_svc_rbf)
ci_95(scores_norm_svc_rbf)

47.71%
54.40%
51.39%
46.32%
52.66%
44.96%
47.64%
50.39%
50.62%
48.37%
48.29%
51.63%
50.83%
45.64%
47.68%
51.00%
48.40%
51.54%
42.80%
47.02%
47.21%
Average Cross-Validation Accuracy: 48.88%
95% Confidence Interval for Accuracy: (43.47%, 54.29%)


In [6]:
# SVC Linear
scores_norm_svc_lin = cross_validation(dfs_norm, svc_lin)
print_average(scores_norm_svc_lin)
ci_95(scores_norm_svc_lin)

In [7]:
# Logistic Regression
scores_norm_logreg = cross_validation(dfs_norm, logreg)
print_average(scores_norm_logreg)
ci_95(scores_norm_logreg)

50.00%
56.40%
47.41%
51.08%
52.66%
52.94%
48.43%
48.43%
52.70%
49.19%
44.02%
50.41%
50.41%
47.72%
49.79%
51.41%
50.00%
52.31%
46.09%
45.24%
44.64%
Average Cross-Validation Accuracy: 49.58%
95% Confidence Interval for Accuracy: (43.70%, 55.47%)


# 🤖Altered Data
- 19 EEG channels
- Bad Epochs removed
- Artificially added a 10Hz sinusoid signal with strength $0.5% \cdot maxamplitude$
- Transformed to BEDNR representations (3 types: no normalization, Half normalized, normalized)
- Averaged the 8 representations per epoch

## 🔺Non Normalized

In [8]:
pairs = ["003", "004", "005", "007", "008", "009", "010", "011", "012", "013", "014", "016", "017", "018", "019", "020", "022", "023", "024", "025", "027"]
dfs_nonorm_10Hz = []

for pair in pairs:
    df = pd.read_csv('BENDR/BENDR_no-norm_10Hz_av/pair'+pair+'_df.csv')
    dfs_nonorm_10Hz.append(df)

#print the length if all the dfs were concatenated
print(pd.concat(dfs_nonorm_10Hz, ignore_index=True).shape)

(4825, 1025)


In [9]:
# LDA
scores_nonorm_lda = cross_validation(dfs_nonorm_10Hz, lda)
print_average(scores_nonorm_lda)
ci_95(scores_nonorm_lda)

83.94%
78.80%
97.21%
91.77%
78.11%
72.69%
92.13%
81.89%
95.02%
89.84%
75.21%
92.68%
90.08%
95.02%
93.67%
95.18%
81.20%
86.92%
90.53%
84.52%
79.40%
Average Cross-Validation Accuracy: 86.94%
95% Confidence Interval for Accuracy: (72.90%, 100.99%)


In [10]:
# SVC RBF
scores_nonorm_svc_rbf = cross_validation(dfs_nonorm_10Hz, svc_rbf)
print_average(scores_nonorm_svc_rbf)
ci_95(scores_nonorm_svc_rbf)

83.03%
74.40%
89.64%
88.31%
70.41%
73.53%
84.65%
75.98%
92.53%
84.96%
75.21%
87.80%
85.54%
87.97%
84.81%
88.35%
74.80%
85.38%
79.42%
83.93%
68.24%
Average Cross-Validation Accuracy: 81.85%
95% Confidence Interval for Accuracy: (68.57%, 95.14%)


In [11]:
# SVC Linear
scores_nonorm_svc_lin = cross_validation(dfs_nonorm_10Hz, svc_lin)
print_average(scores_nonorm_svc_lin)
ci_95(scores_nonorm_svc_lin)

84.40%
80.00%
94.02%
89.18%
75.15%
72.69%
86.22%
79.92%
95.85%
87.80%
79.91%
90.65%
87.60%
91.70%
89.87%
89.96%
80.40%
86.92%
88.07%
86.90%
72.10%
Average Cross-Validation Accuracy: 85.21%
95% Confidence Interval for Accuracy: (72.47%, 97.94%)


In [12]:
# Logistic Regression
scores_nonorm_logreg = cross_validation(dfs_nonorm_10Hz, logreg)
print_average(scores_nonorm_logreg)
ci_95(scores_nonorm_logreg)

63.76%
55.60%
54.18%
61.90%
53.85%
56.30%
57.48%
52.36%
64.32%
53.25%
57.69%
67.48%
59.92%
63.90%
56.54%
61.04%
55.20%
73.08%
58.02%
62.50%
58.80%
Average Cross-Validation Accuracy: 59.39%
95% Confidence Interval for Accuracy: (49.46%, 69.32%)


## 🔹Half Normalized

In [14]:
pairs = ["003", "004", "005", "007", "008", "009", "010", "011", "012", "013", "014", "016", "017", "018", "019", "020", "022", "023", "024", "025", "027"]
dfs_halfnorm_10Hz = []

for pair in pairs:
    df = pd.read_csv('BENDR/BENDR_half_norm_10Hz_av/pair'+pair+'_df.csv')
    dfs_halfnorm_10Hz.append(df)

#print the length if all the dfs were concatenated
print(pd.concat(dfs_halfnorm_10Hz, ignore_index=True).shape)

(4825, 1025)


In [15]:
# LDA
scores_halfnorm_lda = cross_validation(dfs_halfnorm_10Hz, lda)
print_average(scores_halfnorm_lda)
ci_95(scores_halfnorm_lda)

76.61%
68.40%
94.02%
95.67%
73.37%
64.71%
87.01%
69.29%
95.02%
71.54%
67.95%
86.99%
84.30%
88.38%
81.86%
93.57%
76.00%
85.38%
87.65%
85.12%
72.96%
Average Cross-Validation Accuracy: 81.23%
95% Confidence Interval for Accuracy: (62.37%, 100.09%)


In [16]:
# SVC RBF
scores_halfnorm_svc_rbf = cross_validation(dfs_halfnorm_10Hz, svc_rbf)
print_average(scores_halfnorm_svc_rbf)
ci_95(scores_halfnorm_svc_rbf)

77.06%
60.40%
67.73%
83.98%
62.72%
65.55%
56.30%
57.48%
73.86%
55.69%
65.81%
74.39%
71.90%
68.46%
62.45%
79.12%
64.00%
75.38%
70.37%
60.71%
57.51%
Average Cross-Validation Accuracy: 67.19%
95% Confidence Interval for Accuracy: (51.74%, 82.63%)


In [17]:
# SVC Linear
scores_halfnorm_svc_lin = cross_validation(dfs_halfnorm_10Hz, svc_lin)
print_average(scores_halfnorm_svc_lin)
ci_95(scores_halfnorm_svc_lin)

78.90%
64.40%
81.27%
90.91%
68.05%
64.71%
73.62%
62.60%
89.21%
61.38%
68.80%
82.52%
81.40%
86.72%
75.95%
90.76%
74.00%
80.77%
78.19%
77.38%
60.52%
Average Cross-Validation Accuracy: 75.81%
95% Confidence Interval for Accuracy: (57.24%, 94.39%)


In [18]:
# Logistic Regression
scores_halfnorm_logreg = cross_validation(dfs_halfnorm_10Hz, logreg)
print_average(scores_halfnorm_logreg)
ci_95(scores_halfnorm_logreg)

56.88%
48.00%
55.78%
51.95%
58.58%
74.37%
47.64%
54.33%
49.79%
49.59%
57.69%
56.10%
56.61%
48.96%
60.34%
55.42%
54.40%
29.23%
65.43%
48.21%
51.50%
Average Cross-Validation Accuracy: 53.85%
95% Confidence Interval for Accuracy: (37.61%, 70.09%)


## 🔸Normalized

In [19]:
pairs = ["003", "004", "005", "007", "008", "009", "010", "011", "012", "013", "014", "016", "017", "018", "019", "020", "022", "023", "024", "025", "027"]
dfs_norm_10Hz = []

for pair in pairs:
    df = pd.read_csv('BENDR/BENDR_norm_10Hz_av/pair'+pair+'_df.csv')
    dfs_norm_10Hz.append(df)

#print the length if all the dfs were concatenated
print(pd.concat(dfs_norm_10Hz, ignore_index=True).shape)

(4825, 1025)


In [20]:
# LDA
scores_norm_lda = cross_validation(dfs_norm_10Hz, lda)
print_average(scores_norm_lda)
ci_95(scores_norm_lda)

62.39%
58.40%
68.13%
65.37%
64.50%
58.40%
66.14%
58.27%
70.12%
71.14%
58.12%
68.29%
66.94%
65.56%
62.45%
66.27%
62.00%
70.00%
64.20%
72.02%
53.22%
Average Cross-Validation Accuracy: 64.38%
95% Confidence Interval for Accuracy: (54.82%, 73.94%)


In [21]:
# SVC RBF
scores_norm_svc_rbf = cross_validation(dfs_norm_10Hz, svc_rbf)
print_average(scores_norm_svc_rbf)
ci_95(scores_norm_svc_rbf)

60.55%
59.60%
64.54%
58.44%
67.46%
55.46%
62.20%
57.09%
70.54%
65.45%
59.83%
60.98%
63.22%
64.73%
62.45%
67.87%
60.00%
66.15%
60.49%
60.71%
59.66%
Average Cross-Validation Accuracy: 62.26%
95% Confidence Interval for Accuracy: (55.01%, 69.51%)


In [23]:
# SVC Linear
scores_norm_svc_lin = cross_validation(dfs_norm_10Hz, svc_lin)
print_average(scores_norm_svc_lin)
ci_95(scores_norm_svc_lin)

In [22]:
# Logistic Regression
scores_norm_logreg = cross_validation(dfs_norm_10Hz, logreg)
print_average(scores_norm_logreg)
ci_95(scores_norm_logreg)

64.22%
64.40%
66.53%
67.10%
62.72%
58.82%
62.99%
59.45%
72.61%
67.48%
57.69%
63.82%
64.46%
68.05%
61.60%
69.48%
60.40%
70.77%
62.14%
69.64%
55.36%
Average Cross-Validation Accuracy: 64.27%
95% Confidence Interval for Accuracy: (55.59%, 72.96%)
