# Imports and Functions

In [1]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from imblearn.over_sampling import RandomOverSampler
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [2]:
lda = LinearDiscriminantAnalysis()
svc_lin = SVC(C=1000.0, kernel='linear')
svc_rbf = SVC(kernel='rbf', C=1000.0)
logreg = LogisticRegression(max_iter=100000)

In [3]:
def cross_validation(dfs, classifier, n_splits=21):
    scores = []
    
    # Determine base group size and the number of groups that need an extra sample
    base_group_size = len(dfs) // n_splits
    num_extra = len(dfs) % n_splits
    
    # Create the dynamic groups
    grouped_pairs = []
    start = 0
    for i in range(n_splits):
        end = start + base_group_size
        if i < num_extra:  # Add an extra sample to the first 'num_extra' groups
            end += 1
        grouped_pairs.append(dfs[start:end])
        start = end
    
    for i in range(n_splits):
        # Prepare the training and test data
        test_df = pd.concat(grouped_pairs[i], ignore_index=True)
        train_df = pd.concat([df for j, group in enumerate(grouped_pairs) for df in group if j != i], ignore_index=True)
        
        X_train = train_df.drop('label', axis=1)
        y_train = train_df['label']
        X_test = test_df.drop('label', axis=1)
        y_test = test_df['label']
        
        # Handle class imbalance for the training set
        ros = RandomOverSampler(random_state=42)
        X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
        
        # Train and validate the classifier
        classifier.fit(X_resampled, y_resampled)
        score = classifier.score(X_test, y_test)
        scores.append(score)
        print(f"{score * 100:.2f}%")
    
    return scores

# Average cross-validation score
def print_average(scores):
    average_score = sum(scores) / len(scores)
    print(f"Average Cross-Validation Accuracy: {average_score * 100:.2f}%")

# 95% Confidence Interval
def ci_95(scores):
    std_dev = np.std(scores)
    mean_score = np.mean(scores)
    error_range = (mean_score - 1.96 * std_dev, mean_score + 1.96 * std_dev)
    print(f"95% Confidence Interval for Accuracy: ({error_range[0] * 100:.2f}%, {error_range[1] * 100:.2f}%)")

# Experiment Data
- 19 EEG Channels
- Bad Epochs removed
- TFA calculated on it

In [5]:
pairs = ["003", "004", "005", "007", "008", "009", "010", "011", "012", "013", "014", "016", "017", "018", "019", "020", "022", "023", "024", "025", "027"]
dfs = []

for pair in pairs:
    df = pd.read_csv('TF_df_19/pair'+pair+'_df.csv')
    dfs.append(df)

# Concatenate all DataFrames
concatenated_df = pd.concat(dfs, ignore_index=True)

# Print the total number of rows in the concatenated DataFrame
print(concatenated_df.shape)

# Print the number of rows labeled as 1 (Unoupled)
unoupled_count = concatenated_df['label'].value_counts().get(1, 0)
print("Unoupled:", unoupled_count)

# Print the number of rows labeled as 2 (Coupled)
coupled_count = concatenated_df['label'].value_counts().get(2, 0)
print("Coupled:", coupled_count)

(4825, 3421)
Unoupled: 2282
Coupled: 2543


In [5]:
# LDA
scores_lda = cross_validation(dfs, lda)
print_average(scores_lda)
ci_95(scores_lda)

64.22%
56.40%
59.36%
53.25%
62.13%
56.30%
54.72%
59.45%
51.45%
56.50%
53.85%
62.20%
55.37%
58.92%
58.65%
55.82%
49.60%
64.62%
61.32%
59.52%
64.38%
Average Cross-Validation Accuracy: 58.00%


In [8]:
# SVC RBF
scores_svc_rbf = cross_validation(dfs, svc_rbf)
print_average(scores_svc_rbf)
ci_95(scores_svc_rbf)

72.94%
60.80%
54.58%
53.25%
62.13%
65.97%
62.20%
67.72%
65.98%
64.63%
62.82%
59.35%
63.64%
69.29%
67.51%
68.67%
48.00%
33.08%
73.66%
52.98%
81.55%
Average Cross-Validation Accuracy: 62.42%
95% Confidence Interval for Accuracy: (42.86%, 81.97%)


In [9]:
# SVC Linear
scores_svc_lin = cross_validation(dfs, svc_lin)
print_average(scores_svc_lin)
ci_95(scores_svc_lin)

43.12%
49.20%
49.80%
51.52%
37.87%
49.58%
49.61%
49.61%
49.38%
48.37%
45.30%
52.03%
47.11%
51.45%
50.63%
48.59%
50.00%
2.31%
47.33%
50.60%
46.35%
Average Cross-Validation Accuracy: 46.18%
95% Confidence Interval for Accuracy: (25.99%, 66.37%)


In [10]:
# Logistic Regression
scores_logreg = cross_validation(dfs, logreg)
print_average(scores_logreg)
ci_95(scores_logreg)

43.12%
49.20%
49.80%
51.52%
37.87%
49.58%
49.61%
49.61%
49.38%
48.37%
45.30%
52.03%
47.11%
51.45%
50.63%
48.59%
50.00%
2.31%
47.33%
50.60%
46.35%
Average Cross-Validation Accuracy: 46.18%
95% Confidence Interval for Accuracy: (25.99%, 66.37%)


# Synthetic Data
- 19 EEG Channels
- Bad Epochs removed
- Artificially added a 10Hz sinusoid signal with strength $0.5% \cdot maxamplitude$
- TFA calculated on it

In [11]:
pairs = ["003", "004", "005", "007", "008", "009", "010", "011", "012", "013", "014", "016", "017", "018", "019", "020", "022", "023", "024", "025", "027"]
dfs_10Hz = []

for pair in pairs:
    df = pd.read_csv('TFA_10Hz/pair'+pair+'_df.csv')
    dfs_10Hz.append(df)

#print the length if all the dfs were concatenated
print(pd.concat(dfs_10Hz, ignore_index=True).shape)

(4825, 3421)


In [12]:
# LDA
scores_10Hz_lda = cross_validation(dfs_10Hz, lda)
print_average(scores_10Hz_lda)
ci_95(scores_10Hz_lda)

87.16%
60.80%
57.77%
59.74%
57.99%
58.82%
75.98%
63.39%
58.92%
58.13%
56.41%
60.16%
56.20%
63.49%
59.49%
60.24%
41.60%
70.00%
87.24%
57.14%
70.39%
Average Cross-Validation Accuracy: 62.91%
95% Confidence Interval for Accuracy: (42.94%, 82.87%)


In [13]:
# SVC RBF
scores_10Hz_svc_rbf = cross_validation(dfs_10Hz, svc_rbf)
print_average(scores_10Hz_svc_rbf)
ci_95(scores_10Hz_svc_rbf)

91.74%
60.80%
57.37%
87.88%
60.36%
60.08%
98.43%
82.68%
95.85%
84.96%
58.12%
71.95%
65.70%
90.04%
96.62%
90.76%
50.00%
73.85%
97.53%
55.36%
84.12%
Average Cross-Validation Accuracy: 76.87%
95% Confidence Interval for Accuracy: (45.47%, 108.27%)


In [14]:
# SVC Linear
scores_10Hz_svc_lin = cross_validation(dfs_10Hz, svc_lin)
print_average(scores_10Hz_svc_lin)
ci_95(scores_10Hz_svc_lin)

43.12%
49.20%
49.80%
51.52%
37.87%
49.58%
49.61%
49.61%
49.38%
48.37%
45.30%
52.03%
47.11%
51.45%
50.63%
48.59%
50.00%
2.31%
47.33%
50.60%
46.35%
Average Cross-Validation Accuracy: 46.18%
95% Confidence Interval for Accuracy: (25.99%, 66.37%)


In [15]:
# Logistic Regression
scores_10Hz_logreg = cross_validation(dfs_10Hz, logreg)
print_average(scores_10Hz_logreg)
ci_95(scores_10Hz_logreg)

43.12%
49.20%
49.80%
51.52%
37.87%
49.58%
49.61%
49.61%
49.38%
48.37%
45.30%
52.03%
47.11%
51.45%
50.63%
48.59%
50.00%
2.31%
47.33%
50.60%
46.35%
Average Cross-Validation Accuracy: 46.18%
95% Confidence Interval for Accuracy: (25.99%, 66.37%)
