In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns 
import os

path = "/home/marta/Documenti/eeg-ml-thesis/"
os.chdir(path)


In [3]:
custom_params = {
    "axes.spines.right": False,
    "axes.spines.top": False,
    "axes.grid": True,         # Enable grid
    "grid.color": "gray",  # Set grid color to light gray
    "grid.linestyle": "--",     # Set grid line style to dashed
    "grid.linewidth": 0.5,      # Set grid line width
    "figure.facecolor": "#fbfbfb",
    "axes.facecolor": "#fbfbfb",
    "axes.labelsize": 12,  # Increase label size
    "xtick.labelsize": 10,  # Increase tick size
    "ytick.labelsize": 10,  # Increase tick size
    "legend.fontsize": 10   # Improve legend readability
}

sns.set_theme(style="ticks", rc=custom_params)

In [5]:
import os
import numpy as np
import pandas as pd
import umap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Paths to folders containing the EEG .npz files
raw_data_folder = "/home/marta/Documenti/data-milt-preprocessed/train_w1000_ovr0_A_vs_F_vs_C"  # Update this with the actual folder path
pca_data_folder = "/home/marta/Documenti/data-milt-preprocessed/train_w1000_ovr0_pca50_A_vs_F_vs_C"  # Update this with the actual folder path

metadata_df = pd.read_csv("/home/marta/Documenti/milt_dataset/datatset/participants.tsv",sep="\t")

# Extract subject ID from filenames (assuming the format 'sub-XXX_*')
def extract_subject_id(filename):
    return filename.split("_")[0]  # Extracts 'sub-XXX' part

# Function to load EEG data from .npz files and match with metadata
def load_eeg_data(folder_path, metadata_df):
    eeg_data = []
    subjects = []
    
    for file in os.listdir(folder_path):
        if file.endswith(".npz"):
            subject_id = extract_subject_id(file)  # Extract subject ID
            
            # Load EEG features
            data = np.load(os.path.join(folder_path, file), allow_pickle=True)
            eeg_data.append(data['x_data'])  # EEG feature array
            subjects.append(subject_id)

    # Create DataFrame
    df_eeg = pd.DataFrame(eeg_data)
    df_eeg['participant_id'] = subjects  # Add subject IDs
    
    # Merge with metadata
    df_merged = df_eeg.merge(metadata_df, left_on="participant_id", right_on="participant_id", how="inner")
    
    return df_merged



KeyboardInterrupt: 

In [None]:
import umap
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load EEG Data
file_path = pd.read_csv("/home/marta/Documenti/milt_dataset/datatset/participants.tsv",sep="\t")  # Update with your actual file path
df = pd.read_csv(file_path)

# Assume dataset has columns: ['subject_id', 'age', 'sex', 'mmse', 'label', 'eeg_feature_1', ..., 'eeg_feature_N']
meta_columns = ['subject_id', 'age', 'sex', 'mmse', 'label']
eeg_features = df.drop(columns=meta_columns).values  # EEG feature matrix
labels = df['label'].values  # Dementia classification labels
sex = df['sex'].values  # Male/Female
age = df['age'].values  # Age
mmse = df['mmse'].values  # MMSE score

# Standardize EEG Features
scaler = StandardScaler()
eeg_features_scaled = scaler.fit_transform(eeg_features)

# Apply PCA (Optional before UMAP)
pca = PCA(n_components=50)  # Reduce dimensions before UMAP
eeg_pca = pca.fit_transform(eeg_features_scaled)

# Apply UMAP
umap_raw = umap.UMAP(n_components=2, n_neighbors=30, min_dist=0.1, random_state=42)
umap_pca = umap.UMAP(n_components=2, n_neighbors=30, min_dist=0.1, random_state=42)

eeg_umap_raw = umap_raw.fit_transform(eeg_features_scaled)
eeg_umap_pca = umap_pca.fit_transform(eeg_pca)

# Convert UMAP results to DataFrame
df_umap_raw = pd.DataFrame(eeg_umap_raw, columns=['UMAP1', 'UMAP2'])
df_umap_pca = pd.DataFrame(eeg_umap_pca, columns=['UMAP1', 'UMAP2'])

df_umap_raw['label'] = labels
df_umap_pca['label'] = labels

df_umap_raw['sex'] = sex
df_umap_pca['sex'] = sex

df_umap_raw['age'] = age
df_umap_pca['age'] = age

df_umap_raw['mmse'] = mmse
df_umap_pca['mmse'] = mmse

# Function to Plot UMAP
def plot_umap(df_umap, title):
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x='UMAP1', y='UMAP2', hue='label', style='sex', size='mmse', data=df_umap, palette='viridis', alpha=0.7)
    plt.title(title)
    plt.legend(title='Dementia Type')
    plt.show()

# Visualize UMAP embeddings
plot_umap(df_umap_raw, 'UMAP on Raw EEG Features')
plot_umap(df_umap_pca, 'UMAP on PCA-Reduced EEG Features')

# Explore Age Distribution within Clusters
plt.figure(figsize=(10, 5))
sns.boxplot(x=df_umap_raw['label'], y=df_umap_raw['age'], palette='coolwarm')
plt.title('Age Distribution by Dementia Type')
plt.show()

# Explore MMSE Score Distribution within Clusters
plt.figure(figsize=(10, 5))
sns.boxplot(x=df_umap_raw['label'], y=df_umap_raw['mmse'], palette='magma')
plt.title('MMSE Score Distribution by Dementia Type')
plt.show()
