In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# ==========================
# 2️⃣ Imports
# ==========================
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
def load_universe_dataset_with_labels(base_path):
    all_data = []

    for participant in os.listdir(base_path):
        participant_path = os.path.join(base_path, participant)
        if not os.path.isdir(participant_path):
            continue

        for session in ["Lab1", "Lab2", "Wild"]:
            session_path = os.path.join(participant_path, session)
            if not os.path.exists(session_path):
                continue

            # Load labels if exist
            label_file = os.path.join(session_path, "Task_Labels.csv")
            if os.path.exists(label_file):
                labels = pd.read_csv(label_file)
            else:
                labels = None

            features_path = os.path.join(session_path, "Features")
            if not os.path.exists(features_path):
                continue

            for task in os.listdir(features_path):
                task_path = os.path.join(features_path, task)
                if not os.path.isdir(task_path):
                    continue

                for file in os.listdir(task_path):
                    if file.endswith(".pickle"):
                        file_path = os.path.join(task_path, file)
                        try:
                            df = pd.read_pickle(file_path)

                            # Add metadata
                            df["participant"] = participant
                            df["session"] = session
                            df["task"] = task
                            df["feature_type"] = file.replace(".pickle", "")

                            # Merge labels if available
                            if labels is not None:
                                # Merge on trial_id if exists, otherwise just append
                                if 'trial_id' in df.columns and 'trial_id' in labels.columns:
                                    df = df.merge(labels, on='trial_id', how='left')
                                else:
                                    df = pd.concat([df.reset_index(drop=True), labels.reset_index(drop=True)], axis=1)

                            all_data.append(df)

                        except Exception as e:
                            print("Error reading:", file_path, e)

    if not all_data:
        raise ValueError("No data found!")

    dataset = pd.concat(all_data, ignore_index=True)
    return dataset


In [None]:
#BASE_PATH = "/content/drive/MyDrive/universe/UNIVERSE"  # change to your path

BASE_PATH = "/content/drive/MyDrive/universe/UNIVERSE"
BASE_PATH2 = "/content/drive/MyDrive/universe/UNIVERSE/UN_101"
BASE_PATH3 = "/content/drive/MyDrive/universe/UNIVERSE/UN_101/Lab1"
BASE_PATH4 = "/content/drive/MyDrive/universe/UNIVERSE/UN_101/Lab1/Features"
BASE_PATH5 = "/content/drive/MyDrive/universe/UNIVERSE/UN_101/Lab1/Features/arithmetix_easy"

print("BASE FOLDER CONTENT:")
print(os.listdir(BASE_PATH))
print(os.listdir(BASE_PATH2))
print(os.listdir(BASE_PATH3))
print(os.listdir(BASE_PATH4))
print(os.listdir(BASE_PATH5))
# dataset = load_universe_dataset(BASE_PATH)
# dataset.shape



BASE FOLDER CONTENT:
['UN_101', 'UN_104', 'UN_105', 'UN_107', 'UN_103', 'UN_106', 'UN_112', 'UN_110', 'UN_109', 'UN_108', 'UN_111', 'UN_102']
['Lab1', 'Lab2', 'Wild']
['Labeled', 'Raw', 'Lab_Notes.pdf', 'Preprocessed', 'Features', 'Task_Labels.csv']
['relaxation_video', 'arithmetix_easy', 'n_back_easy', 'stroop_easy', 'sudoku_easy', 'arithmetix_hard', 'n_back_hard', 'stroop_hard', 'sudoku_hard']
['EEG_features.pickle', 'HRV_features.pickle', 'EDA_features.pickle', 'TEMP_features.pickle']


In [None]:
BASE_PATH = "/content/drive/MyDrive/universe/UNIVERSE"

# Load dataset with labels
dataset = load_universe_dataset_with_labels(BASE_PATH)

# Now check columns
print(dataset.columns)


Index(['mean_δ', 'mean_θ', 'mean_α', 'mean_β', 'mean_γ', 'α/θ', 'θ/α',
       'frontal_α_asy', 'δ_asy', 'θ_asy', 'α_asy', 'β_asy', 'γ_asy',
       'participant', 'session', 'task', 'feature_type', 'Task',
       'Mental Demand', 'Physical Demand', 'Temporal Demand', 'Performance',
       'Effort', 'Frustration', 'physical_demand__vs__temporal_demand',
       'performance__vs__effort', 'mental_demand__vs__physical_demand',
       'effort__vs__frustration', 'physical_demand__vs__performance',
       'mental_demand__vs__effort', 'performance__vs__frustration',
       'physical_demand__vs__effort', 'mental_demand__vs__performance',
       'temporal_demand__vs__performance', 'physical_demand__vs__frustration',
       'temporal_demand__vs__effort', 'mental_demand__vs__frustration',
       'mental_demand__vs__temporal_demand',
       'temporal_demand__vs__frustration', 'Weighted Nasa Score', 'HRV_MeanNN',
       'HRV_SDNN', 'HRV_RMSSD', 'HRV_LFn', 'HRV_HFn', 'HRV_ratio_LFn_HFn',
       'SCR_P

In [None]:
dataset = dataset.rename(columns={
    "Weighted Nasa Score": "fatigue",
    "Mental stress level": "stress",
    "Mental effort level": "focus"
})


In [None]:
ml_columns = [
    # EEG features
    'mean_δ','mean_θ','mean_α','mean_β','mean_γ',
    'α/θ','θ/α','frontal_α_asy','δ_asy','θ_asy','α_asy','β_asy','γ_asy',
    # HRV
    'HRV_MeanNN','HRV_SDNN','HRV_RMSSD','HRV_LFn','HRV_HFn','HRV_ratio_LFn_HFn',
    # EDA/SCR
    'SCR_Peaks_N','SCR_Peaks_Amplitude_Mean',
    # Temperature
    'mean_temp','std_temp',
    # NASA TLX main scores
    'Mental Demand','Physical Demand','Temporal Demand','Performance','Effort','Frustration',
    # Labels
    'fatigue','stress','focus',
]

# Drop all other columns
dataset = dataset[[c for c in ml_columns if c in dataset.columns]]

In [None]:
# ==========================
# 2️⃣ Clean & preprocess the dataset
# ==========================
def clean_dataset(df, label_columns=["fatigue", "stress", "focus"]):
    # 1. Fill missing values only for numeric columns
    numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

    # 2. Encode categorical columns
    for col in df.select_dtypes(include=["object"]).columns:
        if col not in ["participant", "session", "task", "feature_type"]:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))

    # 3. Standardize numeric features
    numeric_cols = [c for c in numeric_cols if c not in label_columns]
    scaler = StandardScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df, scaler

In [None]:
# ==========================
# 3️⃣ Split into train/test
# ==========================
def prepare_and_save_train_test(df, label_columns=["fatigue", "stress", "focus"], base_folder="/content/drive/MyDrive/universe_ml", test_size=0.2):
    # Create folder structure
    os.makedirs(base_folder, exist_ok=True)
    os.makedirs(os.path.join(base_folder, "train"), exist_ok=True)
    os.makedirs(os.path.join(base_folder, "test"), exist_ok=True)

    X = df.drop(columns=label_columns)
    y = df[label_columns]

    # Stratify on fatigue label for balanced split (can be changed)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42,
        stratify=y[label_columns[0]] if label_columns[0] in y else None
    )

    # Combine features + labels and save as CSV
    train_df = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
    test_df = pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1)

    train_path = os.path.join(base_folder, "train", "train_dataset.csv")
    test_path = os.path.join(base_folder, "test", "test_dataset.csv")

    train_df.to_csv(train_path, index=False)
    test_df.to_csv(test_path, index=False)

    print("✅ Train/Test datasets saved:")
    print("Train:", train_path)
    print("Test:", test_path)

    return train_df, test_df

In [None]:
# ==========================
# 4️⃣ Run preprocessing pipeline
# ==========================
# Clean the dataset
cleaned_dataset, scaler = clean_dataset(dataset)

# Split into train/test and save
train_df, test_df = prepare_and_save_train_test(cleaned_dataset)

✅ Train/Test datasets saved:
Train: /content/drive/MyDrive/universe_ml/train/train_dataset.csv
Test: /content/drive/MyDrive/universe_ml/test/test_dataset.csv


In [None]:
print(dataset.columns.tolist())


['mean_δ', 'mean_θ', 'mean_α', 'mean_β', 'mean_γ', 'α/θ', 'θ/α', 'frontal_α_asy', 'δ_asy', 'θ_asy', 'α_asy', 'β_asy', 'γ_asy', 'HRV_MeanNN', 'HRV_SDNN', 'HRV_RMSSD', 'HRV_LFn', 'HRV_HFn', 'HRV_ratio_LFn_HFn', 'SCR_Peaks_N', 'SCR_Peaks_Amplitude_Mean', 'mean_temp', 'std_temp', 'Mental Demand', 'Physical Demand', 'Temporal Demand', 'Performance', 'Effort', 'Frustration', 'fatigue', 'stress', 'focus']
