<a href="https://colab.research.google.com/github/mhazary/agitation-unsupervised/blob/main/agitaion_with_smote_perturb_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Approach 1: Adaptive SMOTE k_neighbors

In [7]:
import os
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

# 📂 Input and output paths
FEATURES_DIR = "/content/drive/My Drive/Agitation_Detection_Project/only_agitation_files"
RESULTS_DIR = "/content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs"
os.makedirs(RESULTS_DIR, exist_ok=True)

# 🔍 Find all feature files
files = glob.glob(os.path.join(FEATURES_DIR, '*_final_features.csv'))
print(f"Total files found: {len(files)}")

# 🧾 Initialize counters
total_files_attempted = len(files)
files_processed_with_smote = 0

for file_path in tqdm(files, desc="Processing files"):
    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        print(f"❌ Skipping {os.path.basename(file_path)} due to read error: {e}")
        continue

    if 'label' not in df.columns:
        print(f"⚠️ Skipping {os.path.basename(file_path)} - 'label' column missing.")
        continue

    positive_count = df['label'].sum()
    if positive_count == 0:
        print(f"⚠️ Skipping {os.path.basename(file_path)} - no positive samples.")
        continue

    # 🧮 Feature prep
    X = df.drop(columns=['label'])
    y = df['label']
    X_numeric = X.select_dtypes(include=[np.number])
    if X_numeric.shape[1] == 0:
        print(f"⚠️ Skipping {os.path.basename(file_path)} - no numeric features.")
        continue

    X_imputed = pd.DataFrame(SimpleImputer(strategy='mean').fit_transform(X_numeric), columns=X_numeric.columns)
    X_scaled = pd.DataFrame(StandardScaler().fit_transform(X_imputed), columns=X_imputed.columns)

    # ⚖️ Handle SMOTE edge case
    minority_samples = sum(y == 1)
    k_neighbors = min(5, minority_samples - 1)
    if minority_samples <= 1:
        print(f"⚠️ Skipping {os.path.basename(file_path)} - insufficient minority samples ({minority_samples}).")
        continue

    try:
        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, y, test_size=0.3, stratify=y, random_state=42
        )
    except ValueError as ve:
        #print(f"⚠️ Skipping {os.path.basename(file_path)} - train_test_split error: {ve}")
        continue

    try:
        smote = SMOTE(random_state=42, k_neighbors=k_neighbors)
        X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
    except Exception as e:
        #print(f"⚠️ SMOTE failed on {os.path.basename(file_path)}: {e}")
        continue

    # ✅ File successfully passed SMOTE
    files_processed_with_smote += 1

    # 🧠 Train model & evaluate
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train_res, y_train_res)
    y_pred = clf.predict(X_test)

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average='binary', zero_division=0
    )

    # 📄 Save result for this file
    result = pd.DataFrame([{
        'file': os.path.basename(file_path),
        'positive_samples': positive_count,
        'k_neighbors': k_neighbors,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }])

    result_file_name = os.path.splitext(os.path.basename(file_path))[0] + ".csv"
    result_path = os.path.join(RESULTS_DIR, result_file_name)
    result.to_csv(result_path, index=False)
    print(f"✅ Saved result to: {result_path}")

# 📊 Final summary
print("\n📊 Summary:")
print(f"🧾 Original files found: {total_files_attempted}")
print(f"✅ Files successfully processed with SMOTE: {files_processed_with_smote}")

Total files found: 147


Processing files:   1%|          | 1/147 [00:00<02:08,  1.14it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant10_Day12_final_features.csv


Processing files:   1%|▏         | 2/147 [00:01<01:51,  1.31it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant10_Day11_final_features.csv


Processing files:   2%|▏         | 3/147 [00:02<01:36,  1.49it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant10_Day21_final_features.csv


Processing files:   4%|▍         | 6/147 [00:02<00:53,  2.62it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant10_Day24_final_features.csv


Processing files:   5%|▍         | 7/147 [00:03<00:57,  2.45it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant10_Day4_final_features.csv


Processing files:   5%|▌         | 8/147 [00:03<00:59,  2.35it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant10_Day3_final_features.csv


Processing files:   7%|▋         | 10/147 [00:04<00:48,  2.85it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant11_Day3_final_features.csv


Processing files:   8%|▊         | 12/147 [00:04<00:40,  3.37it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant10_Day5_final_features.csv


Processing files:  12%|█▏        | 17/147 [00:05<00:25,  5.18it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant11_Day11_final_features.csv


Processing files:  18%|█▊        | 26/147 [00:06<00:18,  6.62it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant12_Day1_final_features.csv


Processing files:  21%|██        | 31/147 [00:07<00:19,  6.03it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant12_Day34_final_features.csv


Processing files:  22%|██▏       | 33/147 [00:08<00:23,  4.95it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant12_Day41_final_features.csv


Processing files:  24%|██▍       | 35/147 [00:09<00:31,  3.52it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant12_Day4_final_features.csv


Processing files:  24%|██▍       | 36/147 [00:10<00:42,  2.59it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant12_Day6_final_features.csv


Processing files:  28%|██▊       | 41/147 [00:11<00:23,  4.42it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant12_Day5_final_features.csv
✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day17_final_features.csv


Processing files:  29%|██▉       | 43/147 [00:12<00:33,  3.13it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day18_final_features.csv
✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day1_final_features.csv


Processing files:  31%|███       | 45/147 [00:13<00:41,  2.45it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day21_final_features.csv


Processing files:  31%|███▏      | 46/147 [00:14<00:44,  2.27it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day26_final_features.csv


Processing files:  32%|███▏      | 47/147 [00:14<00:51,  1.96it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day23_final_features.csv


Processing files:  33%|███▎      | 48/147 [00:15<00:57,  1.72it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day24_final_features.csv


Processing files:  34%|███▍      | 50/147 [00:16<00:45,  2.15it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day28_final_features.csv


Processing files:  35%|███▍      | 51/147 [00:17<00:49,  1.94it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day2_final_features.csv


Processing files:  37%|███▋      | 54/147 [00:17<00:36,  2.56it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day32_final_features.csv


Processing files:  37%|███▋      | 55/147 [00:18<00:36,  2.49it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day35_final_features.csv


Processing files:  38%|███▊      | 56/147 [00:18<00:41,  2.20it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day36_final_features.csv


Processing files:  39%|███▉      | 58/147 [00:19<00:36,  2.43it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day40_final_features.csv


Processing files:  40%|████      | 59/147 [00:20<00:42,  2.06it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day42_final_features.csv


Processing files:  41%|████      | 60/147 [00:21<00:48,  1.78it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day43_final_features.csv


Processing files:  41%|████▏     | 61/147 [00:21<00:50,  1.69it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day4_final_features.csv


Processing files:  42%|████▏     | 62/147 [00:22<00:55,  1.53it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant13_Day51_final_features.csv


Processing files:  44%|████▍     | 65/147 [00:23<00:38,  2.15it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant15_Day2_final_features.csv


Processing files:  46%|████▋     | 68/147 [00:24<00:32,  2.45it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant16_Day23_final_features.csv


Processing files:  48%|████▊     | 70/147 [00:25<00:30,  2.53it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant17_Day6_final_features.csv


Processing files:  48%|████▊     | 71/147 [00:25<00:32,  2.33it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant19_Day20_final_features.csv


Processing files:  49%|████▉     | 72/147 [00:26<00:34,  2.19it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant19_Day23_final_features.csv


Processing files:  50%|████▉     | 73/147 [00:27<00:35,  2.09it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant19_Day22_final_features.csv


Processing files:  50%|█████     | 74/147 [00:27<00:37,  1.96it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant19_Day2_final_features.csv


Processing files:  51%|█████     | 75/147 [00:28<00:36,  2.00it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant19_Day27_final_features.csv


Processing files:  52%|█████▏    | 76/147 [00:28<00:38,  1.86it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant19_Day4_final_features.csv


Processing files:  52%|█████▏    | 77/147 [00:29<00:39,  1.78it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant19_Day34_final_features.csv


Processing files:  53%|█████▎    | 78/147 [00:29<00:39,  1.75it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant19_Day3_final_features.csv


Processing files:  54%|█████▎    | 79/147 [00:30<00:37,  1.81it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant19_Day30_final_features.csv


Processing files:  54%|█████▍    | 80/147 [00:31<00:46,  1.45it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day11_final_features.csv


Processing files:  55%|█████▌    | 81/147 [00:32<00:44,  1.49it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant19_Day8_final_features.csv


Processing files:  56%|█████▌    | 82/147 [00:33<00:50,  1.29it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day12_final_features.csv


Processing files:  56%|█████▋    | 83/147 [00:33<00:48,  1.33it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day13_final_features.csv


Processing files:  57%|█████▋    | 84/147 [00:35<00:58,  1.08it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day15_final_features.csv


Processing files:  58%|█████▊    | 85/147 [00:36<01:03,  1.02s/it]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day14_final_features.csv


Processing files:  59%|█████▊    | 86/147 [00:37<01:07,  1.10s/it]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day18_final_features.csv


Processing files:  59%|█████▉    | 87/147 [00:38<01:03,  1.06s/it]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day1_final_features.csv


Processing files:  60%|█████▉    | 88/147 [00:39<00:55,  1.06it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day17_final_features.csv


Processing files:  61%|██████    | 89/147 [00:40<00:55,  1.04it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day16_final_features.csv


Processing files:  61%|██████    | 90/147 [00:40<00:46,  1.23it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day25_final_features.csv


Processing files:  62%|██████▏   | 91/147 [00:41<00:41,  1.34it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day26_final_features.csv


Processing files:  63%|██████▎   | 92/147 [00:42<00:43,  1.28it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day2_final_features.csv


Processing files:  63%|██████▎   | 93/147 [00:42<00:38,  1.39it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day3_final_features.csv


Processing files:  64%|██████▍   | 94/147 [00:43<00:41,  1.27it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day5_final_features.csv


Processing files:  65%|██████▍   | 95/147 [00:44<00:39,  1.32it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day4_final_features.csv


Processing files:  66%|██████▌   | 97/147 [00:45<00:29,  1.67it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day9_final_features.csv


Processing files:  67%|██████▋   | 98/147 [00:45<00:29,  1.69it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day6_final_features.csv


Processing files:  67%|██████▋   | 99/147 [00:46<00:30,  1.55it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant1_Day8_final_features.csv


Processing files:  68%|██████▊   | 100/147 [00:47<00:29,  1.60it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant20_Day1_final_features.csv


Processing files:  69%|██████▊   | 101/147 [00:48<00:32,  1.42it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day10_final_features.csv


Processing files:  69%|██████▉   | 102/147 [00:48<00:33,  1.34it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day12_final_features.csv


Processing files:  70%|███████   | 103/147 [00:49<00:33,  1.32it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day13_final_features.csv


Processing files:  71%|███████   | 104/147 [00:50<00:37,  1.14it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day11_final_features.csv


Processing files:  71%|███████▏  | 105/147 [00:51<00:37,  1.13it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day14_final_features.csv


Processing files:  72%|███████▏  | 106/147 [00:52<00:35,  1.14it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day17_final_features.csv


Processing files:  73%|███████▎  | 107/147 [00:53<00:34,  1.17it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day15_final_features.csv


Processing files:  73%|███████▎  | 108/147 [00:54<00:31,  1.23it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day18_final_features.csv


Processing files:  74%|███████▍  | 109/147 [00:54<00:29,  1.28it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day19_final_features.csv


Processing files:  75%|███████▍  | 110/147 [00:55<00:27,  1.34it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day1_final_features.csv


Processing files:  76%|███████▌  | 111/147 [00:56<00:25,  1.39it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day3_final_features.csv


Processing files:  76%|███████▌  | 112/147 [00:56<00:24,  1.44it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day6_final_features.csv


Processing files:  77%|███████▋  | 113/147 [00:57<00:23,  1.43it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day22_final_features.csv


Processing files:  78%|███████▊  | 114/147 [00:58<00:21,  1.52it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day7_final_features.csv


Processing files:  78%|███████▊  | 115/147 [00:59<00:23,  1.36it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day5_final_features.csv


Processing files:  79%|███████▉  | 116/147 [00:59<00:21,  1.46it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day2_final_features.csv


Processing files:  80%|███████▉  | 117/147 [01:00<00:21,  1.42it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day8_final_features.csv


Processing files:  80%|████████  | 118/147 [01:01<00:20,  1.40it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant2_Day9_final_features.csv


Processing files:  81%|████████  | 119/147 [01:02<00:21,  1.31it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant3_Day12_final_features.csv


Processing files:  82%|████████▏ | 120/147 [01:03<00:23,  1.13it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant3_Day10_final_features.csv


Processing files:  82%|████████▏ | 121/147 [01:04<00:25,  1.03it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant3_Day1_final_features.csv


Processing files:  84%|████████▎ | 123/147 [01:05<00:19,  1.21it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant3_Day26_final_features.csv


Processing files:  84%|████████▍ | 124/147 [01:06<00:20,  1.14it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant3_Day23_final_features.csv


Processing files:  85%|████████▌ | 125/147 [01:07<00:19,  1.13it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant3_Day5_final_features.csv


Processing files:  86%|████████▌ | 126/147 [01:08<00:17,  1.21it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant3_Day3_final_features.csv


Processing files:  86%|████████▋ | 127/147 [01:08<00:15,  1.30it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant3_Day34_final_features.csv


Processing files:  87%|████████▋ | 128/147 [01:09<00:12,  1.50it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant3_Day4_final_features.csv


Processing files:  88%|████████▊ | 129/147 [01:10<00:12,  1.45it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant3_Day6_final_features.csv


Processing files:  89%|████████▉ | 131/147 [01:11<00:09,  1.62it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant4_Day1_final_features.csv


Processing files:  90%|████████▉ | 132/147 [01:12<00:10,  1.40it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant4_Day2_final_features.csv


Processing files:  90%|█████████ | 133/147 [01:12<00:09,  1.46it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant5_Day10_final_features.csv


Processing files:  91%|█████████ | 134/147 [01:13<00:09,  1.34it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant5_Day11_final_features.csv


Processing files:  92%|█████████▏| 135/147 [01:14<00:08,  1.34it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant5_Day13_final_features.csv


Processing files:  93%|█████████▎| 136/147 [01:14<00:07,  1.48it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant5_Day15_final_features.csv


Processing files:  93%|█████████▎| 137/147 [01:15<00:06,  1.51it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant5_Day3_final_features.csv


Processing files:  94%|█████████▍| 138/147 [01:16<00:06,  1.44it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant6_Day1_final_features.csv


Processing files:  95%|█████████▍| 139/147 [01:17<00:07,  1.09it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant6_Day20_final_features.csv


Processing files:  95%|█████████▌| 140/147 [01:19<00:07,  1.04s/it]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant6_Day22_final_features.csv


Processing files:  96%|█████████▌| 141/147 [01:20<00:06,  1.05s/it]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant6_Day8_final_features.csv


Processing files:  97%|█████████▋| 142/147 [01:20<00:04,  1.08it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant6_Day23_final_features.csv


Processing files:  97%|█████████▋| 143/147 [01:21<00:03,  1.20it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant6_Day7_final_features.csv


Processing files:  98%|█████████▊| 144/147 [01:21<00:02,  1.35it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant6_Day27_final_features.csv


Processing files:  99%|█████████▊| 145/147 [01:22<00:01,  1.33it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant8_Day1_final_features.csv


Processing files:  99%|█████████▉| 146/147 [01:23<00:00,  1.37it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant8_Day5_final_features.csv


Processing files: 100%|██████████| 147/147 [01:24<00:00,  1.75it/s]

✅ Saved result to: /content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs/Participant9_Day4_final_features.csv

📊 Summary:
🧾 Original files found: 147
✅ Files successfully processed with SMOTE: 109





# Get the SMOTE Performance Summary

In [8]:
import os
import glob
import pandas as pd

# 📁 Folder containing SMOTE model result files
RESULTS_DIR = "/content/drive/My Drive/Agitation_Detection_Project/smote_results_csvs"

# 🔍 Load all result CSVs
result_files = glob.glob(os.path.join(RESULTS_DIR, '*.csv'))
print(f"📂 Loaded {len(result_files)} result files.")

# 📄 Combine all results into one DataFrame
all_results = pd.concat([pd.read_csv(f) for f in result_files], ignore_index=True)
print("🔢 Combined results shape:", all_results.shape)
print("📌 Columns in DataFrame:", all_results.columns.tolist())

# 🏷️ Add a 'model' column with value "SMOTE"
all_results['model'] = 'SMOTE'

# 📊 Performance Summary for SMOTE Model
metrics = ['precision', 'recall', 'f1_score']
summary_stats = (
    all_results.groupby('model')[metrics]
    .agg(['mean', 'median', 'std'])
    .round(4)
)

print("\n📊 Performance Summary for SMOTE Model:")
print(summary_stats)

📂 Loaded 110 result files.
🔢 Combined results shape: (327, 6)
📌 Columns in DataFrame: ['file', 'positive_samples', 'k_neighbors', 'precision', 'recall', 'f1_score']

📊 Performance Summary for SMOTE Model:
      precision               recall                f1_score                
           mean median    std   mean median     std     mean  median     std
model                                                                       
SMOTE    0.6913    0.8  0.326  0.521    0.5  0.2996   0.5728  0.6341  0.2946


# 🧪 Approach 2: Cluster-Based Oversampling (Unsupervised)

In [9]:
import os
import pandas as pd
import numpy as np
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import hdbscan
from sklearn.exceptions import UndefinedMetricWarning
from tqdm import tqdm
import random

# --- Suppress warnings ---
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

# --- CONFIG ---
data_dir = "/content/drive/My Drive/Agitation_Detection_Project/normalized_features_csvs"
results_dir = "/content/drive/My Drive/Agitation_Detection_Project/cluster_results_csvs"
os.makedirs(results_dir, exist_ok=True)
results = []

# --- Preprocessing ---
def window_data_1min(df):
    df['start_time'] = pd.to_datetime(df['start_time'], errors='coerce')
    df = df.set_index('start_time').sort_index()
    agg_dict = {}
    for col in df.columns:
        if col == 'label':
            agg_dict[col] = 'max'
        elif pd.api.types.is_numeric_dtype(df[col]):
            agg_dict[col] = ['mean', 'std']
    df_resampled = df.resample('60s').agg(agg_dict)
    df_resampled.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in df_resampled.columns]
    df_resampled = df_resampled.rename(columns={'label_max': 'label'}) if 'label_max' in df_resampled.columns else df_resampled.assign(label=0)
    return df_resampled.dropna().reset_index()

def enrich_features(df):
    signal_cols = df.select_dtypes(include='number').columns.difference(['label'])
    df['signal_mean'] = df[signal_cols].mean(axis=1)
    df['duration_sec'] = 60
    df['duration_times_mean'] = df['duration_sec'] * df['signal_mean']
    return df

# --- Evaluation ---
def evaluate_model(name, y_pred, y_true):
    try:
        auc = roc_auc_score(y_true, y_pred)
    except ValueError:
        auc = None
    return {
        'model': name,
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1_score': f1_score(y_true, y_pred, zero_division=0),
        'auc_roc': auc,
        'tp': ((y_pred == 1) & (y_true == 1)).sum(),
        'fp': ((y_pred == 1) & (y_true == 0)).sum(),
        'fn': ((y_pred == 0) & (y_true == 1)).sum(),
        'predicted_anomalies': (y_pred == 1).sum(),
        'agitated_events': (y_true == 1).sum()
    }

# --- Geometric Perturbation ---
def generate_synthetic(cluster_data, n_samples=50):
    synthetic = [
        cluster_data[i] + np.random.uniform(0.05, 0.3) * (cluster_data[j] - cluster_data[i])
        for _ in range(n_samples)
        for i, j in [random.sample(range(len(cluster_data)), 2)]
    ]
    return np.array(synthetic)

# --- Main Loop ---
for file in tqdm(os.listdir(data_dir)):
    if not file.endswith('.csv'):
        continue

    input_path = os.path.join(data_dir, file)
    try:
        df = pd.read_csv(input_path)
        df = window_data_1min(df)
        df = enrich_features(df)

        y = df['label'].astype(int).values
        if y.sum() == 0 or len(np.unique(y)) < 2 or len(y) < 20:
            continue

        exclude_cols = ['label'] + df.select_dtypes(['object', 'datetime', 'category']).columns.tolist()
        X = df.drop(columns=exclude_cols)
        X = X.apply(pd.to_numeric, errors='coerce').fillna(X.mean())
        X_scaled = StandardScaler().fit_transform(X)

        # --- HDBSCAN Clustering ---
        clusters = hdbscan.HDBSCAN(min_cluster_size=6, min_samples=4).fit_predict(X_scaled)
        clustered_df = pd.DataFrame(X_scaled)
        clustered_df['label'] = y
        clustered_df['cluster'] = clusters
        clustered_df = clustered_df[clustered_df['cluster'] != -1]

        if clustered_df.empty:
            continue

        # --- Identify Minority Clusters (high anomaly ratio)
        minority_clusters = [
            cid for cid, group in clustered_df.groupby('cluster')
            if group['label'].mean() > 0.3 and len(group) >= 3
        ]

        synthetic_data = []
        for cid in minority_clusters:
            group = clustered_df[clustered_df['cluster'] == cid].drop(columns=['label', 'cluster']).values
            synthetic_data.append(generate_synthetic(group, n_samples=50))

        # --- Oversample if clusters found
        if synthetic_data:
            X_aug = np.vstack([X_scaled] + synthetic_data)
            y_aug = np.concatenate([y] + [np.ones(len(s)) for s in synthetic_data])
            model_label = "ClusterOversampled"
        else:
            X_aug, y_aug = X_scaled, y
            model_label = "Original"

        X_train, X_test, y_train, y_test = train_test_split(X_aug, y_aug, test_size=0.3, stratify=y_aug, random_state=42)
        clf = RandomForestClassifier(random_state=42)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        result = evaluate_model(model_label, y_pred, y_test)
        result['file'] = file
        results.append(result)

    except Exception as e:
        results.append({'file': file, 'model': 'failed', 'note': str(e)})



100%|██████████| 435/435 [01:21<00:00,  5.35it/s]


# Get the Perturbed Cluster Performance Summary

In [6]:
# --- Summary ---
results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(results_dir, 'results.csv'), index=False)  # Save full results
filtered_df = results_df[results_df['model'] != 'failed']
best_by_file = filtered_df.loc[filtered_df.groupby('file')['f1_score'].idxmax()]
model_wins = best_by_file['model'].value_counts().reset_index(name='f1_wins')
model_wins.columns = ['model', 'f1_wins']
model_wins.to_csv(os.path.join(results_dir, 'model_wins.csv'), index=False)  # Save model wins

agg_stats = filtered_df.groupby('model')[['precision', 'recall', 'f1_score', 'auc_roc']].agg(['mean', 'median', 'std']).round(4)
agg_stats.to_csv(os.path.join(results_dir, 'agg_stats.csv'))  # Save aggregated stats

print("\n🏆 F1 Score Winners by Model:")
print(model_wins)

print("\n📊 Aggregated Performance Stats:")
print(agg_stats)

print(f"\n✅ Results saved to: {results_dir}")


🏆 F1 Score Winners by Model:
                model  f1_wins
0            Original      134
1  ClusterOversampled        5

📊 Aggregated Performance Stats:
                   precision                 recall                f1_score  \
                        mean median     std    mean median     std     mean   
model                                                                         
ClusterOversampled    0.8486   0.85  0.0654  0.7085    0.7  0.1585   0.7629   
Original              0.1694   0.00  0.3690  0.0720    0.0  0.1838   0.0950   

                                  auc_roc                  
                   median     std    mean  median     std  
model                                                      
ClusterOversampled    0.8  0.1052  0.8273  0.8466  0.0835  
Original              0.0  0.2231  0.5357  0.5000  0.0914  

✅ Results saved to: /content/drive/My Drive/Agitation_Detection_Project/cluster_results_csvs
