In [1]:
import os
import pandas as pd
import numpy as np
from data_processing import (
    load_sensor_data, 
    load_meta_data, 
    read_sensor_data, 
    read_meta_data, 
    add_diagnosis_to_sensor, 
    filter_and_label_diagnosis,
    add_feature_to_sensor,
    convert_yob_to_age,
    impute_age_by_patient_avg,
    binary_gender_encode,
    impute_updrs3_by_knn,
    split_dataset,
    normalize_data,
    create_fixed_windows_with_overlap
)

from feature_extraction import (
    apply_fft_to_windows,
    extract_time_domain_features,
    extract_fft_features
)
from data_visuals import plot_sensor_signals

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

In [2]:
root_directory = "G:/My Drive/fog_dataset/"

### Loading Data

In [3]:
diagnosis_dir = os.path.join(root_directory, "diagnosis")

tasks = ["TUG", "2minwalk", "4x10mFastWithStop", "4x10mPrefWithoutStop", "4x10mSlowWithStop"]

if not os.path.exists(diagnosis_dir):
    os.makedirs(diagnosis_dir)
    print(f"Created directory: {diagnosis_dir}")

for task_name in tasks:
    print(f"\nProcessing task: {task_name}...")

    sensor_save_path = os.path.join(diagnosis_dir, f"sensor_{task_name}.csv")
    meta_save_path = os.path.join(diagnosis_dir, f"meta_{task_name}.csv")

    if os.path.exists(sensor_save_path) and os.path.exists(meta_save_path):
        print(f"Files for {task_name} already exist. Skipping...")
        continue  

    if not os.path.exists(sensor_save_path):
        try:
            sensor_df = load_sensor_data(root_directory, task_name)
            if not sensor_df.empty:
                sensor_df.to_csv(sensor_save_path, index=False)
            else:
                print(f"No sensor data found for {task_name}. Skipping...")
        except Exception as e:
            print(f"Error loading sensor data for {task_name}: {e}")

    if not os.path.exists(meta_save_path):
        try:
            meta_df = load_meta_data(root_directory, task_name)
            if not meta_df.empty:
                meta_df.to_csv(meta_save_path, index=False)
            else:
                print(f"No metadata found for {task_name}. Skipping...")
        except Exception as e:
            print(f"Error loading metadata for {task_name}: {e}")


Created directory: G:/My Drive/fog_dataset/diagnosis

Processing task: TUG...


Loading Sensor Data for TUG: 100%|██████████| 122/122 [00:20<00:00,  6.01it/s]
Loading Metadata for TUG: 100%|██████████| 122/122 [00:02<00:00, 42.18it/s]



Processing task: 2minwalk...


Loading Sensor Data for 2minwalk: 100%|██████████| 122/122 [00:41<00:00,  2.93it/s]
Loading Metadata for 2minwalk: 100%|██████████| 122/122 [00:02<00:00, 44.91it/s]



Processing task: 4x10mFastWithStop...


Loading Sensor Data for 4x10mFastWithStop: 100%|██████████| 122/122 [00:24<00:00,  4.94it/s]
Loading Metadata for 4x10mFastWithStop: 100%|██████████| 122/122 [00:02<00:00, 43.72it/s]



Processing task: 4x10mPrefWithoutStop...


Loading Sensor Data for 4x10mPrefWithoutStop: 100%|██████████| 122/122 [00:23<00:00,  5.19it/s]
Loading Metadata for 4x10mPrefWithoutStop: 100%|██████████| 122/122 [00:02<00:00, 42.63it/s]



Processing task: 4x10mSlowWithStop...


Loading Sensor Data for 4x10mSlowWithStop: 100%|██████████| 122/122 [00:29<00:00,  4.14it/s]
Loading Metadata for 4x10mSlowWithStop: 100%|██████████| 122/122 [00:03<00:00, 40.00it/s]


### Reading Data

In [3]:
sensor_tug = read_sensor_data(root_directory, "TUG")
meta_tug = read_meta_data(root_directory, "TUG")

sensor_2minwalk = read_sensor_data(root_directory, "2minwalk")
meta_2minwalk = read_meta_data(root_directory, "2minwalk")

sensor_slow = read_sensor_data(root_directory, "4x10mSlowWithStop")
meta_slow = read_meta_data(root_directory, "4x10mSlowWithStop")

sensor_pref = read_sensor_data(root_directory, "4x10mPrefWithoutStop")
meta_pref = read_meta_data(root_directory, "4x10mPrefWithoutStop")

sensor_fast = read_sensor_data(root_directory, "4x10mFastWithStop")
meta_fast = read_meta_data(root_directory, "4x10mFastWithStop")


Reading sensor data for TUG...
Reading metadata for TUG...
Reading sensor data for 2minwalk...
Reading metadata for 2minwalk...
Reading sensor data for 4x10mSlowWithStop...
Reading metadata for 4x10mSlowWithStop...
Reading sensor data for 4x10mPrefWithoutStop...
Reading metadata for 4x10mPrefWithoutStop...
Reading sensor data for 4x10mFastWithStop...
Reading metadata for 4x10mFastWithStop...


### Diagnosis Classification

#### Data Processing

In [4]:
# Add diagnosis to sensor data
sensor_tug = add_diagnosis_to_sensor(sensor_tug, meta_tug)
sensor_2minwalk = add_diagnosis_to_sensor(sensor_2minwalk, meta_2minwalk)
sensor_slow = add_diagnosis_to_sensor(sensor_slow, meta_slow)
sensor_pref = add_diagnosis_to_sensor(sensor_pref, meta_pref)
sensor_fast = add_diagnosis_to_sensor(sensor_fast, meta_fast)

In [5]:
diagnosis_to_keep = [
    "Parkinson's disease",
    "Control",
    "Parkinson's disease and dementia",
    "Parkinsonism unspecified",
    "Secondary parkinsonism: other"
]

label_1_diagnoses = [
    "Parkinson's disease",
    "Parkinson's disease and dementia",
    "Parkinsonism unspecified",
    "Secondary parkinsonism: other"
]

label_0_diagnoses = ["Control"]

# Apply filtering & mapping (Diagnosis column will now be 0 or 1)
sensor_tug = filter_and_label_diagnosis(sensor_tug, diagnosis_to_keep, label_1_diagnoses, label_0_diagnoses)
sensor_2minwalk = filter_and_label_diagnosis(sensor_2minwalk, diagnosis_to_keep, label_1_diagnoses, label_0_diagnoses)
sensor_slow = filter_and_label_diagnosis(sensor_slow, diagnosis_to_keep, label_1_diagnoses, label_0_diagnoses)
sensor_pref = filter_and_label_diagnosis(sensor_pref, diagnosis_to_keep, label_1_diagnoses, label_0_diagnoses)
sensor_fast = filter_and_label_diagnosis(sensor_fast, diagnosis_to_keep, label_1_diagnoses, label_0_diagnoses)

In [6]:
# Add Year of Birth (YOB) to sensor data 
sensor_tug = add_feature_to_sensor(sensor_tug, meta_tug, "YOB")
sensor_2minwalk = add_feature_to_sensor(sensor_2minwalk, meta_2minwalk, "YOB")
sensor_slow = add_feature_to_sensor(sensor_slow, meta_slow, "YOB")
sensor_pref = add_feature_to_sensor(sensor_pref, meta_pref, "YOB")
sensor_fast = add_feature_to_sensor(sensor_fast, meta_fast, "YOB")

# Convert YOB to Age
sensor_tug = convert_yob_to_age(sensor_tug)
sensor_2minwalk = convert_yob_to_age(sensor_2minwalk)
sensor_slow = convert_yob_to_age(sensor_slow)
sensor_pref = convert_yob_to_age(sensor_pref)
sensor_fast = convert_yob_to_age(sensor_fast)

# Impute missing Age values based on unique patient averages 
sensor_tug = impute_age_by_patient_avg(sensor_tug)
sensor_2minwalk = impute_age_by_patient_avg(sensor_2minwalk)
sensor_slow = impute_age_by_patient_avg(sensor_slow)
sensor_pref = impute_age_by_patient_avg(sensor_pref)
sensor_fast = impute_age_by_patient_avg(sensor_fast)

# Drop YOB
sensor_tug.drop(columns=["YOB"], inplace=True)
sensor_2minwalk.drop(columns=["YOB"], inplace=True)
sensor_slow.drop(columns=["YOB"], inplace=True)
sensor_pref.drop(columns=["YOB"], inplace=True)
sensor_fast.drop(columns=["YOB"], inplace=True)

In [7]:
# Add Gender to sensor data
sensor_tug = add_feature_to_sensor(sensor_tug, meta_tug, "Gender")
sensor_2minwalk = add_feature_to_sensor(sensor_2minwalk, meta_2minwalk, "Gender")
sensor_slow = add_feature_to_sensor(sensor_slow, meta_slow, "Gender")
sensor_pref = add_feature_to_sensor(sensor_pref, meta_pref, "Gender")
sensor_fast = add_feature_to_sensor(sensor_fast, meta_fast, "Gender")

# Convert Gender to binary 
sensor_tug = binary_gender_encode(sensor_tug)
sensor_2minwalk = binary_gender_encode(sensor_2minwalk)
sensor_slow = binary_gender_encode(sensor_slow)
sensor_pref = binary_gender_encode(sensor_pref)
sensor_fast = binary_gender_encode(sensor_fast)


In [8]:
# Add UPDRS3 to sensor data
sensor_tug = add_feature_to_sensor(sensor_tug, meta_tug, "MSDS-UPDRS part 3").rename(columns={"MSDS-UPDRS part 3": "UPDRS3"})
sensor_2minwalk = add_feature_to_sensor(sensor_2minwalk, meta_2minwalk, "MSDS-UPDRS part 3").rename(columns={"MSDS-UPDRS part 3": "UPDRS3"})
sensor_slow = add_feature_to_sensor(sensor_slow, meta_slow, "MSDS-UPDRS part 3").rename(columns={"MSDS-UPDRS part 3": "UPDRS3"})
sensor_pref = add_feature_to_sensor(sensor_pref, meta_pref, "MSDS-UPDRS part 3").rename(columns={"MSDS-UPDRS part 3": "UPDRS3"})
sensor_fast = add_feature_to_sensor(sensor_fast, meta_fast, "MSDS-UPDRS part 3").rename(columns={"MSDS-UPDRS part 3": "UPDRS3"})

# Impute missing UPDRS3 values 
sensor_tug = impute_updrs3_by_knn(sensor_tug)
sensor_2minwalk = impute_updrs3_by_knn(sensor_2minwalk)
sensor_slow = impute_updrs3_by_knn(sensor_slow)
sensor_pref = impute_updrs3_by_knn(sensor_pref)
sensor_fast = impute_updrs3_by_knn(sensor_fast)


#### Train Test Split

In [9]:
df_dict = {
    "TUG": sensor_tug,
    "2minwalk": sensor_2minwalk,
    "Slow": sensor_slow,
    "Pref": sensor_pref,
    "Fast": sensor_fast,
}

save_paths = {task: f"{root_directory}/diagnosis/df_{task}" for task in df_dict.keys()}

split_results = {}

# Perform train-validation-test split 
for task_name, df in df_dict.items():
    split_results[task_name] = split_dataset(df, dataset_name=task_name, save_path=save_paths[task_name])



Found existing split files for TUG. Loading data...
Dataset Split Summary for TUG:
  - Total Patients: 112 (Original)
  - Train Patients: 71
  - Validation Patients: 18
  - Test Patients: 23

Found existing split files for 2minwalk. Loading data...
Dataset Split Summary for 2minwalk:
  - Total Patients: 115 (Original)
  - Train Patients: 73
  - Validation Patients: 19
  - Test Patients: 23

Found existing split files for Slow. Loading data...
Dataset Split Summary for Slow:
  - Total Patients: 110 (Original)
  - Train Patients: 70
  - Validation Patients: 18
  - Test Patients: 22

Found existing split files for Pref. Loading data...
Dataset Split Summary for Pref:
  - Total Patients: 109 (Original)
  - Train Patients: 69
  - Validation Patients: 18
  - Test Patients: 22

Found existing split files for Fast. Loading data...
Dataset Split Summary for Fast:
  - Total Patients: 110 (Original)
  - Train Patients: 70
  - Validation Patients: 18
  - Test Patients: 22


#### Normalizaiton

In [10]:
sensor_columns = [
    'L_AccX_filt', 'L_AccY_filt', 'L_AccZ_filt',
    'L_GyrX_filt', 'L_GyrY_filt', 'L_GyrZ_filt',
    'R_AccX_filt', 'R_AccY_filt', 'R_AccZ_filt',
    'R_GyrX_filt', 'R_GyrY_filt', 'R_GyrZ_filt'
]

# Apply normalization for Train, Validation, and Test sets
for task_name, splits in split_results.items():
    split_results[task_name]["train"] = normalize_data(splits["train"], sensor_columns)
    split_results[task_name]["val"] = normalize_data(splits["val"], sensor_columns)
    split_results[task_name]["test"] = normalize_data(splits["test"], sensor_columns)


#### Windowing

In [11]:
window_size = 300
overlap = 150

# Apply windowing for Train, Validation, and Test sets
windowed_results = {}

for task_name, splits in split_results.items():
    print(f"\nApplying windowing to {task_name} dataset...")

    windows_train, labels_train, sub_ids_train = create_fixed_windows_with_overlap(
        splits["train"], sensor_columns, window_size=window_size, overlap=overlap
    )
    windows_val, labels_val, sub_ids_val = create_fixed_windows_with_overlap(
        splits["val"], sensor_columns, window_size=window_size, overlap=overlap
    )
    windows_test, labels_test, sub_ids_test = create_fixed_windows_with_overlap(
        splits["test"], sensor_columns, window_size=window_size, overlap=overlap
    )

    windowed_results[task_name] = {
        "windows_train": windows_train, "labels_train": labels_train, "sub_ids_train": sub_ids_train,
        "windows_val": windows_val, "labels_val": labels_val, "sub_ids_val": sub_ids_val,
        "windows_test": windows_test, "labels_test": labels_test, "sub_ids_test": sub_ids_test
    }
    print(f"{task_name} - Windows (Train): {windows_train.shape}")
    print(f"{task_name} - Windows (Validation): {windows_val.shape}")
    print(f"{task_name} - Windows (Test): {windows_test.shape}")




Applying windowing to TUG dataset...
TUG - Windows (Train): (477, 300, 12)
TUG - Windows (Validation): (129, 300, 12)
TUG - Windows (Test): (149, 300, 12)

Applying windowing to 2minwalk dataset...
2minwalk - Windows (Train): (5878, 300, 12)
2minwalk - Windows (Validation): (1540, 300, 12)
2minwalk - Windows (Test): (1850, 300, 12)

Applying windowing to Slow dataset...
Slow - Windows (Train): (2777, 300, 12)
Slow - Windows (Validation): (817, 300, 12)
Slow - Windows (Test): (918, 300, 12)

Applying windowing to Pref dataset...
Pref - Windows (Train): (1977, 300, 12)
Pref - Windows (Validation): (578, 300, 12)
Pref - Windows (Test): (640, 300, 12)

Applying windowing to Fast dataset...
Fast - Windows (Train): (1592, 300, 12)
Fast - Windows (Validation): (452, 300, 12)
Fast - Windows (Test): (511, 300, 12)


#### Frequency Domain (FFT)

In [12]:
# Apply FFT 
fft_results = {}

for task_name, windows in windowed_results.items():
    print(f"\nApplying FFT to {task_name} dataset...")

    fft_train, freq_bins = apply_fft_to_windows(windows["windows_train"])  # Full FFT spectrum
    fft_val, _ = apply_fft_to_windows(windows["windows_val"])
    fft_test, _ = apply_fft_to_windows(windows["windows_test"])

    fft_results[task_name] = {
        "fft_train": fft_train, "labels_train": windows["labels_train"], "sub_ids_train": windows["sub_ids_train"],
        "fft_val": fft_val, "labels_val": windows["labels_val"], "sub_ids_val": windows["sub_ids_val"],
        "fft_test": fft_test, "labels_test": windows["labels_test"], "sub_ids_test": windows["sub_ids_test"],
        "freq_bins": freq_bins  
    }

    print(f"{task_name} - FFT Train Shape: {fft_train.shape}")
    print(f"{task_name} - FFT Validation Shape: {fft_val.shape}")
    print(f"{task_name} - FFT Test Shape: {fft_test.shape}")



Applying FFT to TUG dataset...
TUG - FFT Train Shape: (477, 151, 12)
TUG - FFT Validation Shape: (129, 151, 12)
TUG - FFT Test Shape: (149, 151, 12)

Applying FFT to 2minwalk dataset...
2minwalk - FFT Train Shape: (5878, 151, 12)
2minwalk - FFT Validation Shape: (1540, 151, 12)
2minwalk - FFT Test Shape: (1850, 151, 12)

Applying FFT to Slow dataset...
Slow - FFT Train Shape: (2777, 151, 12)
Slow - FFT Validation Shape: (817, 151, 12)
Slow - FFT Test Shape: (918, 151, 12)

Applying FFT to Pref dataset...
Pref - FFT Train Shape: (1977, 151, 12)
Pref - FFT Validation Shape: (578, 151, 12)
Pref - FFT Test Shape: (640, 151, 12)

Applying FFT to Fast dataset...
Fast - FFT Train Shape: (1592, 151, 12)
Fast - FFT Validation Shape: (452, 151, 12)
Fast - FFT Test Shape: (511, 151, 12)


#### Time Domain Features

In [13]:
# Apply time-domain feature extraction 
time_features_results = {}

for task_name, windows in windowed_results.items():
    print(f"\nExtracting time-domain features for {task_name} dataset...")

    X_train = extract_time_domain_features(windows["windows_train"])
    X_val = extract_time_domain_features(windows["windows_val"])
    X_test = extract_time_domain_features(windows["windows_test"])

    time_features_results[task_name] = {
        "X_train": X_train, "labels_train": windows["labels_train"], "sub_ids_train": windows["sub_ids_train"],
        "X_val": X_val, "labels_val": windows["labels_val"], "sub_ids_val": windows["sub_ids_val"],
        "X_test": X_test, "labels_test": windows["labels_test"], "sub_ids_test": windows["sub_ids_test"]
    }

    print(f"{task_name} - Time-Domain Features Shape (Train): {X_train.shape}")




Extracting time-domain features for TUG dataset...
TUG - Time-Domain Features Shape (Train): (477, 156)

Extracting time-domain features for 2minwalk dataset...
2minwalk - Time-Domain Features Shape (Train): (5878, 156)

Extracting time-domain features for Slow dataset...
Slow - Time-Domain Features Shape (Train): (2777, 156)

Extracting time-domain features for Pref dataset...
Pref - Time-Domain Features Shape (Train): (1977, 156)

Extracting time-domain features for Fast dataset...
Fast - Time-Domain Features Shape (Train): (1592, 156)


#### Frequency Domain Features

In [14]:
# Apply FFT feature extraction 
fft_features_results = {}

for task_name, fft_data in fft_results.items():
    print(f"\nExtracting FFT features for {task_name} dataset...")

    X_train = extract_fft_features(fft_data["fft_train"], fft_data["freq_bins"])
    X_val = extract_fft_features(fft_data["fft_val"], fft_data["freq_bins"])
    X_test = extract_fft_features(fft_data["fft_test"], fft_data["freq_bins"])

    fft_features_results[task_name] = {
        "X_train": X_train, "labels_train": fft_data["labels_train"], "sub_ids_train": fft_data["sub_ids_train"],
        "X_val": X_val, "labels_val": fft_data["labels_val"], "sub_ids_val": fft_data["sub_ids_val"],
        "X_test": X_test, "labels_test": fft_data["labels_test"], "sub_ids_test": fft_data["sub_ids_test"]
    }

    print(f"{task_name} - FFT Feature Shape (Train): {X_train.shape}")



Extracting FFT features for TUG dataset...
TUG - FFT Feature Shape (Train): (477, 60)

Extracting FFT features for 2minwalk dataset...
2minwalk - FFT Feature Shape (Train): (5878, 60)

Extracting FFT features for Slow dataset...
Slow - FFT Feature Shape (Train): (2777, 60)

Extracting FFT features for Pref dataset...
Pref - FFT Feature Shape (Train): (1977, 60)

Extracting FFT features for Fast dataset...
Fast - FFT Feature Shape (Train): (1592, 60)


#### Model Structure