In [193]:
import json
from pathlib import Path
import pandas as pd
import os
from tabulate import tabulate

# Creating Complete Patients Tabular Dataset

In [194]:
# New directory for storing merged tabular patient data
ptabular_path = './pads-project-main/data/ptables/'
Path(ptabular_path).mkdir(parents=True, exist_ok=True)

# Input directories
patients_dir = Path('./pads-dataset-1.0.0/patients/')
questionnaire_dir = Path('./pads-dataset-1.0.0/questionnaire/')

In [195]:
condition_set = set()

for patient_file in patients_dir.glob("*.json"):
    with open(patient_file) as f:
        p_data = json.load(f)
        condition_set.add(p_data["condition"])

In [196]:
# Custom group-based condition map
condition_map = {
    "Healthy": 0,
    "Parkinson's": 1,
    "Atypical Parkinsonism": 2,
    "Essential Tremor": 2,
    "Multiple Sclerosis": 2,
    "Other Movement Disorders": 2
}

gender_map = {"male": -1, "female": 1}

hand_map = {"right": -1, "left": 1}

bool_map = {True: 1, False: 0}

effal_map = {
    "unknown": 0,
    "worsens": 1,
    "worsening": 1,
    "no effect": 2,
    "improves": 3
}

def map_bool_like(value):
    if str(value).strip().lower() == "true":
        return 1
    elif str(value).strip().lower() == "false":
        return -1
    else:
        return 0  # For None, "null", "N/A", "", unexpected values


In [197]:
import numpy as np

for patient_file in patients_dir.glob("*.json"):
    patient_id = patient_file.stem.split("_")[-1]
    questionnaire_file = questionnaire_dir / f"questionnaire_response_{patient_id}.json"
    
    if not questionnaire_file.exists():
        print(f"Missing questionnaire for patient {patient_id}. Skipping...")
        continue

    with open(patient_file) as f:
        p_data = json.load(f)

    with open(questionnaire_file) as f:
        q_data = json.load(f)

    condition_str = p_data["condition"]
    condition_encoded = condition_map.get(condition_str, -1)

    if condition_encoded == -1:
        print(f"Unknown condition '{condition_str}' for patient {patient_id}. Skipping...")
        continue

    # Encode values
    gender_enc = gender_map.get(p_data.get("gender", "").lower(), 0)  # 0 = unknown/neutral
    hand_enc = hand_map.get(p_data.get("handedness", "").lower(), 0)
    apprik_enc = map_bool_like(p_data.get("appearance_in_kinship"))
    apprifgk_enc = map_bool_like(p_data.get("appearance_in_first_grade_kinship"))
    effal_enc = effal_map.get(str(p_data.get("effect_of_alcohol_on_tremor")).strip().lower(), -10)

    # Use np.nan for invalid/missing numeric values
    def safe_num(val):
        try:
            v = float(val)
            return v if v >= 0 else np.nan
        except:
            return np.nan

    age = safe_num(p_data.get("age"))
    age_diag = safe_num(p_data.get("age_at_diagnosis"))
    height = safe_num(p_data.get("height"))
    weight = safe_num(p_data.get("weight"))

    # Flatten patient info
    patient_flat = {
        "p_id": p_data["id"],
        "s_id": p_data["study_id"],
        "con": condition_str,
        "con_lbl": condition_encoded,
        "age": age,
        "age_diag": age_diag,
        "height": height,
        "weight": weight,
        "gender": gender_enc,
        "hand": hand_enc,
        "apprik": apprik_enc,
        "apprifgk": apprifgk_enc,
        "effal": effal_enc
    }

    # Flatten questionnaire answers
    q_flat = {
        f"q{str(i).zfill(2)}": map_bool_like(item["answer"])
        for i, item in enumerate(q_data["item"], start=1)
    }

    # Combine and save as .bin
    df = pd.DataFrame([{**patient_flat, **q_flat}])
    df.to_pickle(f"{ptabular_path}{patient_id}_tbl_ml.bin")

print("Tabular files created and saved successfully.")

Tabular files created and saved successfully.


# Validating Created Files

In [198]:
print(f"Tabular data saved in: {ptabular_path}")

Tabular data saved in: ./pads-project-main/data/ptables/


In [199]:
ptabular_path = './pads-project-main/data/ptables/'
created_files = [f for f in os.listdir(ptabular_path) if f.endswith('_tbl_ml.bin')]

print(f"Total tabular files created: {len(created_files)}")

Total tabular files created: 469


In [200]:
import pandas as pd

sample_file = './pads-project-main/data/ptables/001_tbl_ml.bin'

df = pd.read_pickle(sample_file)

print(df.T)

                0
p_id          001
s_id         PADS
con       Healthy
con_lbl         0
age          56.0
age_diag     56.0
height      173.0
weight       78.0
gender         -1
hand           -1
apprik          1
apprifgk        1
effal           0
q01            -1
q02            -1
q03            -1
q04            -1
q05            -1
q06            -1
q07            -1
q08            -1
q09            -1
q10            -1
q11            -1
q12            -1
q13            -1
q14            -1
q15            -1
q16            -1
q17            -1
q18            -1
q19            -1
q20            -1
q21            -1
q22            -1
q23            -1
q24            -1
q25            -1
q26            -1
q27            -1
q28            -1
q29            -1
q30            -1


In [201]:
print("Data types:")
print(df.dtypes)

Data types:
p_id         object
s_id         object
con          object
con_lbl       int64
age         float64
age_diag    float64
height      float64
weight      float64
gender        int64
hand          int64
apprik        int64
apprifgk      int64
effal         int64
q01           int64
q02           int64
q03           int64
q04           int64
q05           int64
q06           int64
q07           int64
q08           int64
q09           int64
q10           int64
q11           int64
q12           int64
q13           int64
q14           int64
q15           int64
q16           int64
q17           int64
q18           int64
q19           int64
q20           int64
q21           int64
q22           int64
q23           int64
q24           int64
q25           int64
q26           int64
q27           int64
q28           int64
q29           int64
q30           int64
dtype: object


In [202]:
import pandas as pd
import numpy as np
from pathlib import Path

ptable_path = Path("./pads-project-main/data/ptables/")
bad_files = []

# Features to check
numeric_cols = ["age", "age_diag", "height", "weight"]
bool_cols = ["gender", "hand", "apprik"]
# bool_cols = ["gender", "hand", "apprik", "apprifgk", "effal"]
label_col = "con_lbl"

# Iterate through .bin files
for file in ptable_path.glob("*.bin"):
    df = pd.read_pickle(file)

    row = df.iloc[0]  # one row per file
    bad = False

    for col in numeric_cols:
        if pd.isna(row[col]):
            print(f"NaN in {col} of file {file.name}")
            bad = True

    for col in bool_cols:
        if row[col] == 0:
            print(f"Unknown (0) in {col} of file {file.name}")
            bad = True

    if row[label_col] == -1:
        print(f"Invalid condition label in file {file.name}")
        bad = True

    if bad:
        bad_files.append(file.name)

print("\nFiles with unusual values:")
for name in bad_files:
    print(f" - {name}")

print(f"\nChecked {len(list(ptable_path.glob('*.bin')))} files; Found {len(bad_files)} with issues.")



Files with unusual values:

Checked 469 files; Found 0 with issues.


In [203]:
import os
import pickle
import pandas as pd
from tqdm import tqdm

tabular_path = "./pads-project-main/data/ptables/"
patient_ids = [f"{i:03d}" for i in range(1, 470)]

records = []

for pid in tqdm(patient_ids):
    tab_file = os.path.join(tabular_path, f"{pid}_tbl_ml.bin")
    if not os.path.exists(tab_file):
        continue

    try:
        with open(tab_file, "rb") as f:
            tab_data = pickle.load(f)  
        records.append(tab_data.iloc[0].to_dict()) 
    except Exception as e:
        print(f"Error loading {pid}: {e}")


df = pd.DataFrame.from_records(records)
df.to_csv("./pads-project-main/data/tabular_all_patients.csv", index=False)
print(f"Saved {len(df)} patients to tabular_all_patients.csv")


100%|██████████| 469/469 [00:00<00:00, 4425.65it/s]

Saved 469 patients to tabular_all_patients.csv



