In [1]:
import os
import json
import pandas as pd
from datetime import datetime

# Folder resource yang sudah diproses (output folder)
output_folder = "synthea/output/processed/"

# Fungsi untuk memuat resource dari file JSON
def load_resource(resource_type):
    path = os.path.join(output_folder, f"{resource_type}.json")
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

# Muat resource yang diperlukan
patients = load_resource("Patient")
fh_records = load_resource("FamilyMemberHistory")
related_persons = load_resource("RelatedPerson")
conds = load_resource("Condition")

# Target penyakit (dengan huruf kecil untuk pencocokan case-insensitive)
target_diseases = ["diabetes", "hypertension", "cancer", "heart disease", "alzheimer", "asthma"]


In [2]:
# Fungsi untuk menghitung umur dengan tanggal referensi (misalnya 2025-01-01)
def calculate_age(birth_str, ref_date=datetime(2025, 1, 1)):
    try:
        birth = datetime.strptime(birth_str, "%Y-%m-%d")
        age = (ref_date - birth).days // 365
        return age
    except Exception:
        return None

# --- Agregasi Data per Pasien ---

# Buat DataFrame dasar pasien
df_patients = pd.DataFrame([{
    "patient_id": p["id"],
    "birthDate": p.get("birthDate"),
    "gender": p.get("gender")
} for p in patients])
df_patients["age"] = df_patients["birthDate"].apply(lambda x: calculate_age(x) if x else None)

In [3]:
# Helper: Ambil teks kondisi dari record kondisi (FamilyMemberHistory) yang mungkin berupa list
def extract_fh_conditions(record):
    cond_texts = []
    if "condition" in record:
        for cond in record["condition"]:
            text = cond["code"].get("text", "").strip()
            if text:
                cond_texts.append(text)
    return cond_texts

# Agregasi kondisi dari FamilyMemberHistory per pasien.
# Field 'patient.reference' berbentuk "Patient/<id>"
fh_agg = {}
for rec in fh_records:
    pat_ref = rec.get("patient", {}).get("reference", "")
    if pat_ref.startswith("Patient/"):
        pid = pat_ref.split("/")[-1]
        texts = extract_fh_conditions(rec)
        if pid in fh_agg:
            fh_agg[pid].extend(texts)
        else:
            fh_agg[pid] = texts

In [4]:
# Agregasi kondisi dari RelatedPerson.
# Pertama, buat mapping dari RelatedPerson ID ke patient ID (yang dimiliki RelatedPerson)
rp_mapping = {}
for rp in related_persons:
    # Field rp["patient"]["reference"] = "Patient/<id>"
    pat_ref = rp.get("patient", {}).get("reference", "")
    if pat_ref.startswith("Patient/"):
        patient_id = pat_ref.split("/")[-1]
        rp_mapping[rp["id"]] = patient_id

In [5]:
# Selanjutnya, dari resource Condition, ambil yang subject-nya adalah RelatedPerson/<id>
rp_cond_agg = {}
for cond in conds:
    subj_ref = cond.get("subject", {}).get("reference", "")
    if subj_ref.startswith("RelatedPerson/"):
        rp_id = subj_ref.split("/")[-1]
        # Ambil kondisi dari field text di code
        disease = cond["code"].get("text", "").strip()
        if rp_id in rp_mapping:
            pid = rp_mapping[rp_id]
            if pid in rp_cond_agg:
                rp_cond_agg[pid].append(disease)
            else:
                rp_cond_agg[pid] = [disease]

In [17]:
conds

[{'resourceType': 'Condition',
  'id': 'ded1426d-62e2-77ad-0c8b-5b34075c89a9',
  'meta': {'profile': ['http://hl7.org/fhir/us/core/StructureDefinition/us-core-condition-encounter-diagnosis']},
  'clinicalStatus': {'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/condition-clinical',
     'code': 'resolved'}]},
  'verificationStatus': {'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/condition-ver-status',
     'code': 'confirmed'}]},
  'category': [{'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/condition-category',
      'code': 'encounter-diagnosis',
      'display': 'Encounter Diagnosis'}]}],
  'code': {'coding': [{'system': 'http://snomed.info/sct',
     'code': '314529007',
     'display': 'Medication review due (situation)'}],
   'text': 'Medication review due (situation)'},
  'subject': {'reference': 'urn:uuid:7da148be-b73e-73e3-ed5c-67d7c712a253'},
  'encounter': {'reference': 'urn:uuid:879a2d85-2d6a-6d2b-c3e0-8ab3527f2f47'},
  'onsetDateTi

In [20]:
# Selain itu, kita juga ambil kondisi langsung pada pasien (jika diperlukan) dari Condition
patient_cond_agg = {}
for cond in conds:
    print(cond)
    subj_ref = cond.get("subject", {}).get("reference", "")
    if subj_ref.startswith("Patient/"):
        pid = subj_ref.split("/")[-1]
        disease = cond["code"].get("text", "").strip()
        print(pid, disease)
        if pid in patient_cond_agg:
            patient_cond_agg[pid].append(disease)
        else:
            patient_cond_agg[pid] = [disease]

{'resourceType': 'Condition', 'id': 'ded1426d-62e2-77ad-0c8b-5b34075c89a9', 'meta': {'profile': ['http://hl7.org/fhir/us/core/StructureDefinition/us-core-condition-encounter-diagnosis']}, 'clinicalStatus': {'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/condition-clinical', 'code': 'resolved'}]}, 'verificationStatus': {'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/condition-ver-status', 'code': 'confirmed'}]}, 'category': [{'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/condition-category', 'code': 'encounter-diagnosis', 'display': 'Encounter Diagnosis'}]}], 'code': {'coding': [{'system': 'http://snomed.info/sct', 'code': '314529007', 'display': 'Medication review due (situation)'}], 'text': 'Medication review due (situation)'}, 'subject': {'reference': 'urn:uuid:7da148be-b73e-73e3-ed5c-67d7c712a253'}, 'encounter': {'reference': 'urn:uuid:879a2d85-2d6a-6d2b-c3e0-8ab3527f2f47'}, 'onsetDateTime': '2013-10-11T02:42:24+07:00', 'abatementDateTime':

In [29]:
conds[-270]

{'resourceType': 'Condition',
 'id': '1236303a-5347-3bb0-70bd-ee0ff69d1667',
 'meta': {'profile': ['http://hl7.org/fhir/us/core/StructureDefinition/us-core-condition-encounter-diagnosis']},
 'clinicalStatus': {'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/condition-clinical',
    'code': 'resolved'}]},
 'verificationStatus': {'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/condition-ver-status',
    'code': 'confirmed'}]},
 'category': [{'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/condition-category',
     'code': 'encounter-diagnosis',
     'display': 'Encounter Diagnosis'}]}],
 'code': {'coding': [{'system': 'http://snomed.info/sct',
    'code': '66383009',
    'display': 'Gingivitis (disorder)'}],
  'text': 'Gingivitis (disorder)'},
 'subject': {'reference': 'urn:uuid:f875dfc8-362f-4e0b-7af5-8e0aa600ec49'},
 'encounter': {'reference': 'urn:uuid:c71eccd7-b8e2-04ed-d503-58e5bf6c6773'},
 'onsetDateTime': '2021-05-21T05:04:59+07:00',
 'abatem

In [30]:
conds[-1]

{'resourceType': 'Condition',
 'id': 'condition-related-8a9451f2-3fe4-a23a-18fd-c285095cfeb6-asthma',
 'subject': {'reference': 'RelatedPerson/8a9451f2-3fe4-a23a-18fd-c285095cfeb6'},
 'code': {'coding': [{'system': 'http://snomed.info/sct',
    'code': '22298006',
    'display': 'Asthma'}],
  'text': 'Asthma'}}

In [7]:
# Gabungkan semua kondisi keluarganya (dari FamilyMemberHistory dan RelatedPerson) sebagai fitur teks
def aggregate_conditions(pid):
    texts = []
    if pid in fh_agg:
        texts.extend(fh_agg[pid])
    if pid in rp_cond_agg:
        texts.extend(rp_cond_agg[pid])
    return " ".join(texts)

df_patients["relative_conditions_text"] = df_patients["patient_id"].apply(aggregate_conditions)

In [9]:
# Tentukan label: 1 jika di relative_conditions_text terdapat salah satu target penyakit, 0 jika tidak.
def has_hereditary(text):
    text_lower = text.lower()
    for d in target_diseases:
        if d in text_lower:
            return 1
    return 0

df_patients["has_hereditary_disease"] = df_patients["relative_conditions_text"].apply(has_hereditary)


In [12]:
df_patients['relative_conditions_text'].head(50)

0                                                      
1                                         Heart Disease
2                      Hypertension Hypertension Asthma
3                            Hypertension Heart Disease
4                             Hypertension Hypertension
5                      Asthma Alzheimer Diabetes Asthma
6     Cancer Heart Disease Hypertension Hypertension...
7                             Hypertension Hypertension
8                                          Hypertension
9      Heart Disease Heart Disease Asthma Heart Disease
10    Hypertension Asthma Heart Disease Cancer Heart...
11     Diabetes Hypertension Heart Disease Hypertension
12                           Heart Disease Hypertension
13                                  Cancer Hypertension
14                                                     
15            Hypertension Diabetes Hypertension Asthma
16              Heart Disease Hypertension Hypertension
17         Alzheimer Heart Disease Asthma Heart 

In [None]:

# Jika diinginkan, kita bisa juga meng-aggregate kondisi pasien sendiri dan menggabungkannya,
# tetapi sesuai permintaan, labelnya berdasarkan adanya penyakit keturunan (keluarga).

# --- Simpan Training Dataset ---
training_csv = os.path.join(output_folder, "training_dataset.csv")
df_patients.to_csv(training_csv, index=False)
print(f"\n🚀 Training dataset telah disimpan di {training_csv}")

# Tampilkan beberapa baris contoh
print(df_patients.head())


In [31]:
import os
import json
import pandas as pd
from datetime import datetime

# Folder resource yang sudah diproses (output folder)
output_folder = "synthea/output/processed/"

# Fungsi untuk memuat resource dari file JSON
def load_resource(resource_type):
    path = os.path.join(output_folder, f"{resource_type}.json")
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

# Muat resource yang diperlukan
patients = load_resource("Patient")
fh_records = load_resource("FamilyMemberHistory")
related_persons = load_resource("RelatedPerson")
conds = load_resource("Condition")

# Target penyakit (dengan huruf kecil untuk pencocokan case-insensitive)
target_diseases = ["diabetes", "hypertension", "cancer", "heart disease", "alzheimer", "asthma"]

# Fungsi untuk menghitung umur dengan tanggal referensi (misalnya 2025-01-01)
def calculate_age(birth_str, ref_date=datetime(2025, 1, 1)):
    try:
        birth = datetime.strptime(birth_str, "%Y-%m-%d")
        age = (ref_date - birth).days // 365
        return age
    except Exception:
        return None

# --- Agregasi Data Pasien ---
# Buat DataFrame dasar pasien
df_patients = pd.DataFrame([{
    "patient_id": p["id"],
    "birthDate": p.get("birthDate"),
    "gender": p.get("gender")
} for p in patients])
df_patients["age"] = df_patients["birthDate"].apply(lambda x: calculate_age(x) if x else None)

# --- Agregasi Kondisi Keluarga (sebagai fitur teks) ---
# FamilyMemberHistory: patient.reference berbentuk "Patient/<id>" atau "urn:uuid:<id>"
def extract_fh_conditions(record):
    cond_texts = []
    if "condition" in record:
        for cond in record["condition"]:
            text = cond["code"].get("text", "").strip()
            if text:
                cond_texts.append(text)
    return cond_texts

fh_agg = {}
for rec in fh_records:
    pat_ref = rec.get("patient", {}).get("reference", "")
    if pat_ref:
        # Dukung format "Patient/<id>" atau "urn:uuid:<id>"
        if pat_ref.startswith("Patient/"):
            pid = pat_ref.split("/")[-1]
        elif pat_ref.startswith("urn:uuid:"):
            pid = pat_ref.split(":")[-1]
        else:
            continue
        texts = extract_fh_conditions(rec)
        fh_agg.setdefault(pid, []).extend(texts)

# RelatedPerson: Buat mapping RelatedPerson ID ke patient ID
rp_mapping = {}
for rp in related_persons:
    pat_ref = rp.get("patient", {}).get("reference", "")
    if pat_ref:
        if pat_ref.startswith("Patient/"):
            patient_id = pat_ref.split("/")[-1]
        elif pat_ref.startswith("urn:uuid:"):
            patient_id = pat_ref.split(":")[-1]
        else:
            continue
        rp_mapping[rp["id"]] = patient_id

rp_cond_agg = {}
for cond in conds:
    subj_ref = cond.get("subject", {}).get("reference", "")
    if subj_ref.startswith("RelatedPerson/"):
        rp_id = subj_ref.split("/")[-1]
        disease = cond["code"].get("text", "").strip()
        if rp_id in rp_mapping:
            pid = rp_mapping[rp_id]
            rp_cond_agg.setdefault(pid, []).append(disease)

# --- Agregasi Kondisi Pasien (untuk label) ---
# Dari resource Condition yang subject-nya mengacu ke pasien (mendukung "Patient/..." dan "urn:uuid:...")
patient_cond_agg = {}
for cond in conds:
    subj_ref = cond.get("subject", {}).get("reference", "")
    if subj_ref.startswith("Patient/"):
        pid = subj_ref.split("/")[-1]
    elif subj_ref.startswith("urn:uuid:"):
        pid = subj_ref.split(":")[-1]
    else:
        continue
    disease = cond["code"].get("text", "").strip()
    patient_cond_agg.setdefault(pid, []).append(disease)

# --- Gabungkan semua kondisi relatif sebagai fitur teks ---
def aggregate_relative_conditions(pid):
    texts = []
    if pid in fh_agg:
        texts.extend(fh_agg[pid])
    if pid in rp_cond_agg:
        texts.extend(rp_cond_agg[pid])
    return " ".join(texts)

df_patients["relative_conditions_text"] = df_patients["patient_id"].apply(aggregate_relative_conditions)

# --- Tentukan Label: dari kondisi pasien saja ---
# Buat DataFrame untuk kondisi pasien
df_patient_conditions = pd.DataFrame({"patient_id": df_patients["patient_id"]})
for disease in target_diseases:
    df_patient_conditions[disease] = df_patient_conditions["patient_id"].map(
        lambda pid: int(any(disease in d.lower() for d in patient_cond_agg.get(pid, [])))
    )
df_patient_conditions["has_hereditary_disease"] = df_patient_conditions[target_diseases].max(axis=1)

# --- Gabungkan data pasien dengan label ---
df_train = df_patients.merge(df_patient_conditions[["patient_id", "has_hereditary_disease"]], on="patient_id", how="left")
df_train["has_hereditary_disease"] = df_train["has_hereditary_disease"].fillna(0).astype(int)



In [None]:
import os
import json
import pandas as pd
from datetime import datetime

# ========================
# Helper Functions
# ========================

# Fungsi untuk menghitung umur dengan tanggal referensi (misalnya 2025-01-01)
def calculate_age(birth_str, ref_date=datetime(2025, 1, 1)):
    try:
        birth = datetime.strptime(birth_str, "%Y-%m-%d")
        age = (ref_date - birth).days // 365
        return age
    except Exception:
        return None

# Fungsi untuk memuat resource dari file JSON
def load_resource(resource_type, folder):
    path = os.path.join(folder, f"{resource_type}.json")
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

# Fungsi bantu untuk mengekstrak kondisi dari FamilyMemberHistory
def extract_fh_conditions(record):
    cond_texts = []
    if "condition" in record:
        for cond in record["condition"]:
            text = cond["code"].get("text", "").strip()
            if text:
                cond_texts.append(text)
    return cond_texts

# Fungsi untuk menormalisasi reference (mengambil id)
def extract_id(ref):
    if ref.startswith("Patient/"):
        return ref.split("/")[-1]
    elif ref.startswith("RelatedPerson/"):
        return ref.split("/")[-1]
    elif ref.startswith("urn:uuid:"):
        return ref.split(":")[-1]
    else:
        return ref

# ========================
# Path Folder
# ========================
folder = "synthea/output/processed/"

# ========================
# Muat Resource
# ========================
patients = load_resource("Patient", folder)
fh_records = load_resource("FamilyMemberHistory", folder)
related_persons = load_resource("RelatedPerson", folder)
conds = load_resource("Condition", folder)

# ========================
# DF1: Data Pasien
# ========================
df_patients = pd.DataFrame([{
    "patient_id": p["id"],
    "birthDate": p.get("birthDate"),
    "gender": p.get("gender")
} for p in patients])
df_patients["age"] = df_patients["birthDate"].apply(lambda x: calculate_age(x) if x else None)

# ========================
# DF2: Aggregasi FamilyMemberHistory
# ========================
fh_agg = {}
for rec in fh_records:
    pat_ref = rec.get("patient", {}).get("reference", "")
    pid = extract_id(pat_ref)
    texts = extract_fh_conditions(rec)
    fh_agg.setdefault(pid, []).extend(texts)

# Ubah fh_agg ke DataFrame: satu baris per patient dengan kolom 'fh_conditions'
df_fh = pd.DataFrame([
    {"patient_id": pid, "fh_conditions": " ".join(conds)}
    for pid, conds in fh_agg.items()
])

# ========================
# DF3: Data RelatedPerson
# ========================
# Ambil informasi dasar dari RelatedPerson
rp_list = []
for rp in related_persons:
    pat_ref = rp.get("patient", {}).get("reference", "")
    patient_id = extract_id(pat_ref)
    # Ambil relationship text; jika ada lebih dari satu, gabungkan dengan koma
    rel_text = ", ".join([r.get("text", "") for r in rp.get("relationship", [])])
    rp_list.append({
        "patient_id": patient_id,
        "related_person_id": rp["id"],
        "relationship": rel_text,
        "rp_name": rp.get("name", [{"use": "official", "family": "Unknown"}])[0].get("family", "Unknown")
    })
df_rp = pd.DataFrame(rp_list)

# ========================
# DF4: Aggregasi Kondisi Pasien (untuk label)
# ========================
patient_cond_agg = {}
for cond in conds:
    subj_ref = cond.get("subject", {}).get("reference", "")
    if subj_ref.startswith("Patient/") or subj_ref.startswith("urn:uuid:"):
        pid = extract_id(subj_ref)
        disease = cond["code"].get("text", "").strip()
        if disease:
            patient_cond_agg.setdefault(pid, []).append(disease)

# Buat DataFrame dengan satu baris per patient, kolom 'patient_conditions_text'
df_patient_conditions = pd.DataFrame([
    {"patient_id": pid, "patient_conditions_text": " ".join(diseases)}
    for pid, diseases in patient_cond_agg.items()
])

# ========================
# DF5: Final Training Dataset (satu baris per pasien)
# ========================
# Label dihitung dari kondisi pasien (DF4) saja.
def has_hereditary(text):
    text_lower = text.lower()
    for d in target_diseases:
        if d in text_lower:
            return 1
    return 0

df_patient_conditions["has_hereditary_disease"] = df_patient_conditions["patient_conditions_text"].apply(has_hereditary)

# Gabungkan df_patients, df_fh, dan df_patient_conditions (jika tidak ada data, isikan string kosong)
df_train = df_patients.merge(df_fh, on="patient_id", how="left").merge(df_patient_conditions, on="patient_id", how="left")
df_train["fh_conditions"] = df_train["fh_conditions"].fillna("")
df_train["patient_conditions_text"] = df_train["patient_conditions_text"].fillna("")
df_train["has_hereditary_disease"] = df_train["has_hereditary_disease"].fillna(0).astype(int)

# ========================
# Simpan DataFrame ke CSV
# ========================
training_csv = os.path.join(folder, "training_dataset.csv")
df_train.to_csv(training_csv, index=False)
print(f"\n🚀 Training dataset telah disimpan di {training_csv}")

# ========================
# Tampilkan beberapa baris contoh
print("Contoh data training:")
print(df_train.head())

# ========================
# Optional: Simpan DataFrame RelatedPerson dan FamilyMemberHistory secara terpisah jika diperlukan
df_rp.to_csv(os.path.join(folder, "related_persons.csv"), index=False)
print("🚀 Data RelatedPerson telah disimpan sebagai related_persons.csv")


{'resourceType': 'Condition',
 'id': 'condition-related-8a9451f2-3fe4-a23a-18fd-c285095cfeb6-asthma',
 'subject': {'reference': 'RelatedPerson/8a9451f2-3fe4-a23a-18fd-c285095cfeb6'},
 'code': {'coding': [{'system': 'http://snomed.info/sct',
    'code': '22298006',
    'display': 'Asthma'}],
  'text': 'Asthma'}}

In [41]:

df_train['relative_conditions_text']

0                                                       
1                                          Heart Disease
2                       Hypertension Hypertension Asthma
3                             Hypertension Heart Disease
4                              Hypertension Hypertension
                             ...                        
103           Hypertension Diabetes Heart Disease Asthma
104                               Diabetes Heart Disease
105                            Hypertension Hypertension
106    Diabetes Hypertension Cancer Heart Disease Hyp...
107                                                     
Name: relative_conditions_text, Length: 108, dtype: object

In [3]:
import os
import json
import pandas as pd

# ========================
# Helper Functions
# ========================

def extract_id(ref):
    if ref.startswith("Patient/"):
        return ref.split("/")[-1]
    elif ref.startswith("RelatedPerson/"):
        return ref.split("/")[-1]
    elif ref.startswith("urn:uuid:"):
        return ref.split(":")[-1]
    else:
        return ref

def load_json(filepath):
    with open(filepath, "r", encoding="utf-8") as f:
        return json.load(f)

def save_as_csv(data, output_filepath):
    df = pd.DataFrame(data)
    df.to_csv(output_filepath, index=False)
    print(f"✅ CSV disimpan di {output_filepath}")

def save_as_json(data, output_filepath):
    with open(output_filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)
    print(f"✅ JSON disimpan di {output_filepath}")

# ========================
# Ekstraksi Minimal untuk Setiap Resource
# ========================

def extract_patients(data):
    """Ekstrak informasi dasar pasien: id, birthDate, gender dan (opsional) name."""
    extracted = []
    for p in data:
        extracted.append({
            "patient_id": p.get("id", ""),
            "birthDate": p.get("birthDate", ""),
            "gender": p.get("gender", ""),
            # Jika terdapat informasi name, ambil salah satu field (misalnya family name)
            "name": p.get("name", [{"family": "Unknown"}])[0].get("family", "Unknown")
        })
    return extracted

def extract_family_member_history(data):
    """Ekstrak informasi FamilyMemberHistory: id, patient_id, relationship, dan kondisi."""
    extracted = []
    for rec in data:
        patient_ref = rec.get("patient", {}).get("reference", "")
        patient_id = extract_id(patient_ref)
        # Ambil relationship (jika ada)
        relationship = ""
        if "relationship" in rec and "coding" in rec["relationship"] and rec["relationship"]["coding"]:
            relationship = rec["relationship"]["coding"][0].get("display", "")
        # Ekstrak kondisi (jika ada)
        conditions = []
        if "condition" in rec:
            for cond in rec["condition"]:
                cond_text = cond.get("code", {}).get("text", "")
                if cond_text:
                    conditions.append(cond_text)
        extracted.append({
            "family_member_history_id": rec.get("id", ""),
            "patient_id": patient_id,
            "relationship": relationship,
            "conditions": "; ".join(conditions)  # Gabungkan kondisi jika lebih dari satu
        })
    return extracted

def extract_related_person(data):
    """Ekstrak informasi RelatedPerson: id, patient_id, relationship, nama, gender, dan birthDate."""
    extracted = []
    for rec in data:
        patient_ref = rec.get("patient", {}).get("reference", "")
        patient_id = extract_id(patient_ref)
        # Gabungkan teks relationship jika lebih dari satu
        relationship = ", ".join([r.get("text", "") for r in rec.get("relationship", [])])
        rp_name = ""
        if "name" in rec and isinstance(rec["name"], list) and len(rec["name"]) > 0:
            rp_name = rec["name"][0].get("family", "")
        extracted.append({
            "related_person_id": extract_id(rec.get("id", "")),
            "patient_id": patient_id,
            "relationship": relationship,
            "rp_name": rp_name,
            "gender": rec.get("gender", ""),
            "birthDate": rec.get("birthDate", "")
        })
    return extracted

def extract_conditions(data):
    """Ekstrak informasi Condition: id, patient_id (dari subject), dan nama penyakit (code.text)."""
    extracted = []
    for rec in data:
        subject_ref = rec.get("subject", {}).get("reference", "")
        patient_id = extract_id(subject_ref)
        disease = rec.get("code", {}).get("text", "")
        extracted.append({
            "condition_id": rec.get("id", ""),
            "patient_id": patient_id,
            "disease": disease
        })
    return extracted



In [4]:
import os
import json
import pandas as pd

# Folder hasil proses
processed_folder = "synthea/output/processed/"

# Resource yang dianggap berguna untuk model prediktif
resources = ["Patient", "FamilyMemberHistory", "RelatedPerson", "Condition"]

# Mapping resource ke fungsi ekstraksi yang sudah didefinisikan
extraction_functions = {
    "Patient": extract_patients,
    "FamilyMemberHistory": extract_family_member_history,
    "RelatedPerson": extract_related_person,
    "Condition": extract_conditions
}

# Dictionary untuk menyimpan DataFrame hasil ekstraksi
dataframes = {}

for res in resources:
    filepath = os.path.join(processed_folder, f"{res}.json")
    if os.path.exists(filepath):
        print(f"📂 Memproses {res} dari {filepath}")
        data = load_json(filepath)
        # Jika data berbentuk dictionary, ubah menjadi list
        if isinstance(data, dict):
            data = [data]
        extracted_data = extraction_functions[res](data)
        df = pd.DataFrame(extracted_data)
        dataframes[res] = df
        print(f"✅ Dataframe {res} memiliki shape: {df.shape}")
    else:
        print(f"⚠️ File {res}.json tidak ditemukan di {processed_folder}")

# Tampilkan beberapa baris contoh untuk setiap DataFrame
for res, df in dataframes.items():
    print(f"\nContoh DataFrame untuk {res}:")
    display(df.head())


📂 Memproses Patient dari synthea/output/processed/Patient.json
✅ Dataframe Patient memiliki shape: (108, 4)
📂 Memproses FamilyMemberHistory dari synthea/output/processed/FamilyMemberHistory.json
✅ Dataframe FamilyMemberHistory memiliki shape: (193, 4)
📂 Memproses RelatedPerson dari synthea/output/processed/RelatedPerson.json
✅ Dataframe RelatedPerson memiliki shape: (261, 6)
📂 Memproses Condition dari synthea/output/processed/Condition.json
✅ Dataframe Condition memiliki shape: (4093, 3)

Contoh DataFrame untuk Patient:


Unnamed: 0,patient_id,birthDate,gender,name
0,7da148be-b73e-73e3-ed5c-67d7c712a253,2010-05-07,female,Runolfsdottir785
1,d4f1d88b-aecc-493e-2977-44a72e0de2d9,2002-11-28,female,Jerde200
2,9f7675c1-1f29-10ac-92e5-8aaf367f05c3,2007-06-07,female,Sanford861
3,839e461d-9a4d-a110-1fe9-97bd16378bfd,2008-05-28,male,Ruecker817
4,7e101445-eafd-cd17-0e6b-57f85baa3f44,1985-10-07,female,Kerluke267



Contoh DataFrame untuk FamilyMemberHistory:


Unnamed: 0,family_member_history_id,patient_id,relationship,conditions
0,family-3a644dcd-672c-9579-cdeb-65ce6783da97-,7da148be-b73e-73e3-ed5c-67d7c712a253,Father,Asthma
1,family-8463087b-be64-1139-b779-97d09881e034-,7da148be-b73e-73e3-ed5c-67d7c712a253,Sister,Hypertension; Heart Disease
2,family-00a4d481-551d-9741-dd8f-fa88fe29ab79-,d4f1d88b-aecc-493e-2977-44a72e0de2d9,Father,Hypertension
3,family-8c97920a-fc41-8150-f54e-9dcfc1f48fef-,d4f1d88b-aecc-493e-2977-44a72e0de2d9,Mother,Diabetes; Hypertension
4,family-2b27a9c6-3b32-83fe-c4eb-ff271de3536b-,9f7675c1-1f29-10ac-92e5-8aaf367f05c3,Father,Cancer



Contoh DataFrame untuk RelatedPerson:


Unnamed: 0,related_person_id,patient_id,relationship,rp_name,gender,birthDate
0,3a644dcd-672c-9579-cdeb-65ce6783da97,7da148be-b73e-73e3-ed5c-67d7c712a253,Father,Barton704,female,1975-09-20
1,67ed8fab-19a2-40c5-e56c-3dfdab2c9805,7da148be-b73e-73e3-ed5c-67d7c712a253,Mother,Schowalter414,male,1989-02-23
2,8463087b-be64-1139-b779-97d09881e034,7da148be-b73e-73e3-ed5c-67d7c712a253,Sister,Boyle917,male,1965-09-03
3,00a4d481-551d-9741-dd8f-fa88fe29ab79,d4f1d88b-aecc-493e-2977-44a72e0de2d9,Father,Bernhard322,female,1968-08-26
4,8c97920a-fc41-8150-f54e-9dcfc1f48fef,d4f1d88b-aecc-493e-2977-44a72e0de2d9,Mother,Jerde200,female,2003-10-06



Contoh DataFrame untuk Condition:


Unnamed: 0,condition_id,patient_id,disease
0,ded1426d-62e2-77ad-0c8b-5b34075c89a9,7da148be-b73e-73e3-ed5c-67d7c712a253,Medication review due (situation)
1,40f951b4-d966-312a-c6b2-2b9b89ca5f30,7da148be-b73e-73e3-ed5c-67d7c712a253,Medication review due (situation)
2,aa6822b6-d4e2-bbe9-d4fa-574aac5c27ca,7da148be-b73e-73e3-ed5c-67d7c712a253,Gingivitis (disorder)
3,35aac9a9-f1fe-8563-79d0-d279208f9098,7da148be-b73e-73e3-ed5c-67d7c712a253,Medication review due (situation)
4,63b4c19c-4488-56fe-e4eb-f5dd262aa4b2,7da148be-b73e-73e3-ed5c-67d7c712a253,Medication review due (situation)


In [12]:
df_patient

Unnamed: 0,patient_id,birthDate,gender,name
0,7da148be-b73e-73e3-ed5c-67d7c712a253,2010-05-07,female,Runolfsdottir785
1,d4f1d88b-aecc-493e-2977-44a72e0de2d9,2002-11-28,female,Jerde200
2,9f7675c1-1f29-10ac-92e5-8aaf367f05c3,2007-06-07,female,Sanford861
3,839e461d-9a4d-a110-1fe9-97bd16378bfd,2008-05-28,male,Ruecker817
4,7e101445-eafd-cd17-0e6b-57f85baa3f44,1985-10-07,female,Kerluke267
...,...,...,...,...
103,f5f6a690-af00-7c0f-c5c0-7bae4c3401ad,1996-06-19,female,Gorczany269
104,be222f9e-05e3-7c64-349b-02949d6222c7,1954-12-02,female,Crist667
105,3a644dcd-672c-9579-cdeb-65ce6783da97,1975-09-20,female,Barton704
106,f875dfc8-362f-4e0b-7af5-8e0aa600ec49,1973-06-29,female,Rath779


In [None]:
pd.merge(df_patient, df_fmh, on="patient_id", how="left")

Unnamed: 0,patient_id,birthDate,gender,name,family_member_history_id,relationship,conditions
0,7da148be-b73e-73e3-ed5c-67d7c712a253,2010-05-07,female,Runolfsdottir785,family-3a644dcd-672c-9579-cdeb-65ce6783da97-,Father,Asthma
1,7da148be-b73e-73e3-ed5c-67d7c712a253,2010-05-07,female,Runolfsdottir785,family-8463087b-be64-1139-b779-97d09881e034-,Sister,Hypertension; Heart Disease
2,d4f1d88b-aecc-493e-2977-44a72e0de2d9,2002-11-28,female,Jerde200,family-00a4d481-551d-9741-dd8f-fa88fe29ab79-,Father,Hypertension
3,d4f1d88b-aecc-493e-2977-44a72e0de2d9,2002-11-28,female,Jerde200,family-8c97920a-fc41-8150-f54e-9dcfc1f48fef-,Mother,Diabetes; Hypertension
4,9f7675c1-1f29-10ac-92e5-8aaf367f05c3,2007-06-07,female,Sanford861,family-2b27a9c6-3b32-83fe-c4eb-ff271de3536b-,Father,Cancer
...,...,...,...,...,...,...,...
188,3a644dcd-672c-9579-cdeb-65ce6783da97,1975-09-20,female,Barton704,family-7018ba97-af04-3601-0869-435454482e52-,Brother,Diabetes; Asthma
189,f875dfc8-362f-4e0b-7af5-8e0aa600ec49,1973-06-29,female,Rath779,family-f4267dde-ccd3-6383-7dff-c81af29d29ff-,Father,Hypertension
190,f875dfc8-362f-4e0b-7af5-8e0aa600ec49,1973-06-29,female,Rath779,family-3ada63c8-2ff1-1032-9ed5-39b5fba54c89-,Mother,Hypertension
191,f875dfc8-362f-4e0b-7af5-8e0aa600ec49,1973-06-29,female,Rath779,family-5b1e4f94-922f-22e7-f548-e592b29f9129-,Sister,Hypertension


In [37]:
# === 1. Merge Patient dengan FamilyMemberHistory berdasarkan patient_id (dengan pivot untuk unique patient_id) ===
df_fmh_grouped = (
    df_fmh.groupby(["patient_id", "relationship"])["conditions"]
    .apply(lambda x: "; ".join(x.dropna().unique()))
    .reset_index()
)

# Pivot agar setiap patient_id unik dan masing-masing relationship menjadi kolom
df_fmh_pivot = df_fmh_grouped.pivot(index="patient_id", columns="relationship", values="conditions").reset_index()

# Ubah nama kolom relationship menjadi lower-case + "_condition"
df_fmh_pivot = df_fmh_pivot.rename(columns=lambda x: x.lower() + "_condition" if x != "patient_id" else x)

# Merge dengan df_patient
df_patient_fmh = pd.merge(df_patient, df_fmh_pivot, on="patient_id", how="left")
print("Merged Patient dan FamilyMemberHistory (unique) shape:", df_patient_fmh.shape)
display(df_patient_fmh.head())

# === 2. Group Condition berdasarkan patient_id dan gabungkan nama penyakit ===
df_condition_grouped = (
    df_condition.groupby("patient_id")["disease"]
    .apply(lambda x: " ".join(x))
    .reset_index()
    .rename(columns={"disease": "patient_conditions_text"})
)
print("Grouped Condition shape:", df_condition_grouped.shape)
display(df_condition_grouped.head())

# === 3. Gabungkan data pasien dengan kondisi mereka ===
df_training = pd.merge(df_patient_fmh, df_condition_grouped, on="patient_id", how="left")
print("Training DataFrame shape:", df_training.shape)
display(df_training.head())

# === 4. Proses RelatedPerson dengan cara yang sama seperti FamilyMemberHistory ===
# a. Gabungkan RelatedPerson dengan Condition (berdasarkan related_person_id == patient_id di Condition)
df_rp_condition = pd.merge(
    df_rp, df_condition_grouped, 
    left_on="related_person_id", right_on="patient_id", 
    how="left"
).rename(columns={"patient_conditions_text": "related_conditions"})

# b. Hapus kolom 'patient_id_y' yang redundant (karena sudah ada dari df_rp)
df_rp_condition.drop(columns=["patient_id_y"], inplace=True)

# c. Group berdasarkan patient_id dan relationship, lalu gabungkan kondisi dalam satu baris per patient_id
df_rp_grouped = (
    df_rp_condition.groupby(["patient_id_x", "relationship"])["related_conditions"]
    .apply(lambda x: "; ".join(x.dropna().unique()))
    .reset_index()
)

# d. Pivot agar setiap patient_id unik dan masing-masing relationship menjadi kolom
df_rp_pivot = df_rp_grouped.pivot(index="patient_id_x", columns="relationship", values="related_conditions").reset_index()

# e. Ubah nama kolom relationship menjadi lower-case + "_related_condition"
df_rp_pivot = df_rp_pivot.rename(columns=lambda x: x.lower() + "_related_condition" if x != "patient_id_x" else x)

# f. Merge df_rp_pivot ke df_training berdasarkan patient_id
df_training = pd.merge(df_training, df_rp_pivot, left_on="patient_id", right_on="patient_id_x", how="left")

# g. Hapus kolom redundant
df_training.drop(columns=["patient_id_x"], inplace=True)

# h. Jika ada nilai NaN pada kolom related_conditions, ganti dengan string kosong
df_training.fillna("", inplace=True)

print("Final Training DataFrame shape (after fixing RelatedPerson logic):", df_training.shape)
display(df_training.head())


Merged Patient dan FamilyMemberHistory (unique) shape: (108, 8)


Unnamed: 0,patient_id,birthDate,gender,name,brother_condition,father_condition,mother_condition,sister_condition
0,7da148be-b73e-73e3-ed5c-67d7c712a253,2010-05-07,female,Runolfsdottir785,,Asthma,,Hypertension; Heart Disease
1,d4f1d88b-aecc-493e-2977-44a72e0de2d9,2002-11-28,female,Jerde200,,Hypertension,Diabetes; Hypertension,
2,9f7675c1-1f29-10ac-92e5-8aaf367f05c3,2007-06-07,female,Sanford861,Diabetes,Cancer,,Asthma
3,839e461d-9a4d-a110-1fe9-97bd16378bfd,2008-05-28,male,Ruecker817,Cancer,Heart Disease,,
4,7e101445-eafd-cd17-0e6b-57f85baa3f44,1985-10-07,female,Kerluke267,,,,


Grouped Condition shape: (108, 2)


Unnamed: 0,patient_id,patient_conditions_text
0,00a4d481-551d-9741-dd8f-fa88fe29ab79,Housing unsatisfactory (finding) Received high...
1,047b9787-9e0b-6cf4-7b40-1cd2ab9422c1,Received higher education (finding) Transport ...
2,06671679-d2c8-8426-da09-7017cc0bda53,Received higher education (finding) Lack of ac...
3,0d1b91dc-9b9e-11cd-e150-6837c9cb3e54,Medication review due (situation) Medication r...
4,18b84736-db6e-baac-84e2-62d3e8dacd0f,Chronic sinusitis (disorder) Received higher e...


Training DataFrame shape: (108, 9)


Unnamed: 0,patient_id,birthDate,gender,name,brother_condition,father_condition,mother_condition,sister_condition,patient_conditions_text
0,7da148be-b73e-73e3-ed5c-67d7c712a253,2010-05-07,female,Runolfsdottir785,,Asthma,,Hypertension; Heart Disease,Medication review due (situation) Medication r...
1,d4f1d88b-aecc-493e-2977-44a72e0de2d9,2002-11-28,female,Jerde200,,Hypertension,Diabetes; Hypertension,,Medication review due (situation) Gingivitis (...
2,9f7675c1-1f29-10ac-92e5-8aaf367f05c3,2007-06-07,female,Sanford861,Diabetes,Cancer,,Asthma,Childhood asthma (disorder) Perennial allergic...
3,839e461d-9a4d-a110-1fe9-97bd16378bfd,2008-05-28,male,Ruecker817,Cancer,Heart Disease,,,Medication review due (situation) Primary dent...
4,7e101445-eafd-cd17-0e6b-57f85baa3f44,1985-10-07,female,Kerluke267,,,,,Impacted molars (disorder) Chronic pain (findi...


Final Training DataFrame shape (after fixing RelatedPerson logic): (108, 13)


Unnamed: 0,patient_id,birthDate,gender,name,brother_condition,father_condition,mother_condition,sister_condition,patient_conditions_text,brother_related_condition,father_related_condition,mother_related_condition,sister_related_condition
0,7da148be-b73e-73e3-ed5c-67d7c712a253,2010-05-07,female,Runolfsdottir785,,Asthma,,Hypertension; Heart Disease,Medication review due (situation) Medication r...,,Recurrent urinary tract infection (disorder) E...,Loss of teeth (disorder) Received higher educa...,Risk activity involvement (finding) Received h...
1,d4f1d88b-aecc-493e-2977-44a72e0de2d9,2002-11-28,female,Jerde200,,Hypertension,Diabetes; Hypertension,,Medication review due (situation) Gingivitis (...,,Housing unsatisfactory (finding) Received high...,Medication review due (situation) Risk activit...,
2,9f7675c1-1f29-10ac-92e5-8aaf367f05c3,2007-06-07,female,Sanford861,Diabetes,Cancer,,Asthma,Childhood asthma (disorder) Perennial allergic...,Received higher education (finding) Past pregn...,Risk activity involvement (finding) Received h...,,Received higher education (finding) Loss of te...
3,839e461d-9a4d-a110-1fe9-97bd16378bfd,2008-05-28,male,Ruecker817,Cancer,Heart Disease,,,Medication review due (situation) Primary dent...,Medication review due (situation) Perennial al...,Received higher education (finding) Past pregn...,Risk activity involvement (finding) Only recei...,
4,7e101445-eafd-cd17-0e6b-57f85baa3f44,1985-10-07,female,Kerluke267,,,,,Impacted molars (disorder) Chronic pain (findi...,,,Educated to high school level (finding) Predia...,


In [39]:
# === 1. Daftar penyakit target ===
target_diseases = ["Diabetes", "Hypertension", "Cancer", "Heart Disease", "Alzheimer", "Asthma"]

# === 2. Membuat kolom binary untuk setiap penyakit berdasarkan patient_conditions_text ===
for disease in target_diseases:
    df_training[disease] = df_training["patient_conditions_text"].apply(lambda x: 1 if disease.lower() in x.lower() else 0)
# === 3. Tampilkan hasil ===
print("Training DataFrame with Multi-Label Encoding:")
display(df_training.head())

# === 4. Cek distribusi label untuk tiap penyakit ===
print("Distribusi Label Multi-Label:")
display(df_training[target_diseases].sum())


Training DataFrame with Multi-Label Encoding:


Unnamed: 0,patient_id,birthDate,gender,name,brother_condition,father_condition,mother_condition,sister_condition,patient_conditions_text,brother_related_condition,father_related_condition,mother_related_condition,sister_related_condition,Diabetes,Hypertension,Cancer,Heart Disease,Alzheimer,Asthma
0,7da148be-b73e-73e3-ed5c-67d7c712a253,2010-05-07,female,Runolfsdottir785,,Asthma,,Hypertension; Heart Disease,Medication review due (situation) Medication r...,,Recurrent urinary tract infection (disorder) E...,Loss of teeth (disorder) Received higher educa...,Risk activity involvement (finding) Received h...,1,1,0,0,0,1
1,d4f1d88b-aecc-493e-2977-44a72e0de2d9,2002-11-28,female,Jerde200,,Hypertension,Diabetes; Hypertension,,Medication review due (situation) Gingivitis (...,,Housing unsatisfactory (finding) Received high...,Medication review due (situation) Risk activit...,,0,1,1,0,0,0
2,9f7675c1-1f29-10ac-92e5-8aaf367f05c3,2007-06-07,female,Sanford861,Diabetes,Cancer,,Asthma,Childhood asthma (disorder) Perennial allergic...,Received higher education (finding) Past pregn...,Risk activity involvement (finding) Received h...,,Received higher education (finding) Loss of te...,1,1,1,1,0,1
3,839e461d-9a4d-a110-1fe9-97bd16378bfd,2008-05-28,male,Ruecker817,Cancer,Heart Disease,,,Medication review due (situation) Primary dent...,Medication review due (situation) Perennial al...,Received higher education (finding) Past pregn...,Risk activity involvement (finding) Only recei...,,0,1,0,0,0,0
4,7e101445-eafd-cd17-0e6b-57f85baa3f44,1985-10-07,female,Kerluke267,,,,,Impacted molars (disorder) Chronic pain (findi...,,,Educated to high school level (finding) Predia...,,1,1,0,1,0,1


Distribusi Label Multi-Label:


Diabetes         65
Hypertension     82
Cancer           29
Heart Disease    56
Alzheimer        11
Asthma           31
dtype: int64

In [40]:
import re

def remove_target_diseases(text, target_diseases):
    if pd.isna(text):  # Jika NaN, langsung return ""
        return ""

    # Konversi text ke lowercase untuk mencocokkan semua bentuk penyakit
    text = text.lower()

    # Looping untuk hapus semua keyword yang mengandung target penyakit
    for disease in target_diseases:
        if disease.lower() in text:  # Jika ada disease sebagai substring
            pattern = r"\b\w*" + re.escape(disease.lower()) + r"\w*\b"  # Hapus kata yang mengandung target
            text = re.sub(pattern, "", text).strip()  # Hapus dan bersihkan spasi

    # Hapus double spaces yang muncul setelah penghapusan kata
    text = re.sub(r"\s+", " ", text)

    return text

# === 1. Hilangkan target penyakit dari patient_conditions_text (case-insensitive) ===
df_training["patient_conditions_text_cleaned"] = df_training["patient_conditions_text"].apply(
    lambda x: remove_target_diseases(x, target_diseases)
)

# === 2. Tampilkan beberapa contoh untuk verifikasi ===
print("Training DataFrame after removing target diseases from patient_conditions_text:")
display(df_training[["patient_conditions_text", "patient_conditions_text_cleaned"]].head())


Training DataFrame after removing target diseases from patient_conditions_text:


Unnamed: 0,patient_conditions_text,patient_conditions_text_cleaned
0,Medication review due (situation) Medication r...,medication review due (situation) medication r...
1,Medication review due (situation) Gingivitis (...,medication review due (situation) gingivitis (...
2,Childhood asthma (disorder) Perennial allergic...,childhood (disorder) perennial allergic rhinit...
3,Medication review due (situation) Primary dent...,medication review due (situation) primary dent...
4,Impacted molars (disorder) Chronic pain (findi...,impacted molars (disorder) chronic pain (findi...


In [55]:
len(Y)

108

In [56]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# === 1. Pilih Fitur dan Label ===
text_column = "patient_conditions_text_cleaned"  # Fitur teks yang akan diproses
label_columns = ["Diabetes", "Hypertension", "Cancer", "Heart Disease", "Alzheimer", "Asthma"]  # Label multi-label

X = df_training[text_column]  # Fitur teks
Y = df_training[label_columns].values  # Label multi-label dalam bentuk numpy array

# === 2. Preprocessing Teks ===
def clean_text(text):
    if pd.isna(text) or text.strip() == "":
        return ""  # Handle NaN atau teks kosong
    text = text.lower()  # Lowercase
    text = re.sub(r"\d+", "", text)  # Hapus angka
    text = re.sub(r"[^\w\s]", "", text)  # Hapus tanda baca
    text = re.sub(r"\s+", " ", text).strip()  # Hapus spasi berlebih
    return text

# Terapkan preprocessing ke semua teks
X_cleaned = X.apply(clean_text)

# === 3. Bag-of-Words (BoW) ===
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X_cleaned)

# === 4. Split Data dengan Iterative Stratification ===
X_train, y_train, X_test, y_test = iterative_train_test_split(X_bow, Y, test_size=0.2)

from sklearn.multiclass import OneVsRestClassifier

# === 5. Train Model Logistic Regression dengan OneVsRest untuk Multi-Label ===
clf = OneVsRestClassifier(LogisticRegression(max_iter=1000))
clf.fit(X_train, y_train)

# === 6. Prediksi ===
y_pred = clf.predict(X_test)

# === 7. Evaluasi Model ===
def evaluate_multilabel(y_test, y_pred):
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred, average="macro"))
    print("Recall:", recall_score(y_test, y_pred, average="macro"))
    print("F1 Score:", f1_score(y_test, y_pred, average="macro"))

evaluate_multilabel(y_test, y_pred)




Accuracy: 0.0
Precision: 0.4801587301587302
Recall: 0.6556672494172494
F1 Score: 0.5374386905759455
