In [None]:
import pandas as pd
from deltalake import DeltaTable
from dotenv import load_dotenv

load_dotenv(".demo.env")

storage_options = {
    "AWS_ENDPOINT_URL": "http://localhost:9000",
}

conditions = DeltaTable(
    "s3://fhir/default/Condition.parquet", storage_options=storage_options
).to_pandas()

pd.set_option("display.max_colwidth", None)

In [None]:
conditions

In [None]:
# Zunächst nur die interessanten Spalten extrahieren
conditions_simple = conditions.loc[:, ["id", "subject", "code"]]
conditions_simple

In [None]:
# Anzeige der Struktur der Spalte "subject": ein dict mit mehreren Attributen
conditions_simple["subject"]

In [None]:
# Extrahiere nur die Patient-Referenz aus dem subject dict
conditions_simple["subject_reference"] = conditions_simple["subject"].apply(
    lambda subject: subject["reference"]
)
conditions_simple

In [None]:
# extrahiere das "coding" Attribut aus dem code dict
conditions_simple["code_coding"] = conditions_simple["code"].apply(
    lambda code: code["coding"]
)

# explode erzeugt für jedes Element in der Liste eine neue Zeile im dataframe
conditions_exploded = conditions_simple.explode("code_coding")

# erzeuge neue Spalten für die einzelnen Attribute des coding dicts

# zunächst für code.coding.system
conditions_exploded["code_coding_system"] = conditions_exploded["code_coding"].apply(
    lambda coding: coding["system"]
)

conditions_exploded["code_coding_code"] = conditions_exploded["code_coding"].apply(
    lambda coding: coding["code"]
)

conditions_exploded["code_coding_display"] = conditions_exploded["code_coding"].apply(
    lambda coding: coding["display"]
)

conditions_exploded

In [None]:
conditions_snomed = conditions_exploded[
    conditions_exploded["code_coding_system"] == "http://snomed.info/sct"
]

In [None]:
import matplotlib.pyplot as plt

code_counts = (
    conditions_snomed.groupby("code_coding_display")["subject_reference"]
    .nunique()
    .reset_index()
    .rename(columns={"subject_reference": "patient_count"})
    .sort_values(by="patient_count", ascending=False)
    .head(25)
)

plt.figure(figsize=(12, 6))
plt.bar(code_counts["code_coding_display"], code_counts["patient_count"])
plt.xlabel("Diagnose")
plt.ylabel("Anzahl Patienten")
plt.title("Histogramm: Anzahl Patienten pro Diagnose-Code")
plt.xticks(rotation=75)
plt.tight_layout()
plt.show()

# Übung

Nun stratifiziere die Verteilung noch nach dem Geschlecht des Patienten.
Dazu müssen die Patient-Ressourcen an die Condition gejoined werden: <https://pandas.pydata.org/docs/user_guide/merging.html>


In [None]:
patients = DeltaTable(
    "s3://fhir/default/Patient.parquet/", storage_options=storage_options
).to_pandas()

In [None]:
# Conditions["subject_reference"] enthält die Referenz auf den Patienten im Format "Patient/123".
# Das bauen wir hier nach.
patients["id_as_reference"] = "Patient/" + patients["id"].astype(str)

In [None]:
pd.set_option("display.max_colwidth", 128)
patients

In [None]:
# TODO: merge die beiden DataFrames conditions_snomed und patients
# merged = patients.merge( ... )
# merged

In [None]:
# TODO: plotte die Anzahl der Patienten pro Diagnose, stratifiziert nach Geschlecht