In [None]:
!pip install pandas sqlalchemy python-dotenv
!pip install psycopg2-binary

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv
import os


# Load environment variables
load_dotenv()

POSTGRES_USERNAME = os.getenv("POSTGRES_USERNAME")
POSTGRES_PASSWORD = os.getenv("POSTGRES_PASSWORD")
POSTGRES_SERVER   = os.getenv("POSTGRES_SERVER")
POSTGRES_DATABASE = os.getenv("POSTGRES_DATABASE")
POSTGRES_PORT     = os.getenv("POSTGRES_PORT", "5432")

# Build connection string
db_url = (
    f"postgresql://{POSTGRES_USERNAME}:{POSTGRES_PASSWORD}"
    f"@{POSTGRES_SERVER}:{POSTGRES_PORT}/{POSTGRES_DATABASE}"
)

# Create SQLAlchemy engine
engine = create_engine(db_url, connect_args={"sslmode": "require"})

print("Connected to PostgreSQL")


# ---------------------------------------------------------
# 1. SQL JOIN + VISIT-LEVEL ROLLUP
# ---------------------------------------------------------

# ---------------------------------------------------------
# Identifying Emergency Room (ER) Claims
# ---------------------------------------------------------
# ED visits are identified using:
#   1. Revenue Center Codes (0450–0459, 0981)
#   2. CPT/HCPCS ED E/M Codes (99281–99285)
# ---------------------------------------------------------

query = """

WITH ed_claims AS (
    SELECT
        c."CLM_ID",
        c."BENE_ID",
        c."ORG_NPI_NUM" AS facility_id,
        c."CLM_FROM_DT",
        c."CLM_THRU_DT",
        c."CLM_PMT_AMT",
        c."CLM_TYPE",
        c."YEAR",

        b."AGE_AT_END_REF_YR",
        b."BENE_RACE_CD",
        b."SEX_IDENT_CD",
        b."STATE_CODE",

        d."ICD_DGNS_CD" AS dx1,
        COALESCE(ci."CHRONIC_FLAG", 9) AS "chronic_flag",

        COALESCE(ed."Non_Emergent",0) AS "Non_Emergent",
        COALESCE(ed."Emergent_PC_Treatable",0) AS "Emergent_PC_Treatable",
        COALESCE(ed."ED_Care_Needed_Preventable_Avoidable",0) AS "ED_Care_Needed_Preventable_Avoidable",
        COALESCE(ed."ED_Care_Needed_Not_Preventable",0) AS "ED_Care_Needed_Not_Preventable",
        COALESCE(ed."Alcohol",0) AS "Alcohol",
        COALESCE(ed."Drug",0) AS "Drug",
        COALESCE(ed."Injury",0) AS "Injury",
        COALESCE(ed."Psych",0) AS "Psych",
        COALESCE(ed."Unclassified",0) AS "Unclassified",

        ccsr."Body_System",

        CASE 
            WHEN (c."REV_CNTR" BETWEEN '450' AND '459')
              OR (c."REV_CNTR" = '981')
              OR (c."HCPCS_CD" IN ('99281','99282','99283','99284','99285'))
              OR (c."ICD_PRCDR_CD1" IN ('99281','99282','99283','99284','99285'))
            THEN 1 ELSE 0
        END AS "IS_ED"
    FROM "Claims" c
    LEFT JOIN "Beneficiary" b
        ON c."BENE_ID" = b."BENE_ID" AND c."YEAR" = b."YEAR"
    LEFT JOIN "Diagnosis" d
        ON c."CLM_ID" = d."CLM_ID"
    LEFT JOIN "ICD10_ChronicIndicator" ci
        ON d."ICD_DGNS_CD" = ci."ICD10"
    LEFT JOIN "ED_Algorithm_ICD10" ed
        ON d."ICD_DGNS_CD" = ed."ICD10"
    LEFT JOIN "CCSR_ICD10" ccsr
        ON d."ICD_DGNS_CD" = ccsr."ICD10"
),

visit_base AS (
    SELECT
        *,
        MIN("CLM_FROM_DT") OVER (
            PARTITION BY "BENE_ID", facility_id, "YEAR", "CLM_FROM_DT"::date
        ) AS visit_start
    FROM ed_claims
    WHERE "IS_ED" = 1
)

SELECT
    "BENE_ID",
    facility_id,
    "YEAR",
    visit_start,
    MAX("CLM_THRU_DT") AS visit_end,

    COALESCE(
        MAX(CASE WHEN "CLM_TYPE" IN ('Inpatient','Outpatient') THEN dx1 END),
        MIN(dx1)
    ) AS primary_dx,

    COALESCE(
        MAX(CASE WHEN "CLM_TYPE" IN ('Inpatient','Outpatient') THEN chronic_flag END),
        MAX(chronic_flag)
    ) AS primary_dx_chronic_flag,

    MAX("AGE_AT_END_REF_YR") AS "AGE_AT_END_REF_YR",
    MAX("BENE_RACE_CD") AS "BENE_RACE_CD",
    MAX("SEX_IDENT_CD") AS "SEX_IDENT_CD",
    MAX("STATE_CODE") AS "STATE_CODE",

    MAX("Non_Emergent") AS "Non_Emergent",
    MAX("Emergent_PC_Treatable") AS "Emergent_PC_Treatable",
    MAX("ED_Care_Needed_Preventable_Avoidable") AS "ED_Care_Needed_Preventable_Avoidable",
    MAX("ED_Care_Needed_Not_Preventable") AS "ED_Care_Needed_Not_Preventable",
    MAX("Alcohol") AS "Alcohol",
    MAX("Drug") AS "Drug",
    MAX("Injury") AS "Injury",
    MAX("Psych") AS "Psych",
    MAX("Unclassified") AS "Unclassified",

    SUM("CLM_PMT_AMT") AS total_paid_amt,
    COUNT(DISTINCT "CLM_ID") AS claim_count_in_visit,

    /* Body System Indicators */
    CASE WHEN MAX("Body_System") = 'Respiratory' THEN 1 ELSE 0 END AS BodySystem_Respiratory,
    CASE WHEN MAX("Body_System") = 'Circulatory' THEN 1 ELSE 0 END AS BodySystem_Circulatory,
    CASE WHEN MAX("Body_System") = 'Infectious' THEN 1 ELSE 0 END AS BodySystem_Infectious,
    CASE WHEN MAX("Body_System") = 'Digestive' THEN 1 ELSE 0 END AS BodySystem_Digestive,
    CASE WHEN MAX("Body_System") = 'Mental/Behavioral' THEN 1 ELSE 0 END AS BodySystem_MentalBehavioral,
    CASE WHEN MAX("Body_System") = 'Musculoskeletal' THEN 1 ELSE 0 END AS BodySystem_Musculoskeletal,
    CASE WHEN MAX("Body_System") = 'Neoplasms' THEN 1 ELSE 0 END AS BodySystem_Neoplasms,
    CASE WHEN MAX("Body_System") = 'Nervous System' THEN 1 ELSE 0 END AS BodySystem_NervousSystem,
    CASE WHEN MAX("Body_System") = 'Injury/Poisoning' THEN 1 ELSE 0 END AS BodySystem_InjuryPoisoning,
    CASE WHEN MAX("Body_System") = 'Skin/Subcutaneous' THEN 1 ELSE 0 END AS BodySystem_Skin,
    CASE WHEN MAX("Body_System") = 'Genitourinary' THEN 1 ELSE 0 END AS BodySystem_Genitourinary,
    CASE WHEN MAX("Body_System") = 'Endocrine/Metabolic' THEN 1 ELSE 0 END AS BodySystem_Endocrine,
    CASE WHEN MAX("Body_System") = 'Blood/Immune' THEN 1 ELSE 0 END AS BodySystem_BloodImmune,
    CASE WHEN MAX("Body_System") = 'Symptoms/Signs' THEN 1 ELSE 0 END AS BodySystem_Symptoms,
    CASE WHEN MAX("Body_System") = 'External Causes' THEN 1 ELSE 0 END AS BodySystem_ExternalCauses,
    CASE WHEN MAX("Body_System") = 'Congenital' THEN 1 ELSE 0 END AS BodySystem_Congenital,
    CASE WHEN MAX("Body_System") = 'Perinatal' THEN 1 ELSE 0 END AS BodySystem_Perinatal,
    CASE WHEN MAX("Body_System") = 'Pregnancy/Childbirth' THEN 1 ELSE 0 END AS BodySystem_Pregnancy,
    CASE WHEN MAX("Body_System") = 'Dental' THEN 1 ELSE 0 END AS BodySystem_Dental,
    CASE WHEN MAX("Body_System") = 'Eye' THEN 1 ELSE 0 END AS BodySystem_Eye,
    CASE WHEN MAX("Body_System") = 'Ear' THEN 1 ELSE 0 END AS BodySystem_Ear,
    CASE WHEN MAX("Body_System") = 'Health Status/Contact' THEN 1 ELSE 0 END AS BodySystem_HealthStatus,
    CASE WHEN MAX("Body_System") = 'Unacceptable Diagnosis' THEN 1 ELSE 0 END AS BodySystem_Unacceptable

FROM visit_base
GROUP BY
    "BENE_ID",
    facility_id,
    "YEAR",
    visit_start
ORDER BY
    "BENE_ID",
    visit_start;
"""

df = pd.read_sql(query, engine)
print("✅ Visit-level ER dataset loaded:", df.shape)
df.head()



# ---------------------------------------------------------
# 2. Compute ED Category
# ---------------------------------------------------------
prob_cols = [
    "Non_Emergent",
    "Emergent_PC_Treatable",
    "ED_Care_Needed_Preventable_Avoidable",
    "ED_Care_Needed_Not_Preventable",
    "Alcohol",
    "Drug",
    "Injury",
    "Psych"
]

df["ED_Category"] = df[prob_cols].idxmax(axis=1)
print("✅ ED category assigned")


# ---------------------------------------------------------
# 3. Compute avoidable ED probability
# ---------------------------------------------------------
df["P_avoidable"] = (
    df["Non_Emergent"].fillna(0) +
    df["Emergent_PC_Treatable"].fillna(0) +
    df["ED_Care_Needed_Preventable_Avoidable"].fillna(0)
)

print("✅ Avoidable ED probability computed")


# ---------------------------------------------------------
# 5. Flag avoidable ED visits
# ---------------------------------------------------------
df["Avoidable_ED_Visit"] = (df["P_avoidable"] > 0).astype(int)
print("✅ Avoidable ED visits flagged")


# ---------------------------------------------------------
# 6. Summary by year (ED-only)
# ---------------------------------------------------------
df["YEAR"] = pd.to_datetime(df["visit_start"]).dt.year

summary_by_year = (
    df.groupby("YEAR")
      .agg(
          ER_Visits=("BENE_ID", "count"),
          Avoidable_ED_Visits=("Avoidable_ED_Visit", "sum")
      )
      .reset_index()
      .sort_values("YEAR")
)

display(summary_by_year)
print("✅ Summary by year computed")

df.to_parquet("er_visits_modeling_dataset.parquet", index=False)
print("Saved modeling dataset to er_visits_modeling_dataset.parquet")


Connected to PostgreSQL
✅ Visit-level ER dataset loaded: (7559, 45)
✅ ED category assigned
✅ Avoidable ED probability computed
✅ Avoidable ED visits flagged


Unnamed: 0,YEAR,ER_Visits,Avoidable_ED_Visits
0,2020,2050,182
1,2021,2327,225
2,2022,2703,264
3,2023,479,47


✅ Summary by year computed
Saved modeling dataset to er_visits_modeling_dataset.parquet
