In [None]:
import os
import psycopg2
from pathlib import Path
import sys
import csv
import pandas as pd
from sqlalchemy import create_engine, text
from dotenv import load_dotenv


# set up
cwd = Path(os.getcwd())
project_root = cwd.parents[0]
sys.path.append(str(project_root))

from src.utils.db_loader import load_env_vars
env = load_env_vars()



# Build Postgres connection string
POSTGRES_USERNAME = env["POSTGRES_USERNAME"]
POSTGRES_PASSWORD = env["POSTGRES_PASSWORD"]
POSTGRES_SERVER   = env["POSTGRES_SERVER"]
POSTGRES_DATABASE = env["POSTGRES_DATABASE"]
POSTGRES_PORT     = env.get("POSTGRES_PORT", "5432")


db_url = (
    f"postgresql://{POSTGRES_USERNAME}:{POSTGRES_PASSWORD}"
    f"@{POSTGRES_SERVER}:{POSTGRES_PORT}/{POSTGRES_DATABASE}"
)
engine = create_engine(db_url, connect_args={"sslmode": "require"})



# -----------------------------
# Define data path from .env
# -----------------------------
data_path = Path(env["DATA_PATH"])


# Dictionary of files to load
files_to_load = {
    "beneficiary_2020": "beneficiary_2020.csv",
    "beneficiary_2021": "beneficiary_2021.csv",
    "beneficiary_2022": "beneficiary_2022.csv",
    "beneficiary_2023": "beneficiary_2023.csv",
    "inpatient": "inpatient.csv",
    "outpatient": "outpatient.csv",
    "carrier": "carrier.csv"
}


# -----------------------------
# Create schema
# -----------------------------

with engine.connect() as conn:
    # Drop old tables
    conn.execute(text("DROP TABLE IF EXISTS \"Revenue\" CASCADE;"))
    conn.execute(text("DROP TABLE IF EXISTS \"Provider\" CASCADE;"))
    conn.execute(text("DROP TABLE IF EXISTS \"ProcedureCode\" CASCADE;"))
    conn.execute(text("DROP TABLE IF EXISTS \"Diagnosis\" CASCADE;"))
    conn.execute(text("DROP TABLE IF EXISTS \"Claims\" CASCADE;"))
    conn.execute(text("DROP TABLE IF EXISTS \"Beneficiary\" CASCADE;"))


    # Beneficiary table
    conn.execute(text("""
    CREATE TABLE "Beneficiary" (
        "BeneficiaryID" SERIAL PRIMARY KEY,
        "BENE_ID" VARCHAR(15) NOT NULL,
        "AGE_AT_END_REF_YR" INT,
        "BENE_RACE_CD" CHAR(1),
        "SEX_IDENT_CD" CHAR(1),
        "STATE_CODE" CHAR(2),
        "YEAR" INT NOT NULL,
        UNIQUE ("BENE_ID","YEAR")
    );
    """))

    # Claims table
    conn.execute(text("""
    CREATE TABLE "Claims" (
        "BENE_ID" TEXT NOT NULL,             
        "ClaimID" SERIAL PRIMARY KEY,
        "CLM_ID" TEXT UNIQUE NOT NULL,
        "CLM_LINE_NUM" TEXT,  
        "CLM_TYPE" TEXT,                 
        "YEAR" INT NOT NULL,
        "CLM_FROM_DT" DATE,                  
        "CLM_THRU_DT" DATE,
        "ORG_NPI_NUM" TEXT,   
        "HCPCS_CD" TEXT,    
        "ICD_PRCDR_CD1" TEXT,                               
        "PRNCPAL_DGNS_CD" TEXT,
        "REV_CNTR" TEXT,   
        "CLM_PMT_AMT" NUMERIC(12,2),
        FOREIGN KEY ("BENE_ID","YEAR") REFERENCES "Beneficiary"("BENE_ID","YEAR")
    );
    """))         

    # Diagnosis table
    conn.execute(text("""
    CREATE TABLE "Diagnosis" (
        "DiagnosisID" SERIAL PRIMARY KEY,
        "CLM_ID" TEXT REFERENCES "Claims"("CLM_ID"),
        "ICD_DGNS_CD" TEXT
    );
    """))


    # Procedure table
    conn.execute(text("""
    CREATE TABLE "ProcedureCode" (
        "ProcedureCodeID" SERIAL PRIMARY KEY,
        "CLM_ID" TEXT REFERENCES "Claims"("CLM_ID"),
        "CODE" TEXT,
        "CODE_TYPE" TEXT   -- 'ICD' or 'HCPCS'
    );
    """))

    # Revenue table
    conn.execute(text("""
    CREATE TABLE "Revenue" (
        "RevenueID" SERIAL PRIMARY KEY,
        "CLM_ID" TEXT REFERENCES "Claims"("CLM_ID"),
        "REV_CNTR" TEXT
    );
    """))

    # Provider table
    conn.execute(text("""
    CREATE TABLE "Provider" (
        "ProviderID" SERIAL PRIMARY KEY,
        "ORG_NPI_NUM" TEXT,
        "CLM_ID" TEXT REFERENCES "Claims"("CLM_ID")
    );
    """))

    conn.commit()
    print("âœ… Normalized tables created with `Claims` as the central fact table")

# -----------------------------
# Insert Beneficiary data (2020â€“2023)
# -----------------------------
years_to_load = [2020, 2021, 2022, 2023]

for yr in years_to_load:
    bene_file = data_path / files_to_load[f"beneficiary_{yr}"]
    print(f"ðŸ“¥ Loading {bene_file} ...")
    
    df = pd.read_csv(bene_file, delimiter="|")
    df["YEAR"] = yr
    
    keep_cols = [
        "BENE_ID",
        "AGE_AT_END_REF_YR",
        "BENE_RACE_CD",
        "SEX_IDENT_CD",
        "STATE_CODE",
        "YEAR"
    ]
    df = df[keep_cols]
    
    df.to_sql(
        "Beneficiary",
        engine,
        if_exists="append",
        index=False,
        method="multi",
        chunksize=5000
    )
    print(f"âœ… Beneficiary {yr} data loaded ({len(df)} rows)")

# -----------------------------
# Validation: distinct counts by year
# -----------------------------
query = """
SELECT "YEAR", COUNT(DISTINCT "BENE_ID") AS distinct_beneficiaries,
       COUNT(*) AS total_rows
FROM "Beneficiary"
GROUP BY "YEAR"
ORDER BY "YEAR";
"""

df_counts = pd.read_sql(query, engine)
print("ðŸ“Š Distinct Beneficiary Counts by Year:")
print(df_counts)

# -----------------------------
# Loader for Claims + Normalized Tables
# -----------------------------
def load_claims(file_path, claim_type):
    df = pd.read_csv(file_path, delimiter="|")
    
    # Rename LINE_NUM â†’ CLM_LINE_NUM for Carrier
    if claim_type == "Carrier" and "LINE_NUM" in df.columns:
        df.rename(columns={"LINE_NUM": "CLM_LINE_NUM"}, inplace=True)

    # Dates and year
    df["CLM_THRU_DT"] = pd.to_datetime(df.get("CLM_THRU_DT"), errors="coerce")
    df["CLM_FROM_DT"] = pd.to_datetime(df.get("CLM_FROM_DT"), errors="coerce")
    df["YEAR"] = df["CLM_THRU_DT"].dt.year

    # -----------------------------
    # Filter claims to match Beneficiary years (2020â€“2023)
    # -----------------------------
    df = df[df["YEAR"].between(2020, 2023)]

    # -----------------------------
    # Claims header
    # -----------------------------
    claims_cols = [
         "BENE_ID", "CLM_ID", "CLM_LINE_NUM", "YEAR", "CLM_FROM_DT", "CLM_THRU_DT", "ORG_NPI_NUM", 
         "HCPCS_CD", "ICD_PRCDR_CD1",  "PRNCPAL_DGNS_CD", "REV_CNTR","CLM_PMT_AMT"
    ]
    available_cols = [c for c in claims_cols if c in df.columns]
    df_claims = df[available_cols].copy()
    df_claims["CLM_TYPE"] = claim_type
    df_claims = df_claims.drop_duplicates(subset=["CLM_ID"])

    df_claims.to_sql("Claims", engine, if_exists="append", index=False, method="multi", chunksize=5000)



    # -----------------------------
    # Diagnosis (use only primary diagnosis code)
    # -----------------------------
    if "PRNCPAL_DGNS_CD" in df.columns:
        df_diag = df[["CLM_ID", "PRNCPAL_DGNS_CD"]].dropna()
        df_diag.rename(columns={"PRNCPAL_DGNS_CD": "ICD_DGNS_CD"}, inplace=True)
        df_diag.to_sql(
            "Diagnosis",  engine,  if_exists="append",   index=False,  method="multi", chunksize=5000 )

    # -----------------------------
    # Procedure + HCPCS (Unified Table)
    # -----------------------------

    # procedure codes
    if "ICD_PRCDR_CD1" in df.columns:
        df_icd = df[["CLM_ID", "ICD_PRCDR_CD1"]].dropna()
        df_icd.rename(columns={"ICD_PRCDR_CD1": "CODE"}, inplace=True)
        df_icd["CODE_TYPE"] = "ICD"

        df_icd.to_sql(
            "ProcedureCode", engine,
            if_exists="append", index=False, method="multi", chunksize=5000
        )

    # HCPCS codes (from Carrier + Outpatient)
    if "HCPCS_CD" in df.columns:
        df_hcpcs = df[["CLM_ID", "HCPCS_CD"]].dropna()
        df_hcpcs.rename(columns={"HCPCS_CD": "CODE"}, inplace=True)
        df_hcpcs["CODE_TYPE"] = "HCPCS"

        df_hcpcs.to_sql(
            "ProcedureCode", engine,
            if_exists="append", index=False, method="multi", chunksize=5000  )


    # -----------------------------
    # Revenue (REV_CNTR only)
    # -----------------------------
    if "REV_CNTR" in df.columns:
        df_rev = df[["CLM_ID", "REV_CNTR"]].dropna()
        df_rev.to_sql(
            "Revenue", engine,
            if_exists="append", index=False, method="multi", chunksize=5000    )

    # -----------------------------
    # Provider
    # -----------------------------
    if "ORG_NPI_NUM" in df.columns:
        df_provider = df[["CLM_ID","ORG_NPI_NUM"]].dropna()
        df_provider.to_sql("Provider", engine, if_exists="append", index=False, method="multi", chunksize=5000)

    print(f"âœ… {claim_type} claims loaded ({len(df)} rows)")

# -----------------------------
# Load Inpatient, Outpatient, Carrier
# -----------------------------
load_claims(data_path / files_to_load["inpatient"], "Inpatient")
load_claims(data_path / files_to_load["outpatient"], "Outpatient")
load_claims(data_path / files_to_load["carrier"], "Carrier")



In [22]:
# -----------------------------
# Load NYU Mapping File
# -----------------------------

# Path to Excel file
ed_file = data_path / "NYU_ED_Algorithm_ICD10.xlsx"

# Load the first sheet (or specify sheet name if needed)
df_ed = pd.read_excel(ed_file, sheet_name=0)

df_ed.columns = [
    "ICD10",
    "ICD10_Description",
    "Non_Emergent",
    "Emergent_PC_Treatable",
    "ED_Care_Needed_Preventable_Avoidable",
    "ED_Care_Needed_Not_Preventable",
    "Alcohol",
    "Drug",
    "Injury",
    "Psych",
    "Unclassified"
]

df_ed.to_sql(
    "ED_Algorithm_ICD10",
    engine,
    if_exists="append",
    index=False,
    method="multi",
    chunksize=5000
)

print("âœ… NYU ED Algorithm ICD-10 codes loaded from Excel")

pd.read_sql('SELECT COUNT(*) FROM "ED_Algorithm_ICD10";', engine)

âœ… NYU ED Algorithm ICD-10 codes loaded from Excel


Unnamed: 0,count
0,75242
