In [17]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime

In [18]:
def inspect_file_columns(file_path, n_rows=5):
    """
    Quickly read the first few rows of a pipe-delimited file
    and print cleaned column names.
    """
    df = pd.read_csv(
        file_path,
        delimiter="|",
        nrows=n_rows,
        dtype=str
    )
    # Clean columns
    df.columns = df.columns.str.strip(" ~'")
    print(f"Columns in {file_path}:")
    for col in df.columns:
        print(f"- {col}")

In [19]:
# Provider Impressions
inspect_file_columns("../data/raw/FACTPCRPRIMARYIMPRESSION.txt")
inspect_file_columns("../data/raw/FACTPCRSECONDARYIMPRESSION.txt")

# Symptoms
inspect_file_columns("../data/raw/FACTPCRPRIMARYSYMPTOM.txt")
inspect_file_columns("../data/raw/FACTPCRADDITIONALSYMPTOM.txt")

# Cause of Injury and Trauma
inspect_file_columns("../data/raw/FACTPCRCAUSEOFINJURY.txt")
inspect_file_columns("../data/raw/FACTPCRINJURYRISKFACTOR.txt")
inspect_file_columns("../data/raw/FACTPCRTRAUMACRITERIA.txt")

# Barriers to Care
inspect_file_columns("../data/raw/FACTPCRBARRIERTOCARE.txt")

# Delay Types
inspect_file_columns("../data/raw/FACTPCRDISPATCHDELAY.txt")
inspect_file_columns("../data/raw/FACTPCRRESPONSEDELAY.txt")
inspect_file_columns("../data/raw/FACTPCRSCENEDELAY.txt")
inspect_file_columns("../data/raw/FACTPCRTRANSPORTDELAY.txt")
inspect_file_columns("../data/raw/FACTPCRTURNAROUNDDELAY.txt")

# Destination Details
inspect_file_columns("../data/raw/FACTPCRDESTINATIONREASON.txt")
inspect_file_columns("../data/raw/FACTPCRDESTINATIONTEAM.txt")

# Vitals
inspect_file_columns("../data/raw/FACTPCRVITAL.txt")

# Arrest & CPR
inspect_file_columns("../data/raw/FACTPCRARRESTCPRPROVIDED.txt")
inspect_file_columns("../data/raw/FACTPCRARRESTRESUSCITATION.txt")
inspect_file_columns("../data/raw/FACTPCRARRESTRHYTHMDESTINATION.txt")
inspect_file_columns("../data/raw/FACTPCRARRESTROSC.txt")
inspect_file_columns("../data/raw/FACTPCRARRESTWITNESS.txt")

# Medication & Procedure
inspect_file_columns("../data/raw/FACTPCRMEDICATION.txt")
inspect_file_columns("../data/raw/FACTPCRPROCEDURE.txt")
inspect_file_columns("../data/raw/FACTPCRPROTOCOL.txt")

# Additional Modes
inspect_file_columns("../data/raw/FACTPCRADDITIONALRESPONSEMODE.txt")
inspect_file_columns("../data/raw/FACTPCRADDITIONALTRANSPORTMODE.txt")

# Alcohol/Drug Use Indicator
inspect_file_columns("../data/raw/FACTPCRALCOHOLDRUGUSEINDICATOR.txt")

# Work Related Exposure
inspect_file_columns("../data/raw/FACTPCRWORKRELATEDEXPOSURE.txt")

# Patient Groupings
inspect_file_columns("../data/raw/PCRPATIENTRACEGROUP.txt")

# Procedure/ECG Groupings
inspect_file_columns("../data/raw/PCRPROCCOMPGROUP.txt")
inspect_file_columns("../data/raw/PCRMEDCOMPGROUP.txt")
inspect_file_columns("../data/raw/PCRVITALECGGROUP.txt")
inspect_file_columns("../data/raw/PCRVITALECGINTERPRETATIONGROUP.txt")
inspect_file_columns("../data/raw/PCRVITALGLASGOWQUALIFIERGROUP.txt")

# Core Events Table
inspect_file_columns("../data/raw/pub_pcrevents_cp25.txt")

Columns in ../data/raw/FACTPCRPRIMARYIMPRESSION.txt:
- PcrKey
- eSituation_11
Columns in ../data/raw/FACTPCRSECONDARYIMPRESSION.txt:
- PcrKey
- eSituation_12
Columns in ../data/raw/FACTPCRPRIMARYSYMPTOM.txt:
- PcrKey
- eSituation_09
Columns in ../data/raw/FACTPCRADDITIONALSYMPTOM.txt:
- PcrKey
- eSituation_10
Columns in ../data/raw/FACTPCRCAUSEOFINJURY.txt:
- PcrKey
- eInjury_01
Columns in ../data/raw/FACTPCRINJURYRISKFACTOR.txt:
- PcrKey
- eInjury_04
Columns in ../data/raw/FACTPCRTRAUMACRITERIA.txt:
- PcrKey
- eInjury_03
Columns in ../data/raw/FACTPCRBARRIERTOCARE.txt:
- PcrKey
- eHistory_01
Columns in ../data/raw/FACTPCRDISPATCHDELAY.txt:
- PcrKey
- eResponse_08
Columns in ../data/raw/FACTPCRRESPONSEDELAY.txt:
- PcrKey
- eResponse_09
Columns in ../data/raw/FACTPCRSCENEDELAY.txt:
- PcrKey
- eResponse_10
Columns in ../data/raw/FACTPCRTRANSPORTDELAY.txt:
- PcrKey
- eResponse_11
Columns in ../data/raw/FACTPCRTURNAROUNDDELAY.txt:
- PcrKey
- eResponse_12
Columns in ../data/raw/FACTPCRDESTI

In [20]:
# Define target ICD-10 prefixes
target_prefixes = ("T40", "F11")

# Path to primary impressions file
primary_path = "../data/raw/FACTPCRPRIMARYIMPRESSION.txt"

# Prepare chunks
primary_chunks = []

with open(primary_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Processing Primary Impressions"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["eSituation_11"] = chunk["eSituation_11"].str.strip(" ~")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        mask = chunk["eSituation_11"].str.startswith(target_prefixes)
        filtered = chunk.loc[mask, ["PcrKey"]]

        if not filtered.empty:
            primary_chunks.append(filtered)

primary_df = pd.concat(primary_chunks, ignore_index=True)
print("Primary impressions matched:", len(primary_df))

Processing Primary Impressions: 530it [00:21, 24.21it/s]

Primary impressions matched: 231354





In [21]:
# Path to secondary impressions file
secondary_path = "../data/raw/FACTPCRSECONDARYIMPRESSION.txt"

# Prepare chunks
secondary_chunks = []

with open(secondary_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Processing Secondary Impressions"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["eSituation_12"] = chunk["eSituation_12"].str.strip(" ~")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        mask = chunk["eSituation_12"].str.startswith(target_prefixes)
        filtered = chunk.loc[mask, ["PcrKey"]]

        if not filtered.empty:
            secondary_chunks.append(filtered)

secondary_df = pd.concat(secondary_chunks, ignore_index=True)
print("Secondary impressions matched:", len(secondary_df))

Processing Secondary Impressions: 544it [00:22, 23.85it/s]

Secondary impressions matched: 42853





In [22]:
# Combine and deduplicate primary and secondary impression matches
opioid_cases = pd.concat([primary_df, secondary_df], ignore_index=True).drop_duplicates()

# Optionally, turn it into a set for fast lookups
opioid_pcr_keys = set(opioid_cases["PcrKey"])
print("Total unique opioid-related cases:", len(opioid_pcr_keys))

Total unique opioid-related cases: 271206


In [23]:
# Load core events table
events_path = "../data/raw/pub_pcrevents_cp25.txt"
events_chunks = []

with open(events_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Filtering pub_pcrevents_cp25.txt"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(opioid_pcr_keys)]
        if not filtered.empty:
            events_chunks.append(filtered)

events_df = pd.concat(events_chunks, ignore_index=True)
print("Filtered events records:", len(events_df))

Filtering pub_pcrevents_cp25.txt: 542it [04:28,  2.02it/s]


Filtered events records: 271206


In [24]:
# Load and aggregate eSituation_09 (Primary Symptom)
symptom_path = "../data/raw/FACTPCRPRIMARYSYMPTOM.txt"
symptom_chunks = []

with open(symptom_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Processing Primary Symptom"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")
        chunk["eSituation_09"] = chunk["eSituation_09"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(opioid_pcr_keys)]
        if not filtered.empty:
            symptom_chunks.append(filtered[["PcrKey", "eSituation_09"]])

symptom_df = pd.concat(symptom_chunks, ignore_index=True)

# Merge into events_df (one-to-one)
events_df = events_df.merge(symptom_df, on="PcrKey", how="left")
print("Merged primary symptom. New shape:", events_df.shape)

Processing Primary Symptom: 534it [00:37, 14.20it/s]


Merged primary symptom. New shape: (271206, 48)


In [25]:
# Load and aggregate eSituation_10 (Other Associated Symptoms)
assoc_path = "../data/raw/FACTPCRADDITIONALSYMPTOM.txt"
assoc_chunks = []

with open(assoc_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Processing Associated Symptoms"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")
        chunk["eSituation_10"] = chunk["eSituation_10"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(opioid_pcr_keys)]
        if not filtered.empty:
            assoc_chunks.append(filtered[["PcrKey", "eSituation_10"]])

assoc_df = pd.concat(assoc_chunks, ignore_index=True)

# Aggregate: count and nunique per PcrKey
assoc_agg = (
    assoc_df.groupby("PcrKey")
    .agg(
        assoc_symptom_count=("eSituation_10", "count"),
        assoc_symptom_unique=("eSituation_10", "nunique")
    )
    .reset_index()
)

# Merge
events_df = events_df.merge(assoc_agg, on="PcrKey", how="left")
print("Merged associated symptoms. New shape:", events_df.shape)

Processing Associated Symptoms: 580it [00:41, 13.94it/s]


Merged associated symptoms. New shape: (271206, 50)


In [26]:
# Reload and merge full Primary and Secondary Impressions

primary_chunks = []
secondary_chunks = []

# Reload Primary Impressions
with open("../data/raw/FACTPCRPRIMARYIMPRESSION.txt", "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Reloading Primary Impressions"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")
        chunk["eSituation_11"] = chunk["eSituation_11"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(opioid_pcr_keys)]
        if not filtered.empty:
            primary_chunks.append(filtered[["PcrKey", "eSituation_11"]])

# Reload Secondary Impressions
with open("../data/raw/FACTPCRSECONDARYIMPRESSION.txt", "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Reloading Secondary Impressions"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")
        chunk["eSituation_12"] = chunk["eSituation_12"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(opioid_pcr_keys)]
        if not filtered.empty:
            secondary_chunks.append(filtered[["PcrKey", "eSituation_12"]])

# Concatenate and merge
primary_imp_df = pd.concat(primary_chunks, ignore_index=True)
secondary_imp_df = pd.concat(secondary_chunks, ignore_index=True)

events_df = events_df.merge(primary_imp_df, on="PcrKey", how="left")
print("Merged primary impression. New shape:", events_df.shape)

events_df = events_df.merge(secondary_imp_df, on="PcrKey", how="left")
print("Merged secondary impression. New shape:", events_df.shape)


Reloading Primary Impressions: 530it [00:36, 14.59it/s]
Reloading Secondary Impressions: 544it [00:38, 14.02it/s]


Merged primary impression. New shape: (271206, 51)
Merged secondary impression. New shape: (281813, 52)


In [27]:
# Loading Injury Data

# Paths to source files
cause_path = "../data/raw/FACTPCRCAUSEOFINJURY.txt"
risk_path = "../data/raw/FACTPCRINJURYRISKFACTOR.txt"
trauma_path = "../data/raw/FACTPCRTRAUMACRITERIA.txt"

# Initialize lists
cause_chunks = []
risk_chunks = []
trauma_chunks = []

# Load Cause of Injury (eInjury_01)
with open(cause_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading eInjury_01 (Cause of Injury)"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eInjury_01"] = chunk["eInjury_01"].str.strip(" ~'")
        cause_chunks.append(chunk[["PcrKey", "eInjury_01"]])

cause_df = pd.concat(cause_chunks, ignore_index=True)

# Load Injury Risk Factor (eInjury_04)
with open(risk_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading eInjury_04 (Risk Factor)"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eInjury_04"] = chunk["eInjury_04"].str.strip(" ~'")
        risk_chunks.append(chunk[["PcrKey", "eInjury_04"]])

risk_df = pd.concat(risk_chunks, ignore_index=True)

# Load Trauma Criteria (eInjury_03)
with open(trauma_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading eInjury_03 (Trauma Criteria)"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eInjury_03"] = chunk["eInjury_03"].str.strip(" ~'")
        trauma_chunks.append(chunk[["PcrKey", "eInjury_03"]])

trauma_df = pd.concat(trauma_chunks, ignore_index=True)

# Aggregate each to 1 row per PcrKey
cause_agg = cause_df.groupby("PcrKey").agg(
    injury_cause_count=("eInjury_01", "nunique")
).reset_index()

risk_agg = (
    risk_df.dropna(subset=["eInjury_04"])
    .groupby("PcrKey")
    .size()
    .reset_index(name="injury_risk_flag")
)
risk_agg["injury_risk_flag"] = 1  # presence = 1

trauma_agg = trauma_df.groupby("PcrKey").agg(
    trauma_criteria_count=("eInjury_03", "nunique")
).reset_index()

# Merge into events_df
events_df = events_df.merge(cause_agg, on="PcrKey", how="left")
events_df = events_df.merge(risk_agg, on="PcrKey", how="left")
events_df = events_df.merge(trauma_agg, on="PcrKey", how="left")

# Fill missing with 0
events_df["injury_cause_count"] = events_df["injury_cause_count"].fillna(0).astype(int)
events_df["injury_risk_flag"] = events_df["injury_risk_flag"].fillna(0).astype(int)
events_df["trauma_criteria_count"] = events_df["trauma_criteria_count"].fillna(0).astype(int)

print("Injury-related features merged. New shape:", events_df.shape)

Loading eInjury_01 (Cause of Injury): 545it [00:19, 28.21it/s]
Loading eInjury_04 (Risk Factor): 543it [00:18, 28.80it/s]
Loading eInjury_03 (Trauma Criteria): 543it [00:18, 28.75it/s]


Injury-related features merged. New shape: (281813, 55)


In [28]:
# Load and process eHistory_01 (Barriers to Patient Care)
barrier_path = "../data/raw/FACTPCRBARRIERTOCARE.txt"
barrier_chunks = []

with open(barrier_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading eHistory_01 (Barriers to Care)"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eHistory_01"] = chunk["eHistory_01"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(opioid_pcr_keys)]

        if not filtered.empty:
            barrier_chunks.append(filtered[["PcrKey", "eHistory_01"]])

barrier_df = pd.concat(barrier_chunks, ignore_index=True)

# Aggregate
barrier_agg = (
    barrier_df.groupby("PcrKey")
    .agg(
        barrier_count=("eHistory_01", "count"),
        barrier_unique=("eHistory_01", "nunique")
    )
    .reset_index()
)
barrier_agg["barrier_present"] = 1

# Merge into main dataset
events_df = events_df.merge(barrier_agg, on="PcrKey", how="left")

# Fill NA with zeros
events_df["barrier_count"] = events_df["barrier_count"].fillna(0).astype(int)
events_df["barrier_unique"] = events_df["barrier_unique"].fillna(0).astype(int)
events_df["barrier_present"] = events_df["barrier_present"].fillna(0).astype(int)

print("Barriers to care merged. New shape:", events_df.shape)

Loading eHistory_01 (Barriers to Care): 550it [00:37, 14.59it/s]


Barriers to care merged. New shape: (281813, 58)


In [29]:
# Delay files and their corresponding column names
delay_sources = {
    "../data/raw/FACTPCRDISPATCHDELAY.txt": "eResponse_08",
    "../data/raw/FACTPCRRESPONSEDELAY.txt": "eResponse_09",
    "../data/raw/FACTPCRSCENEDELAY.txt": "eResponse_10",
    "../data/raw/FACTPCRTRANSPORTDELAY.txt": "eResponse_11",
    "../data/raw/FACTPCRTURNAROUNDDELAY.txt": "eResponse_12",
}

# Loop through each and join delay features
for file_path, col in delay_sources.items():
    print(f"Processing {col} from {file_path}")
    delay_chunks = []

    with open(file_path, "r") as f:
        for chunk in tqdm(
            pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
            desc=f"Loading {col}"
        ):
            chunk.columns = chunk.columns.str.strip(" ~'")
            chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
            chunk[col] = chunk[col].str.strip(" ~'")

            filtered = chunk[chunk["PcrKey"].isin(opioid_pcr_keys)]
            if not filtered.empty:
                delay_chunks.append(filtered[["PcrKey", col]])

    delay_df = pd.concat(delay_chunks, ignore_index=True)

    # Aggregate delay types per PcrKey
    delay_agg = (
        delay_df.groupby("PcrKey")[col]
        .agg([
            ("{}_count".format(col), "count"),
            ("{}_unique".format(col), "nunique")
        ])
        .reset_index()
    )
    delay_agg["{}_present".format(col)] = 1

    # Merge into main events_df
    events_df = events_df.merge(delay_agg, on="PcrKey", how="left")

    # Fill missing values
    events_df["{}_count".format(col)] = events_df["{}_count".format(col)].fillna(0).astype(int)
    events_df["{}_unique".format(col)] = events_df["{}_unique".format(col)].fillna(0).astype(int)
    events_df["{}_present".format(col)] = events_df["{}_present".format(col)].fillna(0).astype(int)

print("All delay types merged. New shape:", events_df.shape)

Processing eResponse_08 from ../data/raw/FACTPCRDISPATCHDELAY.txt


Loading eResponse_08: 543it [00:40, 13.56it/s]


Processing eResponse_09 from ../data/raw/FACTPCRRESPONSEDELAY.txt


Loading eResponse_09: 545it [00:41, 13.18it/s]


Processing eResponse_10 from ../data/raw/FACTPCRSCENEDELAY.txt


Loading eResponse_10: 546it [00:40, 13.37it/s]


Processing eResponse_11 from ../data/raw/FACTPCRTRANSPORTDELAY.txt


Loading eResponse_11: 544it [00:40, 13.57it/s]


Processing eResponse_12 from ../data/raw/FACTPCRTURNAROUNDDELAY.txt


Loading eResponse_12: 552it [00:40, 13.60it/s]


All delay types merged. New shape: (281813, 73)


In [30]:
# Load Destination Reason and Alert Info

# File paths
dest_reason_path = "../data/raw/FACTPCRDESTINATIONREASON.txt"
dest_team_path = "../data/raw/FACTPCRDESTINATIONTEAM.txt"

# Chunks
reason_chunks = []
team_chunks = []

# eDisposition_20 - Reason for choosing destination
print(f"Processing eDisposition_20 from {dest_reason_path}")
with open(dest_reason_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading eDisposition_20"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eDisposition_20"] = chunk["eDisposition_20"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]
        if not filtered.empty:
            reason_chunks.append(filtered[["PcrKey", "eDisposition_20"]])

reason_df = pd.concat(reason_chunks, ignore_index=True)


# eDisposition_24 and eDisposition_25 - Alert type and timestamp
print(f"Processing eDisposition_24 & eDisposition_25 from {dest_team_path}")
with open(dest_team_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading eDisposition_24 & 25"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eDisposition_24"] = chunk["eDisposition_24"].str.strip(" ~'")
        chunk["eDisposition_25"] = chunk["eDisposition_25"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]
        if not filtered.empty:
            team_chunks.append(filtered[["PcrKey", "eDisposition_24", "eDisposition_25"]])

team_df = pd.concat(team_chunks, ignore_index=True)

# Merge into events_df
events_df = events_df.merge(reason_df, on="PcrKey", how="left")
events_df = events_df.merge(team_df, on="PcrKey", how="left")

print("Destination-related features merged. New shape:", events_df.shape)

Processing eDisposition_20 from ../data/raw/FACTPCRDESTINATIONREASON.txt


Loading eDisposition_20: 582it [00:36, 16.15it/s]


Processing eDisposition_24 & eDisposition_25 from ../data/raw/FACTPCRDESTINATIONTEAM.txt


Loading eDisposition_24 & 25: 543it [00:45, 11.87it/s]


Destination-related features merged. New shape: (297762, 76)


In [31]:
# Load and aggregate vitals from FACTPCRVITAL.txt

vitals_path = "../data/raw/FACTPCRVITAL.txt"
vitals_chunks = []

# Vital fields of interest and new names
vital_fields = {
    "eVitals_10": "heart_rate",
    "eVitals_14": "resp_rate",
    "eVitals_06": "systolic_bp",
    "eVitals_12": "spo2",
    "eVitals_18": "bgl",
    "eVitals_16": "etco2"
}

with open(vitals_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Vitals"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk = chunk[["PcrKey"] + list(vital_fields.keys())]
        chunk = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]

        vitals_chunks.append(chunk)

vitals_df = pd.concat(vitals_chunks, ignore_index=True)

# Aggregate: take the first non-null value per PcrKey per vital
agg_dict = {col: "first" for col in vital_fields.keys()}
vitals_agg = (
    vitals_df.groupby("PcrKey")
    .agg(agg_dict)
    .rename(columns=vital_fields)
    .reset_index()
)

# Merge into main dataset
events_df = events_df.merge(vitals_agg, on="PcrKey", how="left")
print("Vitals merged. New shape:", events_df.shape)

Loading Vitals: 1648it [04:56,  5.55it/s]


Vitals merged. New shape: (297762, 82)


In [32]:
# Load and process medication administration data

med_path = "../data/raw/FACTPCRMEDICATION.txt"
med_chunks = []

with open(med_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Medications"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eMedications_03"] = chunk["eMedications_03"].str.strip(" ~'")
        chunk["eMedications_03Descr"] = chunk["eMedications_03Descr"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]

        if not filtered.empty:
            med_chunks.append(filtered[["PcrKey", "eMedications_03", "eMedications_03Descr"]])

med_df = pd.concat(med_chunks, ignore_index=True)

# Aggregate
med_agg = (
    med_df.groupby("PcrKey")
    .agg(
        any_med_administered=("eMedications_03", lambda x: 1),
        med_count=("eMedications_03", "nunique"),
        naloxone_administered=("eMedications_03Descr", lambda x: int(x.str.contains("naloxone|narcan", case=False, na=False).any()))
    )
    .reset_index()
)

# Merge
events_df = events_df.merge(med_agg, on="PcrKey", how="left")
events_df[["any_med_administered", "med_count", "naloxone_administered"]] = events_df[["any_med_administered", "med_count", "naloxone_administered"]].fillna(0).astype(int)

print("Medications merged. New shape:", events_df.shape)

Loading Medications: 628it [01:49,  5.76it/s]


Medications merged. New shape: (297762, 85)


In [33]:
# Load and process procedure data

proc_path = "../data/raw/FACTPCRPROCEDURE.txt"
proc_chunks = []

with open(proc_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Procedures"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eProcedures_03"] = chunk["eProcedures_03"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]

        if not filtered.empty:
            proc_chunks.append(filtered[["PcrKey", "eProcedures_03"]])

proc_df = pd.concat(proc_chunks, ignore_index=True)

# Aggregate
proc_agg = (
    proc_df.groupby("PcrKey")
    .agg(
        any_procedure=("eProcedures_03", lambda x: 1),
        proc_count=("eProcedures_03", "nunique")
    )
    .reset_index()
)

# Merge
events_df = events_df.merge(proc_agg, on="PcrKey", how="left")
events_df[["any_procedure", "proc_count"]] = events_df[["any_procedure", "proc_count"]].fillna(0).astype(int)

print("Procedures merged. New shape:", events_df.shape)

Loading Procedures: 934it [01:45,  8.84it/s]


Procedures merged. New shape: (297762, 87)


In [34]:
# Alcohol/Drug Use Indicator

alcohol_path = "../data/raw/FACTPCRALCOHOLDRUGUSEINDICATOR.txt"
alcohol_chunks = []

with open(alcohol_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Alcohol/Drug Use Indicator"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eHistory_17"] = chunk["eHistory_17"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]

        if not filtered.empty:
            alcohol_chunks.append(filtered[["PcrKey"]])

if alcohol_chunks:
    alcohol_df = pd.concat(alcohol_chunks, ignore_index=True).drop_duplicates()
    alcohol_df["alcohol_drug_use_flag"] = 1

    events_df = events_df.merge(alcohol_df, on="PcrKey", how="left")
else:
    events_df["alcohol_drug_use_flag"] = 0  # add column if no matches

# Always ensure column exists and is filled
if "alcohol_drug_use_flag" not in events_df.columns:
    events_df["alcohol_drug_use_flag"] = 0
else:
    events_df["alcohol_drug_use_flag"] = events_df["alcohol_drug_use_flag"].fillna(0).astype(int)

print("Alcohol/drug use merged. New shape:", events_df.shape)

Loading Alcohol/Drug Use Indicator: 553it [00:32, 16.86it/s]


Alcohol/drug use merged. New shape: (297762, 88)


In [35]:
# Protocols Initiated

protocol_path = "../data/raw/FACTPCRPROTOCOL.txt"
protocol_chunks = []

with open(protocol_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Protocols"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eProtocols_01"] = chunk["eProtocols_01"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]

        if not filtered.empty:
            protocol_chunks.append(filtered[["PcrKey", "eProtocols_01"]])

protocol_df = pd.concat(protocol_chunks, ignore_index=True)

# Group all protocol codes per PcrKey into comma-separated strings
protocol_agg = (
    protocol_df.groupby("PcrKey")["eProtocols_01"]
    .apply(lambda x: ",".join(sorted(set(x))))
    .reset_index()
    .rename(columns={"eProtocols_01": "protocols_used"})
)

# Also count how many unique protocols were used
protocol_agg["protocol_count"] = protocol_agg["protocols_used"].apply(lambda x: len(x.split(",")))

events_df = events_df.merge(protocol_agg, on="PcrKey", how="left")
events_df["protocol_count"] = events_df["protocol_count"].fillna(0).astype(int)

print("Protocols merged. New shape:", events_df.shape)

Loading Protocols: 556it [00:35, 15.88it/s]


Protocols merged. New shape: (297762, 90)


In [36]:
# Arrest-related Flags (Witnessed, Resuscitation, CPR)

def load_arrest_flags(file_path, field, flag_name):
    chunks = []
    with open(file_path, "r") as f:
        for chunk in tqdm(
            pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
            desc=f"Loading {field}"
        ):
            chunk.columns = chunk.columns.str.strip(" ~'")
            chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
            chunk[field] = chunk[field].str.strip(" ~'")
            filtered = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]

            if not filtered.empty:
                chunks.append(filtered[["PcrKey"]])

    if chunks:
        df = pd.concat(chunks, ignore_index=True).drop_duplicates()
        df[flag_name] = 1
        return df
    else:
        return pd.DataFrame(columns=["PcrKey", flag_name])

arrest_flags = []

arrest_flags.append(load_arrest_flags("../data/raw/FACTPCRARRESTWITNESS.txt", "eArrest_04", "arrest_witnessed_flag"))
arrest_flags.append(load_arrest_flags("../data/raw/FACTPCRARRESTRESUSCITATION.txt", "eArrest_03", "resuscitation_flag"))
arrest_flags.append(load_arrest_flags("../data/raw/FACTPCRARRESTCPRPROVIDED.txt", "eArrest_09", "cpr_provided_flag"))

# Merge each into events_df
for flag_df in arrest_flags:
    events_df = events_df.merge(flag_df, on="PcrKey", how="left")

# Fill missing with 0
for col in ["arrest_witnessed_flag", "resuscitation_flag", "cpr_provided_flag"]:
    events_df[col] = events_df[col].fillna(0).astype(int)

print("Arrest-related flags merged. New shape:", events_df.shape)

Loading eArrest_04: 542it [00:31, 17.46it/s]
Loading eArrest_03: 546it [00:31, 17.58it/s]
Loading eArrest_09: 546it [00:30, 17.72it/s]


Arrest-related flags merged. New shape: (297762, 93)


In [37]:
# ROSC Flag (Return of Spontaneous Circulation)

roscs = []
rosc_path = "../data/raw/FACTPCRARRESTROSC.txt"

with open(rosc_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading eArrest_12 (ROSC)"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eArrest_12"] = chunk["eArrest_12"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]

        if not filtered.empty:
            roscs.append(filtered[["PcrKey"]])

if roscs:
    rosc_df = pd.concat(roscs, ignore_index=True).drop_duplicates()
    rosc_df["rosc_flag"] = 1
    events_df = events_df.merge(rosc_df, on="PcrKey", how="left")
    events_df["rosc_flag"] = events_df["rosc_flag"].fillna(0).astype(int)

print("ROSC merged. New shape:", events_df.shape)

Loading eArrest_12 (ROSC): 543it [00:31, 17.27it/s]


ROSC merged. New shape: (297762, 94)


In [38]:
# Arrest Rhythm at Destination

rhythm_chunks = []
rhythm_path = "../data/raw/FACTPCRARRESTRHYTHMDESTINATION.txt"

with open(rhythm_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading eArrest_17 (Destination Rhythm)"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eArrest_17"] = chunk["eArrest_17"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]

        if not filtered.empty:
            rhythm_chunks.append(filtered[["PcrKey", "eArrest_17"]])

if rhythm_chunks:
    rhythm_df = pd.concat(rhythm_chunks, ignore_index=True)
    rhythm_agg = (
        rhythm_df.groupby("PcrKey")
        .agg(dest_rhythm_count=("eArrest_17", "nunique"))
        .reset_index()
    )
    events_df = events_df.merge(rhythm_agg, on="PcrKey", how="left")
    events_df["dest_rhythm_count"] = events_df["dest_rhythm_count"].fillna(0).astype(int)

print("Destination arrest rhythm merged. New shape:", events_df.shape)

Loading eArrest_17 (Destination Rhythm): 542it [00:30, 17.59it/s]


Destination arrest rhythm merged. New shape: (297762, 95)


In [39]:
# Patient Race Group

race_chunks = []
race_path = "../data/raw/PCRPATIENTRACEGROUP.txt"

with open(race_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading ePatient_14 (Race Group)"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["ePatient_14"] = chunk["ePatient_14"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]

        if not filtered.empty:
            race_chunks.append(filtered[["PcrKey", "ePatient_14"]])

if race_chunks:
    race_df = pd.concat(race_chunks, ignore_index=True)
    race_agg = (
        race_df.groupby("PcrKey")
        .agg(race_count=("ePatient_14", "nunique"))
        .reset_index()
    )
    events_df = events_df.merge(race_agg, on="PcrKey", how="left")
    events_df["race_count"] = events_df["race_count"].fillna(0).astype(int)

print("Patient race info merged. New shape:", events_df.shape)

Loading ePatient_14 (Race Group): 548it [00:39, 13.83it/s]


Patient race info merged. New shape: (297762, 96)


In [40]:
# Response Mode and Transport Mode

response_path = "../data/raw/FACTPCRADDITIONALRESPONSEMODE.txt"
transport_path = "../data/raw/FACTPCRADDITIONALTRANSPORTMODE.txt"

# Load response mode (eResponse_24)
resp_chunks = []
with open(response_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading eResponse_24 (Response Mode)"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eResponse_24"] = chunk["eResponse_24"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]

        if not filtered.empty:
            resp_chunks.append(filtered[["PcrKey", "eResponse_24"]])

if resp_chunks:
    resp_df = pd.concat(resp_chunks, ignore_index=True).drop_duplicates("PcrKey")
    events_df = events_df.merge(resp_df, on="PcrKey", how="left")

# Load transport mode (eDisposition_18)
trans_chunks = []
with open(transport_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading eDisposition_18 (Transport Mode)"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eDisposition_18"] = chunk["eDisposition_18"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]

        if not filtered.empty:
            trans_chunks.append(filtered[["PcrKey", "eDisposition_18"]])

if trans_chunks:
    trans_df = pd.concat(trans_chunks, ignore_index=True).drop_duplicates("PcrKey")
    events_df = events_df.merge(trans_df, on="PcrKey", how="left")

print("Response and transport mode merged. New shape:", events_df.shape)

Loading eResponse_24 (Response Mode): 612it [00:34, 17.79it/s]
Loading eDisposition_18 (Transport Mode): 578it [00:33, 17.13it/s]


Response and transport mode merged. New shape: (297762, 98)


In [41]:
# Final DataFrame Inspection

print("Final shape:", events_df.shape)

# Count of missing values per column
null_counts = events_df.isnull().sum().sort_values(ascending=False)
print("\nMissing values per column:")
print(null_counts[null_counts > 0])

# Count of unique values per column (helps spot single-value or ID-like fields)
print("\nUnique values per column (non-null):")
print(events_df.nunique().sort_values(ascending=False))

# Quick look at distributions of some key columns
print("\nValue counts for selected columns:")
for col in [
    "eOutcome_01", "eOutcome_02", "eMedications_03", 
    "eSituation_11", "eSituation_12", "injury_cause_count",
    "injury_risk_flag", "trauma_criteria_count", "alcohol_drug_use_flag"
]:
    if col in events_df.columns:
        print(f"\n--- {col} ---")
        print(events_df[col].value_counts(dropna=False).head(10))

Final shape: (297762, 98)

Missing values per column:
eSituation_11           5996
eSituation_09           5429
eSituation_12           4196
assoc_symptom_count     3715
assoc_symptom_unique    3715
etco2                    145
bgl                      145
heart_rate               145
resp_rate                145
systolic_bp              145
spo2                     145
dtype: int64

Unique values per column (non-null):
PcrKey                  271206
eTimes_05               264238
eTimes_06               263980
eTimes_03               261992
eTimes_13               258661
                         ...  
any_procedure                1
eResponse_09_present         1
eResponse_12_present         1
eResponse_11_present         1
barrier_present              1
Length: 98, dtype: int64

Value counts for selected columns:

--- eOutcome_01 ---
eOutcome_01
~7701003   ~    247485
~7701001   ~     44600
~01        ~      2240
~30        ~      1450
~09        ~       955
~07        ~       315
~02

In [42]:
print(events_df.columns)

Index(['PcrKey', 'eDispatch_01', 'eDispatch_02', 'eArrest_14', 'eArrest_01',
       'eArrest_02', 'eArrest_05', 'eArrest_07', 'eArrest_11', 'eArrest_16',
       'eArrest_18', 'eDisposition_12', 'eDisposition_19', 'eDisposition_16',
       'eDisposition_21', 'eDisposition_22', 'eDisposition_23', 'eOutcome_01',
       'eOutcome_02', 'ePatient_15', 'ePatient_16', 'ePayment_01',
       'ePayment_50', 'eResponse_05', 'eResponse_07', 'eResponse_15',
       'eResponse_23', 'eScene_01', 'eScene_06', 'eScene_07', 'eScene_08',
       'eScene_09', 'eSituation_02', 'eSituation_07', 'eSituation_08',
       'eSituation_13', 'eSituation_01', 'eTimes_01', 'eTimes_03', 'eTimes_05',
       'eTimes_06', 'eTimes_07', 'eTimes_09', 'eTimes_11', 'eTimes_12',
       'eTimes_13', 'eDisposition_17', 'eSituation_09', 'assoc_symptom_count',
       'assoc_symptom_unique', 'eSituation_11', 'eSituation_12',
       'injury_cause_count', 'injury_risk_flag', 'trauma_criteria_count',
       'barrier_count', 'barrier_uni

In [43]:
# Save the final dataset
events_df.to_csv("../data/interim/opioid_cases_full.csv", index=False)
print("Saved to ../data/interim/opioid_cases_full.csv")

Saved to ../data/interim/opioid_cases_full.csv
