In [14]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime

In [15]:
def inspect_file_columns(file_path, n_rows=5):
    """
    Quickly read the first few rows of a pipe-delimited file
    and print cleaned column names.
    """
    df = pd.read_csv(
        file_path,
        delimiter="|",
        nrows=n_rows,
        dtype=str
    )
    # Clean columns
    df.columns = df.columns.str.strip(" ~'")
    print(f"Columns in {file_path}:")
    for col in df.columns:
        print(f"- {col}")

In [31]:
# Provider Impressions
inspect_file_columns("../data/raw/FACTPCRPRIMARYIMPRESSION.txt")
inspect_file_columns("../data/raw/FACTPCRSECONDARYIMPRESSION.txt")

# Symptoms
inspect_file_columns("../data/raw/FACTPCRPRIMARYSYMPTOM.txt")
inspect_file_columns("../data/raw/FACTPCRADDITIONALSYMPTOM.txt")

# Cause of Injury and Trauma
inspect_file_columns("../data/raw/FACTPCRCAUSEOFINJURY.txt")
inspect_file_columns("../data/raw/FACTPCRINJURYRISKFACTOR.txt")
inspect_file_columns("../data/raw/FACTPCRTRAUMACRITERIA.txt")

# Barriers to Care
inspect_file_columns("../data/raw/FACTPCRBARRIERTOCARE.txt")

# Delay Types
inspect_file_columns("../data/raw/FACTPCRDISPATCHDELAY.txt")
inspect_file_columns("../data/raw/FACTPCRRESPONSEDELAY.txt")
inspect_file_columns("../data/raw/FACTPCRSCENEDELAY.txt")
inspect_file_columns("../data/raw/FACTPCRTRANSPORTDELAY.txt")
inspect_file_columns("../data/raw/FACTPCRTURNAROUNDDELAY.txt")

# Destination Details
inspect_file_columns("../data/raw/FACTPCRDESTINATIONREASON.txt")
inspect_file_columns("../data/raw/FACTPCRDESTINATIONTEAM.txt")

# Vitals
inspect_file_columns("../data/raw/FACTPCRVITAL.txt")

# Arrest & CPR
inspect_file_columns("../data/raw/FACTPCRARRESTCPRPROVIDED.txt")
inspect_file_columns("../data/raw/FACTPCRARRESTRESUSCITATION.txt")
inspect_file_columns("../data/raw/FACTPCRARRESTRHYTHMDESTINATION.txt")
inspect_file_columns("../data/raw/FACTPCRARRESTROSC.txt")
inspect_file_columns("../data/raw/FACTPCRARRESTWITNESS.txt")

# Medication & Procedure
inspect_file_columns("../data/raw/FACTPCRMEDICATION.txt")
inspect_file_columns("../data/raw/FACTPCRPROCEDURE.txt")
inspect_file_columns("../data/raw/FACTPCRPROTOCOL.txt")

# Additional Modes
inspect_file_columns("../data/raw/FACTPCRADDITIONALRESPONSEMODE.txt")
inspect_file_columns("../data/raw/FACTPCRADDITIONALTRANSPORTMODE.txt")

# Alcohol/Drug Use Indicator
inspect_file_columns("../data/raw/FACTPCRALCOHOLDRUGUSEINDICATOR.txt")

# Work Related Exposure
inspect_file_columns("../data/raw/FACTPCRWORKRELATEDEXPOSURE.txt")

# Patient Groupings
inspect_file_columns("../data/raw/PCRPATIENTRACEGROUP.txt")

# Procedure/ECG Groupings
inspect_file_columns("../data/raw/PCRPROCCOMPGROUP.txt")
inspect_file_columns("../data/raw/PCRMEDCOMPGROUP.txt")
inspect_file_columns("../data/raw/PCRVITALECGGROUP.txt")
inspect_file_columns("../data/raw/PCRVITALECGINTERPRETATIONGROUP.txt")
inspect_file_columns("../data/raw/PCRVITALGLASGOWQUALIFIERGROUP.txt")

# Core Events Table
inspect_file_columns("../data/raw/pub_pcrevents_cp25.txt")

Columns in ../data/raw/FACTPCRPRIMARYIMPRESSION.txt:
- PcrKey
- eSituation_11
Columns in ../data/raw/FACTPCRSECONDARYIMPRESSION.txt:
- PcrKey
- eSituation_12
Columns in ../data/raw/FACTPCRPRIMARYSYMPTOM.txt:
- PcrKey
- eSituation_09
Columns in ../data/raw/FACTPCRADDITIONALSYMPTOM.txt:
- PcrKey
- eSituation_10
Columns in ../data/raw/FACTPCRCAUSEOFINJURY.txt:
- PcrKey
- eInjury_01
Columns in ../data/raw/FACTPCRINJURYRISKFACTOR.txt:
- PcrKey
- eInjury_04
Columns in ../data/raw/FACTPCRTRAUMACRITERIA.txt:
- PcrKey
- eInjury_03
Columns in ../data/raw/FACTPCRBARRIERTOCARE.txt:
- PcrKey
- eHistory_01
Columns in ../data/raw/FACTPCRDISPATCHDELAY.txt:
- PcrKey
- eResponse_08
Columns in ../data/raw/FACTPCRRESPONSEDELAY.txt:
- PcrKey
- eResponse_09
Columns in ../data/raw/FACTPCRSCENEDELAY.txt:
- PcrKey
- eResponse_10
Columns in ../data/raw/FACTPCRTRANSPORTDELAY.txt:
- PcrKey
- eResponse_11
Columns in ../data/raw/FACTPCRTURNAROUNDDELAY.txt:
- PcrKey
- eResponse_12
Columns in ../data/raw/FACTPCRDESTI

In [17]:
# Define target ICD-10 prefixes
target_prefixes = ("T40", "F11")

# Path to primary impressions file
primary_path = "../data/raw/FACTPCRPRIMARYIMPRESSION.txt"

# Prepare chunks
primary_chunks = []

with open(primary_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Processing Primary Impressions"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["eSituation_11"] = chunk["eSituation_11"].str.strip(" ~")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        mask = chunk["eSituation_11"].str.startswith(target_prefixes)
        filtered = chunk.loc[mask, ["PcrKey"]]

        if not filtered.empty:
            primary_chunks.append(filtered)

primary_df = pd.concat(primary_chunks, ignore_index=True)
print("Primary impressions matched:", len(primary_df))

Processing Primary Impressions: 530it [00:22, 23.91it/s]

Primary impressions matched: 231354





In [18]:
# Path to secondary impressions file
secondary_path = "../data/raw/FACTPCRSECONDARYIMPRESSION.txt"

# Prepare chunks
secondary_chunks = []

with open(secondary_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Processing Secondary Impressions"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["eSituation_12"] = chunk["eSituation_12"].str.strip(" ~")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        mask = chunk["eSituation_12"].str.startswith(target_prefixes)
        filtered = chunk.loc[mask, ["PcrKey"]]

        if not filtered.empty:
            secondary_chunks.append(filtered)

secondary_df = pd.concat(secondary_chunks, ignore_index=True)
print("Secondary impressions matched:", len(secondary_df))

Processing Secondary Impressions: 544it [00:23, 23.08it/s]

Secondary impressions matched: 42853





In [19]:
# Combine and deduplicate primary and secondary impression matches
opioid_cases = pd.concat([primary_df, secondary_df], ignore_index=True).drop_duplicates()

# Optionally, turn it into a set for fast lookups
opioid_pcr_keys = set(opioid_cases["PcrKey"])
print("Total unique opioid-related cases:", len(opioid_pcr_keys))

Total unique opioid-related cases: 271206


In [20]:
# Load core events table
events_path = "../data/raw/pub_pcrevents_cp25.txt"
events_chunks = []

with open(events_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Filtering pub_pcrevents_cp25.txt"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(opioid_pcr_keys)]
        if not filtered.empty:
            events_chunks.append(filtered)

events_df = pd.concat(events_chunks, ignore_index=True)
print("Filtered events records:", len(events_df))

Filtering pub_pcrevents_cp25.txt: 542it [04:18,  2.10it/s]

Filtered events records: 271206





In [21]:
# Load and aggregate eSituation_09 (Primary Symptom)
symptom_path = "../data/raw/FACTPCRPRIMARYSYMPTOM.txt"
symptom_chunks = []

with open(symptom_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Processing Primary Symptom"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")
        chunk["eSituation_09"] = chunk["eSituation_09"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(opioid_pcr_keys)]
        if not filtered.empty:
            symptom_chunks.append(filtered[["PcrKey", "eSituation_09"]])

symptom_df = pd.concat(symptom_chunks, ignore_index=True)

# Merge into events_df (one-to-one)
events_df = events_df.merge(symptom_df, on="PcrKey", how="left")
print("Merged primary symptom. New shape:", events_df.shape)

Processing Primary Symptom: 534it [00:35, 15.05it/s]


Merged primary symptom. New shape: (271206, 48)


In [22]:
# Load and aggregate eSituation_10 (Other Associated Symptoms)
assoc_path = "../data/raw/FACTPCRADDITIONALSYMPTOM.txt"
assoc_chunks = []

with open(assoc_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Processing Associated Symptoms"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")
        chunk["eSituation_10"] = chunk["eSituation_10"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(opioid_pcr_keys)]
        if not filtered.empty:
            assoc_chunks.append(filtered[["PcrKey", "eSituation_10"]])

assoc_df = pd.concat(assoc_chunks, ignore_index=True)

# Aggregate: count and nunique per PcrKey
assoc_agg = (
    assoc_df.groupby("PcrKey")
    .agg(
        assoc_symptom_count=("eSituation_10", "count"),
        assoc_symptom_unique=("eSituation_10", "nunique")
    )
    .reset_index()
)

# Merge
events_df = events_df.merge(assoc_agg, on="PcrKey", how="left")
print("Merged associated symptoms. New shape:", events_df.shape)

Processing Associated Symptoms: 580it [00:38, 15.00it/s]


Merged associated symptoms. New shape: (271206, 50)


In [23]:
# Reload and merge full Primary and Secondary Impressions

primary_chunks = []
secondary_chunks = []

# Reload Primary Impressions
with open("../data/raw/FACTPCRPRIMARYIMPRESSION.txt", "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Reloading Primary Impressions"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")
        chunk["eSituation_11"] = chunk["eSituation_11"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(opioid_pcr_keys)]
        if not filtered.empty:
            primary_chunks.append(filtered[["PcrKey", "eSituation_11"]])

# Reload Secondary Impressions
with open("../data/raw/FACTPCRSECONDARYIMPRESSION.txt", "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Reloading Secondary Impressions"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")
        chunk["eSituation_12"] = chunk["eSituation_12"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(opioid_pcr_keys)]
        if not filtered.empty:
            secondary_chunks.append(filtered[["PcrKey", "eSituation_12"]])

# Concatenate and merge
primary_imp_df = pd.concat(primary_chunks, ignore_index=True)
secondary_imp_df = pd.concat(secondary_chunks, ignore_index=True)

events_df = events_df.merge(primary_imp_df, on="PcrKey", how="left")
print("Merged primary impression. New shape:", events_df.shape)

events_df = events_df.merge(secondary_imp_df, on="PcrKey", how="left")
print("Merged secondary impression. New shape:", events_df.shape)


Reloading Primary Impressions: 530it [00:33, 15.79it/s]
Reloading Secondary Impressions: 544it [00:35, 15.37it/s]


Merged primary impression. New shape: (271206, 51)
Merged secondary impression. New shape: (281813, 52)


In [25]:
# Loading Injury Data

# Paths to source files
cause_path = "../data/raw/FACTPCRCAUSEOFINJURY.txt"
risk_path = "../data/raw/FACTPCRINJURYRISKFACTOR.txt"
trauma_path = "../data/raw/FACTPCRTRAUMACRITERIA.txt"

# Initialize lists
cause_chunks = []
risk_chunks = []
trauma_chunks = []

# Load Cause of Injury (eInjury_01)
with open(cause_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading eInjury_01 (Cause of Injury)"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eInjury_01"] = chunk["eInjury_01"].str.strip(" ~'")
        cause_chunks.append(chunk[["PcrKey", "eInjury_01"]])

cause_df = pd.concat(cause_chunks, ignore_index=True)

# Load Injury Risk Factor (eInjury_04)
with open(risk_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading eInjury_04 (Risk Factor)"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eInjury_04"] = chunk["eInjury_04"].str.strip(" ~'")
        risk_chunks.append(chunk[["PcrKey", "eInjury_04"]])

risk_df = pd.concat(risk_chunks, ignore_index=True)

# Load Trauma Criteria (eInjury_03)
with open(trauma_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading eInjury_03 (Trauma Criteria)"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eInjury_03"] = chunk["eInjury_03"].str.strip(" ~'")
        trauma_chunks.append(chunk[["PcrKey", "eInjury_03"]])

trauma_df = pd.concat(trauma_chunks, ignore_index=True)

# Aggregate each to 1 row per PcrKey
cause_agg = cause_df.groupby("PcrKey").agg(
    injury_cause_count=("eInjury_01", "nunique")
).reset_index()

risk_agg = (
    risk_df.dropna(subset=["eInjury_04"])
    .groupby("PcrKey")
    .size()
    .reset_index(name="injury_risk_flag")
)
risk_agg["injury_risk_flag"] = 1  # presence = 1

trauma_agg = trauma_df.groupby("PcrKey").agg(
    trauma_criteria_count=("eInjury_03", "nunique")
).reset_index()

# Merge into events_df
events_df = events_df.merge(cause_agg, on="PcrKey", how="left")
events_df = events_df.merge(risk_agg, on="PcrKey", how="left")
events_df = events_df.merge(trauma_agg, on="PcrKey", how="left")

# Fill missing with 0
events_df["injury_cause_count"] = events_df["injury_cause_count"].fillna(0).astype(int)
events_df["injury_risk_flag"] = events_df["injury_risk_flag"].fillna(0).astype(int)
events_df["trauma_criteria_count"] = events_df["trauma_criteria_count"].fillna(0).astype(int)

print("Injury-related features merged. New shape:", events_df.shape)

Loading eInjury_01 (Cause of Injury): 545it [00:18, 29.59it/s]
Loading eInjury_04 (Risk Factor): 543it [00:18, 29.93it/s]
Loading eInjury_03 (Trauma Criteria): 543it [00:18, 29.73it/s]


Injury-related features merged. New shape: (281813, 55)


In [26]:
# Load and process eHistory_01 (Barriers to Patient Care)
barrier_path = "../data/raw/FACTPCRBARRIERTOCARE.txt"
barrier_chunks = []

with open(barrier_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading eHistory_01 (Barriers to Care)"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eHistory_01"] = chunk["eHistory_01"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(opioid_pcr_keys)]

        if not filtered.empty:
            barrier_chunks.append(filtered[["PcrKey", "eHistory_01"]])

barrier_df = pd.concat(barrier_chunks, ignore_index=True)

# Aggregate
barrier_agg = (
    barrier_df.groupby("PcrKey")
    .agg(
        barrier_count=("eHistory_01", "count"),
        barrier_unique=("eHistory_01", "nunique")
    )
    .reset_index()
)
barrier_agg["barrier_present"] = 1

# Merge into main dataset
events_df = events_df.merge(barrier_agg, on="PcrKey", how="left")

# Fill NA with zeros
events_df["barrier_count"] = events_df["barrier_count"].fillna(0).astype(int)
events_df["barrier_unique"] = events_df["barrier_unique"].fillna(0).astype(int)
events_df["barrier_present"] = events_df["barrier_present"].fillna(0).astype(int)

print("Barriers to care merged. New shape:", events_df.shape)

Loading eHistory_01 (Barriers to Care): 550it [00:38, 14.44it/s]


Barriers to care merged. New shape: (281813, 58)


In [27]:
# Delay files and their corresponding column names
delay_sources = {
    "../data/raw/FACTPCRDISPATCHDELAY.txt": "eResponse_08",
    "../data/raw/FACTPCRRESPONSEDELAY.txt": "eResponse_09",
    "../data/raw/FACTPCRSCENEDELAY.txt": "eResponse_10",
    "../data/raw/FACTPCRTRANSPORTDELAY.txt": "eResponse_11",
    "../data/raw/FACTPCRTURNAROUNDDELAY.txt": "eResponse_12",
}

# Loop through each and join delay features
for file_path, col in delay_sources.items():
    print(f"Processing {col} from {file_path}")
    delay_chunks = []

    with open(file_path, "r") as f:
        for chunk in tqdm(
            pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
            desc=f"Loading {col}"
        ):
            chunk.columns = chunk.columns.str.strip(" ~'")
            chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
            chunk[col] = chunk[col].str.strip(" ~'")

            filtered = chunk[chunk["PcrKey"].isin(opioid_pcr_keys)]
            if not filtered.empty:
                delay_chunks.append(filtered[["PcrKey", col]])

    delay_df = pd.concat(delay_chunks, ignore_index=True)

    # Aggregate delay types per PcrKey
    delay_agg = (
        delay_df.groupby("PcrKey")[col]
        .agg([
            ("{}_count".format(col), "count"),
            ("{}_unique".format(col), "nunique")
        ])
        .reset_index()
    )
    delay_agg["{}_present".format(col)] = 1

    # Merge into main events_df
    events_df = events_df.merge(delay_agg, on="PcrKey", how="left")

    # Fill missing values
    events_df["{}_count".format(col)] = events_df["{}_count".format(col)].fillna(0).astype(int)
    events_df["{}_unique".format(col)] = events_df["{}_unique".format(col)].fillna(0).astype(int)
    events_df["{}_present".format(col)] = events_df["{}_present".format(col)].fillna(0).astype(int)

print("All delay types merged. New shape:", events_df.shape)

Processing eResponse_08 from ../data/raw/FACTPCRDISPATCHDELAY.txt


Loading eResponse_08: 543it [00:37, 14.38it/s]


Processing eResponse_09 from ../data/raw/FACTPCRRESPONSEDELAY.txt


Loading eResponse_09: 545it [00:39, 13.63it/s]


Processing eResponse_10 from ../data/raw/FACTPCRSCENEDELAY.txt


Loading eResponse_10: 546it [00:41, 13.18it/s]


Processing eResponse_11 from ../data/raw/FACTPCRTRANSPORTDELAY.txt


Loading eResponse_11: 544it [00:40, 13.45it/s]


Processing eResponse_12 from ../data/raw/FACTPCRTURNAROUNDDELAY.txt


Loading eResponse_12: 552it [00:41, 13.23it/s]


All delay types merged. New shape: (281813, 73)


In [29]:
# Load Destination Reason and Alert Info

# File paths
dest_reason_path = "../data/raw/FACTPCRDESTINATIONREASON.txt"
dest_team_path = "../data/raw/FACTPCRDESTINATIONTEAM.txt"

# Chunks
reason_chunks = []
team_chunks = []

# eDisposition_20 - Reason for choosing destination
print(f"Processing eDisposition_20 from {dest_reason_path}")
with open(dest_reason_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading eDisposition_20"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eDisposition_20"] = chunk["eDisposition_20"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]
        if not filtered.empty:
            reason_chunks.append(filtered[["PcrKey", "eDisposition_20"]])

reason_df = pd.concat(reason_chunks, ignore_index=True)


# eDisposition_24 and eDisposition_25 - Alert type and timestamp
print(f"Processing eDisposition_24 & eDisposition_25 from {dest_team_path}")
with open(dest_team_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading eDisposition_24 & 25"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eDisposition_24"] = chunk["eDisposition_24"].str.strip(" ~'")
        chunk["eDisposition_25"] = chunk["eDisposition_25"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]
        if not filtered.empty:
            team_chunks.append(filtered[["PcrKey", "eDisposition_24", "eDisposition_25"]])

team_df = pd.concat(team_chunks, ignore_index=True)

# Merge into events_df
events_df = events_df.merge(reason_df, on="PcrKey", how="left")
events_df = events_df.merge(team_df, on="PcrKey", how="left")

print("Destination-related features merged. New shape:", events_df.shape)

Processing eDisposition_20 from ../data/raw/FACTPCRDESTINATIONREASON.txt


Loading eDisposition_20: 582it [00:32, 18.08it/s]


Processing eDisposition_24 & eDisposition_25 from ../data/raw/FACTPCRDESTINATIONTEAM.txt


Loading eDisposition_24 & 25: 543it [00:40, 13.37it/s]


Destination-related features merged. New shape: (297762, 76)


In [32]:
# Load and aggregate vitals from FACTPCRVITAL.txt

vitals_path = "../data/raw/FACTPCRVITAL.txt"
vitals_chunks = []

# Vital fields of interest and new names
vital_fields = {
    "eVitals_10": "heart_rate",
    "eVitals_14": "resp_rate",
    "eVitals_06": "systolic_bp",
    "eVitals_12": "spo2",
    "eVitals_18": "bgl",
    "eVitals_16": "etco2"
}

with open(vitals_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Vitals"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk = chunk[["PcrKey"] + list(vital_fields.keys())]
        chunk = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]

        vitals_chunks.append(chunk)

vitals_df = pd.concat(vitals_chunks, ignore_index=True)

# Aggregate: take the first non-null value per PcrKey per vital
agg_dict = {col: "first" for col in vital_fields.keys()}
vitals_agg = (
    vitals_df.groupby("PcrKey")
    .agg(agg_dict)
    .rename(columns=vital_fields)
    .reset_index()
)

# Merge into main dataset
events_df = events_df.merge(vitals_agg, on="PcrKey", how="left")
print("Vitals merged. New shape:", events_df.shape)

Loading Vitals: 1648it [04:38,  5.93it/s]


Vitals merged. New shape: (297762, 82)


In [33]:
# Load and process medication administration data

med_path = "../data/raw/FACTPCRMEDICATION.txt"
med_chunks = []

with open(med_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Medications"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eMedications_03"] = chunk["eMedications_03"].str.strip(" ~'")
        chunk["eMedications_03Descr"] = chunk["eMedications_03Descr"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]

        if not filtered.empty:
            med_chunks.append(filtered[["PcrKey", "eMedications_03", "eMedications_03Descr"]])

med_df = pd.concat(med_chunks, ignore_index=True)

# Aggregate
med_agg = (
    med_df.groupby("PcrKey")
    .agg(
        any_med_administered=("eMedications_03", lambda x: 1),
        med_count=("eMedications_03", "nunique"),
        naloxone_administered=("eMedications_03Descr", lambda x: int(x.str.contains("naloxone|narcan", case=False, na=False).any()))
    )
    .reset_index()
)

# Merge
events_df = events_df.merge(med_agg, on="PcrKey", how="left")
events_df[["any_med_administered", "med_count", "naloxone_administered"]] = events_df[["any_med_administered", "med_count", "naloxone_administered"]].fillna(0).astype(int)

print("Medications merged. New shape:", events_df.shape)

Loading Medications: 628it [01:44,  6.03it/s]


Medications merged. New shape: (297762, 85)


In [34]:
# Load and process procedure data

proc_path = "../data/raw/FACTPCRPROCEDURE.txt"
proc_chunks = []

with open(proc_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Procedures"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~'")
        chunk["eProcedures_03"] = chunk["eProcedures_03"].str.strip(" ~'")
        filtered = chunk[chunk["PcrKey"].isin(events_df["PcrKey"])]

        if not filtered.empty:
            proc_chunks.append(filtered[["PcrKey", "eProcedures_03"]])

proc_df = pd.concat(proc_chunks, ignore_index=True)

# Aggregate
proc_agg = (
    proc_df.groupby("PcrKey")
    .agg(
        any_procedure=("eProcedures_03", lambda x: 1),
        proc_count=("eProcedures_03", "nunique")
    )
    .reset_index()
)

# Merge
events_df = events_df.merge(proc_agg, on="PcrKey", how="left")
events_df[["any_procedure", "proc_count"]] = events_df[["any_procedure", "proc_count"]].fillna(0).astype(int)

print("Procedures merged. New shape:", events_df.shape)

Loading Procedures: 934it [01:41,  9.22it/s]


Procedures merged. New shape: (297762, 87)


In [7]:
cause_injury_path = "../data/raw/FACTPCRCAUSEOFINJURY.txt"

cause_injury_chunks = []

with open(cause_injury_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Cause of Injury"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")
        
        filtered = chunk[chunk["PcrKey"].isin(pcr_key_set)]
        if not filtered.empty:
            cause_injury_chunks.append(filtered)

cause_injury_df = pd.concat(cause_injury_chunks, ignore_index=True)
print("Cause of Injury records loaded:", len(cause_injury_df))

cause_injury_agg = (
    cause_injury_df.groupby("PcrKey")
    .agg(
        cause_injury_first=("eInjury_01", "first"),
        cause_injury_count=("eInjury_01", "count"),
        unique_causes=("eInjury_01", "nunique")
    )
    .reset_index()
)

Loading Cause of Injury: 545it [00:34, 15.57it/s]


Cause of Injury records loaded: 271756


In [11]:
def inspect_file_columns(file_path, n_rows=5):
    """
    Quickly read the first few rows of a pipe-delimited file
    and print cleaned column names.
    """
    df = pd.read_csv(
        file_path,
        delimiter="|",
        nrows=n_rows,
        dtype=str
    )
    # Clean columns
    df.columns = df.columns.str.strip(" ~'")
    print(f"Columns in {file_path}:")
    for col in df.columns:
        print(f"- {col}")

# Symptoms
inspect_file_columns("../data/raw/FACTPCRPRIMARYSYMPTOM.txt")
inspect_file_columns("../data/raw/FACTPCRADDITIONALSYMPTOM.txt")

# Cause of injury
inspect_file_columns("../data/raw/FACTPCRCAUSEOFINJURY.txt")
inspect_file_columns("../data/raw/FACTPCRINJURYRISKFACTOR.txt")

# Trauma criteria
inspect_file_columns("../data/raw/FACTPCRTRAUMACRITERIA.txt")

# Barriers to care
inspect_file_columns("../data/raw/FACTPCRBARRIERTOCARE.txt")

# Response and transport delays
inspect_file_columns("../data/raw/FACTPCRDISPATCHDELAY.txt")
inspect_file_columns("../data/raw/FACTPCRRESPONSEDELAY.txt")
inspect_file_columns("../data/raw/FACTPCRSCENEDELAY.txt")
inspect_file_columns("../data/raw/FACTPCRTRANSPORTDELAY.txt")
inspect_file_columns("../data/raw/FACTPCRTURNAROUNDDELAY.txt")

# Destination details
inspect_file_columns("../data/raw/FACTPCRDESTINATIONREASON.txt")
inspect_file_columns("../data/raw/FACTPCRDESTINATIONTEAM.txt")

# Work related exposure
inspect_file_columns("../data/raw/FACTPCRWORKRELATEDEXPOSURE.txt")

Columns in ../data/raw/FACTPCRPRIMARYSYMPTOM.txt:
- PcrKey
- eSituation_09
Columns in ../data/raw/FACTPCRADDITIONALSYMPTOM.txt:
- PcrKey
- eSituation_10
Columns in ../data/raw/FACTPCRCAUSEOFINJURY.txt:
- PcrKey
- eInjury_01
Columns in ../data/raw/FACTPCRINJURYRISKFACTOR.txt:
- PcrKey
- eInjury_04
Columns in ../data/raw/FACTPCRTRAUMACRITERIA.txt:
- PcrKey
- eInjury_03
Columns in ../data/raw/FACTPCRBARRIERTOCARE.txt:
- PcrKey
- eHistory_01
Columns in ../data/raw/FACTPCRDISPATCHDELAY.txt:
- PcrKey
- eResponse_08
Columns in ../data/raw/FACTPCRRESPONSEDELAY.txt:
- PcrKey
- eResponse_09
Columns in ../data/raw/FACTPCRSCENEDELAY.txt:
- PcrKey
- eResponse_10
Columns in ../data/raw/FACTPCRTRANSPORTDELAY.txt:
- PcrKey
- eResponse_11
Columns in ../data/raw/FACTPCRTURNAROUNDDELAY.txt:
- PcrKey
- eResponse_12
Columns in ../data/raw/FACTPCRDESTINATIONREASON.txt:
- PcrKey
- eDisposition_20
Columns in ../data/raw/FACTPCRDESTINATIONTEAM.txt:
- eDisposition_25
- PcrKey
- eDisposition_24
Columns in ../dat

In [8]:
injury_risk_path = "../data/raw/FACTPCRINJURYRISKFACTOR.txt"

injury_risk_chunks = []

with open(injury_risk_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Injury Risk Factor"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")
        
        filtered = chunk[chunk["PcrKey"].isin(pcr_key_set)]
        if not filtered.empty:
            injury_risk_chunks.append(filtered)

injury_risk_df = pd.concat(injury_risk_chunks, ignore_index=True)
print("Injury Risk Factor records loaded:", len(injury_risk_df))

injury_risk_agg = (
    injury_risk_df.groupby("PcrKey")
    .agg(
        injury_risk_count=("eInjury_03", "count"),
        unique_injury_risks=("eInjury_03", "nunique")
    )
    .reset_index()
)

Loading Injury Risk Factor: 543it [00:35, 15.51it/s]


Injury Risk Factor records loaded: 271206


KeyError: "Column(s) ['eInjury_03'] do not exist"

In [12]:
# Path to event file
event_path = "../data/raw/pub_pcrevents_cp25.txt"

# Prepare chunks
event_chunks = []

with open(event_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Event Records"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(pcr_key_set)]

        if not filtered.empty:
            event_chunks.append(filtered)

event_df = pd.concat(event_chunks, ignore_index=True)

print("Event records loaded:", len(event_df))

Loading Event Records: 542it [04:33,  1.98it/s]

Event records loaded: 271206





In [13]:
# Path to Vitals file
vitals_path = "../data/raw/FACTPCRVITAL.txt"

# Define vitals columns to extract and aggregate
vital_cols = {
    "HeartRate": "eVitals_10",
    "RespRate": "eVitals_14",
    "SystolicBP": "eVitals_06",
    "SpO2": "eVitals_12",
    "BGL": "eVitals_18",
    "ETCO2": "eVitals_16",
    "GCS_Eye": "eVitals_19",
    "GCS_Verbal": "eVitals_20",
    "GCS_Motor": "eVitals_21"
}

# Prepare chunks
vitals_chunks = []

with open(vitals_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Vitals"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(pcr_key_set)].copy()
        if not filtered.empty:
            for name, col in vital_cols.items():
                if col in filtered.columns:
                    extracted = filtered[col].str.extract(r"(\d+\.?\d*)")[0]
                    filtered[name] = pd.to_numeric(extracted, errors="coerce").where(lambda x: x < 1000)
            vitals_chunks.append(filtered[["PcrKey"] + list(vital_cols.keys())])

# Combine all
vitals_df = pd.concat(vitals_chunks, ignore_index=True)

# Aggregate per PcrKey
vitals_agg = (
    vitals_df.groupby("PcrKey")[list(vital_cols.keys())]
    .agg(["first", "last", "min", "max", "mean", "std", "count"])
)

# Flatten column names
vitals_agg.columns = ["_".join(col).strip() for col in vitals_agg.columns.values]
vitals_agg = vitals_agg.reset_index()

print("Vitals aggregated:", vitals_agg.shape)

Loading Vitals: 1648it [05:29,  5.00it/s]


Vitals aggregated: (271071, 64)


In [14]:
# Path to Medications file
meds_path = "../data/raw/FACTPCRMEDICATION.txt"

# Prepare chunks
meds_chunks = []

with open(meds_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Medications"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")
        chunk["eMedications_03Descr"] = chunk["eMedications_03Descr"].str.strip().str.lower()

        filtered = chunk[chunk["PcrKey"].isin(pcr_key_set)]
        if not filtered.empty:
            meds_chunks.append(filtered)

# Combine all
meds_df = pd.concat(meds_chunks, ignore_index=True)

print("Medications records loaded:", len(meds_df))

Loading Medications: 628it [01:44,  6.02it/s]

Medications records loaded: 456070





In [15]:
# Flag Naloxone
naloxone_flag = meds_df["eMedications_03Descr"].str.contains("naloxone|narcan", na=False)

# Aggregate medication info
meds_agg = (
    meds_df.groupby("PcrKey")
    .agg(
        total_meds=("eMedications_03", "count"),
        unique_meds=("eMedications_03", "nunique"),
        naloxone_doses=("eMedications_03Descr", lambda x: x.str.contains("naloxone|narcan", na=False).sum()),
        naloxone_flag=("eMedications_03Descr", lambda x: x.str.contains("naloxone|narcan", na=False).any()),
        first_route=("eMedications_07", "first"),
        first_response=("eMedications_10", "first")
    )
    .reset_index()
)

print("Medications aggregated:", meds_agg.shape)

Medications aggregated: (271206, 7)


In [16]:
# Path to Procedures file
proc_path = "../data/raw/FACTPCRPROCEDURE.txt"

# Prepare chunks
proc_chunks = []

with open(proc_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Procedures"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(pcr_key_set)]
        if not filtered.empty:
            proc_chunks.append(filtered)

# Combine all
proc_df = pd.concat(proc_chunks, ignore_index=True)

print("Procedures records loaded:", len(proc_df))

Loading Procedures: 934it [01:50,  8.46it/s]

Procedures records loaded: 587755





In [17]:
# Aggregate procedures per PcrKey
proc_agg = (
    proc_df.groupby("PcrKey")
    .agg(
        procedure_count=("eProcedures_03", "count"),
        unique_procedures=("eProcedures_03", "nunique"),
        first_procedure=("eProcedures_03", "first"),
        all_procedures=("eProcedures_03", lambda x: list(x.dropna()))
    )
    .reset_index()
)

# Optionally stringify list for easier storage
proc_agg["all_procedures_str"] = proc_agg["all_procedures"].apply(lambda x: "|".join(x) if x else "")

print("Procedures aggregated:", proc_agg.shape)

Procedures aggregated: (271206, 6)


In [18]:
# Path to CPR records
cpr_path = "../data/raw/FACTPCRARRESTCPRPROVIDED.txt"

# Prepare chunks
cpr_chunks = []

with open(cpr_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading CPR Records"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(pcr_key_set)]
        if not filtered.empty:
            cpr_chunks.append(filtered)

cpr_df = pd.concat(cpr_chunks, ignore_index=True)

print("CPR records loaded:", len(cpr_df))

Loading CPR Records: 546it [00:34, 15.81it/s]

CPR records loaded: 275322





In [19]:
# Aggregate CPR info
cpr_agg = (
    cpr_df.groupby("PcrKey")
    .agg(
        cpr_given=("eArrest_09", lambda x: True),
        bystander_cpr=("eArrest_09", lambda x: x.str.contains("BYSTANDER", case=False, na=False).any()),
        ems_cpr=("eArrest_09", lambda x: x.str.contains("EMS|CREW|PROVIDER", case=False, na=False).any())
    )
    .reset_index()
)

print("CPR aggregated:", cpr_agg.shape)

CPR aggregated: (271206, 4)


In [20]:
# Path to ROSC records
rosc_path = "../data/raw/FACTPCRARRESTROSC.txt"

# Prepare chunks
rosc_chunks = []

with open(rosc_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading ROSC Records"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(pcr_key_set)]
        if not filtered.empty:
            rosc_chunks.append(filtered)

rosc_df = pd.concat(rosc_chunks, ignore_index=True)

print("ROSC records loaded:", len(rosc_df))

Loading ROSC Records: 543it [00:35, 15.51it/s]

ROSC records loaded: 271712





In [21]:
# Aggregate ROSC flag
rosc_agg = (
    rosc_df.groupby("PcrKey")
    .agg(
        rosc_achieved=("eArrest_12", lambda x: x.str.contains("YES", case=False, na=False).any())
    )
    .reset_index()
)

print("ROSC aggregated:", rosc_agg.shape)

ROSC aggregated: (271206, 2)


In [23]:
# Path to Computed Elements
computed_path = "../data/raw/ComputedElements.txt"

# Prepare chunks
computed_chunks = []

with open(computed_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Computed Demographics"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        # Check which columns exist
        cols_available = [col for col in ["PcrKey", "ePatient_15", "ePatient_16"] if col in chunk.columns]
        if not cols_available:
            continue

        filtered = chunk[chunk["PcrKey"].isin(pcr_key_set)][cols_available]

        if not filtered.empty:
            computed_chunks.append(filtered)

# Combine
computed_df = pd.concat(computed_chunks, ignore_index=True)

# Deduplicate
demographics_agg = computed_df.drop_duplicates(subset="PcrKey").copy()

# Convert types if columns exist
if "ePatient_15" in demographics_agg.columns:
    demographics_agg["ePatient_15"] = pd.to_numeric(demographics_agg["ePatient_15"], errors="coerce")

if "ePatient_16" in demographics_agg.columns:
    demographics_agg["ePatient_16"] = demographics_agg["ePatient_16"].astype(str).str.strip(" ~")

print("Demographics aggregated:", demographics_agg.shape)

Loading Computed Demographics: 542it [01:13,  7.33it/s]

Demographics aggregated: (271206, 1)





In [24]:
# Path to Alcohol/Drug Use indicators
drug_path = "../data/raw/FACTPCRALCOHOLDRUGUSEINDICATOR.txt"

# Prepare chunks
drug_chunks = []

with open(drug_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Alcohol/Drug Use"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(pcr_key_set)]
        if not filtered.empty:
            drug_chunks.append(filtered)

drug_df = pd.concat(drug_chunks, ignore_index=True)

print("Alcohol/Drug Use records loaded:", len(drug_df))

Loading Alcohol/Drug Use: 553it [00:34, 16.03it/s]

Alcohol/Drug Use records loaded: 319541





In [25]:
# Aggregate flags per PcrKey
drug_agg = (
    drug_df.groupby("PcrKey")
    .agg(
        use_flag_count=("eHistory_17", "count"),
        unique_use_flags=("eHistory_17", "nunique"),
        all_use_flags=("eHistory_17", lambda x: list(x.dropna()))
    )
    .reset_index()
)

# Stringify list for storage
drug_agg["all_use_flags_str"] = drug_agg["all_use_flags"].apply(lambda x: "|".join(x) if x else "")

print("Alcohol/Drug Use aggregated:", drug_agg.shape)

Alcohol/Drug Use aggregated: (271206, 5)


In [26]:
# Path to Additional Symptoms
symptom_path = "../data/raw/FACTPCRADDITIONALSYMPTOM.txt"

# Prepare chunks
symptom_chunks = []

with open(symptom_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Additional Symptoms"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(pcr_key_set)]
        if not filtered.empty:
            symptom_chunks.append(filtered)

symptom_df = pd.concat(symptom_chunks, ignore_index=True)

print("Additional Symptoms records loaded:", len(symptom_df))

Loading Additional Symptoms: 580it [00:34, 16.88it/s]

Additional Symptoms records loaded: 306286





In [27]:
# Aggregate per PcrKey
symptom_agg = (
    symptom_df.groupby("PcrKey")
    .agg(
        symptom_count=("eSituation_10", "count"),
        unique_symptoms=("eSituation_10", "nunique"),
        all_symptoms=("eSituation_10", lambda x: list(x.dropna()))
    )
    .reset_index()
)

# Stringify list for easier storage
symptom_agg["all_symptoms_str"] = symptom_agg["all_symptoms"].apply(lambda x: "|".join(x) if x else "")

print("Additional Symptoms aggregated:", symptom_agg.shape)

Additional Symptoms aggregated: (267904, 5)


In [28]:
# Start with the Event DataFrame (1 row per PcrKey)
df_merged = event_df.copy()

# List of all aggregated DataFrames to merge
feature_tables = {
    "vitals": vitals_agg,
    "medications": meds_agg,
    "procedures": proc_agg,
    "cpr": cpr_agg,
    "rosc": rosc_agg,
    "demographics": demographics_agg,
    "alcohol_drug_use": drug_agg,
    "additional_symptoms": symptom_agg
}

# Merge each
for name, table in feature_tables.items():
    df_merged = df_merged.merge(table, on="PcrKey", how="left")
    print(f"Merged {name}: now shape {df_merged.shape}")

# Save to CSV
df_merged.to_csv("../data/interim/opioid_cases_full.csv", index=False)

print("All data merged and saved to opioid_cases_full.csv")

Merged vitals: now shape (271206, 110)
Merged medications: now shape (271206, 116)
Merged procedures: now shape (271206, 121)
Merged cpr: now shape (271206, 124)
Merged rosc: now shape (271206, 125)
Merged demographics: now shape (271206, 125)
Merged alcohol_drug_use: now shape (271206, 129)
Merged additional_symptoms: now shape (271206, 133)
All data merged and saved to opioid_cases_full.csv
