In [9]:
import pandas as pd
from tqdm import tqdm
import numpy as np

In [10]:
# Define target ICD-10 prefixes
target_prefixes = ("T40", "F11")

# Path to primary impressions file
primary_path = "../data/raw/FACTPCRPRIMARYIMPRESSION.txt"

# Prepare chunks
primary_chunks = []

with open(primary_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Processing Primary Impressions"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["eSituation_11"] = chunk["eSituation_11"].str.strip(" ~")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        mask = chunk["eSituation_11"].str.startswith(target_prefixes)
        filtered = chunk.loc[mask, ["PcrKey"]]

        if not filtered.empty:
            primary_chunks.append(filtered)

primary_df = pd.concat(primary_chunks, ignore_index=True)
print("Primary impressions matched:", len(primary_df))

Processing Primary Impressions: 530it [00:22, 23.75it/s]

Primary impressions matched: 231354





In [11]:
# Path to secondary impressions file
secondary_path = "../data/raw/FACTPCRSECONDARYIMPRESSION.txt"

# Prepare chunks
secondary_chunks = []

with open(secondary_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Processing Secondary Impressions"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["eSituation_12"] = chunk["eSituation_12"].str.strip(" ~")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        mask = chunk["eSituation_12"].str.startswith(target_prefixes)
        filtered = chunk.loc[mask, ["PcrKey"]]

        if not filtered.empty:
            secondary_chunks.append(filtered)

secondary_df = pd.concat(secondary_chunks, ignore_index=True)
print("Secondary impressions matched:", len(secondary_df))

Processing Secondary Impressions: 544it [00:23, 22.94it/s]

Secondary impressions matched: 42853





In [5]:
# Define target ICD-10 prefixes
target_prefixes = ("T40", "F11")

# Load PRIMARY impressions
primary_uri = "s3://{}/raw-data/FACTPCRPRIMARYIMPRESSION.txt".format(credentials.BUCKET_NAME)
primary_chunks = []

with open(primary_uri, transport_params=transport_params) as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000),
        desc="Processing Primary Impressions"
    ):
        # Clean columns
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["eSituation_11"] = chunk["eSituation_11"].astype(str).str.strip(" ~")
        chunk["PcrKey"] = chunk["PcrKey"].astype(str).str.strip(" ~")

        # Filter rows with target ICD codes
        mask = chunk["eSituation_11"].str.startswith(target_prefixes)
        filtered = chunk[mask]

        if not filtered.empty:
            primary_chunks.append(filtered[["PcrKey"]])

# Combine all matching rows
primary_df = pd.concat(primary_chunks, ignore_index=True)
print("Primary impressions matched:", len(primary_df))


Processing Primary Impressions: 530it [34:28,  3.90s/it]

Primary impressions matched: 231354





In [6]:
# Load SECONDARY impressions
secondary_uri = "s3://{}/raw-data/FACTPCRSECONDARYIMPRESSION.txt".format(credentials.BUCKET_NAME)
secondary_chunks = []

with open(secondary_uri, transport_params=transport_params) as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000),
        desc="Processing Secondary Impressions"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["eSituation_12"] = chunk["eSituation_12"].astype(str).str.strip(" ~")
        chunk["PcrKey"] = chunk["PcrKey"].astype(str).str.strip(" ~")

        mask = chunk["eSituation_12"].str.startswith(target_prefixes)
        filtered = chunk[mask]

        if not filtered.empty:
            secondary_chunks.append(filtered[["PcrKey"]])

secondary_df = pd.concat(secondary_chunks, ignore_index=True)
print("Secondary impressions matched:", len(secondary_df))

Processing Secondary Impressions: 544it [38:04,  4.20s/it]

Secondary impressions matched: 42853





In [12]:
# Combine primary and secondary
opioid_pcr_df = pd.concat([primary_df, secondary_df]).drop_duplicates()

print("Total unique opioid-related PcrKeys:", len(opioid_pcr_df))

# Convert to set for fast lookup
pcr_key_set = set(opioid_pcr_df["PcrKey"])

Total unique opioid-related PcrKeys: 271206


In [13]:
# Path to event file
event_path = "../data/raw/pub_pcrevents_cp25.txt"

# Prepare chunks
event_chunks = []

with open(event_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Event Records"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(pcr_key_set)]

        if not filtered.empty:
            event_chunks.append(filtered)

event_df = pd.concat(event_chunks, ignore_index=True)

print("Event records loaded:", len(event_df))

Loading Event Records: 542it [04:45,  1.90it/s]

Event records loaded: 271206





In [14]:
# Path to Vitals file
vitals_path = "../data/raw/FACTPCRVITAL.txt"

# Define vitals columns to extract and aggregate
vital_cols = {
    "HeartRate": "eVitals_10",
    "RespRate": "eVitals_14",
    "SystolicBP": "eVitals_06",
    "SpO2": "eVitals_12",
    "BGL": "eVitals_18",
    "ETCO2": "eVitals_16",
    "GCS_Eye": "eVitals_19",
    "GCS_Verbal": "eVitals_20",
    "GCS_Motor": "eVitals_21"
}

# Prepare chunks
vitals_chunks = []

with open(vitals_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Loading Vitals"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        filtered = chunk[chunk["PcrKey"].isin(pcr_key_set)].copy()
        if not filtered.empty:
            for name, col in vital_cols.items():
                if col in filtered.columns:
                    extracted = filtered[col].str.extract(r"(\d+\.?\d*)")[0]
                    filtered[name] = pd.to_numeric(extracted, errors="coerce").where(lambda x: x < 1000)
            vitals_chunks.append(filtered[["PcrKey"] + list(vital_cols.keys())])

# Combine all
vitals_df = pd.concat(vitals_chunks, ignore_index=True)

# Aggregate per PcrKey
vitals_agg = (
    vitals_df.groupby("PcrKey")[list(vital_cols.keys())]
    .agg(["first", "last", "min", "max", "mean", "std", "count"])
)

# Flatten column names
vitals_agg.columns = ["_".join(col).strip() for col in vitals_agg.columns.values]
vitals_agg = vitals_agg.reset_index()

print("Vitals aggregated:", vitals_agg.shape)

Loading Vitals: 1648it [05:40,  4.84it/s]


Vitals aggregated: (271071, 64)
