In [9]:
import pandas as pd
from tqdm import tqdm
import numpy as np

In [10]:
# Define target ICD-10 prefixes
target_prefixes = ("T40", "F11")

# Path to primary impressions file
primary_path = "../data/raw/FACTPCRPRIMARYIMPRESSION.txt"

# Prepare chunks
primary_chunks = []

with open(primary_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Processing Primary Impressions"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["eSituation_11"] = chunk["eSituation_11"].str.strip(" ~")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        mask = chunk["eSituation_11"].str.startswith(target_prefixes)
        filtered = chunk.loc[mask, ["PcrKey"]]

        if not filtered.empty:
            primary_chunks.append(filtered)

primary_df = pd.concat(primary_chunks, ignore_index=True)
print("Primary impressions matched:", len(primary_df))

Processing Primary Impressions: 530it [00:22, 23.75it/s]

Primary impressions matched: 231354





In [11]:
# Path to secondary impressions file
secondary_path = "../data/raw/FACTPCRSECONDARYIMPRESSION.txt"

# Prepare chunks
secondary_chunks = []

with open(secondary_path, "r") as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000, dtype=str),
        desc="Processing Secondary Impressions"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["eSituation_12"] = chunk["eSituation_12"].str.strip(" ~")
        chunk["PcrKey"] = chunk["PcrKey"].str.strip(" ~")

        mask = chunk["eSituation_12"].str.startswith(target_prefixes)
        filtered = chunk.loc[mask, ["PcrKey"]]

        if not filtered.empty:
            secondary_chunks.append(filtered)

secondary_df = pd.concat(secondary_chunks, ignore_index=True)
print("Secondary impressions matched:", len(secondary_df))

Processing Secondary Impressions: 544it [00:23, 22.94it/s]

Secondary impressions matched: 42853





In [5]:
# Define target ICD-10 prefixes
target_prefixes = ("T40", "F11")

# Load PRIMARY impressions
primary_uri = "s3://{}/raw-data/FACTPCRPRIMARYIMPRESSION.txt".format(credentials.BUCKET_NAME)
primary_chunks = []

with open(primary_uri, transport_params=transport_params) as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000),
        desc="Processing Primary Impressions"
    ):
        # Clean columns
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["eSituation_11"] = chunk["eSituation_11"].astype(str).str.strip(" ~")
        chunk["PcrKey"] = chunk["PcrKey"].astype(str).str.strip(" ~")

        # Filter rows with target ICD codes
        mask = chunk["eSituation_11"].str.startswith(target_prefixes)
        filtered = chunk[mask]

        if not filtered.empty:
            primary_chunks.append(filtered[["PcrKey"]])

# Combine all matching rows
primary_df = pd.concat(primary_chunks, ignore_index=True)
print("Primary impressions matched:", len(primary_df))


Processing Primary Impressions: 530it [34:28,  3.90s/it]

Primary impressions matched: 231354





In [6]:
# Load SECONDARY impressions
secondary_uri = "s3://{}/raw-data/FACTPCRSECONDARYIMPRESSION.txt".format(credentials.BUCKET_NAME)
secondary_chunks = []

with open(secondary_uri, transport_params=transport_params) as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000),
        desc="Processing Secondary Impressions"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["eSituation_12"] = chunk["eSituation_12"].astype(str).str.strip(" ~")
        chunk["PcrKey"] = chunk["PcrKey"].astype(str).str.strip(" ~")

        mask = chunk["eSituation_12"].str.startswith(target_prefixes)
        filtered = chunk[mask]

        if not filtered.empty:
            secondary_chunks.append(filtered[["PcrKey"]])

secondary_df = pd.concat(secondary_chunks, ignore_index=True)
print("Secondary impressions matched:", len(secondary_df))

Processing Secondary Impressions: 544it [38:04,  4.20s/it]

Secondary impressions matched: 42853





In [12]:
# Combine primary and secondary
opioid_pcr_df = pd.concat([primary_df, secondary_df]).drop_duplicates()

print("Total unique opioid-related PcrKeys:", len(opioid_pcr_df))

# Convert to set for fast lookup
pcr_key_set = set(opioid_pcr_df["PcrKey"])

Total unique opioid-related PcrKeys: 271206


In [8]:
# Save to interim data folder as CSV
opioid_pcr_df.to_csv("../data/interim/opioid_pcr_keys.csv", index=False)

print("Saved opioid_pcr_keys.csv")

Saved opioid_pcr_keys.csv
