In [2]:
import pandas as pd
from smart_open import open
import boto3
from tqdm import tqdm

# Import MinIO credentials securely
import credentials

In [3]:
# Create boto3 session
session = boto3.session.Session()

# Configure transport parameters
transport_params = {
    "client": session.client(
        "s3",
        endpoint_url=credentials.MINIO_URL,
        aws_access_key_id=credentials.ACCESS_KEY,
        aws_secret_access_key=credentials.SECRET_KEY
    )
}

In [4]:
# Create S3 client
s3_client = session.client(
    "s3",
    endpoint_url=credentials.MINIO_URL,
    aws_access_key_id=credentials.ACCESS_KEY,
    aws_secret_access_key=credentials.SECRET_KEY
)

# List objects in your bucket under raw-data/
response = s3_client.list_objects_v2(
    Bucket=credentials.BUCKET_NAME,
    Prefix="raw-data/"
)

# Print filenames
print("Files in raw-data/:")
for obj in response.get("Contents", []):
    print(" -", obj["Key"])

Files in raw-data/:
 - raw-data/ComputedElements.txt
 - raw-data/EINJURY_01REF.txt
 - raw-data/EPROCEDURES_03REF.txt
 - raw-data/ESITUATION_09REF.txt
 - raw-data/ESITUATION_10REF.txt
 - raw-data/ESITUATION_11REF.txt
 - raw-data/ESITUATION_12REF.txt
 - raw-data/FACTPCRADDITIONALRESPONSEMODE.txt
 - raw-data/FACTPCRADDITIONALSYMPTOM.txt
 - raw-data/FACTPCRADDITIONALTRANSPORTMODE.txt
 - raw-data/FACTPCRALCOHOLDRUGUSEINDICATOR.txt
 - raw-data/FACTPCRARRESTCPRPROVIDED.txt
 - raw-data/FACTPCRARRESTRESUSCITATION.txt
 - raw-data/FACTPCRARRESTRHYTHMDESTINATION.txt
 - raw-data/FACTPCRARRESTROSC.txt
 - raw-data/FACTPCRARRESTWITNESS.txt
 - raw-data/FACTPCRBARRIERTOCARE.txt
 - raw-data/FACTPCRCAUSEOFINJURY.txt
 - raw-data/FACTPCRDESTINATIONREASON.txt
 - raw-data/FACTPCRDESTINATIONTEAM.txt
 - raw-data/FACTPCRDISPATCHDELAY.txt
 - raw-data/FACTPCRINJURYRISKFACTOR.txt
 - raw-data/FACTPCRMEDICATION.txt
 - raw-data/FACTPCRPRIMARYIMPRESSION.txt
 - raw-data/FACTPCRPRIMARYSYMPTOM.txt
 - raw-data/FACTPCRPROCE

In [5]:
# Define target ICD-10 prefixes
target_prefixes = ("T40", "F11")

# Load PRIMARY impressions
primary_uri = "s3://{}/raw-data/FACTPCRPRIMARYIMPRESSION.txt".format(credentials.BUCKET_NAME)
primary_chunks = []

with open(primary_uri, transport_params=transport_params) as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000),
        desc="Processing Primary Impressions"
    ):
        # Clean columns
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["eSituation_11"] = chunk["eSituation_11"].astype(str).str.strip(" ~")
        chunk["PcrKey"] = chunk["PcrKey"].astype(str).str.strip(" ~")

        # Filter rows with target ICD codes
        mask = chunk["eSituation_11"].str.startswith(target_prefixes)
        filtered = chunk[mask]

        if not filtered.empty:
            primary_chunks.append(filtered[["PcrKey"]])

# Combine all matching rows
primary_df = pd.concat(primary_chunks, ignore_index=True)
print("Primary impressions matched:", len(primary_df))


Processing Primary Impressions: 530it [34:28,  3.90s/it]

Primary impressions matched: 231354





In [6]:
# Load SECONDARY impressions
secondary_uri = "s3://{}/raw-data/FACTPCRSECONDARYIMPRESSION.txt".format(credentials.BUCKET_NAME)
secondary_chunks = []

with open(secondary_uri, transport_params=transport_params) as f:
    for chunk in tqdm(
        pd.read_csv(f, delimiter="|", chunksize=100_000),
        desc="Processing Secondary Impressions"
    ):
        chunk.columns = chunk.columns.str.strip(" ~'")
        chunk["eSituation_12"] = chunk["eSituation_12"].astype(str).str.strip(" ~")
        chunk["PcrKey"] = chunk["PcrKey"].astype(str).str.strip(" ~")

        mask = chunk["eSituation_12"].str.startswith(target_prefixes)
        filtered = chunk[mask]

        if not filtered.empty:
            secondary_chunks.append(filtered[["PcrKey"]])

secondary_df = pd.concat(secondary_chunks, ignore_index=True)
print("Secondary impressions matched:", len(secondary_df))

Processing Secondary Impressions: 544it [38:04,  4.20s/it]

Secondary impressions matched: 42853





In [7]:
# Combine and drop duplicates
opioid_pcr_df = pd.concat([primary_df, secondary_df]).drop_duplicates()

print("Total unique opioid-related PcrKeys:", len(opioid_pcr_df))


Total unique opioid-related PcrKeys: 271206


In [8]:
# Save to interim data folder as CSV
opioid_pcr_df.to_csv("../data/interim/opioid_pcr_keys.csv", index=False)

print("Saved opioid_pcr_keys.csv")

Saved opioid_pcr_keys.csv
