# Data Cleaning
This Jupyter Notebook handles ingestion of the Unstructured Notes in the "MIMIC-III Clinical Database" dateset. Specifically, it combs through the noteevents.csv, matches SUBJECT_ID between the "Curated Data for Describing Blood Glucose" datasets and then starts filtering and extracting diabetes specific keywords that are outlined below. Once these "SUBJECT_IDs, HADM_IDs (Hospital Admissions), TEXTS" are filtered and extracted, it then assigns this information to a .JSONL file, specifically [filtered_diabetes_notes_dask.jsonl]. We are then able to utilize this file to begin fine-tuning our imported BioBERT model, as we will utilize this BioBERT model on the "MIMIC-IV Clinical Database" dataset. Once fine-tuned on our diabetic specific data, we can generate "ground truth" summaries that we can utilize to compare against summaries generated from the MIMIC-III dataset (generated with a BART). The BART summaries will be trained against the BioBERT model and after sufficient training and fine-tuning, we will then begin summary generation of notes that are tied to anomalies detected with the RAE model we are utilizing. 

In [1]:
import os
import dask
import dask.dataframe as dd
import pandas as pd
import nltk
import torch
import textwrap
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
from sentence_transformers import SentenceTransformer, util
from dask.diagnostics import ProgressBar

# ⚙️ Control threading to reduce memory pressure
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# ⚙️ Thread-based Dask scheduler is safer with PyTorch
dask.config.set(scheduler='threads')

  from .autonotebook import tqdm as notebook_tqdm


<dask.config.set at 0x7f3e97398ee0>

In [2]:
nltk.download('punkt')

# Manually initialize tokenizer to avoid punkt_tab bug
punkt_params = PunktParameters()
tokenizer = PunktSentenceTokenizer(punkt_params)

sent_tokenize = tokenizer.tokenize

# ✅ Select device (GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"📟 Using device: {device}")

# Load sentence embedding model onto device
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# Diabetes-related reference terms
diabetes_terms = [
    "diabetes", "diabetic", "DM1", "DM Type 1", "DM Type I", "type 1 diabetes", "type 2 diabetes",
    "diabetes diagnosis", "DKA", "ketoacidosis", "DKA history", "DKA episode", "DKA resolved", "DKA management",
    "hyperglycemia", "hypoglycemia", "hyperglycemia treatment",
    "polyuria", "polydipsia", "polyphagia", "HPI", "blood sugar", "blood sugar control",
    "glucose", "glucose-", "FS=", "FS:", "FS-", "FS(", "HbA1C","HgbA1C", "HgA1C", "a1c", "A1C test results",
    "ketone", "anion gap", "bicarbonate", "blood glucose", "blood sugars",
    "insulin", "insulin therapy", "Lantus", "Novalog", "Humalog",
    "sliding scale", "sliding scale insulin", "subcutaneously", "insulin drip", "insulin gtt", "insulin sliding scale", "amp D50",
    "carbohydrate", "diabetic diet", "glucose control, long acting insulin, short acting insulin, intermediate acting insulin"
]

# Precompute reference embeddings on device
ref_embeds = model.encode(diabetes_terms, convert_to_tensor=True, device=device)

# Native chunking to avoid external dependency
def chunked(seq, size):
    for i in range(0, len(seq), size):
        yield seq[i:i + size]

[nltk_data] Downloading package punkt to /home/maurb/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


📟 Using device: cuda


In [3]:
# Load glucose-insulin dataset
df_glucose = pd.read_csv(
    "curated-data-for-describing-blood-glucose-management-in-the-intensive-care-unit-1.0.1/Datasets/glucose_insulin_ICU.csv",
    dtype={'SUBJECT_ID': str, 'HADM_ID': str}
)
glucose_ids = set(df_glucose["SUBJECT_ID"])

# Clean NOTEEVENTS with pandas first
df_cleaned = pd.read_csv(
    "mimic-iii-clinical-database-1.4/NOTEEVENTS.csv",
    dtype=str,
    on_bad_lines='skip',
    engine='python'
)
df_cleaned.to_csv("clean_NOTEEVENTS.csv", index=False)

# Load with Dask
ddf = dd.read_csv(
    "clean_NOTEEVENTS.csv",
    dtype=str,
    blocksize="100MB",
    assume_missing=True,
    on_bad_lines='skip',
    engine='python'
)
ddf.columns = ddf.columns.str.strip('"')

# Filter for valid TEXT and SUBJECT_ID
ddf = ddf.dropna(subset=["TEXT"])
ddf = ddf[ddf["SUBJECT_ID"].isin(glucose_ids)]

# Persist to memory
ddf = ddf.persist()

In [4]:
# Filtering function using SentenceTransformer and cosine sim
import traceback


def extract_matches(note_text):
    try:
        if not isinstance(note_text, str):
            return []
        
        sentences = sent_tokenize(note_text)
        candidates = [s for s in sentences if any(term.lower() in s.lower() for term in diabetes_terms)]
        if not candidates:
            return []

        matches = []
        for chunk in chunked(candidates, 16):  # Keep batch size small
            embs = model.encode(chunk, convert_to_tensor=True, device=device)
            sims = util.cos_sim(embs, ref_embeds)
            scores = torch.max(sims, dim=1).values
            matches.extend([(s, round(score.item(), 3)) for s, score in zip(chunk, scores) if score > 0.5])

        return matches
    except Exception:
        traceback.print_exc()
        return []

# Dask-compatible wrapper
def extract_partition(part):
    part["DIABETES_MATCHES"] = part["TEXT"].apply(extract_matches)
    return part

# 1. Prepare meta that includes the new column
meta = ddf._meta.copy()
meta["DIABETES_MATCHES"] = object

# 2. Apply the matching function across partitions
ddf = ddf.map_partitions(extract_partition, meta=meta)

# 3. Now safely filter based on the new column
ddf_filtered = ddf[ddf["DIABETES_MATCHES"].map(lambda x: len(x) > 0, meta=("filter", bool))]

In [5]:
# Compute and save
print("⚙️ Computing full filtered results...")
with ProgressBar():
    df_result = ddf_filtered[["SUBJECT_ID", "HADM_ID", "DIABETES_MATCHES"]].compute()

# 💾 Save to disk
df_result.to_json("filtered_diabetes_notes_dask.jsonl", orient="records", lines=True)

⚙️ Computing full filtered results...
[########################################] | 100% Completed | 17m 28ss


In [6]:
# Preview
if not df_result.empty:
    print("✅ Preview from full filtered output:\n")
    for sent, score in df_result["DIABETES_MATCHES"].iloc[0]:
        print(f"[score={score}] {textwrap.fill(sent, width=100)}")
        print("=" * 120)
else:
    print("⚠️ No diabetes-relevant notes found.")

✅ Preview from full filtered output:

[score=0.673] Hyperglycemia:  Patient was placed on insulin-sliding scale due to hyperglycemia, which was steroid
induced.
[score=0.549] This worked quite well and her glucose came back to normal levels once the steroids were tapered to
off.


## Debugging section

In [7]:
# # 🧪 Small-scale test on ~1000 rows from first partition
# print("🔍 Running preview compute on ~1000 rows from first partition...")

# sample_ddf = ddf.partitions[0].head(1000, compute=True)
# sample_ddf_filtered = sample_ddf[sample_ddf["DIABETES_MATCHES"].map(lambda x: len(x) > 0)]

# if not sample_ddf_filtered.empty:
#     print("✅ Preview from sample filtered notes:\n")
#     for sent, score in sample_ddf_filtered["DIABETES_MATCHES"].iloc[0]:
#         print(f"[score={score}] {textwrap.fill(sent, width=100)}")
#         print("=" * 120)
# else:
#     print("⚠️ No diabetes-relevant notes found in sample.")

In [8]:
# # Should show a nonzero number
# shared_ids = set(ddf["SUBJECT_ID"].compute()) & glucose_ids
# print(f"🔗 SUBJECT_IDs shared between datasets: {len(shared_ids)}")

In [9]:
# # Pull a note from SUBJECT_ID "11861"
# note_df = ddf[ddf["SUBJECT_ID"] == "11861"].compute()

# if note_df.empty:
#     print("❌ No notes found for SUBJECT_ID 11861.")
# else:
#     # Display basic info
#     print(f"🧾 Found {len(note_df)} note(s) for SUBJECT_ID 11861.")
    
#     note_text = note_df.iloc[0]["TEXT"]
#     print("\n📝 Sample TEXT preview:\n")
#     print(textwrap.fill(note_text[:1000], width=100))

#     # Run extract_matches
#     print("\n🔍 Matching diabetes-related sentences:\n")
#     matches = extract_matches(note_text)

#     if matches:
#         for sent, score in matches:
#             print(f"[score={score}] {textwrap.fill(sent, width=100)}")
#             print("-" * 120)
#     else:
#         print("⚠️ No matches found — consider lowering score threshold or revisiting diabetes_terms.")
