In [None]:
# Create an empty data frame for results.

df_extract = df_discharge_text.drop("text", axis = 1)
df_extract

def extract_sections(df_extract, column_name="text"):
    patterns = {
        "Chief Complaint": r'Chief Complaint:\s*(.*)',
        "Discharge Diagnosis": r'(?s)Discharge Diagnosis:\s*(.*?)\s*(?=Discharge Condition:|\Z)',
        "Present Illness History": r'(?s)History of Present Illness:\s*(.*?)\s*(Past Medical History:|\Z)',
        "Discharge Medications": r'(?s)Discharge Medications:\s*(.*?)\s*Discharge Disposition:',
        "Discharge Disposition": r'(?s)Discharge Disposition:\s*(.*?)\s*(?=Discharge Diagnosis:|\Z)',
        "Mental Status": r'(?s)Mental Status:\s*(.*?)\s*(?=Level of Consciousness:|\Z)',
        "Level of Consciousness": r'(?s)Level of Consciousness:\s*(.*?)\s*(?=Activity Status:|\Z)',
        "Discharge Instructions": r'(?s)Discharge Instructions:\s*(.*?)\s*(?=\Z)'
    }

    # Extract text

    df_target = pd.DataFrame(index=df_extract.index)

    for col, pattern in patterns.items():
        # Extract raw text first
        df_target[col] = df_extract[column_name].apply(
            lambda text: re.search(pattern, str(text)).group(1).strip()
            if re.search(pattern, str(text)) else None
        )

        # Apply cleanup logic specific to column
        if col == "All Dx":
            df_target[col] = df_target[col].str.replace(
            r'(?i)(Primary diagnosis:|Secondary diagnosis:|Primary diagnoses:|Secondary diagnoses:|PRIMARY DIAGNOSIS|SECONDARY DIAGNOSIS|PRIMARY DIAGNOSES|SECONDARY DIAGNOSES)',
            '', regex=True) \
            .str.replace(r'(\s*\n\s*|- |-|={3,}|-{3,}|_{3,}|#|\d+)', ' ', regex=True) \
            .str.replace(r'\s{2,}', ' ', regex=True) \
            .str.strip()

        elif col == "Present Illness History":
            df_target[col] = df_target[col] \
            .str.replace(r'(\s*\n\s*|_{3,}|={3,}|\+)', ' ', regex=True) \
            .str.replace(r'\d+(\.\d+)?%?', '', regex=True) \
            .str.replace(r'\s{2,}', ' ', regex=True) \
            .str.strip()

        elif col == "Discharge Medications":
            df_target[col] = df_target[col] \
            .str.replace(r'\n', ' ', regex=True) \
            .str.replace(r'\s{2,}', ' ', regex=True) \
            .str.strip()

        elif col == "Discharge Disposition":
            df_target[col] = df_target[col] \
            .str.replace(r'\n_+|\n|:', ' ', regex=True) \
            .str.replace(r'\s{2,}', ' ', regex=True) \
            .str.strip()

        elif col == "Discharge Diagnosis":
            df_target[col] = df_target[col] \
            .str.replace(r'(\s*\n\s*|-|={3,}|-{3,}|_{3,})', ' ', regex=True) \
            .str.replace(r'\s{2,}', ' ', regex=True) \
            .str.strip()

        elif col == "Discharge Instructions":
            df_target[col] = df_target[col] \
            .str.replace(r'Follow[-\s]*up Instructions.*', ' ', regex=True) \
            .str.replace(r'\n|_{3,}| - ', ' ', regex=True) \
            .str.replace(r'\s{2,}', ' ', regex=True) \
            .str.strip()
    return df_target

extracted_final = extract_sections(df_discharge_text, column_name="text")

# Fix missing values

extracted_final = extracted_final.dropna(subset=["Present Illness History", "Discharge Instructions"]).reset_index(drop=True)
extracted_final.isnull().sum()

# View rows with any missing values
# missing_rows = extracted_final[extracted_final.isnull().any(axis=1)]
# missing_rows

extracted_final = extracted_final.dropna(subset=["Present Illness History", "Discharge Instructions"]).reset_index(drop=True)
extracted_final.isnull().sum()

extracted_final.fillna("Not provided", inplace=True)
# Check for missing values
extracted_final.isnull().sum()

# Expand abbreviations

abbr_dict_manual = dict(sorted({
    "ALT": "alanine aminotransferase",
    "AM": "morning",
    "ASA": "aspirin",
    "AST": "aspartate aminotransferase",
    "BID": "twice a day",
    "BM": "bowel movement",
    "BNP": "brain natriuretic peptide",
    "BRBPR": "bright red blood per rectum",
    "BP": "blood pressure",
    "BUN": "blood urea nitrogen",
    "CABG": "coronary artery bypass graft",
    "CAD": "coronary artery disease",
    "CBC": "complete blood count",
    "CHF": "congestive heart failure",
    "CKD": "chronic kidney disease",
    "COPD": "chronic obstructive pulmonary disease",
    "CP": "chest pain",
    "CT": "computed tomography",
    "CTA": "computed tomography angiography",
    "CXR": "chest x-ray",
    "DM": "diabetes mellitus",
    "DOE": "dyspnea on exertion",
    "DVT": "deep vein thrombosis",
    "ECG": "electrocardiogram",
    "ED": "emergency department",
    "EF": "ejection fraction",
    "EGD": "esophagogastroduodenoscopy",
    "EKG": "electrocardiogram",
    "EMS": "emergency medical services",
    "ER": "emergency room",
    "ERCP": "endoscopic retrograde cholangiopancreatography",
    "ESRD": "end-stage renal disease",
    "GI": "gastrointestinal",
    "HA": "headache",
    "HAART": "highly active antiretroviral therapy",
    "HD": "hemodialysis",
    "HE": "hepatic encephalopathy",
    "H/H": "hemoglobin and hematocrit",
    "HCV": "hepatitis C virus",
    "HIV": "human immunodeficiency virus",
    "HLD": "hyperlipidemia",
    "HPI": "history of present illness",
    "HR": "heart rate",
    "HTN": "hypertension",
    "ICU": "intensive care unit",
    "INR": "international normalized ratio",
    "IV": "intravenous",
    "IVDU": "intravenous drug use",
    "IVF": "intravenous fluids",
    "LAD": "left anterior descending artery",
    "LOC": "loss of consciousness",
    "LLE": "left lower extremity",
    "MICU": "medical intensive care unit",
    "MI": "myocardial infarction",
    "MRI": "magnetic resonance imaging",
    "Na": "sodium",
    "NC": "nasal cannula",
    "NG": "nasogastric",
    "NS": "normal saline",
    "O2": "oxygen",
    "OSH": "outside hospital",
    "PE": "pulmonary embolism",
    "PCP": "primary care provider",
    "PNA": "pneumonia",
    "PMH": "past medical history",
    "PND": "paroxysmal nocturnal dyspnea",
    "PO": "by mouth",
    "PTSD": "post-traumatic stress disorder",
    "RA": "room air",
    "ROS": "review of systems",
    "RR": "respiratory rate",
    "RUQ": "right upper quadrant",
    "SBP": "spontaneous bacterial peritonitis",
    "SI": "suicidal ideation",
    "SOB": "shortness of breath",
    "ST": "sinus tachycardia",
    "TIA": "transient ischemic attack",
    "UA": "urinalysis",
    "US": "ultrasound",
    "UTI": "urinary tract infection",
    "VS": "vital signs",
    "WBC": "white blood cell",
    "WNL": "within normal limits",
    "ART": "antiretroviral therapy",
    "D/C": "discharge or discontinue",
    "F/C/N/V": "fever, chills, nausea, and vomiting",
    "C/B": "complicated by",
    "H/O": "history of",
    "N/V": "nausea and vomiting",
    "HIV": "human immunodeficiency virus",
    "P/W": "presents with",
    "F/C": "fever and chills",
    "YO": "years old",
    "ABX": "antibiotics",
    "AODM": "adult onset diabetes mellitus",
    "ARDS": "acute respiratory distress syndrome",
    "AVR": "aortic valve replacement",
    "BIBA": "brought in by ambulance",
    "BPH": "benign prostatic hyperplasia",
    "CNS": "central nervous system",
    "CPAP": "continuous positive airway pressure",
    "CVA": "cerebrovascular accident",
    "DNI": "do not intubate",
    "DNR": "do not resuscitate",
    "DVT": "deep vein thrombosis",
    "Dx": "diagnosis",
    "ETOH": "alcohol",
    "FTT": "failure to thrive",
    "Fx": "fracture",
    "GSW": "gunshot wound",
    "HB": "hemoglobin",
    "HT": "height",
    "I&O": "intake and output",
    "IHD": "ischemic heart disease",
    "I/O": "input and output",
    "LFTs": "liver function tests",
    "LLQ": "left lower quadrant",
    "LUQ": "left upper quadrant",
    "MVA": "motor vehicle accident",
    "NAD": "no acute distress",
    "NPO": "nothing by mouth",
    "NTD": "no trauma detected",
    "NSR": "normal sinus rhythm",
    "ORIF": "open reduction internal fixation",
    "PERRLA": "pupils equal round reactive to light and accommodation",
    "RLE": "right lower extremity",
    "RLL": "right lower lobe",
    "RLQ": "right lower quadrant",
    "RUL": "right upper lobe",
    "RUQ": "right upper quadrant",
    "S/P": "status post",
    "TID": "three times a day",
    "TPN": "total parenteral nutrition",
    "UA": "urinalysis",
    "UOP": "urine output",
    "WNL": "within normal limits",
    "W/U": "workup",
    "XR": "x-ray",
}.items()))
# abbr_dict_manual

# Compile abbreviation pattern (case-insensitive)
abbr_pattern = re.compile(
    r'\b(' + '|'.join(re.escape(k) for k in abbr_dict_manual.keys()) + r')\b',
    flags=re.IGNORECASE
)

# Remove common English words (select)
custom_stopwords_list = [
    "a", "about", "am", "an", "and",
    "are", "been",
    "but", "by", "did", "do", "does",
    "for", "from", "had", "has", "have",
    "he", "her", "here", "hers", "herself", "him", "himself", "his",
    "into", "is", "it", "its", "itself", "just", "me", "my",
    "myself", "of", "off", "on", "or",
    "our", "ours", "ourselves", "out", "s", "she",
    "so", "some", "t", "that", "the", "their", "theirs",
    "them", "themselves", "there", "these", "they", "this", "those",
    "to", "too", "under", "up", "was", "we", "were", "what", "when",
    "where", "which", "while", "who", "whom", "will", "you", "your",
    "yours", "yourself", "yourselves"
]

def preprocess_PIH(text):
    text = str(text)

    # Expand abbreviations (case-insensitive)
    text = abbr_pattern.sub(
        lambda match: abbr_dict_manual.get(match.group(0).upper(), match.group(0)),
        text
    )

    # Filter custom common words
    text = ' '.join([word for word in text.split() if word.lower() not in custom_stopwords_list])

    return text

extracted_final["Present Illness History"] = extracted_final["Present Illness History"].apply(preprocess_PIH)

# Run the tokeinizer

def generate_bert_embeddings(texts, model, tokenizer, batch_size=100):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        tokenized = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        with torch.no_grad():
            outputs = model(**tokenized)
        attention_mask = tokenized["attention_mask"].unsqueeze(-1)
        masked = outputs.last_hidden_state * attention_mask
        embeddings = masked.sum(dim=1) / attention_mask.sum(dim=1)
        all_embeddings.extend(embeddings.cpu().numpy().tolist())
    return all_embeddings

# Assume all missing values are already replaced with "not provided"
texts_PIH = extracted_final["Present Illness History"].tolist()
extracted_final["bert_embedding_history"] = generate_bert_embeddings(texts_PIH, model, tokenizer)

texts_DI = extracted_final["Discharge Instructions"].tolist()
extracted_final["bert_embedding_instructions"] = generate_bert_embeddings(texts_DI, model, tokenizer)