In [None]:

!pip install torch transformers pandas


In [None]:

import pandas as pd
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


In [None]:

# Load dataset
file_path = "/home/hq6375/Desktop/Code/MIMIC-III/physionet.org/files/mimiciii/1.4/NOTEEVENTS.csv"
df = pd.read_csv(file_path, low_memory=False)

# Display dataset info
df.info()
df.head()


In [None]:

# Filter relevant note categories
RELEVANT_CATEGORIES = ["Nursing", "Nursing/other", "Social Work"]
df = df[df["CATEGORY"].isin(RELEVANT_CATEGORIES)]

# Drop missing text values
df = df.dropna(subset=["TEXT"])

# Display updated dataset info
df.info()
df.head()


In [None]:

def clean_text(text):
    text = re.sub(r"\n+", " ", text)  # Remove new lines
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    text = re.sub(r"[^\w\s.,]", "", text)  # Remove special characters except punctuation
    return text.lower()  # Convert to lowercase

# Apply cleaning function
df["CLEANED_TEXT"] = df["TEXT"].apply(clean_text)

# Display cleaned text samples
df[["CATEGORY", "CLEANED_TEXT"]].head()


In [None]:

MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(device)

print("Model loaded successfully.")


In [None]:

def extract_information(note):
    instruction = """Extract information relevant to hospital readmission:
    1. Past hospitalizations mentioned.
    2. Medications prescribed at discharge.
    3. Symptoms & conditions at discharge.
    4. Follow-up instructions and care plans.
    Provide a structured JSON output."""

    input_text = f"{instruction}\n\n{note}"
    
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=2048).to(device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=512)
    
    extracted_info = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return extracted_info

print("Function for extraction is ready.")


In [None]:

# Apply extraction on a sample (100 notes for monitoring)
df["EXTRACTED_INFO"] = df["CLEANED_TEXT"].sample(100).apply(extract_information)

# Display extracted information
df[["CATEGORY", "EXTRACTED_INFO"]].head()


In [None]:

# Save cleaned & extracted data
output_path = "/home/hq6375/Desktop/Code/MIMIC-III/processed_notes.csv"
df.to_csv(output_path, index=False)

print(f"Processed data saved at: {output_path}")
