<a href="https://colab.research.google.com/github/manikanta5315/Bayes_Assignments/blob/main/Clinical_notes_%26_SDOH_factors_prompt_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gradio -q
!pip install groq -q
!pip install pandas -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.1/57.1 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.1/320.1 kB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m89.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.2/73.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.8/63.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m130.2/130.2 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.8/108.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import json
import pandas as pd
import gradio as gr
from groq import Groq

In [3]:
from google.colab import userdata
from google.colab import drive
groq_api_key = userdata.get('groqAPI')
os.environ['GROQ_API_KEY'] = groq_api_key
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
def load_sdoh_codes(csv_path):
    """Load SDOH codes from CSV file"""
    try:
        sdoh_df = pd.read_csv(csv_path)
        if not all(col in sdoh_df.columns for col in ['SDOH factor', 'Code']):
            raise ValueError("Missing required columns in CSV: 'SDOH factor' or 'Code'")
        return dict(zip(sdoh_df['SDOH factor'].str.lower(), sdoh_df['Code']))
    except Exception as e:
        print(f"Error loading SDOH codes: {e}")
        return {}

In [173]:

def extract_patient_info(clinical_note):
    """Send the clinical note to the model and return the extracted patient information."""
    try:
        # Ensure API key is set
        if 'GROQ_API_KEY' not in os.environ:
            raise ValueError("Groq API key not found in environment variables")

        client = Groq(api_key=os.environ['GROQ_API_KEY'])

        # Detailed prompt for comprehensive SDOH factor extraction
        prompt = f"""Extract patient information with precise focus on matchable Social Determinants of Health (SDOH) Factors:

EXTRACTION INSTRUCTIONS:
1. Extract patient name exactly as it appears
2. Identify hospital name and full address
3. List all allergies
4. List all major medical problems
5. SDOH Factors: Precise, extractable social determinants and dont give the sdoh factors which are not matching and also don't mention them.

        **Social Determinants of Health (SDOH) Factors:**

        **Exposure to Radiation**
        This includes subcategories such as occupational radiation exposure, medical diagnostic radiation (X-rays, CT scans), environmental radiation sources (nuclear fallout, radon gas), ultraviolet radiation (sun exposure), and proximity to nuclear facilities.

        **Lack of Access to Clean Water**
        This factor comprises insufficient potable water for drinking, contamination of household water supplies, agricultural irrigation challenges, and dependence on unsafe natural water sources.

        **Exposure to High Noise**
        Includes noise pollution from industrial work environments, urban traffic noise, excessive residential noise, recreational noise exposure (concerts, clubs).

        **Workplace-Related Stress**
        Subcategories include job insecurity, excessive workload, workplace harassment, lack of work-life balance, and poor relationships with coworkers or supervisors.

        **Limited Access to Healthcare Services**
        Encompasses physical distance from healthcare facilities, financial barriers (lack of insurance), long wait times for appointments, absence of specialized care in the area.

        **Poor Housing Conditions**
        Includes inadequate ventilation, overcrowding, exposure to mold or pests, unstable building structures, and lack of heating or cooling systems.

        **Inadequate Nutrition**
        Subcategories include food insecurity, reliance on low-nutrient processed foods, lack of access to fresh produce, malnutrition, and dietary imbalances.

        **Exposure to Nuclear Radiation**
        This involves incidents of nuclear accidents, exposure through medical therapies (radiation treatments), contamination in specific geographic regions, occupational exposure in nuclear plants, and exposure through contaminated food or water.

        **Exposure to Dust and Smoke**
        Includes industrial emissions, exposure to wildfire smoke, indoor air pollution from cooking fuels, occupational dust (construction, mining), and secondhand tobacco smoke.

        **Exposure to Environmental Pollutants**
        Encompasses exposure to chemical spills, pesticide exposure in agricultural settings, industrial waste in water bodies, urban air pollution (smog), and proximity to waste disposal sites.

        note: Dont consider all the SDOH factor. consider only if they match exact.

CLINICAL NOTE: Only match the SDOH factors which are exactly matched. and others dont pass and mention.
{clinical_note}

OUTPUT FORMAT (STRICT JSON):
{{
    "patient_information": {{
        "name": "",
        "hospital": {{
            "name": "",
            "address": ""
        }},
        "allergies": [],
        "major_medical_problems": []
    }},
          "sdoh_factors": [
            "Specific SDOH Factor 1",
            "Specific SDOH Factor 2"
    ]
}}


MATCHING CRITERIA:
- Only include SDOH factors with EXACT matches to predefined codes and dont give the sdoh factors which are not matched and also don't mention them in the output
- Prioritize high-confidence, specific social determinant extractions
- Avoid general or interpretative statements


CRITICAL REQUIREMENTS:
- NO unmatched SDOH factors in final output and also don't mention them
- For the SDOH factor pass only which are exactly matched
- Be specific and comprehensive
- Use verbatim text from the clinical note
- No interpretations, only direct extractions"""

        response = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a precise medical information and SDOH factor extractor. Output ONLY the requested JSON."
                },
                {"role": "user", "content": prompt}
            ],
            model="mixtral-8x7b-32768",
            temperature=0.1,
            top_p = 0.9,
            max_tokens=5000,
            response_format={"type": "json_object"}
        )

        # Extract and parse response
        response_content = response.choices[0].message.content

        try:
            # Strip whitespace and parse JSON
            response_content = response_content.strip()
            data = json.loads(response_content)

            # Validate structure
            if not isinstance(data, dict) or 'patient_information' not in data:
                return {"error": "Invalid response structure"}

            return data

        except json.JSONDecodeError as e:
            print(f"JSON Parsing Error: {e}")
            return {
                "error": f"Failed to parse JSON: {str(e)}",
                "raw_response": response_content
            }

    except Exception as e:
        print(f"Critical Extraction Error: {e}")
        import traceback
        traceback.print_exc()
        return {"error": str(e)}

In [174]:
def match_sdoh_codes(extracted_info, sdoh_codes):
    """Match extracted SDOH factors with their corresponding codes."""
    try:
        # Validate input
        if not isinstance(extracted_info, dict) or 'sdoh_factors' not in extracted_info:
            return {"error": "Invalid extracted information"}

        # Match SDOH factors with codes
        sdoh_factors = extracted_info['sdoh_factors']
        sdoh_with_codes = []

        for factor in sdoh_factors:
            factor_lower = factor.lower()
            matched_code = sdoh_codes.get(factor_lower, "CODE_NOT_FOUND")
            sdoh_with_codes.append({
                "original_factor": factor,
                "matched_code": matched_code,
                "status": "Exact" if matched_code != "CODE_NOT_FOUND" else "Not Found"
            })

        # Prepare final result
        result = {
            "patient_information": extracted_info.get('patient_information', {}),
            "sdoh_factors_with_codes": sdoh_with_codes
        }

        return result

    except Exception as e:
        print(f"Error matching SDOH codes: {e}")
        return {"error": str(e)}

In [175]:
def process_files(clinical_note_path, sdoh_csv_path):
    """Process clinical notes and match SDOH codes."""
    try:
        # Read clinical note
        with open(clinical_note_path, 'r') as file:
            clinical_note = file.read()

        # Load SDOH codes
        sdoh_codes = load_sdoh_codes(sdoh_csv_path)

        # Extract patient information
        extracted_info = extract_patient_info(clinical_note)

        # Match SDOH codes
        result = match_sdoh_codes(extracted_info, sdoh_codes)

        return json.dumps(result, indent=2)

    except Exception as e:
        print(f"Unexpected error: {e}")
        import traceback
        traceback.print_exc()
        return json.dumps({"error": str(e)})



In [176]:
def main():
    # Gradio interface
    iface = gr.Interface(
        fn=process_files,
        inputs=[
            gr.File(label="Clinical Note"),
            gr.File(label="SDOH Codes CSV")
        ],
        outputs=gr.Textbox(label="Extracted Information"),
        title="Clinical Note and SDOH Code Matcher",
        description="Extract patient information and match SDOH factors with codes"
    )

    # Launch the interface
    iface.launch()

if __name__ == "__main__":
    # Ensure Groq API key is set
    if not os.environ.get("GROQ_API_KEY"):
        print("Please set your GROQ_API_KEY environment variable")
    else:
        main()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://30cb636c372afcd342.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
