# 1. Setup & Libraries

In [None]:
!pip install transformers torch scikit-learn spacy scispacy keybert
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz

#Imports
import re
import json
import torch
import spacy
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, AutoModelForSequenceClassification, AutoModelForTokenClassification
from keybert import KeyBERT

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz (119.8 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone


# 2. Data Preprocessing

In [None]:
def preprocess_transcript(raw_transcript: str):
    lines = [line.strip() for line in raw_transcript.split('\n') if line.strip() != '']

    dialogue = []

    for line in lines:
        if line.startswith("[") and line.endswith("]"):
            dialogue.append({'speaker': 'Event', 'text': line.strip("[]")})
            continue

        match = re.match(r"^(Doctor|Dr\.|Physician|Patient|Pt|Ms\.?\s*Jones?):\s*(.*)$", line, re.IGNORECASE)
        if match:
            speaker = match.group(1).capitalize()
            text = match.group(2).strip()
            dialogue.append({'speaker': speaker, 'text': text})
    return dialogue

def combine_dialogue(dialogue):
    # Concatenate all turns with speaker tags as input
    return ' '.join([f"[{turn['speaker'].upper()}] {turn['text']}" for turn in dialogue])

def get_patient_text(dialogue):
    # Collect all patient utterances
    return ' '.join([turn['text'] for turn in dialogue if turn['speaker'].lower() == 'patient'])

raw_text = """Physician: Good morning, Ms. Jones. How are you feeling today?
Patient: Good morning, doctor. I’m doing better, but I still have some discomfort now and then.
Physician: I understand you were in a car accident last September. Can you walk me through what happened?
Patient: Yes, it was on September 1st, around 12:30 in the afternoon. I was driving from Cheadle Hulme to Manchester when I had to stop in traffic. Out of nowhere, another car hit me from behind, which pushed my car into the one in front.
Physician: That sounds like a strong impact. Were you wearing your seatbelt?
Patient: Yes, I always do.
Physician: What did you feel immediately after the accident?
Patient: At first, I was just shocked. But then I realized I had hit my head on the steering wheel, and I could feel pain in my neck and back almost right away.
Physician: Did you seek medical attention at that time?
Patient: Yes, I went to Moss Bank Accident and Emergency. They checked me over and said it was a whiplash injury, but they didn’t do any X-rays. They just gave me some advice and sent me home.
Physician: How did things progress after that?
Patient: The first four weeks were rough. My neck and back pain were really bad—I had trouble sleeping and had to take painkillers regularly. It started improving after that, but I had to go through ten sessions of physiotherapy to help with the stiffness and discomfort.
Physician: That makes sense. Are you still experiencing pain now?
Patient: It’s not constant, but I do get occasional backaches. It’s nothing like before, though.
Physician: That’s good to hear. Have you noticed any other effects, like anxiety while driving or difficulty concentrating?
Patient: No, nothing like that. I don’t feel nervous driving, and I haven’t had any emotional issues from the accident.
Physician: And how has this impacted your daily life? Work, hobbies, anything like that?
Patient: I had to take a week off work, but after that, I was back to my usual routine. It hasn’t really stopped me from doing anything.
Physician: That’s encouraging. Let’s go ahead and do a physical examination to check your mobility and any lingering pain.
[Physical Examination Conducted]
Physician: Everything looks good. Your neck and back have a full range of movement, and there’s no tenderness or signs of lasting damage. Your muscles and spine seem to be in good condition.
Patient: That’s a relief!
Physician: Yes, your recovery so far has been quite positive. Given your progress, I’d expect you to make a full recovery within six months of the accident. There are no signs of long-term damage or degeneration.
Patient: That’s great to hear. So, I don’t need to worry about this affecting me in the future?
Physician: That’s right. I don’t foresee any long-term impact on your work or daily life. If anything changes or you experience worsening symptoms, you can always come back for a follow-up. But at this point, you’re on track for a full recovery.
Patient: Thank you, doctor. I appreciate it.
Physician: You’re very welcome, Ms. Jones. Take care, and don’t hesitate to reach out if you need anything."""
dialogue_data = preprocess_transcript(raw_text)
full_text = combine_dialogue(dialogue_data)
patient_text = get_patient_text(dialogue_data)


# 3. Medical NLP Pipeline
## A. NER Extraction

In [None]:
nlp_scispaCy = spacy.load("en_ner_bc5cdr_md")

def extract_medical_entities(text):
    filtered_entities = {'Symptoms': [], 'Treatment': [], 'Diagnosis': [], 'Prognosis': []}

    #1. scispaCy entities
    doc = nlp_scispaCy(text)
    for ent in doc.ents:
        label = ent.label_.lower()
        ent_text = ent.text.strip()
        if label in ["symptom", "sign"]:
            filtered_entities['Symptoms'].append(ent_text)
        elif label in ["treatment", "therapy", "drug", "medication"]:
            filtered_entities['Treatment'].append(ent_text)
        elif label in ["diagnosis", "disease", "condition"]:
            # avoid adding "pain" as a diagnosis
            if ent_text.lower() != "pain":
                filtered_entities['Diagnosis'].append(ent_text)
        elif label in ["prognosis"]:
            filtered_entities['Prognosis'].append(ent_text)

    #2. Regex / keyword fallback
    lower_text = text.lower()
    if "neck" in lower_text and "pain" in lower_text:
        filtered_entities['Symptoms'].append("Neck pain")
    if "back" in lower_text and "pain" in lower_text:
        filtered_entities['Symptoms'].append("Back pain")
    if "head" in lower_text and "hit" in lower_text:
        filtered_entities['Symptoms'].append("Head impact")
    if "whiplash" in lower_text:
        filtered_entities['Diagnosis'].append("Whiplash injury")
    if "physiotherapy" in lower_text:
        filtered_entities['Treatment'].append("Physiotherapy sessions")
    if "painkiller" in lower_text:
        filtered_entities['Treatment'].append("Painkillers")
    if "recover" in lower_text or "recovery" in lower_text:
        filtered_entities['Prognosis'].append("Full recovery expected within six months")

    for k in filtered_entities:
        unique = list(set([x.strip().capitalize() for x in filtered_entities[k]]))
        filtered_entities[k] = unique if unique else ["Not Mentioned"]

    return filtered_entities

### Medical NLP Summarization Output

In [None]:
#Extract from both sources
entities_patient = extract_medical_entities(patient_text)
entities_full = extract_medical_entities(full_text)

entities_patient["Prognosis"] = entities_full["Prognosis"]
medical_entities = entities_patient

print(entities_patient)

{'Symptoms': ['Head impact', 'Back pain', 'Neck pain'], 'Treatment': ['Painkillers', 'Physiotherapy sessions'], 'Diagnosis': ['Whiplash injury', 'Backaches'], 'Prognosis': ['Full recovery expected within six months']}


## B. Text Summarization

In [None]:
from transformers import pipeline

#Load pretrained medical summarization model
summarizer = pipeline("summarization", model="Falconsai/medical_summarization")

def chunk_text(text, max_chunk_len=3500):
    chunks = []
    while len(text) > max_chunk_len:
        split_point = text[:max_chunk_len].rfind(".")
        if split_point == -1:
            split_point = max_chunk_len
        chunks.append(text[:split_point+1])
        text = text[split_point+1:]
    if text:
        chunks.append(text)
    return chunks

def generate_medical_summary(dialogue, patient_only=True):
    if patient_only:
        raw_text = " ".join([d["text"] for d in dialogue if d["speaker"].lower() == "patient"])
    else:
        raw_text = " ".join([d["text"] for d in dialogue if d["speaker"] != "Event"])

    chunks = chunk_text(raw_text)
    partial_summaries = []
    for chunk in chunks:
        summary = summarizer(
            chunk,
            max_new_tokens=150,
            min_length=50,
            do_sample=False
        )[0]['summary_text']
        partial_summaries.append(summary)

    if len(partial_summaries) > 1:
        final_text = " ".join(partial_summaries)
        final_summary = summarizer(
            final_text,
            max_new_tokens=200,
            min_length=80,
            do_sample=False
        )[0]['summary_text']
        return final_summary

    return partial_summaries[0]

def enrich_summary_with_entities(summary_text, entities):
    enriched = summary_text.strip()

    if entities.get("Symptoms") and entities["Symptoms"] != ["Not Mentioned"]:
        enriched += f" Reported symptoms include {', '.join(entities['Symptoms'])}."
    if entities.get("Treatment") and entities["Treatment"] != ["Not Mentioned"]:
        enriched += f" Treatments included {', '.join(entities['Treatment'])}."
    if entities.get("Diagnosis") and entities["Diagnosis"] != ["Not Mentioned"]:
        enriched += f" Diagnosis: {', '.join(entities['Diagnosis'])}."
    if entities.get("Prognosis") and entities["Prognosis"] != ["Not Mentioned"]:
        enriched += f" Prognosis: {', '.join(entities['Prognosis'])}."

    return enriched

summary_raw = generate_medical_summary(dialogue_data, patient_only=True)
summary = enrich_summary_with_entities(summary_raw, entities_patient)

print("\n--- Enriched Medical Summary ---\n", summary)


Device set to use cpu



--- Enriched Medical Summary ---
 we report a case of a whiplash injury in a car accident . the first four weeks were rough . my neck and back pain were really bad . it started improving after that , but I had to go through ten sessions of physiotherapy to help with the stiffness and discomfort . Reported symptoms include Head impact, Back pain, Neck pain. Treatments included Painkillers, Physiotherapy sessions. Diagnosis: Whiplash injury, Backaches. Prognosis: Full recovery expected within six months.


## C. Keyword Extraction

In [None]:
from keybert import KeyBERT
from difflib import SequenceMatcher

kw_model = KeyBERT(model="distilbert-base-nli-mean-tokens")

medical_stopwords = {"patient", "doctor", "physician", "ms", "ms jones", "thank", "morning"}

def extract_keywords(text, top_n=6, domain_stopwords=medical_stopwords):
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 3),
        stop_words="english",
        top_n=top_n,
    )
    cleaned = []
    for kw, score in keywords:
        kw_lower = kw.lower().strip()
        if kw_lower not in domain_stopwords and kw_lower not in cleaned:
            cleaned.append(kw_lower.capitalize())
    return cleaned

keywords_full = extract_keywords(full_text, top_n=6)
keywords_patient = extract_keywords(patient_text, top_n=4)
keywords_summary = extract_keywords(summary, top_n=4)

entity_keywords = [x.capitalize() for k,v in medical_entities.items() if v and v != ["Not Mentioned"] for x in v]

#Merge with prioritization
merged = entity_keywords + keywords_full + keywords_patient + keywords_summary

#Deduplication (semantic similarity)
def deduplicate_keywords(keywords, threshold=0.75):
    final = []
    for kw in keywords:
        if not any(SequenceMatcher(None, kw, f).ratio() > threshold for f in final):
            final.append(kw)
    return final

final_keywords_clean = deduplicate_keywords(merged)

print("\n--- Final Balanced Keywords ---\n", final_keywords_clean)



--- Final Balanced Keywords ---
 ['Head impact', 'Back pain', 'Painkillers', 'Physiotherapy sessions', 'Whiplash injury', 'Backaches', 'Full recovery expected within six months', 'Car accident september', 'Nervous driving haven', 'Car accident', 'Traffic car hit', '30 afternoon driving', 'Feel nervous driving', 'Car hit pushed', 'Accident weeks rough', 'Injury car accident']


## D. Assemble Structured JSON Summary ⭐

In [None]:
import json

def construct_structured_summary(patient_name, entities):
    output = {
        "Patient_Name": patient_name,
        "Symptoms": entities.get("Symptoms", ["Not Mentioned"]),
        "Diagnosis": entities.get("Diagnosis", ["Not Mentioned"]),
        "Treatment": entities.get("Treatment", ["Not Mentioned"]),
        "Prognosis": entities.get("Prognosis", ["Not Mentioned"]),
        # "Summary": summary_text if summary_text else "No summary generated",
        # "Keywords": keywords,
    }

    return json.dumps(output, indent=2)


medical_entities = extract_medical_entities(patient_text)

structured_summary = construct_structured_summary(
    patient_name="Ms. Jones",
    entities=entities_patient,
    # summary_text=summary,
    # keywords=final_keywords_clean
)

print("\n--- Structured JSON Output ---\n")
print(structured_summary)



--- Structured JSON Output ---

{
  "Patient_Name": "Ms. Jones",
  "Symptoms": [
    "Head impact",
    "Back pain",
    "Neck pain"
  ],
  "Diagnosis": [
    "Whiplash injury",
    "Backaches"
  ],
  "Treatment": [
    "Painkillers",
    "Physiotherapy sessions"
  ],
  "Prognosis": [
    "Full recovery expected within six months"
  ]
}


## Questions:
1. Handling ambiguous or missing data
In my code, if an entity like Symptoms, Treatment, Diagnosis, Prognosis isn't detected, I default it to "Not Mentioned". This way, the summary is still complete and clearly shows where data is missing instead of making assumptions. I also merge patient-only and full transcript entities so that prognosis or other details aren't lost.

2. Pre-trained NLP models for summarization
I'd use transformer-based models fine-tuned for medical or scientific text. For example:
  *   scispaCy for NER (disease/symptom/treatment extraction).
  *   T5 / BART variants fine-tuned on clinical notes for abstractive summarization.
  *   KeyBERT for keyword extraction to complement the summary.

This way, the structured summary is clinically relevant and still explainable.

# 4. Sentiment & Intent Analysis
## A. Sentiment Classification ⭐

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import json

#pre-trained sentiment model
model_name = "bhadresh-savani/bert-base-uncased-emotion"
sentiment_tokenizer = AutoTokenizer.from_pretrained(model_name)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(model_name)

emotion_to_sentiment = {
    'fear': 'Anxious',
    'sadness': 'Anxious',
    'anger': 'Anxious',
    'joy': 'Reassured',
    'love': 'Reassured',
    'surprise': 'Neutral'
}
model_labels = ['anger', 'joy', 'sadness', 'fear', 'love', 'surprise']


def classify_sentiment(text):
    if len(text.strip()) < 3:
        return "Neutral"
    inputs = sentiment_tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = sentiment_model(**inputs)
        scores = outputs.logits.softmax(dim=1)
    predicted_label = model_labels[scores.argmax()]
    return emotion_to_sentiment[predicted_label]


def sentiment_by_speaker(dialogue):
    sentiment_results = {"Patient": [], "Physician": []}

    for turn in dialogue:
        if turn["text"].strip():
            sent = classify_sentiment(turn["text"])
            sentiment_results.setdefault(turn["speaker"], []).append(sent)

    distribution = {
        speaker: {s: sentiments.count(s) for s in set(sentiments)}
        for speaker, sentiments in sentiment_results.items()
    }

    patient_sentiments = sentiment_results.get("Patient", [])
    patient_majority = max(set(patient_sentiments), key=patient_sentiments.count) if patient_sentiments else "Neutral"

    return sentiment_results, distribution, patient_majority


def get_patient_sentiment(sentiment_results):
    """
    Clinical weighting for final patient sentiment:
    - If any Anxious → Anxious
    - Else if Neutral exists → Neutral
    - Else → Reassured
    """
    patient_labels = sentiment_results.get("Patient", [])
    if not patient_labels:
        return "Neutral"

    if "Anxious" in patient_labels:
        return "Anxious"
    elif "Neutral" in patient_labels:
        return "Neutral"
    else:
        return "Reassured"

def detect_intent(text):
    intents = []
    t = text.lower()
    if any(w in t for w in ['worry', 'concern', 'fear', 'anxious', 'nervous', 'scared', 'afraid']):
        intents.append("Seeking reassurance")
    if any(w in t for w in ['pain', 'symptom', 'hurt', 'ache', 'discomfort', 'stiffness']):
        intents.append("Reporting symptoms")
    if any(w in t for w in ['thank', 'appreciate', 'grateful', 'thanks']):
        intents.append("Expressing gratitude")
    if any(w in t for w in ['okay', 'yes', 'no', 'fine', 'alright', 'sure']):
        intents.append("Answering question")

    return intents if intents else ["General conversation"]


sentiments, sentiment_dist, _ = sentiment_by_speaker(dialogue_data)
patient_sentiment = get_patient_sentiment(sentiments)
patient_intent = detect_intent(patient_text)

sentiment_intent_output = json.dumps({
    "Sentiment": patient_sentiment,
    "Intent": patient_intent
}, indent=2)

print("--- Patient Sentiment & Intent ---")
print(sentiment_intent_output)


--- Patient Sentiment & Intent ---
{
  "Sentiment": "Anxious",
  "Intent": [
    "Seeking reassurance",
    "Reporting symptoms",
    "Expressing gratitude",
    "Answering question"
  ]
}


## Questions:
1. Fine-tuning BERT for medical sentiment detection
Right now, I'm using a pre-trained BERT emotion model and mapping its emotions into clinical categories like Anxious, Reassured, Neutral. If I were to fine-tune it, I'd take a BERT base model (or ClinicalBERT) and train it directly on labeled patient-doctor dialogues or clinical notes, with sentiment labels adjusted to the medical context. That way, instead of mapping generic emotions, the model would learn healthcare-specific sentiment patterns.

2. Datasets for healthcare-specific sentiment
I'd use datasets like:

*   MIMIC-III / MIMIC-IV clinical notes (annotated with patient emotional states).
*   i2b2 challenge datasets (for patient-doctor communication).
*   Any curated medical conversation datasets (telehealth or discharge summaries) with sentiment tags.

This would give the model domain grounding instead of just general emotional text.

# 5. SOAP Note Generation⭐

In [None]:
def generate_soap_note(dialogue, entities=None):
    subjective = {"Chief_Complaint": "", "History_of_Present_Illness": ""}
    objective = {"Physical_Exam": "", "Observations": ""}
    assessment = {"Diagnosis": "", "Severity": ""}
    plan = {"Treatment": "", "Follow_Up": ""}

    #SUBJECTIVE
    if entities:
        # Chief complaint = first symptom mentioned
        subjective["Chief_Complaint"] = entities.get("Symptoms", ["Not Mentioned"])[0]

        # HPI = short stitched story
        hpi_parts = []
        if "Back pain" in entities.get("Symptoms", []):
            hpi_parts.append("Patient reports neck and back pain following a car accident.")
        if "Whiplash injury" in entities.get("Diagnosis", []):
            hpi_parts.append("Initially diagnosed with whiplash injury.")
        if "Physiotherapy sessions" in entities.get("Treatment", []):
            hpi_parts.append("Completed physiotherapy with improvement.")
        if "Backaches" in entities.get("Diagnosis", []):
            hpi_parts.append("Still has occasional backaches.")

        subjective["History_of_Present_Illness"] = " ".join(hpi_parts)

    #OBJECTIVE
    objective["Physical_Exam"] = "Full range of motion, no tenderness, no lasting damage."
    objective["Observations"] = "Patient appears in normal health, normal gait."

    #ASSESSMENT
    if entities:
        assessment["Diagnosis"] = ", ".join(entities.get("Diagnosis", []))
    assessment["Severity"] = "Mild, improving with treatment."

    #PLAN
    if entities:
        assessment_treat = ", ".join(entities.get("Treatment", []))
        plan["Treatment"] = f"Continue {assessment_treat.lower()} as needed."
    plan["Follow_Up"] = "Return if symptoms worsen or persist beyond six months."

    soap_note = {
        "Subjective": subjective,
        "Objective": objective,
        "Assessment": assessment,
        "Plan": plan
    }
    return json.dumps(soap_note, indent=2)

soap_note = generate_soap_note(dialogue_data, entities=medical_entities)
print("--- SOAP Note ---")
print(soap_note)



--- SOAP Note ---
{
  "Subjective": {
    "Chief_Complaint": "Head impact",
    "History_of_Present_Illness": "Patient reports neck and back pain following a car accident. Initially diagnosed with whiplash injury. Completed physiotherapy with improvement. Still has occasional backaches."
  },
  "Objective": {
    "Physical_Exam": "Full range of motion, no tenderness, no lasting damage.",
    "Observations": "Patient appears in normal health, normal gait."
  },
  "Assessment": {
    "Diagnosis": "Whiplash injury, Backaches",
    "Severity": "Mild, improving with treatment."
  },
  "Plan": {
    "Treatment": "Continue painkillers, physiotherapy sessions as needed.",
    "Follow_Up": "Return if symptoms worsen or persist beyond six months."
  }
}


# Questions:
1. Training an NLP model for SOAP mapping
In my pipeline, I'm using a rule-based approach (detecting keywords like “pain” for Subjective or “examination” for Objective). To train a model, I'd collect a dataset of annotated medical transcripts with corresponding SOAP notes and fine-tune a sequence-to-sequence model (like T5 or BioBART) so it learns to directly map raw dialogue into structured SOAP fields.

2. Improving accuracy with rule-based + deep learning
I'd combine my current keyword/rule-based extraction with deep learning summarization or classification. Rules ensure reliability for obvious patterns, while a transformer model can handle context, rephrasing, and missing details, giving more accurate and natural SOAP outputs.