In [None]:
import re
import unicodedata
import pandas as pd
from typing import List

In [None]:
health_care_path = r"..\data\raw\HealthCareMagic-100k.json"
df = pd.read_json(health_care_path)

df.head(200)

Unnamed: 0,instruction,input,output
0,"If you are a doctor, please answer the medical...",I woke up this morning feeling the whole room ...,"Hi, Thank you for posting your query. The most..."
1,"If you are a doctor, please answer the medical...",My baby has been pooing 5-6 times a day for a ...,Hi... Thank you for consulting in Chat Doctor....
2,"If you are a doctor, please answer the medical...","Hello, My husband is taking Oxycodone due to a...","Hello, and I hope I can help you today.First, ..."
3,"If you are a doctor, please answer the medical...",lump under left nipple and stomach pain (male)...,HI. You have two different problems. The lump ...
4,"If you are a doctor, please answer the medical...",I have a 5 month old baby who is very congeste...,Thank you for using Chat Doctor. I would sugge...
...,...,...,...
195,"If you are a doctor, please answer the medical...",i am taking glycomate sr igram at night before...,"Hi Mr. Mira, As you have not mentioned the dur..."
196,"If you are a doctor, please answer the medical...",My mother is having lightheadiness and heart p...,"Hi, Lightheadedness associated with palpitatio..."
197,"If you are a doctor, please answer the medical...",I am 27 years old female. I am having right lo...,Hi ! Good morning. I am Chat Doctor answering ...
198,"If you are a doctor, please answer the medical...","Respected Sir,My name is Nishanth singh gurung...","Thanks for your query, I have gone through you..."


In [12]:
GREET_PATTERNS = [
    # các pattern thường gặp (case-insensitive)
    r"^hi[,.\s]*",
    r"^hello[,.\s]*",
    r"^good\s+(morning|evening|day)[,.\s]*",
    r"^dear[\s\w:,-]*[,.\s]*",                    # Dear Name:
    r"^thank(s| you)( for)? (posting|your query|for your query|for the query|for asking|for contacting|for the query|for your question|for the query)[,.\s]*",
    r"^thanks( for)?( your)?( query| posting| for posting| for contacting| for asking)?[,.\s]*",
    r"^welcome( to)?( the)?( chat\s*doctor| chatdoctor| chat\s*doctor!| chatdoctor!| chat\s*doctor[,.\s]*)?",
    r"^welcome (and )?thank(s| you)[,.\s]*",
    r"^i\s+(have\s+)?(read|gone\s+through|studied)\s+(your|the)\s+(query|question|case)[,.\s]*",
    r"^i\s+understand\s+your\s+concern[s]?[.,\s]*",
    r"^i\s+can\s+understand\s+your\s+concern[s]?[.,\s]*",
    r"^thank you for writing( to us| about your query)?[,.\s]*",
    r"^thanks for posting( your query)?[,.\s]*",
    r"^thanks for contacting( us)?[,.\s]*",
    r"^thanks for choosing( chat\s*doctor)?[,.\s]*",
    r"^thank you for your query[,.\s]*",
    r"^thanks for your query[,.\s]*",
    r"^thank you for asking( chat\s*doctor)?[,.\s]*",
    r"^welcome to (chat\s*doctor|chatdoctor)[,.\s]*",
    r"^forum[.,\s]*$",
    r"^on chat\s*doctor[.,\s]*",
    r"^chat\s*doctor[.,\s]*",
    r"^thank-you for providing the brief history[,.\s]*",
    r"^dear[,\s]*",  # generic dear
    # short salutations like "Good morning.", "Good evening sir."
    r"^good (morning|evening|day|afternoon)[,.\s]*",
]

# global removals anywhere in text (not only leading)
GLOBAL_REMOVE = [
    r"\bchat[-\s]*doctor\b",
    r"\bthanks for (posting|your query|the query|asking|contacting)\b",
    r"\bthank you for (posting|your query|asking|contacting)\b",
    r"\bthanks for your query\b",
    r"\bwelcome to chatdoctor\b",
    r"\bwelcome to chat doctor\b",
    r"\bthanks for posting\b",
    r"\bthanks for asking\b",
    r"\bthank you for asking\b",
    r"\bthanks for contacting\b",
    r"\bthanks\b\.?$",
    r"\bforum\b\.?",
    r"\bin\.?$",   # stray "in." tokens
]

# compile patterns
GREET_RE = [re.compile(p, flags=re.I) for p in GREET_PATTERNS]
GLOBAL_RE = [re.compile(p, flags=re.I) for p in GLOBAL_REMOVE]

# small helper to collapse weird spaced letters like "T hank" -> "Thank"
_SPLICED_LETTER_RE = re.compile(r'\b(?:[A-Za-z]\s){2,}[A-Za-z]\b')

def _collapse_spliced_letters(s: str) -> str:
    """
    Collapse sequences like 'T hank' or 'W a n d a' into 'Thank' / 'Wanda' for short sequences.
    Only apply to sequences <= ~30 chars to be safe.
    """
    def repl(m):
        t = re.sub(r'\s+', '', m.group(0))
        return t
    
    return _SPLICED_LETTER_RE.sub(repl, s)

def _split_sentences(text: str) -> List[str]:
    # simple sentence splitter by punctuation
    parts = re.split(r'(?<=[\.\?\!])\s+', text)
    # also split on newlines
    exploded = []
    for p in parts:
        exploded.extend([s.strip() for s in p.splitlines() if s.strip()])
        
    return [s for s in exploded if s.strip()]

def clean_healthcaremagic_text_v2(text: str, max_leading_sentences=6) -> str:
    """
    Improved cleaner for HealthCareMagic-like doctor replies.
    Removes various greeting/filler constructs at the beginning (and some global tokens).
    """
    if not text:
        return text

    # normalize unicode and whitespace
    text = unicodedata.normalize("NFKC", text)
    text = text.replace('\r', ' ')
    text = re.sub(r'\u200b', '', text)  # zero-width
    text = re.sub(r'\s+', ' ', text).strip()

    # collapse weird broken-letter tokens e.g. "T hank" -> "Thank"
    text = _collapse_spliced_letters(text)

    # replace known connector typos to normalize
    text = re.sub(r'(?i)chat\s*doctor', ' ', text)   # remove the service markers
    text = re.sub(r'(?i)chatdoctor', ' ', text)
    text = re.sub(r'(?i)on chat doctor', ' ', text)
    text = re.sub(r'(?i)welcome\sto', ' ', text)

    # split into sentences and drop initial greeting/filler sentences
    sents = _split_sentences(text)

    keep_idx = 0
    for i, s in enumerate(sents[:max_leading_sentences]):  # only examine a few head sentences
        s_strip = s.strip()
        low = s_strip.lower()

        # if very short and contains filler tokens, drop
        if len(s_strip.split()) <= 6 and any(tok in low for tok in ("thanks", "thank", "welcome", "forum", "dear", "hi", "hello", "good morning", "good evening")):
            keep_idx = i + 1
            continue

        # if matches any greeting pattern -> drop this sentence
        matched = False
        for pat in GREET_RE:
            if pat.match(s_strip):
                matched = True
                keep_idx = i + 1
                break
        if matched:
            continue

        # also drop sentences that begin with "i have read", "i have gone through", "i read", "i understand"
        if re.match(r'(?i)^(i\s+(have\s+)?(read|gone\s+through|studied|seen)|i\s+(understand|can\s+understand))', s_strip):
            keep_idx = i + 1
            continue

        # otherwise break on first "real" sentence
        break

    # reconstruct remaining text
    remaining = " ".join(sents[keep_idx:]) if keep_idx < len(sents) else ""
    if not remaining:
        # if nothing left, fallback to original but remove greetings globally
        remaining = text

    # global removals of common filler fragments inside text
    for rep in GLOBAL_RE:
        remaining = rep.sub(" ", remaining)

    # remove common connectors left at sentence starts e.g. "and thank you ..." or "and welcome ..."
    remaining = re.sub(r'(?i)^[\s,;:-]*(and\s+)?(thank(s| you)|welcome)[\s,;:-]*', "", remaining)

    # final cleanup: remove duplicated spaces/punctuations and stray leading punctuation
    remaining = re.sub(r'\s+', ' ', remaining).strip()
    remaining = re.sub(r'^[\s,.:;!?-]+', '', remaining)
    remaining = remaining.strip()

    return remaining

df_cleaned = df.copy()
df_cleaned['output'] = df_cleaned['output'].apply(lambda x: clean_healthcaremagic_text_v2(x))

In [13]:
df_cleaned.head(200)

Unnamed: 0,instruction,input,output
0,"If you are a doctor, please answer the medical...",I woke up this morning feeling the whole room ...,The most likely cause for your symptoms is ben...
1,"If you are a doctor, please answer the medical...",My baby has been pooing 5-6 times a day for a ...,It seems your kid is having viral diarrhea. On...
2,"If you are a doctor, please answer the medical...","Hello, My husband is taking Oxycodone due to a...",Medications can only affect a fetus if you tak...
3,"If you are a doctor, please answer the medical...",lump under left nipple and stomach pain (male)...,You have two different problems. The lump unde...
4,"If you are a doctor, please answer the medical...",I have a 5 month old baby who is very congeste...,I would suggest that you see your doctor. Your...
...,...,...,...
195,"If you are a doctor, please answer the medical...",i am taking glycomate sr igram at night before...,"Mira, As you have not mentioned the duration o..."
196,"If you are a doctor, please answer the medical...",My mother is having lightheadiness and heart p...,Heart rate and blood pressure can be normal in...
197,"If you are a doctor, please answer the medical...",I am 27 years old female. I am having right lo...,"I am answering your query. From your history, ..."
198,"If you are a doctor, please answer the medical...","Respected Sir,My name is Nishanth singh gurung...",The sensitivity in the tooth can be because of...


In [14]:
df_cleaned.tail(200)

Unnamed: 0,instruction,input,output
111965,"If you are a doctor, please answer the medical...",I have problem wih my gall bladder. I have che...,"I am , I understand your concerns and I will t..."
111966,"If you are a doctor, please answer the medical...",I have a question concerning my mother. She is...,"On the other hand, if your mother was simply i..."
111967,"If you are a doctor, please answer the medical...","Hi, I have a lump about the size of pea right ...",For how long this lump is there.? Is it decrea...
111968,"If you are a doctor, please answer the medical...","Hi, my mum has a persistent cough after having...","Spurs, by medical definition is a short pointe..."
111969,"If you are a doctor, please answer the medical...",I am having chest congestion for the past one ...,Maintain oral hygiene as sometime gum bleeding...
...,...,...,...
112160,"If you are a doctor, please answer the medical...",im 25 years old ..i started using mtp kit on 5...,If you self-medicated for inducing the abortio...
112161,"If you are a doctor, please answer the medical...",My 5 year old son has been coughing for a mont...,As you have mentioned in your history that you...
112162,"If you are a doctor, please answer the medical...",My toes on right foot more than left are numb ...,The numbness and blue discoloration could both...
112163,"If you are a doctor, please answer the medical...","I was diagnosis with pleurisy last Tuesday, an...",Treatment of pleurisy is depending on cause. T...


In [15]:
output_path = r"..\data\processed\HealthCareMagic-100k.json"
df_cleaned.to_json(output_path)