# Medical Note Extraction H2O.ai Competition Notebook - Winners

In [2]:
import pandas as pd
import numpy as np

In [3]:
train_df = pd.read_csv('/kaggle/input/medical-note-extraction-h-2-o-gen-ai-world-ny/train.csv')
train_df

Unnamed: 0,ID,Note,json
0,1,**Clinical Notes**\n\n**Patient Information:**...,"{""patient_info"": {""age"": 41, ""gender"": ""Male""}..."
1,2,Clinical Notes:\n\nPatient: 56-year-old male\n...,"{""patient_info"": {""age"": 56, ""gender"": ""Male""}..."
2,3,Clinical Notes:\n\nPatient: 73-year-old female...,"{""patient_info"": {""age"": 73, ""gender"": ""Female..."
3,4,**Clinical Notes**\n\n**Patient Information:**...,"{""patient_info"": {""age"": 32, ""gender"": ""Female..."
4,5,"**Clinical Notes**\n\n**Patient:** Male, age 5...","{""patient_info"": {""age"": 51, ""gender"": ""Male""}..."
...,...,...,...
10817,10818,**Clinical Notes**\n\n**Patient Information:**...,"{""patient_info"": {""age"": 52, ""gender"": ""Female..."
10818,10819,**Clinical Notes**\n\n**Patient Information:**...,"{""patient_info"": {""age"": 17, ""gender"": ""Male""}..."
10819,10820,Clinical Notes:\n\nPatient Profile:\nThe patie...,"{""patient_info"": {""age"": 36, ""gender"": ""Female..."
10820,10821,**Clinical Notes**\n\n**Patient Information:**...,"{""patient_info"": {""age"": 21, ""gender"": ""Female..."


In [4]:
train_df['Note'] = train_df['Note'].str.replace('*', '', regex=False)

In [5]:
symptoms_set = set()
for i in range(len(train_df)):
    for sym in eval(train_df['json'][i])['symptoms']:
        symptoms_set.add(sym)

In [6]:
symptoms_set

{'abdominal_pain',
 'anxiety',
 'blurred_vision',
 'chest_pain',
 'cough',
 'diarrhea',
 'difficulty_breathing',
 'difficulty_concentrating',
 'dizziness',
 'dry_skin',
 'ear_pain',
 'facial_pain',
 'fatigue',
 'fever',
 'frequent_urination',
 'headache',
 'heartburn',
 'increased_thirst',
 'itchy_eyes',
 'joint_pain',
 'loss_of_taste_smell',
 'nausea',
 'night_sweats',
 'painful_urination',
 'pale_skin',
 'rash',
 'restlessness',
 'runny_nose',
 'sadness',
 'sneezing',
 'sore_throat',
 'swollen_lymph_nodes',
 'vomiting',
 'weight_loss',
 'wheezing'}

In [7]:
visit_motivation_set = set()
for i in range(len(train_df)):
    visit_motivation_set.add(eval(train_df['json'][i])['visit_motivation'])

In [8]:
import re

def extract_symptoms(text, symptoms_set):
    def process_text(text_to_process):
        symptoms_list = []
        # Special check for "loss of taste and smell"
        combined_symptom = "loss of taste and smell"
        if re.search(r'\b' + re.escape(combined_symptom) + r'\b', text_to_process.lower()):
            combined_symptom="loss_of_taste_smell"
            symptoms_list.append(combined_symptom)
        
        # Insert spaces around punctuation marks for better tokenization
        text_processed = re.sub(r'([,.;()\-])', r' \1 ', text_to_process.lower())
        # Replace multiple whitespace characters with a single space
        text_processed = re.sub(r'\s+', ' ', text_processed)

        # Define additional mappings for partial matches
        partial_match_map = {
            "pale_skin": "pale",
            "sadness": "sad",
            "cough": "coughing",
            "fatigue": "fatigued",
            "dizziness": "dizzy",
            "headache": "headaches",
            "anxiety": "anxious",
            "frequent_urination": "frequent and painful urination",
            "painful_urination": "frequent and painful urination",
            "swollen_lymph_nodes": "lymph",
            "restlessness": "restless",
            "loss_of_taste_smell":"loss of taste/smell"
        }

        # Search for symptoms in the processed text
        for symptom in symptoms_set:
            # Skip "loss of taste and smell" if already added
            if symptom == combined_symptom and combined_symptom in symptoms_list:
                continue
                
            symptom_phrase = symptom.replace('_', ' ').lower()
            # Build a regex pattern that allows for any whitespace between words
            symptom_words = symptom_phrase.split()
            symptom_pattern = r'\b' + r'\s+'.join(map(re.escape, symptom_words)) + r'\b'
            pattern = re.compile(symptom_pattern, re.IGNORECASE)
            matches = list(pattern.finditer(text_processed))
            for match in matches:
                # Get a window of words before the symptom phrase
                start = max(0, match.start() - 40)
                preceding_text = text_processed[start:match.start()]
                # Check for negation words
                if not re.search(r'\b(no|not|denies|denied|without|absence of)\b', preceding_text, re.IGNORECASE):
                    symptoms_list.append(symptom)
                    break  # Stop after the first valid occurrence

        # Handle partial matches
        for full_symptom, partial in partial_match_map.items():
            if full_symptom not in symptoms_list and re.search(r'\b' + re.escape(partial) + r'\b', text_processed):
                symptoms_list.append(full_symptom)

        return symptoms_list
        
    vital_index = re.search(r'\bvitals?\b', text.lower())  # Search for 'vital' or 'vitals'
    if vital_index:
        truncated_text = text[:vital_index.start()]  # Get the text before 'vital' or 'vitals'
        symptoms_list = process_text(truncated_text)
    else:
        symptoms_list = process_text(text)
    
    # Step 2: If no symptoms found, process the full original text
    if not symptoms_list:
        symptoms_list = process_text(text)
    
    return symptoms_list

In [9]:
# Preprocess visit motivations: keep bracket content and original as keys
preprocessed_visit_motivation = {
    **{
        re.search(r"\((.*?)\)", vmotivation).group(1): vmotivation
        for vmotivation in visit_motivation_set
        if re.search(r"\((.*?)\)", vmotivation)
    },
    **{
        re.sub(r"\s*\(.*?\)", "", vmotivation).strip(): vmotivation
        for vmotivation in visit_motivation_set
    }
}


In [10]:
preprocessed_visit_motivation['Anxiety Disorder'] = 'Anxiety Disorders'

In [87]:
preprocessed_visit_motivation

{'Flu': 'Influenza (Flu)',
 'GERD': 'Gastroesophageal Reflux Disease (GERD)',
 'Type 2': 'Diabetes (Type 2)',
 'COPD': 'Chronic Obstructive Pulmonary Disease (COPD)',
 'Otitis Media': 'Ear Infection (Otitis Media)',
 'UTI': 'Urinary Tract Infection (UTI)',
 'High Blood Pressure': 'Hypertension (High Blood Pressure)',
 'Atopic Dermatitis': 'Eczema (Atopic Dermatitis)',
 'TB': 'Tuberculosis (TB)',
 'Coronary Artery Disease': 'Heart Disease (Coronary Artery Disease)',
 'Influenza': 'Influenza (Flu)',
 'Strep Throat': 'Strep Throat',
 'Common Cold': 'Common Cold',
 'Gastroesophageal Reflux Disease': 'Gastroesophageal Reflux Disease (GERD)',
 'Anemia': 'Anemia',
 'Asthma': 'Asthma',
 'Anxiety Disorders': 'Anxiety Disorders',
 'Diabetes': 'Diabetes (Type 2)',
 'Pneumonia': 'Pneumonia',
 'COVID-19': 'COVID-19',
 'Chronic Obstructive Pulmonary Disease': 'Chronic Obstructive Pulmonary Disease (COPD)',
 'Sinusitis': 'Sinusitis',
 'Depression': 'Depression',
 'Allergies': 'Allergies',
 'Ear Infec

In [52]:
import re
import json
import ast

def extract_age_gender(text):
    # Remove the date line and unnecessary whitespace
    modified_text = re.sub(r"Date:.*\n?", "", text).strip()
    
    # Define improved regex for age and gender
    age_gender_pattern = r"""
        (?:\b(?:[Tt]he\s+patient\s+is|[Aa]ge[:\s]*|[Aa]ge\s+is\s+)?   # Introductory phrases
        (?P<age>\d{1,3})\s*                                           # Capture 1-3 digit age
        (?:[- ]?(?:year[- ]old|years?|yr[- ]old|yrs?[- ]old|yrs?|old)?))?  # Valid age qualifiers
        (?:[- ]?(?P<gender>male|female)\b)?                           # Match gender explicitly
    """
    
    # Compile the regex with verbose and case-insensitive options
    combined_pattern = re.compile(age_gender_pattern, re.IGNORECASE | re.VERBOSE)
    
    # Find matches
    matches = combined_pattern.finditer(modified_text)
    
    age, gender = None, None
    for match in matches:
        # Extract matched groups
        age_group = match.group("age")
        gender_group = match.group("gender")
        
        # Validate and assign age
        if age_group:
            possible_age = int(age_group)
            if possible_age <= 100:  # Valid age
                age = possible_age
        
        # Assign gender
        if gender_group:
            gender = gender_group.capitalize()  # Capitalize to standardize
        
        # Break if both age and gender are found
        if age is not None and gender is not None:
            break
    
    return age, gender



def extract_visit_motivation(text, visit_motivation_set):
    best_match = None
    best_position = float('inf')  # Initialize to a very large number

    modified_text = re.sub(r"Medical History:.*\n?", "", text)
    
    # Search for each motivation and find the position in the text
    for motivation, original in preprocessed_visit_motivation.items():
        match = re.search(rf"\b{re.escape(motivation)}\b", modified_text, re.IGNORECASE)
        
        if match:
            # If we find a match, compare its start position
            position = match.start()
            if position < best_position:
                best_position = position
                best_match = original

    return best_match  # Return the motivation that appeared first in the text

def update_patient_info(medical_note, age, gender, symptoms, visit_motivation, extracted_vital_signs):
    # Parse the note as either JSON or Python dictionary
    dictionary = parse_note_string(medical_note)
    
    # Update patient info, symptoms, and visit motivation
    if "patient_info" in dictionary:
        dictionary["patient_info"]["age"] = age
        dictionary["patient_info"]["gender"] = gender
    if "symptoms" in dictionary:
        dictionary["symptoms"] = symptoms
    if "visit_motivation" in dictionary:
        dictionary["visit_motivation"] = visit_motivation
    if "vital_signs" in dictionary:
        dictionary["vital_signs"] = extracted_vital_signs
    
    return json.dumps(dictionary)

def parse_note_string(note_string):
    try:
        return json.loads(note_string)  # Handles double quotes
    except json.JSONDecodeError:
        return ast.literal_eval(note_string)  # Handles single quotes

In [76]:
import re

def extract_vital_signs(text):
    vital_signs = {
        'blood_pressure': {'systolic': None, 'diastolic': None},
        'temperature': None,
        'heart_rate': None,
        'respiratory_rate': None,
        'oxygen_saturation': None,
        'glucose_level': None,
        'cholesterol_level': None
    }

    patterns = {
        'blood_pressure': r'(\d{2,3})/(\d{2,3})\s*mmhg',  # Matches 97/79 mmHg (case-insensitive)
        'temperature': r'([\d.]+)\s?°c',
        'heart_rate': r'([\d]+)\s*(?:bpm|beats\s*per\s*minute|beats\s*per\s*min|beat\s*per\s*minute)',  # Matches "**heart rate** 70 bpm" or "hr 70 bpm"
        'respiratory_rate': r'(\d+)\s*(?:breaths?/min|breaths\s*per\s*minute|breaths\s*per\s*min|breath\s*per\s*minute)',
        'oxygen_saturation': r'([\d.]+)\s?%',  # Matches "**O2 Sat**: 98%" or "oxygen saturation 98%"
        'glucose_level': r'glucose.*?([\d.]+)\s*mg/dl',  # Matches "glucose 139.8 mg/dL"
        'cholesterol_level': r'cholesterol.*?([\d.]+)\s*mg/dl'  # Matches "cholesterol 23.8 mg/dL"
    }

    units = {
        'blood_pressure': 'mmHg',
        'temperature': '°C',
        'heart_rate': 'bpm',
        'respiratory_rate': 'breaths/min',
        'oxygen_saturation': '%',
        'glucose_level': 'mg/dL',
        'cholesterol_level': 'mg/dL'
    }

    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            if key == 'blood_pressure':
                # Blood pressure has two values, systolic and diastolic
                systolic = match.group(1)
                diastolic = match.group(2)
                vital_signs['blood_pressure']['systolic'] = {'value': int(systolic), 'unit': 'mmHg'}
                vital_signs['blood_pressure']['diastolic'] = {'value': int(diastolic), 'unit': 'mmHg'}
            elif key == 'heart_rate' or key == 'respiratory_rate':
                value = match.group(1)
                vital_signs[key] = {'value': int(value), 'unit': units[key]}
            elif key == 'glucose_level':
                value = match.group(1)  # Captures the float after "glucose"
                vital_signs[key] = {'value': float(value), 'unit': units[key]}
            elif key == 'cholesterol_level':
                value = match.group(1)  # Captures the float after "cholesterol"
                vital_signs[key] = {'value': float(value), 'unit': units[key]}
            else:
                value = match.group(1)
                vital_signs[key] = {'value': float(value), 'unit': units[key]}
    
    # Fallback for blood pressure if no systolic/diastolic found
    if vital_signs['blood_pressure'] == {'systolic': None, 'diastolic': None}:
        mmhg_matches = re.findall(r'(\d+)\s*mmhg', text, re.IGNORECASE)
        if len(mmhg_matches) >= 2:
            systolic = int(mmhg_matches[0])
            diastolic = int(mmhg_matches[1])
            vital_signs['blood_pressure']['systolic'] = {'value': systolic, 'unit': 'mmHg'}
            vital_signs['blood_pressure']['diastolic'] = {'value': diastolic, 'unit': 'mmHg'}
    
    # Remove None values for any vital signs not found in the text
    vital_signs_cleaned = {k: v for k, v in vital_signs.items() if v is not None and v != {'systolic': None, 'diastolic': None}}
    
    return vital_signs_cleaned


In [12]:
# Load test data
df = pd.read_csv("/kaggle/input/medical-note-extraction-h-2-o-gen-ai-world-ny/test.csv")
sub_df = pd.read_csv('/kaggle/input/medical-note-extraction-h-2-o-gen-ai-world-ny/sample_submission.csv')

In [13]:
sub_train_df=pd.DataFrame(train_df["ID"])

In [14]:
sub_train_df['json'] = sub_df['json'].iloc[0]

In [15]:
sub_train_df

Unnamed: 0,ID,json
0,1,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."
1,2,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."
2,3,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."
3,4,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."
4,5,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."
...,...,...
10817,10818,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."
10818,10819,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."
10819,10820,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."
10820,10821,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."


In [77]:
updated_dicts = []
for i, row in train_df.iterrows():
    medical_note = row["Note"]  # Assuming the column with the medical note is named 'medical_note'
    age, gender = extract_age_gender(medical_note)
    visit_motivation = extract_visit_motivation(medical_note, visit_motivation_set)
    if visit_motivation:
        medical_note = re.sub(rf"\b{re.escape(visit_motivation)}\b", '', medical_note, flags=re.IGNORECASE)
    symptoms = extract_symptoms(medical_note, symptoms_set)
    extracted_vital_signs = extract_vital_signs(medical_note)
    updated_dict = update_patient_info(sub_train_df.loc[i]['json'], age, gender, symptoms, visit_motivation, extracted_vital_signs)
    updated_dicts.append(updated_dict)

# Add the updated dictionaries as a new column
sub_train_df["json"] = updated_dicts

In [None]:
# To check where we are going wrong
import json

sum_matches = 0
for i in range(1000):
    if sub_train_df['json'].iloc[i]:
        try:
            sub_json = json.loads(sub_train_df['json'].iloc[i].replace('null', 'None'))
            train_json = json.loads(train_df['json'].iloc[i].replace('null', 'None'))
            
            # Ensure both 'symptoms' lists (if they are lists) are sorted before comparison
            sub_symptoms = sorted(sub_json['symptoms']) if isinstance(sub_json['symptoms'], list) else sub_json['symptoms']
            train_symptoms = sorted(train_json['symptoms']) if isinstance(train_json['symptoms'], list) else train_json['symptoms']
            
            if sub_symptoms == train_symptoms:
                sum_matches += 1
            else:
                print(i)
                print(sub_symptoms)
                print(train_symptoms)
        except (KeyError, ValueError, TypeError):
            # Handle cases where JSON is invalid or keys are missing
            pass

sum_matches

In [None]:
import json

sum_matches = 0
for i in range(10822):
    if sub_train_df['json'].iloc[i]:
        try:
            sub_json = json.loads(sub_train_df['json'].iloc[i].replace('null', 'None'))
            train_json = json.loads(train_df['json'].iloc[i].replace('null', 'None'))
            if sub_json['patient_info']['age'] == train_json['patient_info']['age']:
                sum_matches += 1
            else:
                print(i)
                print(f"Predicted: {sub_json['patient_info']['age']}")
                print(f"Actual: {train_json['patient_info']['age']}\n")
        except (KeyError, ValueError, TypeError):
            # Handle cases where JSON is invalid or keys are missing
            pass

sum_matches

In [None]:
import json

sum_matches = 0
for i in range(10822):
    if sub_train_df['json'].iloc[i]:
        try:
            sub_json = json.loads(sub_train_df['json'].iloc[i].replace('null', 'None'))
            train_json = json.loads(train_df['json'].iloc[i].replace('null', 'None'))
            if sub_json['patient_info']['gender'] == train_json['patient_info']['gender']:
                sum_matches += 1
            else:
                print(i)
                print(f"Predicted: {sub_json['patient_info']['gender']}")
                print(f"Actual: {train_json['patient_info']['gender']}\n")
        except (KeyError, ValueError, TypeError):
            # Handle cases where JSON is invalid or keys are missing
            pass

sum_matches

In [None]:
import json

sum_matches = 0
for i in range(10822):
    if sub_train_df['json'].iloc[i]:
        try:
            sub_json = json.loads(sub_train_df['json'].iloc[i].replace('null', 'None'))
            train_json = json.loads(train_df['json'].iloc[i].replace('null', 'None'))
            if sub_json['visit_motivation'] == train_json['visit_motivation']:
                sum_matches += 1
            else:
                print(i)
                print(f"Predicted: {sub_json['visit_motivation']}")
                print(f"Actual: {train_json['visit_motivation']}\n")
        except (KeyError, ValueError, TypeError):
            # Handle cases where JSON is invalid or keys are missing
            pass

sum_matches

In [None]:
import json

sum_matches = 0
for i in range(10822):
    if sub_train_df['json'].iloc[i]:
        try:
            # Parse JSON strings and replace 'null' with 'None'
            sub_json = json.loads(sub_train_df['json'].iloc[i].replace('null', 'None'))
            train_json = json.loads(train_df['json'].iloc[i].replace('null', 'None'))
            
            # Sort 'vital_signs' if they are lists before comparison
            sub_vital_signs = sorted(sub_json['vital_signs']) if isinstance(sub_json['vital_signs'], list) else sub_json['vital_signs']
            train_vital_signs = sorted(train_json['vital_signs']) if isinstance(train_json['vital_signs'], list) else train_json['vital_signs']
            
            # Compare the sorted values
            if sub_vital_signs == train_vital_signs:
                sum_matches += 1
            else:
                print(i)
                print(f"Predicted: {sub_vital_signs}\n")
                print(f"Actual: {train_vital_signs}\n")
        except (KeyError, ValueError, TypeError):
            # Handle cases where JSON is invalid or keys are missing
            pass

sum_matches

In [43]:
sub_df

Unnamed: 0,ID,json
0,10823,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."
1,10824,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."
2,10825,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."
3,10826,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."
4,10827,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."
...,...,...
3791,14614,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."
3792,14615,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."
3793,14616,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."
3794,14617,"{""patient_info"": {""age"": 0, ""gender"": ""Unknown..."


In [44]:
df

Unnamed: 0,ID,Note
0,10823,**Clinical Notes**\n\n**Patient Information:**...
1,10824,**Clinical Notes**\n\n**Patient Information:**...
2,10825,Clinical Note:\n\nPatient: 71-year-old male\nC...
3,10826,**Clinical Notes**\n\n**Patient Information:**...
4,10827,**Clinical Notes**\n\n**Patient Information:**...
...,...,...
3791,14614,**Clinical Notes**\n\n**Patient Information:**...
3792,14615,**Clinical Notes**\n\n**Patient Information:**...
3793,14616,**Clinical Notes:**\n\n**Patient Information:*...
3794,14617,**Clinical Notes**\n\n**Patient Information:**...


In [45]:
df['Note'] = df['Note'].str.replace('*', '', regex=False)

In [46]:
df

Unnamed: 0,ID,Note
0,10823,Clinical Notes\n\nPatient Information:\n\n Age...
1,10824,Clinical Notes\n\nPatient Information:\n- Age:...
2,10825,Clinical Note:\n\nPatient: 71-year-old male\nC...
3,10826,Clinical Notes\n\nPatient Information:\n- Age:...
4,10827,Clinical Notes\n\nPatient Information:\n- Age:...
...,...,...
3791,14614,Clinical Notes\n\nPatient Information:\nAge: 3...
3792,14615,Clinical Notes\n\nPatient Information:\n Age: ...
3793,14616,Clinical Notes:\n\nPatient Information:\n- Age...
3794,14617,Clinical Notes\n\nPatient Information:\n- Age:...


In [82]:
# Process each row to update the dictionary
updated_dicts = []
for i, row in df.iterrows():
    medical_note = row["Note"]  # Assuming the column with the medical note is named 'medical_note'
    age, gender = extract_age_gender(medical_note)
    visit_motivation = extract_visit_motivation(medical_note, visit_motivation_set)
    if visit_motivation:
        medical_note = re.sub(rf"\b{re.escape(visit_motivation)}\b", '', medical_note, flags=re.IGNORECASE)
    symptoms = extract_symptoms(medical_note, symptoms_set)
    extracted_vital_signs = extract_vital_signs(medical_note)
    updated_dict = update_patient_info(sub_df.loc[i]['json'], age, gender, symptoms, visit_motivation, extracted_vital_signs)
    updated_dicts.append(updated_dict)

# Add the updated dictionaries as a new column
sub_df["json"] = updated_dicts

In [83]:
sub_df

Unnamed: 0,ID,json
0,10823,"{""patient_info"": {""age"": 48, ""gender"": ""Male""}..."
1,10824,"{""patient_info"": {""age"": 53, ""gender"": ""Male""}..."
2,10825,"{""patient_info"": {""age"": 71, ""gender"": ""Male""}..."
3,10826,"{""patient_info"": {""age"": 73, ""gender"": ""Male""}..."
4,10827,"{""patient_info"": {""age"": 48, ""gender"": ""Female..."
...,...,...
3791,14614,"{""patient_info"": {""age"": 35, ""gender"": ""Female..."
3792,14615,"{""patient_info"": {""age"": 59, ""gender"": ""Female..."
3793,14616,"{""patient_info"": {""age"": 67, ""gender"": ""Female..."
3794,14617,"{""patient_info"": {""age"": 55, ""gender"": ""Female..."


In [None]:
# Save the updated DataFrame
sub_df.to_csv("submission.csv", index=False)

print("Updated file saved as 'submission.csv'")