In [1]:
import pandas as pd
import re

In [2]:
ne = pd.read_csv("~/mimic-iii/NOTEEVENTS.csv.gz", low_memory = False)

In [3]:
data = ne[["ROW_ID", "TEXT"]].copy()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2083180 entries, 0 to 2083179
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   ROW_ID  int64 
 1   TEXT    object
dtypes: int64(1), object(1)
memory usage: 31.8+ MB


In [4]:
import pandas as pd
import re
import numpy as np

# --- Define your target heading variations ---
heading_variations = {
    'Admit Diagnosis': [
        'Admit diagnosis',
        'Admitting Diagnosis',
        'ADMIT DIAGNOSIS'
    ],
    'Medical Condition': [
        'MEDICAL CONDITION',
        'Medical Condition'
    ],
    'History of Present Illness': [
        'History of Present Illness',
        'HISTORY OF PRESENT ILLNESS',
        'HISTORY OF THE PRESENT ILLNESS',
        'History of the present illness',
        'HPI'
    ],
    'Chief Complaint': [
        'Chief Complaint',
        'chief complaint',
        'CHIEF COMPLAINT'
    ]
}

# Create a mapping from lowercase variants to standardized headings.
variation_to_standard = {}
for standard, variations in heading_variations.items():
    for variant in variations:
        variation_to_standard[variant.lower()] = standard

# --- Build the regex pattern using only the target headings ---
# Escape each variant and sort by length (longer ones first to avoid partial matches)
escaped_headings = sorted(
    [re.escape(variant) for variant in variation_to_standard.keys()],
    key=len,
    reverse=True
)
headings_pattern = '|'.join(escaped_headings)

# Our regex stops capturing when it sees a newline followed by a line that starts with some text and a colon.
# (This works well when the new heading is in our target list. Otherwise, extra text may be captured.)
pattern = re.compile(
    rf'(?P<heading>{headings_pattern})\s*:\s*'
    rf'(?P<content>.*?)(?=\n\s*[^:\n]+:\s*|\Z)',
    re.IGNORECASE | re.DOTALL
)

# --- Revised Extraction Function ---
def extract_headings(text, pattern, variation_mapping):
    """
    For each target heading in text, capture its content.
    For headings known to be one-line (such as Admitting Diagnosis), only the first line is kept.
    Also, if the captured content is empty or its first nonempty line looks like another heading,
    the value is set to None.
    """
    matches = pattern.finditer(text)
    extracted = {}
    for match in matches:
        heading_variant = match.group('heading').strip().lower()
        content = match.group('content')  # may be multiline
        content_stripped = content.strip() if content else ''
        
        # Check if the first nonempty line itself looks like a heading (e.g., "24 Hour Events:")
        first_non_empty_line = None
        for line in content.splitlines():
            if line.strip():
                first_non_empty_line = line.strip()
                break
        if first_non_empty_line and re.match(r'^[^:]+:\s*$', first_non_empty_line):
            content_stripped = ''
        if not content_stripped:
            content_stripped = None

        standard_heading = variation_mapping.get(heading_variant)
        if standard_heading:
            # Special rule: for headings expected to be one line (e.g., Admitting Diagnosis),
            # take only the first line.
            if standard_heading == "Admit Diagnosis":
                first_line = content.splitlines()[0].strip() if content else ""
                content_stripped = first_line if first_line else None

            # Merge content if the heading appears more than once.
            if standard_heading in extracted:
                if extracted[standard_heading] is None:
                    extracted[standard_heading] = content_stripped
                else:
                    if content_stripped is not None:
                        extracted[standard_heading] += " " + content_stripped
            else:
                extracted[standard_heading] = content_stripped
    return extracted

extracted_data = data['TEXT'].apply(lambda x: extract_headings(x, pattern, variation_to_standard))
extracted_df = pd.json_normalize(extracted_data)
cdata = data.join(extracted_df)
print(cdata.head())

   ROW_ID                                               TEXT  \
0     174  Admission Date:  [**2151-7-16**]       Dischar...   
1     175  Admission Date:  [**2118-6-2**]       Discharg...   
2     176  Admission Date:  [**2119-5-4**]              D...   
3     177  Admission Date:  [**2124-7-21**]              ...   
4     178  Admission Date:  [**2162-3-3**]              D...   

                          History of Present Illness  \
0                                                NaN   
1  This is an 81-year-old female\nwith a history ...   
2  This 81 year old woman has a history of COPD. ...   
3  87 yo F with h/o CHF, COPD on 5 L oxygen at ba...   
4  Mr. [**Known lastname 1829**] is a 82 year old...   

                                     Chief Complaint Medical Condition  \
0                                                NaN               NaN   
1                                                NaN               NaN   
2  81 yo F smoker w/ COPD, severe TBM, s/p trache...    

In [5]:
# List the columns you want to combine for the clinical note
clinical_columns = [
    "Chief Complaint",
    "Admit Diagnosis",
    "Medical Condition",
    "History of Present Illness"
]

# Fill missing values with empty strings and concatenate the text,
# ensuring that each heading's content ends with a period.
cdata['CLINICAL_NOTE'] = cdata[clinical_columns].fillna('').apply(
    lambda row: ' '.join(
        (x.strip() + ('.' if x.strip() and not x.strip().endswith('.') else ''))
        for x in row
        if x.strip()  # only include non-empty strings
    ).strip(),
    axis=1
)

# Create the new DataFrame with the desired columns
new_df = cdata[['ROW_ID', 'TEXT', 'CLINICAL_NOTE']]

# Display the first few rows of the new DataFrame
print(new_df.head())

new_df["CLINICAL_NOTE"].iloc[2]

   ROW_ID                                               TEXT  \
0     174  Admission Date:  [**2151-7-16**]       Dischar...   
1     175  Admission Date:  [**2118-6-2**]       Discharg...   
2     176  Admission Date:  [**2119-5-4**]              D...   
3     177  Admission Date:  [**2124-7-21**]              ...   
4     178  Admission Date:  [**2162-3-3**]              D...   

                                       CLINICAL_NOTE  
0                                                     
1  This is an 81-year-old female\nwith a history ...  
2  81 yo F smoker w/ COPD, severe TBM, s/p trache...  
3  COPD exacerbation/Shortness of Breath. 87 yo F...  
4  Mr. [**Known lastname 1829**] was seen at [**H...  


'81 yo F smoker w/ COPD, severe TBM, s/p tracheobronchoplasty [**5-5**]\ns/p perc trach [**5-13**]. This 81 year old woman has a history of COPD. Over the past five\n\nyears she has had progressive difficulties with her breathing.\nIn\n[**2118-6-4**] she was admitted to [**Hospital1 18**] for respiratory failure\ndue\nto a COPD exacerbation. Due to persistent hypoxemia, she\nrequired\nintubation and a eventual bronchoscopy on [**2118-6-9**] revealed marked\n\nnarrowing of the airways on expiration consistent with\ntracheomalacia.\nShe subsequently underwent placement of two\nsilicone stents, one in the left main stem and one in the\ntrachea. During the admission the patient had complaints of\nchest\npain and ruled out for an MI. She was subsequently discharged to\n\n[**Hospital1 **] for physical and pulmonary rehab. Repeat bronchoscopy\non\n[**2118-8-1**] revealed granulation tissue at the distal right lateral\nwall of the tracheal stent. There was significant malacia of the\n\nperiphe

In [6]:
new_df.head()

Unnamed: 0,ROW_ID,TEXT,CLINICAL_NOTE
0,174,Admission Date: [**2151-7-16**] Dischar...,
1,175,Admission Date: [**2118-6-2**] Discharg...,This is an 81-year-old female\nwith a history ...
2,176,Admission Date: [**2119-5-4**] D...,"81 yo F smoker w/ COPD, severe TBM, s/p trache..."
3,177,Admission Date: [**2124-7-21**] ...,COPD exacerbation/Shortness of Breath. 87 yo F...
4,178,Admission Date: [**2162-3-3**] D...,Mr. [**Known lastname 1829**] was seen at [**H...
