In [12]:
import os
import pandas as pd
import json

# Function to extract entities from the JSON data
def extract_entities(json_data):
    entity_data = []  # Initialize an empty list to store entity information
    hadm_id = json_data['hadm_id']  # Extract HADM_ID
    # Iterate through each note in the JSON data
    for note in json_data['notes']:
        note_id = note['note_id']  # Extract Note ID
        # Iterate through each annotation in the note
        for annotation in note['annotations']:
            # Extract entity information
            entity_info = {
                'HADM_ID': hadm_id,
                'Note_ID': note_id,
                'Entity_Text': annotation['covered_text'],
                'Entity_Type': annotation['type'],
                'Code': annotation['code'],
                'Code_System': annotation['code_system']
            }
            # Append entity information to the list
            entity_data.append(entity_info)
    return entity_data

In [13]:
# Function to extract information from JSON files
def extract_info_from_json(json_file_path, entity_data):
    # Load JSON data from the file
    with open(json_file_path, 'r') as file:
        json_data = json.load(file)
        # Extract entities from the JSON data
        entities = extract_entities(json_data)
        # Extend entity_data list with the extracted entities
        entity_data.extend(entities)


In [14]:
# Function to search for JSON files in a folder and its subfolders
def search_files(folder_path):
    entity_data = []  # Initialize an empty list to store entity information
    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, entity_data)
                # Print a separator after processing each file
                print("=" * 50)

    # Create a DataFrame from the entity_data list
    entity_df = pd.DataFrame(entity_data)
    # Print the DataFrame
    #print(entity_df)
    # Print the count of unique HADM_ID values
    print("Total unique HADM_ID count:", entity_df['HADM_ID'].nunique())

    return entity_df  # Return the DataFrame

In [27]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2"

# Call the search_files function to start searching for JSON files in the main folder and its subfolders
entity_df = search_files(main_folder_path)

Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\100197-ICD-9.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\ICD-10\1.0\101525-ICD-10.json
Total unique HADM_ID count: 2


In [28]:
print(entity_df)

    HADM_ID  Note_ID                    Entity_Text     Entity_Type      Code  \
0    100197    25762                            IPH  MapType.APPROX       431   
1    100197    25762     intraventricular extension  MapType.APPROX       431   
2    100197    25762                  hydrocephalus  MapType.APPROX     331.4   
3    100197    25762    intraparenchymal hemorrhage  MapType.APPROX       431   
4    100197    25762                            DNR           Human    V49.86   
..      ...      ...                            ...             ...       ...   
81   101525   566978                   WBC elevated           Human   D72.829   
82   101525  1071556  fracture of the left clavicle           Human  S42.002A   
83   101525  1071962                      neck pain           Human     M54.2   
84   101525  1071963                      neck pain           Human     M54.2   
85   101525   240524                   Sinus rhythm           Human     93000   

   Code_System  
0     ICD-

In [30]:
data = entity_df.drop(['HADM_ID','Note_ID','Entity_Type'], axis=1)

In [31]:
print(data)

                      Entity_Text      Code Code_System
0                             IPH       431    ICD-9-CM
1      intraventricular extension       431    ICD-9-CM
2                   hydrocephalus     331.4    ICD-9-CM
3     intraparenchymal hemorrhage       431    ICD-9-CM
4                             DNR    V49.86    ICD-9-CM
..                            ...       ...         ...
81                   WBC elevated   D72.829   ICD-10-CM
82  fracture of the left clavicle  S42.002A   ICD-10-CM
83                      neck pain     M54.2   ICD-10-CM
84                      neck pain     M54.2   ICD-10-CM
85                   Sinus rhythm     93000         CPT

[86 rows x 3 columns]


In [33]:
# Function to split text and create new rows with B or I values

def split_text_to_rows(row):
    words = row['Entity_Text'].split()
    rows = []
    for i, word in enumerate(words):
        if i == 0:
            tag = 'B'
        else:
            tag = 'I'
        rows.append({'Entity_Text': word, 'Tag': tag, 'Code': row['Code'], 'Code_System': row['Code_System']})
    return rows

# Apply the function to each row and concatenate the results
new_rows = []
for index, row in data.iterrows():
    new_rows.extend(split_text_to_rows(row))

# Create a new dataframe with the split rows
new_df = pd.DataFrame(new_rows)

print(new_df)

          Entity_Text Tag   Code Code_System
0                 IPH   B    431    ICD-9-CM
1    intraventricular   B    431    ICD-9-CM
2           extension   I    431    ICD-9-CM
3       hydrocephalus   B  331.4    ICD-9-CM
4    intraparenchymal   B    431    ICD-9-CM
..                ...  ..    ...         ...
217              pain   I  M54.2   ICD-10-CM
218              neck   B  M54.2   ICD-10-CM
219              pain   I  M54.2   ICD-10-CM
220             Sinus   B  93000         CPT
221            rhythm   I  93000         CPT

[222 rows x 4 columns]


In [85]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(new_df)

          Entity_Text Tag      Code Code_System
0                 IPH   B       431    ICD-9-CM
1    intraventricular   B       431    ICD-9-CM
2           extension   I       431    ICD-9-CM
3       hydrocephalus   B     331.4    ICD-9-CM
4    intraparenchymal   B       431    ICD-9-CM
5          hemorrhage   I       431    ICD-9-CM
6                 DNR   B    V49.86    ICD-9-CM
7            DIABETES   B    250.00    ICD-9-CM
8                TYPE   I    250.00    ICD-9-CM
9                  II   I    250.00    ICD-9-CM
10     HYPERLIPIDEMIA   B     272.4    ICD-9-CM
11           GLAUCOMA   B     365.9    ICD-9-CM
12     OSTEOARTHRITIS   B    715.90    ICD-9-CM
13            CAROTID   B    433.10    ICD-9-CM
14           STENOSIS   I    433.10    ICD-9-CM
15               left   I    433.10    ICD-9-CM
16            60-69%,   I    433.10    ICD-9-CM
17                 rt   I    433.10    ICD-9-CM
18                 50   I    433.10    ICD-9-CM
19          extubated   B     96.71   IC

##################################################################################################################

##################################################################################################################

In [34]:
import os
import json
import stanza
import csv
import pandas as pd

# Build an English pipeline
stanza.download('en', package='mimic', processors={'ner': 'i2b2'}) # download English model
nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'}) # initialize English neural pipeline

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 3.17MB/s]                    
2024-03-18 17:52:22 INFO: Downloaded file to C:\Users\okechukwu chude\stanza_resources\resources.json
2024-03-18 17:52:22 INFO: Downloading these customized packages for language: en (English)...
| Processor       | Package        |
------------------------------------
| tokenize        | mimic          |
| pos             | mimic_charlm   |
| lemma           | mimic_nocharlm |
| depparse        | mimic_charlm   |
| ner             | i2b2           |
| pretrain        | mimic          |
| backward_charlm | mimic          |
| forward_charlm  | mimic          |

2024-03-18 17:52:22 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\tokenize\mimic.pt
2024-03-18 17:52:22 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\pos\mimic_charlm.pt
2024-03-18 17:52:22 INFO: File 

In [35]:
# Global variable to store all extracted texts
extracted_texts = []

In [36]:
# Function to process text using Stanza
def process_text_with_stanza(text):
    doc = nlp(text)
    return doc

In [37]:
# Function to extract information from JSON files
def extract_info_from_json(json_file_path, hadm_id_set):
    global extracted_texts

    # Open the JSON file
    with open(json_file_path, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'hadm_id' and 'comment' from the JSON data
        hadm_id = data.get('hadm_id', None)
        comment = data.get('comment', None)

        # If 'hadm_id' is not found, print a warning message
        if hadm_id is None:
            print(f"Warning: 'hadm_id' not found in {json_file_path}")
            return None

        # Add 'hadm_id' to the set
        hadm_id_set.add(hadm_id)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_info = {}  # Initialize a dictionary to store note information

            # Add 'hadm_id' to the note information
            note_info['hadm_id'] = hadm_id

            # Extract information from the note
            note_info['note_id'] = note.get('note_id', None)
            note_info['category'] = note.get('category', None)
            note_info['description'] = note.get('description', None)

            # Extract annotations from the note
            annotations = note.get('annotations', [])
            annotations_info = []  # Initialize a list to store annotation information

            # Iterate through each annotation in the note
            for annotation in annotations:
                annotation_info = {}  # Initialize a dictionary to store annotation information

                # Extract information from the annotation
                annotation_info['begin'] = annotation.get('begin', None)
                annotation_info['end'] = annotation.get('end', None)
                annotation_info['code'] = annotation.get('code', None)
                annotation_info['code_system'] = annotation.get('code_system', None)
                annotation_info['description'] = annotation.get('description', None)
                annotation_info['type'] = annotation.get('type', None)
                annotation_info['covered_text'] = annotation.get('covered_text', None)

                annotations_info.append(annotation_info)  # Append annotation information to the list

            note_info['annotations'] = annotations_info  # Add annotations information to the note
            note_info['text'] = note.get('text', None)

            # Process text with Stanza
            if note_info['text']:
                processed_text = process_text_with_stanza(note_info['text'])
                note_info['processed_text'] = processed_text

                # Count number of sentences and store it
                num_sentences = len(processed_text.sentences)
                note_info['num_sentences'] = num_sentences

                # Extract sentence information and generate labels
                sentence_info = []
                for sent_id, sent in enumerate(processed_text.sentences):
                    tokens = [word.text for word in sent.words]
                    labels = generate_labels(tokens, note_info['annotations'])
                    sentence_info.append({'sentence_id': sent_id, 'words': tokens, 'labels': labels})

                note_info['sentence_info'] = sentence_info

            extracted_texts.append(note_info)  # Append note information to the global variable



In [38]:
def generate_labels(words, annotations):
    labels = ['O'] * len(words)

    for annotation in annotations:
        begin = annotation['begin']
        end = annotation['end']
        code = annotation['code']
        covered_text = annotation['covered_text']

        # Find the word indices corresponding to the annotation
        word_indices = find_word_indices(words, begin, end, covered_text)

        # Update labels for the matched words
        for idx in word_indices:
            labels[idx] = f'B-{code}' if idx == word_indices[0] else f'I-{code}'

    return labels

In [39]:
# Function to search for JSON files in a given folder and its subfolders

def search_files(folder_path):
    # Set to store unique hadm_id values
    hadm_id_set = set()

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, hadm_id_set)
                # Print a separator after processing each file
                print("=" * 50)

    # Print the count of unique hadm_id values
    print("Total unique hadm_id count:", len(hadm_id_set))



In [40]:
# Function to extract information about sentences and named entities

def search_json_files(folder_path):
    # Initialize a list to store all JSON file paths
    json_files = []

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                json_files.append(os.path.join(root, file))

    return json_files

In [41]:
# Function to generate labels for tokens from JSON files

def generate_labels_from_json(json_file):
    # Open the JSON file
    with open(json_file, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_text = note.get('text', None)
            if note_text:
                # Tokenize the note text
                tokens = tokenize_text(note_text)

                # Extract annotations from the note
                annotations = note.get('annotations', [])

                # Initialize labels for tokens
                labels = ['O'] * len(tokens)

                # Match annotations with tokens
                for annotation in annotations:
                    begin = annotation['begin']
                    end = annotation['end']
                    code = annotation['code']
                    covered_text = annotation['covered_text']

                    # Find the token indices corresponding to the annotation
                    token_indices = find_word_indices(tokens, begin, end, covered_text)

                    # Update labels for the matched tokens
                    for idx in token_indices:
                        labels[idx] = f'B-{code}' if idx == token_indices[0] else f'I-{code}'

                

In [42]:
# Function to tokenize text using Stanza

def tokenize_text(text):
    # Process the text with the initialized pipeline
    doc = nlp(text)
    # Extract tokens from the document
    tokens = [word.text for sent in doc.sentences for word in sent.words]
    print(doc.ents)

    return tokens

In [44]:
# Function to find the word indices corresponding to the annotation
def find_word_indices(words, begin, end, covered_text):
    word_indices = []
    char_counter = 0

    for idx, word in enumerate(words):
        if char_counter + len(word) >= begin and char_counter <= end:
            word_indices.append(idx)
        char_counter += len(word) + 1  # Add 1 for space between words

    return word_indices

In [45]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2"

# Call the search_files function to start searching for JSON files in the main folder and its subfolders
search_files(main_folder_path)

Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\100197-ICD-9.json
Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\ICD-10\1.0\101525-ICD-10.json
Total unique hadm_id count: 2


In [48]:
data = []
for note_info in extracted_texts:
    for sent_info in note_info['sentence_info']:
        for word, label in zip(sent_info['words'], sent_info['labels']):
            data.append({'sentence_id': sent_info['sentence_id'], 'Entity_Text': word, 'Tag': label})

word_df = pd.DataFrame(data)

In [49]:
word_df

Unnamed: 0,sentence_id,Entity_Text,Tag
0,0,Admission,O
1,0,Date,O
2,0,:,O
3,0,[,O
4,0,**2136-10-23,O
...,...,...,...
9868,2,tracing,I-93000
9869,2,available,O
9870,2,for,O
9871,2,comparison,O


In [82]:
# Merge data frames
merged_df = pd.merge(word_df, new_df, on='Entity_Text', how='left', indicator=True)

# Fill NaN values in the 'Tag' column with 'o'
merged_df.loc[merged_df['_merge'] == 'left_only', 'Tag'] = 'o'

merged_df = merged_df.drop('_merge', axis=1)


# Fill NaN values in the 'Code' and 'Code_System' columns with NaN
merged_df['Code'] = merged_df['Code'].fillna(pd.NA)
merged_df['Code_System'] = merged_df['Code_System'].fillna(pd.NA)

print(merged_df)

       sentence_id   Entity_Text    Tag_x Tag_y  Code Code_System Tag
0                0     Admission        O   NaN  <NA>        <NA>   o
1                0          Date        O   NaN  <NA>        <NA>   o
2                0             :        O   NaN  <NA>        <NA>   o
3                0             [        O   NaN  <NA>        <NA>   o
4                0  **2136-10-23        O   NaN  <NA>        <NA>   o
...            ...           ...      ...   ...   ...         ...  ..
13705            2       tracing  I-93000   NaN  <NA>        <NA>   o
13706            2     available        O   NaN  <NA>        <NA>   o
13707            2           for        O   NaN  <NA>        <NA>   o
13708            2    comparison        O   NaN  <NA>        <NA>   o
13709            2             .        O   NaN  <NA>        <NA>   o

[13710 rows x 7 columns]


In [83]:
merged_df= merged_df.drop(['sentence_id','Code', 'Code_System'], axis=1)

In [84]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(merged_df)

                                             Entity_Text       Tag_x Tag_y  \
0                                              Admission           O   NaN   
1                                                   Date           O   NaN   
2                                                      :           O   NaN   
3                                                      [           O   NaN   
4                                           **2136-10-23           O   NaN   
5                                                    **]           O   NaN   
6                                              Discharge           O   NaN   
7                                                   Date           O   NaN   
8                                                      :           O   NaN   
9                                                      [           O   NaN   
10                                          **2136-10-24           O   NaN   
11                                                   **]        