In [22]:
import os
import json
import stanza
import csv
import pandas as pd

# Build an English pipeline
stanza.download('en', package='mimic', processors={'ner': 'i2b2'}) # download English model
nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'}) # initialize English neural pipeline

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 3.10MB/s]                    
2024-03-15 10:46:36 INFO: Downloaded file to C:\Users\okechukwu chude\stanza_resources\resources.json
2024-03-15 10:46:36 INFO: Downloading these customized packages for language: en (English)...
| Processor       | Package        |
------------------------------------
| tokenize        | mimic          |
| pos             | mimic_charlm   |
| lemma           | mimic_nocharlm |
| depparse        | mimic_charlm   |
| ner             | i2b2           |
| backward_charlm | mimic          |
| pretrain        | mimic          |
| forward_charlm  | mimic          |

2024-03-15 10:46:36 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\tokenize\mimic.pt
2024-03-15 10:46:36 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\en\pos\mimic_charlm.pt
2024-03-15 10:46:36 INFO: File exists: C:\Users\okechukwu chude\stanza_resources\

In [23]:
def extract_info_from_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    
    hadm_id = data.get("hadm_id", None)
    comment = data.get("comment", None)
    notes = data.get("notes", [])
    
    extracted_data = []
    for note in notes:
        note_id = note.get("note_id", None)
        category = note.get("category", None)
        description = note.get("description", None)
        annotations = note.get("annotations", [])
        text = note.get("text", None)
        
        for annotation in annotations:
            begin = annotation.get("begin", None)
            end = annotation.get("end", None)
            code = annotation.get("code", None)
            code_system = annotation.get("code_system", None)
            description = annotation.get("description", None)
            covered_text = annotation.get("covered_text", None)
            
            extracted_data.append({
                "hadm_id": hadm_id,
                "comment": comment,
                "note_id": note_id,
                "category": category,
                "description": description,
                "begin": begin,
                "end": end,
                "code": code,
                "code_system": code_system,
                "covered_text": covered_text,
                "text": text
            })
    
    return extracted_data

In [24]:
# Function to search folders and subfolders for JSON files
def search_json_files(root_folder):
    json_files = []
    for foldername, _, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith('.json'):
                json_files.append(os.path.join(foldername, filename))
    return json_files

In [25]:
# Main function to process all JSON files
def process_json_files(root_folder):
    all_data = []
    json_files = search_json_files(root_folder)
    for file_path in json_files:
        all_data.extend(extract_info_from_json(file_path))
    return all_data

In [26]:
# Path to the root folder containing JSON files
root_folder = r"C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2"

In [28]:
# Process JSON files and store extracted information in a DataFrame
data = process_json_files(root_folder)
df = pd.DataFrame(data)

In [31]:
df.head()


Unnamed: 0,hadm_id,comment,note_id,category,description,begin,end,code,code_system,covered_text,text
0,100197,,25762,Discharge summary,Intracerebral hemorrhage,374,377,431,ICD-9-CM,IPH,Admission Date: [**2136-10-23**] ...
1,100197,,25762,Discharge summary,Intracerebral hemorrhage,383,409,431,ICD-9-CM,intraventricular extension,Admission Date: [**2136-10-23**] ...
2,100197,,25762,Discharge summary,Obstructive hydrocephalus,430,443,331.4,ICD-9-CM,hydrocephalus,Admission Date: [**2136-10-23**] ...
3,100197,,25762,Discharge summary,Intracerebral hemorrhage,835,862,431,ICD-9-CM,intraparenchymal hemorrhage,Admission Date: [**2136-10-23**] ...
4,100197,,25762,Discharge summary,Do not resuscitate status,1577,1580,V49.86,ICD-9-CM,DNR,Admission Date: [**2136-10-23**] ...


In [32]:
# Function to tokenize text using Stanza
def tokenize_with_stanza(text):
    doc = nlp(text)
    tokens = [word.text for sent in doc.sentences for word in sent.words]
    return tokens


In [33]:
# Tokenize the words in the text column
df['tokenized_text'] = df['text'].apply(tokenize_with_stanza)

In [35]:
df.head()

Unnamed: 0,hadm_id,comment,note_id,category,description,begin,end,code,code_system,covered_text,text,tokenized_text
0,100197,,25762,Discharge summary,Intracerebral hemorrhage,374,377,431,ICD-9-CM,IPH,Admission Date: [**2136-10-23**] ...,"[Admission, Date, :, [, **2136-10-23, **], Dis..."
1,100197,,25762,Discharge summary,Intracerebral hemorrhage,383,409,431,ICD-9-CM,intraventricular extension,Admission Date: [**2136-10-23**] ...,"[Admission, Date, :, [, **2136-10-23, **], Dis..."
2,100197,,25762,Discharge summary,Obstructive hydrocephalus,430,443,331.4,ICD-9-CM,hydrocephalus,Admission Date: [**2136-10-23**] ...,"[Admission, Date, :, [, **2136-10-23, **], Dis..."
3,100197,,25762,Discharge summary,Intracerebral hemorrhage,835,862,431,ICD-9-CM,intraparenchymal hemorrhage,Admission Date: [**2136-10-23**] ...,"[Admission, Date, :, [, **2136-10-23, **], Dis..."
4,100197,,25762,Discharge summary,Do not resuscitate status,1577,1580,V49.86,ICD-9-CM,DNR,Admission Date: [**2136-10-23**] ...,"[Admission, Date, :, [, **2136-10-23, **], Dis..."


In [36]:
print(df['tokenized_text'])

0     [Admission, Date, :, [, **2136-10-23, **], Dis...
1     [Admission, Date, :, [, **2136-10-23, **], Dis...
2     [Admission, Date, :, [, **2136-10-23, **], Dis...
3     [Admission, Date, :, [, **2136-10-23, **], Dis...
4     [Admission, Date, :, [, **2136-10-23, **], Dis...
5     [Admission, Date, :, [, **2136-10-23, **], Dis...
6     [Admission, Date, :, [, **2136-10-23, **], Dis...
7     [Admission, Date, :, [, **2136-10-23, **], Dis...
8     [Admission, Date, :, [, **2136-10-23, **], Dis...
9     [Admission, Date, :, [, **2136-10-23, **], Dis...
10    [Admission, Date, :, [, **2136-10-23, **], Dis...
Name: tokenized_text, dtype: object


In [37]:
# Function to perform named entity recognition using Stanza
def ner_with_stanza(text):
    doc = nlp(text)
    entities = []
    for sent in doc.sentences:
        for entity in sent.ents:
            entities.append({
                "text": entity.text,
                "type": entity.type
            })
    return entities

# Perform named entity recognition on the text column
df['ner_results'] = df['text'].apply(ner_with_stanza)

# Display the DataFrame with NER results
print(df)

    hadm_id comment  note_id           category  \
0    100197            25762  Discharge summary   
1    100197            25762  Discharge summary   
2    100197            25762  Discharge summary   
3    100197            25762  Discharge summary   
4    100197            25762  Discharge summary   
5    100197            25762  Discharge summary   
6    100197            25762  Discharge summary   
7    100197            25762  Discharge summary   
8    100197            25762  Discharge summary   
9    100197            25762  Discharge summary   
10   100197            25762  Discharge summary   

                                          description  begin   end    code  \
0                            Intracerebral hemorrhage    374   377     431   
1                            Intracerebral hemorrhage    383   409     431   
2                           Obstructive hydrocephalus    430   443   331.4   
3                            Intracerebral hemorrhage    835   862     431 

In [38]:
print(df['ner_results'])

0     [{'text': 'Known Allergies', 'type': 'PROBLEM'...
1     [{'text': 'Known Allergies', 'type': 'PROBLEM'...
2     [{'text': 'Known Allergies', 'type': 'PROBLEM'...
3     [{'text': 'Known Allergies', 'type': 'PROBLEM'...
4     [{'text': 'Known Allergies', 'type': 'PROBLEM'...
5     [{'text': 'Known Allergies', 'type': 'PROBLEM'...
6     [{'text': 'Known Allergies', 'type': 'PROBLEM'...
7     [{'text': 'Known Allergies', 'type': 'PROBLEM'...
8     [{'text': 'Known Allergies', 'type': 'PROBLEM'...
9     [{'text': 'Known Allergies', 'type': 'PROBLEM'...
10    [{'text': 'Known Allergies', 'type': 'PROBLEM'...
Name: ner_results, dtype: object
