<a href="https://colab.research.google.com/github/okechukwuchude/Automating-Medical-Coding/blob/main/Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import json
import stanza
import csv
import pandas as pd

# Build an English pipeline
stanza.download('en', package='mimic', processors={'ner': 'i2b2'}) # download English model
nlp = stanza.Pipeline('en', package='mimic', processors={'ner': 'i2b2'}) # initialize English neural pipeline

In [71]:
# Global variable to store all extracted texts
extracted_texts = []

# Global variable to store cumulative sentence count
global_sentence_count = 0

In [72]:
# Inside the function process_text_with_stanza, update the global_sentence_count
def process_text_with_stanza(text):
    global global_sentence_count
    doc = nlp(text)
    global_sentence_count += len(doc.sentences)  # Increment global_sentence_count by the number of sentences in the current document
    return doc

In [73]:
def extract_info_from_json(json_file_path, hadm_id_set):
    global extracted_texts
    global global_sentence_count  # Access the global global_sentence_count variable
    last_sentence_id = global_sentence_count  # Initialize last assigned sentence ID to global_sentence_count

    # Open the JSON file
    with open(json_file_path, 'r') as file:
        # Load JSON data from the file
        data = json.load(file)

        # Extract 'hadm_id' and 'comment' from the JSON data
        hadm_id = data.get('hadm_id', None)
        comment = data.get('comment', None)

        # If 'hadm_id' is not found, print a warning message
        if hadm_id is None:
            print(f"Warning: 'hadm_id' not found in {json_file_path}")
            return None

        # Add 'hadm_id' to the set
        hadm_id_set.add(hadm_id)

        # Extract 'notes' from the JSON data
        notes = data.get('notes', [])

        # Iterate through each note
        for note in notes:
            note_info = {}  # Initialize a dictionary to store note information

            # Add 'hadm_id' to the note information
            note_info['hadm_id'] = hadm_id

            # Extract information from the note
            note_info['note_id'] = note.get('note_id', None)
            note_info['category'] = note.get('category', None)
            note_info['description'] = note.get('description', None)

            # Extract annotations from the note
            annotations = note.get('annotations', [])
            annotations_info = []  # Initialize a list to store annotation information

            # Iterate through each annotation in the note
            for annotation in annotations:
                annotation_info = {}  # Initialize a dictionary to store annotation information

                # Extract information from the annotation
                annotation_info['begin'] = annotation.get('begin', None)
                annotation_info['end'] = annotation.get('end', None)
                annotation_info['code'] = annotation.get('code', None)
                annotation_info['code_system'] = annotation.get('code_system', None)
                annotation_info['description'] = annotation.get('description', None)
                annotation_info['type'] = annotation.get('type', None)
                annotation_info['covered_text'] = annotation.get('covered_text', None)

                annotations_info.append(annotation_info)  # Append annotation information to the list

            note_info['annotations'] = annotations_info  # Add annotations information to the note
            note_info['text'] = note.get('text', None)

            # Process text with Stanza
            if note_info['text']:
                processed_text = process_text_with_stanza(note_info['text'])
                note_info['processed_text'] = processed_text

                # Extract sentence information and generate labels
                sentence_info = []
                for sent in processed_text.sentences:
                    tokens = [word.text for word in sent.words]
                    labels = ['O'] * len(tokens)  # Initialize labels as 'O'

                    for annotation in note_info['annotations']:
                        begin = annotation['begin']
                        end = annotation['end']

                        word_indices = find_word_indices(sent.words, begin, end)

                        # Assign labels
                        if word_indices:
                            start_idx, end_idx = word_indices[0], word_indices[-1]
                            labels[start_idx] = 'B'
                            for idx in range(start_idx + 1, end_idx + 1):
                                labels[idx] = 'I'

                    # Assign a unique sentence ID
                    sentence_id = last_sentence_id + sent.index  # Calculate sentence ID based on last assigned ID and current sentence index

                    # Print the result for each sentence
                    print('Sentence ID:', sentence_id)
                    print('Sentence:', ' '.join(tokens))
                    print('Labels:', ' '.join(labels))
                    print()

                    sentence_info.append({
                        'sentence_id': sentence_id,
                        'words': tokens,
                        'labels': labels
                    })

                note_info['sentence_info'] = sentence_info
                last_sentence_id += len(processed_text.sentences)  # Update last_sentence_id

            extracted_texts.append(note_info)  # Append note information to the global variable


In [74]:
def find_word_indices(words, begin, end):
    # Create an empty list to store the indices of words found within the given range
    word_indices = []

    # Iterate through each token in the list of tokens
    for idx, word in enumerate(words):
        # Check if the start character of the token matches the beginning of the range
        # or if the token spans the beginning of the range
        if word.start_char == begin or (word.start_char < begin and word.end_char > begin):
            # If it matches, add the index of the token to the list of word indices
            word_indices.append(idx)
        # Check if the end character of the token matches the end of the range
        # or if the token spans the end of the range
        if word.end_char == end or (word.start_char < end and word.end_char > end):
            # If it matches, add the index of the token to the list of word indices
            word_indices.append(idx)
            break

    # Return the list of word indices found within the given range
    return word_indices

In [75]:
def generate_labels(words, annotations, start_token_idx, end_token_idx):
    labels = ['O'] * len(words)

    for annotation in annotations:
        code = annotation['code']

        # # Update labels for the matched words
        # if start_token_idx is not None and end_token_idx is not None:
        #     labels[start_token_idx] = f'B-{code}'
        #     for idx in range(start_token_idx + 1, end_token_idx + 1):
        #         labels[idx] = f'I-{code}'

    return labels

In [76]:
# Function to search for JSON files in a given folder and its subfolders
def search_files(folder_path):
    # Set to store unique hadm_id values
    hadm_id_set = set()

    # Recursively search for JSON files in the folder and its subfolders
    for root, dirs, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.json'):
                # Construct the full path to the JSON file
                json_file_path = os.path.join(root, filename)
                print("Processing:", json_file_path)
                # Call extract_info_from_json function to extract information from the JSON file
                extract_info_from_json(json_file_path, hadm_id_set)
                # Print a separator after processing each file
                print("=" * 50)

    # Print the count of unique hadm_id values
    print("Total unique hadm_id count:", len(hadm_id_set))




In [77]:
# Specify the path to the main folder containing subfolders with JSON files
main_folder_path = r'C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2'

# Call the search_files function to start searching for JSON files in the main folder and its subfolders
search_files(main_folder_path)

Processing: C:\Users\okechukwu chude\Documents\NLP\text extraction\Automating-Medical-Coding\2\100197-ICD-9.json
Sentence ID: 0
Sentence: Admission Date : [ **2136-10-23 **] Discharge Date : [ **2136-10-24 **]
Labels: O O O O O O O O O O O O

Sentence ID: 1
Sentence: Date of Birth : [ ** 2056-7-14 **] Sex : M
Labels: O O O O O O O O O O O

Sentence ID: 2
Sentence: Service : NEUROSURGERY
Labels: O O O

Sentence ID: 3
Sentence: Allergies : No Known Allergies / Adverse Drug Reactions
Labels: O O O O O O O O O

Sentence ID: 4
Sentence: Attending :[** First Name3 ( LF ) 1835 **] Chief Complaint : Found down
Labels: O O O O O O O O O O O O O O

Sentence ID: 5
Sentence: Major Surgical or Invasive Procedure : None
Labels: O O O O O O O

Sentence ID: 6
Sentence: History of Present Illness : 80M p/w a large R IPH with intraventricular extension , midline shift , and hydrocephalus .
Labels: O O O O O O O O O O B O B I O O O O O B O

Sentence ID: 7
Sentence: The patient was found lying face up in 

In [78]:
import pandas as pd

# Function to create a DataFrame from the extracted sentence information
def create_dataframe(extracted_texts):
    data = []

    for note_info in extracted_texts:
        if 'sentence_info' in note_info:
            hadm_id = note_info['hadm_id']
            note_id = note_info['note_id']
            for sentence_info in note_info['sentence_info']:
                words = sentence_info['words']
                labels = sentence_info['labels']
                sentence_id = sentence_info['sentence_id']

                for word, label in zip(words, labels):
                    data.append({
                        'hadm_id': hadm_id,
                        'note_id': note_id,
                        'sentence_id': sentence_id,
                        'word': word,
                        'label': label
                    })

    df = pd.DataFrame(data)
    return df

df = create_dataframe(extracted_texts)
print(df)

      hadm_id  note_id  sentence_id          word label
0      100197    25762            0     Admission     O
1      100197    25762            0          Date     O
2      100197    25762            0             :     O
3      100197    25762            0             [     O
4      100197    25762            0  **2136-10-23     O
...       ...      ...          ...           ...   ...
9868   101525   240524          480       tracing     O
9869   101525   240524          480     available     O
9870   101525   240524          480           for     O
9871   101525   240524          480    comparison     O
9872   101525   240524          480             .     O

[9873 rows x 5 columns]


In [79]:
label_counts = df['label'].value_counts()
print(label_counts)

label
O    9647
I     140
B      86
Name: count, dtype: int64


In [80]:
token_df= df.drop(['hadm_id', 'note_id',], axis=1)

In [81]:
token_df

Unnamed: 0,sentence_id,word,label
0,0,Admission,O
1,0,Date,O
2,0,:,O
3,0,[,O
4,0,**2136-10-23,O
...,...,...,...
9868,480,tracing,O
9869,480,available,O
9870,480,for,O
9871,480,comparison,O


In [82]:
token_df.rename(columns = {'word':'words', 'label':'labels'}, inplace = True)

In [83]:
token_df.head()

Unnamed: 0,sentence_id,words,labels
0,0,Admission,O
1,0,Date,O
2,0,:,O
3,0,[,O
4,0,**2136-10-23,O


In [84]:
token_df.to_csv('test_tokens.csv', index=False)

In [5]:
import pandas as pd

token_df= pd.read_csv('test_tokens.csv')

In [6]:
token_df.count()

sentence_id    9873
words          9852
labels         9873
dtype: int64

In [7]:
#checking for null values
token_df.isnull().sum()

sentence_id     0
words          21
labels          0
dtype: int64

In [8]:
data = token_df.fillna(method='ffill')
data.head()

  data = token_df.fillna(method='ffill')


Unnamed: 0,sentence_id,words,labels
0,0,Admission,O
1,0,Date,O
2,0,:,O
3,0,[,O
4,0,**2136-10-23,O


In [9]:
# let's create a new column called "sentence" which groups the words by sentence
data['sentence'] = data[['sentence_id','words','labels']].groupby(['sentence_id'])['words'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
data['word_labels'] = data[['sentence_id','words','labels']].groupby(['sentence_id'])['labels'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,sentence_id,words,labels,sentence,word_labels
0,0,Admission,O,Admission Date : [ **2136-10-23 **] Discharge ...,"O,O,O,O,O,O,O,O,O,O,O,O"
1,0,Date,O,Admission Date : [ **2136-10-23 **] Discharge ...,"O,O,O,O,O,O,O,O,O,O,O,O"
2,0,:,O,Admission Date : [ **2136-10-23 **] Discharge ...,"O,O,O,O,O,O,O,O,O,O,O,O"
3,0,[,O,Admission Date : [ **2136-10-23 **] Discharge ...,"O,O,O,O,O,O,O,O,O,O,O,O"
4,0,**2136-10-23,O,Admission Date : [ **2136-10-23 **] Discharge ...,"O,O,O,O,O,O,O,O,O,O,O,O"


In [10]:
label2id = {k: v for v, k in enumerate(data.labels.unique())}
id2label = {v: k for v, k in enumerate(data.labels.unique())}
label2id

{'O': 0, 'B': 1, 'I': 2}

In [11]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sentence,word_labels
0,Admission Date : [ **2136-10-23 **] Discharge ...,"O,O,O,O,O,O,O,O,O,O,O,O"
1,Date of Birth : [ ** 2056-7-14 **] Sex : M,"O,O,O,O,O,O,O,O,O,O,O"
2,Service : NEUROSURGERY,"O,O,O"
3,Allergies : No Known Allergies / Adverse Drug ...,"O,O,O,O,O,O,O,O,O"
4,Attending :[** First Name3 ( LF ) 1835 **] Chi...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O"


In [17]:
data.head(7)

Unnamed: 0,sentence,word_labels
0,Admission Date : [ **2136-10-23 **] Discharge ...,"O,O,O,O,O,O,O,O,O,O,O,O"
1,Date of Birth : [ ** 2056-7-14 **] Sex : M,"O,O,O,O,O,O,O,O,O,O,O"
2,Service : NEUROSURGERY,"O,O,O"
3,Allergies : No Known Allergies / Adverse Drug ...,"O,O,O,O,O,O,O,O,O"
4,Attending :[** First Name3 ( LF ) 1835 **] Chi...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O"
5,Major Surgical or Invasive Procedure : :,"O,O,O,O,O,O,O"
6,History of Present Illness : 80M p/w a large R...,"O,O,O,O,O,O,O,O,O,O,B,O,B,I,O,O,O,O,O,B,O"


In [12]:
len(data)

358

In [14]:
data.iloc[6].sentence

'History of Present Illness : 80M p/w a large R IPH with intraventricular extension , midline shift , and hydrocephalus .'

In [15]:
data.iloc[6].word_labels

'O,O,O,O,O,O,O,O,O,O,B,O,B,I,O,O,O,O,O,B,O'

In [19]:
filtered_df = data[~data['word_labels'].apply(lambda x: all(label == 'O' for label in x.split(',')))]


In [20]:
filtered_df.head()

Unnamed: 0,sentence,word_labels
6,History of Present Illness : 80M p/w a large R...,"O,O,O,O,O,O,O,O,O,O,B,O,B,I,O,O,O,O,O,B,O"
11,He was taken to [ ** Hospital1 18 ** ] [ ** Lo...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
15,He also had signed a DNR / DNI order .,"O,O,O,O,O,B,O,O,O,O"
20,History : - DIABETES TYPE II - HYPERLIPIDEMIA ...,"O,O,O,B,I,I,O,B,O,B,O,B,O,B,I,I,I,I,I,I,I,I,O,..."
37,[ ** Name2 ( NI ) ** ] was extubated on [ * * ...,"O,O,O,O,O,O,O,O,O,B,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [21]:
filtered_df.iloc[1].sentence

'He was taken to [ ** Hospital1 18 ** ] [ ** Location ( un ) 620 ** ] and head CT was performed , revealing a large intraparenchymal hemorrhage extending from the lower midbrain into the hypothalamus , thalamus and basal ganglia on the right , with significant mass effect , intraventricular extension with casting of the right ventricle and some blood product in the posterior [ ** Doctor Last Name 534 ** ] of the left lateral ventricle .'

In [23]:
filtered_df.iloc[1].word_labels

'O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B,I,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O'