In [1]:
try:
    from medcat.cat import CAT
except:
    print("WARNING: Runtime will restart automatically and please run other cells thereafter.")
    exit()

  from tqdm.autonotebook import tqdm, trange


In [2]:
import pandas as pd
import numpy as np
import json
import os
import spacy
import logging

from medcat.cat import CAT
from medcat.cdb import CDB
from medcat.config import Config
from medcat.vocab import Vocab
from medcat.meta_cat import MetaCAT
from medcat.preprocessing.tokenizers import TokenizerWrapperBPE
from tokenizers import ByteLevelBPETokenizer
import dask.dataframe as dd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)

In [3]:
#'increase size of notebook'
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [4]:
# os.chdir('/content/drive/My Drive/Colab Notebooks/mimic-iii-clinical-database-1.4/src')
path = "/home/vmadmin/Anchor_and_Span/src/"
if os.getcwd() != path:
    os.chdir(path)

In [5]:
!ls .

__pycache__	explore.ipynb	  mydask.png	   test_linking.py
anchor_span.py	medcat_ann.ipynb  scisp_ann.ipynb  utils.py
annotate.py	mstext_ann.ipynb  scisp_ann.py	   visualize_annotations.ipynb
creds		mstext_ann.py	  subscription


In [6]:
data_read_dask = dd.read_csv('../../mimic-iii/cleaned/notes/NOTESEVENTS_0.csv', 
                        encoding="utf-8", engine="python", dtype={"CLEANED_TEXT": str})
data_read_dask.head(n=2, compute=True)

Unnamed: 0,index,CLEANED_TEXT
0,0,"Admission Date: 2151-7-16 Discharge Date: 2151-8-4 Service: ADDENDUM: RADIOLOGIC STUDIES: Radiologic studies also included a chest CT, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis. This also moderate-sized left pleural effusion. HEAD CT: Head CT showed no intracranial hemorrhage or mass effect, but old infarction consistent with past medical history. ABDOMINAL CT: Abdominal CT showed lesions of T10 and sacrum most likely secondary to osteoporosis. These can be followed by repeat imaging as an outpatient. First Name8 (NamePattern2) First Name4 (NamePattern1) 1775 Last Name (NamePattern1) , M.D. MD Number(1) 1776 Dictated By:Hospital 1807 MEDQUIST36 D: 2151-8-5 12:11 T: 2151-8-5 12:21 JOB#: Job Number 1808"
1,1,"Admission Date: 2118-6-2 Discharge Date: 2118-6-14 Date of Birth: Sex: F Service: MICU and then to Doctor Last Name Medicine HISTORY OF PRESENT ILLNESS: This is an 81-year-old female with a history of emphysema (not on home O2), who presents with three days of shortness of breath thought by her primary care doctor to be a COPD flare. Two days prior to admission, she was started on a prednisone taper and one day prior to admission she required oxygen at home in order to maintain oxygen saturation greater than 90%. She has also been on levofloxacin and nebulizers, and was not getting better, and presented to the Hospital1 18 Emergency Room. In the Hospital3 Emergency Room, her oxygen saturation was 100% on CPAP. She was not able to be weaned off of this despite nebulizer treatment and Solu-Medrol 125 mg IV x2. Review of systems is negative for the following: Fevers, chills, nausea, vomiting, night sweats, change in weight, gastrointestinal complaints, neurologic changes, rashes, palpitations, orthopnea. Is positive for the following: Chest pressure occasionally with shortness of breath with exertion, some shortness of breath that is positionally related, but is improved with nebulizer treatment. PAST MEDICAL HISTORY: 1. COPD. Last pulmonary function tests in 2117-11-3 demonstrated a FVC of 52% of predicted, a FEV1 of 54% of predicted, a MMF of 23% of predicted, and a FEV1:FVC ratio of 67% of predicted, that does not improve with bronchodilator treatment. The FVC, however, does significantly improve with bronchodilator treatment consistent with her known reversible air flow obstruction in addition to an underlying restrictive ventilatory defect. The patient has never been on home oxygen prior to this recent episode. She has never been on steroid taper or been intubated in the past. 2. Lacunar CVA. MRI of the head in 2114-11-4 demonstrates ""mild degree of multiple small foci of high T2 signal within the white matter of both cerebral hemispheres as well as the pons, in the latter region predominantly to the right of midline. The abnormalities, while nonspecific in etiology, are most likely secondary to chronic microvascular infarction. There is no mass, lesion, shift of the normal midline strictures or hydrocephalus. The major vascular flow patterns are preserved. There is moderate right maxillary, moderate bilateral ethmoid, mild left maxillary, minimal right sphenoid, and frontal sinus mucosal thickening. These abnormalities could represent an allergic or some other type of inflammatory process. Additionally noted is a moderately enlarged subtotally empty sella turcica"". 3. Angina: Most recent stress test was in 2118-1-3 going for four minutes with a rate pressure product of 10,000, 64% of maximum predicted heart rate without evidence of ischemic EKG changes or symptoms. The imaging portion of the study demonstrated no evidence of myocardial ischemia and a calculated ejection fraction of 84%. The patient denies angina at rest and gets angina with walking a few blocks. Are alleviated by sublingual nitroglycerin. 4. Hypothyroidism on Synthroid. 5. Depression on Lexapro. 6. Motor vehicle accident with head injury approximately 10 years ago. MEDICATIONS ON ADMISSION: 1. Hydrochlorothiazide 25 q.d. 2. Prednisone 60 mg, 50 mg, 40 mg, 20 mg. 3. Levofloxacin 500 mg q.d. 4. Imdur 60 mg q.d. 5. Synthroid 75 mcg q.d. 6. Pulmicort nebulizer b.i.d. 7. Albuterol nebulizer q.4. prn. 8. Lexapro 10 mg q.d. 9. Protonix 40 mg q.d. 10. Aspirin 81 mg q.d. ALLERGIES: Norvasc leads to lightheadedness and headache. FAMILY HISTORY: Noncontributory. SOCIAL HISTORY: Lives with her husband, Dr. Known lastname 1809 an eminent Pediatric Neurologist at Hospital3 1810. The patient is a prior smoker, but has not smoked in over 10 years. She has no known alcohol use and she is a full code. PHYSICAL EXAM AT TIME OF ADMISSION: Blood pressure 142/76, heart rate 100 and regular, respirations at 17-21, and 97% axillary temperature. She was saturating at 100% on CPAP with dry mucous membranes. An elderly female in no apparent distress. Pupils are equal, round, and reactive to light and accommodation. Extraocular movements are intact. Oropharynx difficult to assess due to CPAP machine. No evidence of jugular venous pressure, however, the strap from the CPAP machine obscures the neck exam. Cranial nerves II through XII are grossly intact. Neck is supple without lymphadenopathy. Heart exam: Tachycardic, regular, obscured by loud bilateral wheezing with increase in the expiratory phase as well as profuse scattered rhonchi throughout the lung fields. Positive bowel sounds, soft, nontender, nondistended, obese, no masses. Mild edema of the lower extremities without clubbing or cyanosis, no rashes. There is a right hand hematoma. Strength is assessed as 5-9 in the lower extremities, 5-9 in the upper extremities with a normal mental status and cognition. LABORATORY STUDIES: White count 19, hematocrit 41, platelets 300. Chem-7: 127, 3.6, 88, 29, 17, 0.6, 143. Troponin was negative. CKs were negative times three. Initial blood gas showed a pH of 7.4, pO2 of 66, pCO2 of 54. Chest x-ray demonstrates a moderate sized hiatal hernia, segmental atelectasis, left lower lobe infiltrate versus segmental atelectasis. EKG shows normal sinus rhythm at 113 beats per minute, normal axis, no evidence of ST-T wave changes. BRIEF SUMMARY OF HOSPITAL COURSE: 1. COPD/dyspnea/pneumonia: The patient was initially placed on an aggressive steroid taper and admitted to the Medical Intensive Care Unit due to her difficulty with oxygenation despite CPAP machine. She was also given nebulizer treatments q.4h. as well as chest PT. The nebulizers were increased to q.1h. due to the fact that she continued to have labored breathing. Due to persistent respiratory failure and labored breathing, the patient was intubated on 2118-6-7 in order to improve oxygenation, ventilation, and ability to suction. A bronchoscopy was performed on 2118-6-7, which demonstrated marked narrowing of the airways with expiration consistent with tracheomalacia. On 2118-6-9, two silicone stents were placed, one in the left main stem (12 x 25 and one in the trachea 16 x 40) by Dr. First Name (STitle) Name (STitle) under rigid bronchoscopy with general anesthesia. On 2118-6-11, the patient was extubated to a cool mist shovel mask and her oxygen was titrated down to 2 liters nasal cannula at which time she was transferred to the medical floor. On the medical floor, the steroids were weaned to off on 2118-6-14, and the patient was saturating at 97% on 2 liters, 92% on room air. On 2118-6-14, the patient was seen again by the Interventional Pulmonology service, who agreed that she looked much improved and recommended that she go to pulmonary rehabilitation with followup within six weeks' time status post placement of stents in respiratory failure. 2. Cardiovascular: The patient was ruled out for a MI. She did have another episode on the medical floor of chest pain, which showed no evidence of EKG changes and negative troponin, negative CKs x3. She was continued on aspirin, Imdur, and diltiazem for rate control per her outpatient regimen. 3. Hypertension: She was maintained on diltiazem and hydrochlorothiazide with adequate blood pressure control and normalization of electrolytes. 4. Hematuria: The patient had intermittent hematuria likely secondary to Foley placement. The Foley catheter was discontinued on 2118-6-14. She had serial urinalyses, which were all negative for signs of infection. 5. Hyperglycemia: Patient was placed on insulin-sliding scale due to hyperglycemia, which was steroid induced. This worked quite well and her glucose came back to normal levels once the steroids were tapered to off. 6. Leukocytosis: Patient did have a profound leukocytosis of 20 to 22 during much of her hospital course. As the steroids were tapered to off, her white blood cell count on 2118-6-14 was 15,000. It was felt that the leukocytosis was secondary to both steroids as well as question of a left lower lobe pneumonia. 7. For the left lower lobe pneumonia, the patient had initially received a course of levofloxacin 500 p.o. q.d. from 2118-6-4 to 2118-6-10. This was restarted on 2118-6-12 for an additional seven day course given the fact that she still had the leukocytosis and still had marked rales at the left lower lobe. 8. Hypothyroidism: The patient was continued on outpatient medical regimen. 9. Depression: The patient was continued on Lexapro per outpatient regimen. It is recommended that she follow up with a therapist as an outpatient due to the fact that she did have a blunted affect throughout much of the hospital course, and did appear clinically to be depressed. 10. Prophylaxis: She was maintained on proton-pump inhibitor with subQ Heparin. 11. Sore throat: The patient did have a sore throat for much of the hospital course post extubation. This was treated with Cepacol lozenges as well as KBL liquid (a solution containing Kaopectate, Bismuth, and lidocaine) at bedtime. 12. Communication/code status: The patient was full code throughout her hospital course, and communication was maintained with the patient and her husband. 13. Muscle weakness: The patient did have profound muscle weakness and was evaluated by Physical Therapy, and was found to have impaired functional mobility, impaired musculoskeletal performance, impaired gas exchange, impaired endurance, impaired ventilation, and needed help with supine to sit. However, she was able to tolerate sitting in a chair for approximately one hour. On motor exam, her flexors and extensors of the lower extremities were 4-8 at the knee, 4-8 at the ankle, 4-8 at the elbows, and 4-8 hips. It was felt that this weakness was most likely due to a combination of steroid myopathy as well as muscle atrophy secondary to deconditioning after a prolonged hospital course. 14. Speech/swallow: The patient had a Speech and Swallow evaluation showing no evidence of dysphagia, no evidence of vocal cord damage status post tracheal stent placement. DISCHARGE CONDITION: The patient was able to oxygenate on room air at 93% at the time of discharge. She was profoundly weak, but was no longer tachycardic and had a normal blood pressure. Her respirations were much improved albeit with transmitted upper airway sounds. DISCHARGE STATUS: The patient will be discharged to Hospital1 for both pulmonary and physical rehabilitation. DISCHARGE MEDICATIONS: 1. Levothyroxine 75 mcg p.o. q.d. 2. Citalopram 10 mg p.o. q.d. 3. Aspirin 81 mg p.o. q.d. 4. Fluticasone 110 mcg two puffs inhaled b.i.d. 5. Salmeterol Diskus one inhalation b.i.d. 6. Acetaminophen 325-650 mg p.o. q.4-6h. prn. 7. Ipratropium bromide MDI two puffs inhaled q.2h. prn. 8. Albuterol 1-2 puffs inhaled q.2h. prn. 9. Zolpidem tartrate 5 mg p.o. q.h.s. prn. 10. Isosorbide dinitrate 10 mg p.o. t.i.d. 11. Diltiazem 60 mg p.o. q.i.d. 12. Pantoprazole 40 mg p.o. q.24h. 13. Trazodone 25 mg p.o. q.h.s. prn. 14. SubQ Heparin 5000 units subcutaneous b.i.d. until such time that the patient is able to get out of bed twice a day. 15. Cepacol lozenges q.2h. prn. 16. Levofloxacin 500 mg p.o. q.d. for a seven day course to be completed on 2118-6-21. 17. Kaopectate/Benadryl/lidocaine 5 mL p.o. b.i.d. prn, not to be given around mealtimes for concern of dysphagia induced by lidocaine. 18. Lorazepam 0.5-2 mg IV q.6h. prn. FOLLOW-UP PLANS: The patient is recommended to followup with Dr. First Name4 (NamePattern1) Last Name (NamePattern1) 1407, Telephone/Fax (1) 1408 within two weeks of leaving of the hospital. She is also recommended to followup with the Interventional Pulmonary service for followup status post stent placement. She is also recommended to followup with a neurologist if her muscle weakness does not improve within one week on physical therapy with concern for steroid-induced myopathy. FINAL DIAGNOSES: 1. Tracheomalacia status post tracheal and left main stem bronchial stent placement. 2. Hypertension. 3. Hypothyroidism. 4. Restrictive lung defect. 5. Depression. DR.Last Name (STitle) ,First Name3 (LF) 12-207 Dictated By:Last Name (NamePattern1) 1811 MEDQUIST36 D: 2118-6-14 11:30 T: 2118-6-14 11:33 JOB#: Job Number 1812"


In [7]:
nlp = spacy.load('en_core_web_sm')

In [8]:
#using spacy multiprocess pip function to multiprocess data
def spacyProcess(data, dask=False):
    if dask:
        data.loc[:,"PROCESSED_TEXT"] = data['CLEANED_TEXT'].apply(lambda x:nlp(x))
        return data
    documents = nlp.pipe(data['CLEANED_TEXT'].tolist())
    return documents

In [9]:
start, end = 0, 10

In [10]:
data_read_ = data_read_dask.compute()

In [11]:
"data_read_das is of type {} and data_read is of type {}".format(type(data_read_dask), type(data_read_))

"data_read_das is of type <class 'dask.dataframe.core.DataFrame'> and data_read is of type <class 'pandas.core.frame.DataFrame'>"

In [12]:
if len(data_read_) < end:
    end = len(data_read_)
data_read = data_read_[start:end]

In [13]:
data_read.shape

(10, 2)

In [14]:
spacy_processed_data = spacyProcess(data=data_read)

In [15]:
document_sent_map = {}
sentences = []
sent_id = 0
for doc_id,doc in enumerate(spacy_processed_data):
    doc_sents = doc.sents
    doc_sents_ids = []
    for sent in doc_sents:
        sentences.append(" ".join([m.text for m in sent]))
        doc_sents_ids.append(sent_id)
        sent_id += 1
    document_sent_map[doc_id] = (doc_sents_ids[0], doc_sents_ids[-1]+1)
data_read_sp = pd.DataFrame({'CLEANED_TEXT':sentences})

In [16]:
dict([[k,v] for k,v in document_sent_map.items()][:5])

{0: (0, 11), 1: (11, 184), 2: (184, 304), 3: (304, 510), 4: (510, 706)}

In [17]:
data_read_sp.head()

Unnamed: 0,CLEANED_TEXT
0,"Admission Date : 2151 - 7 - 16 Discharge Date : 2151 - 8 - 4 Service : ADDENDUM : RADIOLOGIC STUDIES : Radiologic studies also included a chest CT , which confirmed cavitary lesions in the left lung apex consistent with infectious process / tuberculosis ."
1,This also moderate - sized left pleural effusion .
2,HEAD CT :
3,"Head CT showed no intracranial hemorrhage or mass effect , but old infarction consistent with past medical history ."
4,ABDOMINAL CT :


In [18]:
# format the df to match: required input data for multiprocessing = [(doc_id, doc_text), (doc_id, doc_text), ...]
def data_iterator_df(data):
    for id, row in data[['CLEANED_TEXT']].iterrows():
        yield (id, str(row['CLEANED_TEXT']))

In [19]:
cat = CAT.load_model_pack("../models/umls_sm_pt2ch_533bab5115c6c2d6.zip")

2023-07-18 19:10:16,355 - Found an existing unziped model pack at: ../models/umls_sm_pt2ch_533bab5115c6c2d6, the provided zip will not be touched.
2023-07-18 19:10:16,362 - Loading model pack with dill format
2023-07-18 19:10:16,363 - Reading CDB data from ../models/umls_sm_pt2ch_533bab5115c6c2d6/cdb.dat
2023-07-18 19:10:37,443 - You have MedCAT version '1.8.0' installed while the CDB was exported by MedCAT version '1.3.1.dev315',
which may or may not work. If you experience any compatibility issues, please reinstall MedCAT
or download the compatible model.
2023-07-18 19:10:44,730 - {
  "Model ID": "533bab5115c6c2d6",
  "Last Modified On": "07 June 2023",
  "History (from least to most recent)": [],
  "Description": "No description",
  "Source Ontology": null,
  "Location": null,
  "MetaCAT models": [
    {
      "Category Name": "Status",
      "Description": "No description",
      "Classes": {
        "Affirmed": 0,
        "Other": 1
      },
      "Model": "lstm"
    }
  ],
  "Bas

In [20]:
# Set a batch size to control for the variablity between document sizes
batch_size_chars = 500000 # Batch size (BS) in number of characters

# Run model
results = cat.multiprocessing(data_iterator_df(data_read_sp),  # Formatted data
                              # batch_size_chars = batch_size_chars,
                              nproc=8) # Number of processors

2023-07-18 19:11:00,307 - Annotated until now: 0 docs; Current BS: 1172 docs; Elapsed time: 0.00 minutes


In [21]:
len(results)

1172

In [22]:
#find the token id/position of an entity given its char offset from start of a sentence and the sentence
def fetch_entitis_span_pos(entity, sentence, extractor):
    sentence_tokenized = dict([(i, j) for i, j in enumerate(sentence.split())])
    token_ids_cumulative_length = {}
    if extractor == 'MSTA4H':
        char_offset, entity_len, entity_text = entity.offset, entity.length, entity.text
    if extractor == 'MEDCAT':
        char_offset, entity_len, entity_text = entity[0], entity[1]-entity[0], entity[2]
    curr_offset = 0
    span, end_span_found = [], False
    if char_offset == 0:
        span.append(0)

    for i, j in sentence_tokenized.items():
        if curr_offset >= char_offset + entity_len:
            span.append(i)
            end_span_found = True
        else:
            curr_offset = curr_offset + len(j) + 1
            if curr_offset == char_offset:
                span.append(i + 1)
        if end_span_found == True:
            if len(span) != 2:
                print("Failed to detect span position {} {}".format(entity_text, span))
            try:
                assert len(span) == 2
                break
            except Exception as e:
                return entity_text, span
    return entity_text, span

In [23]:
results_copy = dict(sorted(results.items(), key=lambda x:x[0]))

In [27]:
dataset_ann = []
for doc_id, document_sent_range in document_sent_map.items():
    doc_ann = {}
    doc_entities = []
    sents = []
    for sent_id, doc in enumerate(results_copy):
        if sent_id in range(document_sent_range[0], document_sent_range[-1]):
            sents.append(sentences[sent_id].split())
            for k,v in results_copy[doc]['entities'].items():
                entity = {}
                _entity_ = (v['start'], v['end'], v['source_value'])
                entity_text, entity_span_pos = fetch_entitis_span_pos(_entity_, sentences[sent_id], "MEDCAT")
                try:
                    assert len(entity_span_pos) == 2
                except AssertionError:
                    pass
                entity['name'] = entity_text
                entity['sent_id'] = sent_id
                entity['char_position'] = [v['start'], v['end']]
                entity['token_pos'] = entity_span_pos
                entity['score'] = v['acc']
                entity['linked_entities'] = []
                linked_entity = {}
                for m,n in zip(v['types'], v['type_ids']):
                    linked_entity['cui'] = v['cui']
                    linked_entity['type'] = m
                    linked_entity['type_id'] = n
                    entity['linked_entities'].append(linked_entity)
                if entity:
                    doc_entities.append(entity)
        if sent_id == (document_sent_range[-1]-1):
            break
    doc_ann['Entities'] = doc_entities
    doc_ann['Sents'] = sents
    dataset_ann.append(doc_ann)

Failed to detect span position 25PM [22]
Failed to detect span position 30AM [20]
Failed to detect span position 26PM [9]
Failed to detect span position 17AM [9]
Failed to detect span position 26PM [18]
Failed to detect span position 26PM [9]
Failed to detect span position 26PM [18]
Failed to detect span position 11PM [12]
Failed to detect span position 26PM [31]
Failed to detect span position 26PM [39]
Failed to detect span position 26PM [52]
Failed to detect span position 11PM [7]
Failed to detect span position 32PM [8]
Failed to detect span position 5 - mm [24]
Failed to detect span position 13PM [25]
Failed to detect span position 13PM [9]
Failed to detect span position 13PM [18]
Failed to detect span position 13PM [27]
Failed to detect span position 13PM [37]


In [28]:
dataset_ann

[{'Entities': [{'name': 'chest',
    'sent_id': 0,
    'char_position': [138, 143],
    'token_pos': [28, 29],
    'score': 0.8211847196206128,
    'linked_entities': [{'cui': 'C0817096',
      'type': 'Body Location or Region',
      'type_id': 'T029'}]},
   {'name': 'lesions',
    'sent_id': 0,
    'char_position': [174, 181],
    'token_pos': [34, 35],
    'score': 0.8631982261939173,
    'linked_entities': [{'cui': 'C0221198',
      'type': 'Finding',
      'type_id': 'T033'}]},
   {'name': 'lung',
    'sent_id': 0,
    'char_position': [194, 198],
    'token_pos': [38, 39],
    'score': 0.4046709274009144,
    'linked_entities': [{'cui': 'C0024109',
      'type': 'Body Part, Organ, or Organ Component',
      'type_id': 'T023'}]},
   {'name': 'infectious process',
    'sent_id': 0,
    'char_position': [220, 238],
    'token_pos': [42, 44],
    'score': 1.0,
    'linked_entities': [{'cui': 'C0745283',
      'type': 'Pathologic Function',
      'type_id': 'T046'}]},
   {'name': 'tub

In [24]:
def createDir(path):
    dest = path
    if not os.path.exists(path):
        dest = os.makedirs(path)
    return dest

In [25]:
dest_dir = '../anns/medcat/'
createDir(dest_dir)

'../anns/medcat/'

In [26]:
import pickle

annotation_file = 'anns_mult_{}_{}.pkl'.format(start, end)
with open(os.path.join(dest_dir, annotation_file), 'wb') as a:
    pickle.dump(dataset_ann, a, protocol=pickle.HIGHEST_PROTOCOL)
    a.close()