In [1]:
import os
import re
import sys
import spacy
import pandas as pd
import json
import numpy as np
import yaml
import scispacy
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
from scispacy.umls_utils import UmlsKnowledgeBase

from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient, HealthcareEntityRelation
from pprint import pprint
from time import time

Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


In [2]:
#spark
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").appName("Mimic-II").config("spark.driver.memory", "15g").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/18 09:12:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
import logging

logging.basicConfig(format='%(process)d-%(levelname)s-%(message)s')

In [4]:
pd.set_option('display.max_colwidth', None)

In [5]:
#'increase size of notebook'
from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [6]:
data_read = pd.read_csv('../../mimic-iii/cleaned/notes/NOTESEVENTS_0.csv', low_memory=False)
data_read.iloc[:1,:]

Unnamed: 0,index,CLEANED_TEXT
0,0,"Admission Date: 2151-7-16 Discharge Date: 2151-8-4 Service: ADDENDUM: RADIOLOGIC STUDIES: Radiologic studies also included a chest CT, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis. This also moderate-sized left pleural effusion. HEAD CT: Head CT showed no intracranial hemorrhage or mass effect, but old infarction consistent with past medical history. ABDOMINAL CT: Abdominal CT showed lesions of T10 and sacrum most likely secondary to osteoporosis. These can be followed by repeat imaging as an outpatient. First Name8 (NamePattern2) First Name4 (NamePattern1) 1775 Last Name (NamePattern1) , M.D. MD Number(1) 1776 Dictated By:Hospital 1807 MEDQUIST36 D: 2151-8-5 12:11 T: 2151-8-5 12:21 JOB#: Job Number 1808"


In [7]:
data_read.shape

(12422, 2)

In [8]:
#spark
# data_read_spark = spark.read.format('csv').option("header", "true").load("/content/drive/My Drive/Colab Notebooks/mimic-iii-clinical-database-1.4/src/NOTESEVENTS_CLEANED_.csv")
data_read_spark = spark.createDataFrame(data_read)
data_read_spark.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+-----+--------------------+
|index|        CLEANED_TEXT|
+-----+--------------------+
|    0|Admission Date: 2...|
|    1|Admission Date: 2...|
|    2|Admission Date: 2...|
|    3|Admission Date: 2...|
|    4|Admission Date: 2...|
+-----+--------------------+
only showing top 5 rows



                                                                                

In [9]:
type(data_read_spark)

pyspark.sql.dataframe.DataFrame

In [10]:
spacy_model = spacy.load("en_core_web_sm")
kb = UmlsKnowledgeBase()
st = kb.semantic_type_tree

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
#specifying microsoft text analytics language service parameters
creds = yaml.safe_load(open('creds/text_analytics_credentials.yml', 'r'))
endpoint = creds['Subscription']['endpoint']
key = creds['Subscription']['key1']
text_analytics_client = TextAnalyticsClient(
    endpoint=endpoint,
    credential=AzureKeyCredential(key),
)

In [20]:
def fetch_entitis_span_pos(entity, sentence, extractor):
    sentence_tokenized = dict([(i, j) for i, j in enumerate(sentence.split())])
    token_ids_cumulative_length = {}
    if extractor == 'MSTA4H':
        char_offset, entity_len, entity_text = entity.offset, entity.length, entity.text
    if extractor == 'MEDCAT':
        char_offset, entity_len, entity_text = entity[0], entity[1]-entity[0], entity[2]
    curr_offset = 0
    span, end_span_found = [], False
    if char_offset == 0:
        span.append(0)

    for i, j in sentence_tokenized.items():
        if curr_offset >= char_offset + entity_len:
            span.append(i)
            end_span_found = True
        else:
            curr_offset = curr_offset + len(j) + 1
            if curr_offset == char_offset:
                span.append(i + 1)
        if end_span_found == True:
            if len(span) != 2:
                pass
#                 print("Failed to detect span position {} {}".format(entity_text, span))
            try:
                assert len(span) == 2
                break
            except Exception as e:
                return entity_text, span
    return entity_text, span

In [13]:
#create spans of batches that would be used toselect chunks of documents that can be processed by Text analytics without which ratelimitting errors
def batching_for_textanalyticsclinet(batch, size):
    x, y = 0, batch
    batch_list = []
    for i in range(0, size, batch):
        if i + batch >= size:
            batch_list.append((x, x + (size - i)))
            break
        batch_list.append((x, y))
        x = y
        y = y + batch
    return batch_list

In [14]:
start, end = 0, 2500

In [15]:
data_read = data_read[start:end]

In [16]:
start_time = time()
data = data_read["CLEANED_TEXT"].tolist()
documents = spacy_model.pipe(data)
documents_segmented = []
for doc in documents:
    doc_sents = doc.sents
    docs = []
    for i,d in enumerate(doc_sents):
        docs.append(" ".join([i.text for i in d]))
    documents_segmented.append(docs)
end_time = time()
print(end_time - start_time)

443.51146507263184


In [17]:
data_df = data_read_spark.toPandas()

                                                                                

In [18]:
def analyze_health(x):
    poller = text_analytics_client.begin_analyze_healthcare_entities([x])
    result = poller.result()
    return result

In [None]:
errorsInAnnotation = {}
un_linked_codes = {}
dataset_ann = []
for d,document in enumerate(documents_segmented):
    document_batches = batching_for_textanalyticsclinet(25, len(document))
    doc_ann = {}
    document_batch_entities = []
    document_batch_sents = []
    document_batch_labels = []
    print(d,"\n**************************************************************************************\n")
    sent_id = 0
    for k,document_batch in enumerate(document_batches):
        print(document_batch)
        start, end = document_batch
        document_sents = document[start:end]
        poller = text_analytics_client.begin_analyze_healthcare_entities(document_sents)
        result = poller.result()
        # docs = [doc for doc in result if not doc.is_error]
        errorSentences = []
        sent_entities = []
        document_labels = []
        for i,doc in enumerate(result):
#             print("Sent {}: {}".format(sent_id, document_sents[i]))
            if not doc.is_error:
                for e_c,entity in enumerate(doc.entities):
                    entity_span_pos = fetch_entitis_span_pos(entity, document_sents[i], extractor='MSTA4H')
                    entity_annotation = {"name": "{}".format(entity.text),
                                         "char_pos": [entity.offset, entity.offset+entity.length],
                                         "Category":entity.category,
                                        "token_pos": entity_span_pos[1],
                                        "sent_id": sent_id,
                                        "score": entity.confidence_score,
                                        "linked_entities": []}
#                     print(entity)
                    if entity.data_sources is not None:
                        for data_source in entity.data_sources:
                            linked_entity = {}
                            if data_source.name.lower() == 'umls':
                                try:
                                    umls_ent = kb.cui_to_entity[data_source.entity_id]
                                    stycodes = [(stycode, st.get_canonical_name(stycode)) for stycode in umls_ent.types]
                                    linked_entity["kb"] = data_source.name
#                                     print("--",data_source.entity_id, umls_ent.types)
                                    for stycode, styname in stycodes:
                                        linked_entity["id_code"] = data_source.entity_id
                                        linked_entity["type"] = styname
                                        linked_entity["type_id"] = stycode
#                                         print("----", styname, stycode)
                                except Exception as e:
                                    linked_entity["id_code"] = data_source.entity_id
                                    exc_type, exc_obj, exc_tb = sys.exc_info()
                                    errorStr = re.sub(r"<class|>", "", str(exc_type)).strip()
                                    if errorStr not in errorsInAnnotation:
                                        errorsInAnnotation[errorStr] = [str(e)]
                                    else:
                                        errorsInAnnotation[errorStr].append(str(e))
                                        logging.warning("There must be an entity whose code didn't get linked to the UMLS KB")
                            if data_source.name.lower() == 'snomedct_us':
                                linked_entity["kb"] = data_source.name
                                linked_entity["id_code"] = data_source.entity_id
                            if linked_entity:
                                entity_annotation["linked_entities"].append(linked_entity)
#                                 print("===", linked_entity)
                    sent_entities.append(entity_annotation)
#                     print("Entity {}: {}".format(e_c, entity))
                for relation in doc.entity_relations:
                    rel = {}
                    try:
                        rel["r"] = relation.relation_type
                        rt = re.split(r"Of|Finds", relation.relation_type)
                        for role in relation.roles:
                            if role.name == rt[0]:
                                rel["h"] = role.name
                            elif role.name == rt[1]:
                                rel["t"] = role.name
                        rel["evi"] = [sent_id]
                        document_labels.append(rel)
                    except Exception as e:
                        pass
            else:
                errorSentences.append((i, doc))
                logging.warning("Text analytics api didn't process this sentence")

#             print("\n")
            sent_id += 1

        if errorSentences:
            errorsInAnnotation[d] = errorSentences
        document_batch_entities.extend(sent_entities)
        document_batch_sents.extend([i.split() for i in document])
        document_batch_labels.extend(document_labels)
    doc_ann["Entities"] = document_batch_entities
    doc_ann["Sents"] = document_batch_sents
    doc_ann["Labels"] = document_batch_labels
    dataset_ann.append(doc_ann)

0 
**************************************************************************************

(0, 6)
1 
**************************************************************************************

(0, 25)
(25, 50)
(50, 75)




(75, 100)
(100, 125)
(125, 150)




(150, 175)
(175, 192)
2 
**************************************************************************************

(0, 25)
(25, 50)
(50, 75)
(75, 100)




(100, 122)
3 
**************************************************************************************

(0, 25)




(25, 50)




(50, 75)
(75, 100)




(100, 125)




(125, 150)
(150, 175)




(175, 197)
4 
**************************************************************************************

(0, 25)
(25, 50)
(50, 75)
(75, 100)
(100, 125)




(125, 150)




(150, 175)
(175, 193)
5 
**************************************************************************************

(0, 25)
(25, 50)
(50, 75)




(75, 85)
6 
**************************************************************************************

(0, 25)




(25, 50)




(50, 60)
7 
**************************************************************************************

(0, 25)
(25, 50)
(50, 75)




(75, 100)
(100, 125)




(125, 132)
8 
**************************************************************************************

(0, 25)




(25, 50)




(50, 62)
9 
**************************************************************************************

(0, 25)


In [31]:
end

2500

In [24]:
def createDir(path):
    dest = path
    if not os.path.exists(path):
        dest = os.makedirs(path)
    return dest

In [25]:
dest_dir = '../anns/microsoft_text_analytics/'
error_dir = '../anns/microsoft_text_analytics/errors'
createDir(dest_dir)
createDir(error_dir)

'../anns/microsoft_text_analytics/errors'

In [26]:
with open(os.path.join(dest_dir, 'ms_ann_{}_{}.pkl'.format(start, end)), 'bw') as ms:
    json.dump(dataset_ann, ms, protocol=pickle.HIGHEST_PROTOCOL)
    ms.close()
          
try:
    with open(os.path.join(error_dir, 'errors_in_annotation.json'), 'a') as er:
        json.dump(errorsInAnnotation, er)
        er.close()
except Exception:
    pass

In [None]:
import re
l = {}
try:
  assert 1 == 2
except Exception as e:
  print(1,e)
  exc_type, exc_obj, exc_tb = sys.exc_info()
  fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
  print(2, exc_type)
  if

1 
2 <class 'AssertionError'>


In [None]:
errorsInAnnotation

{"'KeyError'": [KeyError('C0743416'),
  KeyError('C1999162'),
  KeyError('C2237216'),
  KeyError('C0471050'),
  KeyError('C0470485'),
  KeyError('C0517391'),
  KeyError('C2183459'),
  KeyError('C0470485'),
  KeyError('C0742532'),
  KeyError('C3508933'),
  KeyError('C4029497'),
  KeyError('C2041208')]}