In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.insert(0, "..")

In [3]:
import medspacy

In [4]:
with open("./discharge_summary.txt") as f:
    text = f.read()

In [5]:
enable = ['sentencizer',
 'tagger',
 'parser',
 'ner',
 'target_matcher',
 'context',
 'sectionizer',
 'doc_consumer']

In [6]:
nlp = medspacy.load("en_info_3700_i2b2_2012", enable=enable)



In [7]:
nlp.pipe_names

['sentencizer',
 'tagger',
 'parser',
 'ner',
 'target_matcher',
 'context',
 'sectionizer']

In [74]:
doc = nlp(text)

# I. DocConsumer

In [9]:
from medspacy.io import DocConsumer

In [10]:
doc_consumer = DocConsumer(nlp, dtypes=("ent", "context", "section", "doc"))

In [11]:
doc_consumer.dtype_attrs

{'ent': ['text',
  'start_char',
  'end_char',
  'label_',
  'is_negated',
  'is_uncertain',
  'is_historical',
  'is_hypothetical',
  'is_family',
  'section_category',
  'section_parent'],
 'context': ['ent_text',
  'ent_label_',
  'ent_start_char',
  'ent_end_char',
  'modifier_text',
  'modifier_category',
  'modifier_direction',
  'modifier_start_char',
  'modifier_end_char',
  'modifier_scope_start_char',
  'modifier_scope_end_char'],
 'section': ['section_category',
  'section_title_text',
  'section_title_start_char',
  'section_title_end_char',
  'section_title_text',
  'section_title_start_char',
  'section_title_end_char',
  'section_text',
  'section_text_start_char',
  'section_text_end_char',
  'section_parent'],
 'doc': ['text']}

In [75]:
doc = doc_consumer(doc)

## Ents data

In [13]:
ent_data = doc._.ent_data

In [14]:
ent_data.keys()

odict_keys(['text', 'start_char', 'end_char', 'label_', 'is_negated', 'is_uncertain', 'is_historical', 'is_hypothetical', 'is_family', 'section_category', 'section_parent'])

In [15]:
ent_data_rows = doc._.get_data("ent", as_rows=True)

In [16]:
ent_data_rows[0]

('Hydrochlorothiazide',
 163,
 182,
 'TREATMENT',
 False,
 False,
 False,
 False,
 False,
 'allergies',
 None)

In [18]:
ents_df = doc._.to_dataframe("ent")

In [19]:
ents_df.head()

Unnamed: 0,text,start_char,end_char,label_,is_negated,is_uncertain,is_historical,is_hypothetical,is_family,section_category,section_parent
0,Hydrochlorothiazide,163,182,TREATMENT,False,False,False,False,False,allergies,
1,Abdominal pain,239,253,PROBLEM,False,False,False,False,False,chief_complaint,
2,Invasive Procedure,273,291,TREATMENT,False,False,False,False,False,chief_complaint,
3,PICC line,293,302,TREATMENT,False,False,False,False,False,chief_complaint,
4,ERCP,314,318,TEST,False,False,False,False,False,chief_complaint,


In [20]:
ents_df[ents_df["is_negated"] == True]

Unnamed: 0,text,start_char,end_char,label_,is_negated,is_uncertain,is_historical,is_hypothetical,is_family,section_category,section_parent
9,metastasis,519,529,PROBLEM,True,False,False,False,False,history_of_present_illness,
19,alcohol or drug use,788,807,PROBLEM,True,False,False,False,False,social_history,


## Context data

In [21]:
context_data = doc._.context_data

In [22]:
context_data.keys()

odict_keys(['ent_text', 'ent_label_', 'ent_start_char', 'ent_end_char', 'modifier_text', 'modifier_category', 'modifier_direction', 'modifier_start_char', 'modifier_end_char', 'modifier_scope_start_char', 'modifier_scope_end_char'])

In [23]:
context_df = doc._.to_dataframe("context")

In [24]:
context_df.head()

Unnamed: 0,ent_text,ent_label_,ent_start_char,ent_end_char,modifier_text,modifier_category,modifier_direction,modifier_start_char,modifier_end_char,modifier_scope_start_char,modifier_scope_end_char
0,metastasis,PROBLEM,519,529,no evidence of,NEGATED_EXISTENCE,FORWARD,504,518,519,518
1,alcohol or drug use,PROBLEM,788,807,No,NEGATED_EXISTENCE,FORWARD,785,787,788,787
2,stroke,PROBLEM,838,844,Mother,FAMILY,FORWARD,826,832,833,832
3,aspiration respiratory distress,PROBLEM,1478,1509,h/o,HISTORICAL,FORWARD,1474,1477,1478,1477
4,fever,PROBLEM,1652,1657,if,HYPOTHETICAL,FORWARD,1613,1615,1616,1615


## Section data

In [25]:
section_data = doc._.section_data

In [26]:
section_data.keys()

odict_keys(['section_category', 'section_title_text', 'section_title_start_char', 'section_title_end_char', 'section_text', 'section_text_start_char', 'section_text_end_char', 'section_parent'])

In [27]:
section_data_rows = doc._.get_data("section", as_rows=True)

In [28]:
section_data_rows[0]

(None,
 None,
 0,
 0,
 'Admission Date:  [**2573-5-30**]              Discharge Date:   [**2573-7-1**]\n\nDate of Birth:  [**2498-8-19**]             Sex:   F\n\n',
 0,
 134,
 None)

In [29]:
section_df = doc._.to_dataframe("section")

In [30]:
section_df.head()

Unnamed: 0,section_category,section_title_text,section_title_start_char,section_title_end_char,section_text,section_text_start_char,section_text_end_char,section_parent
0,,,0,0,Admission Date: [**2573-5-30**] ...,0,134,
1,other,Service:,134,142,Service: SURGERY\n\n,134,152,
2,allergies,Allergies:,152,162,Allergies:\nHydrochlorothiazide\n\nAttending:[...,152,222,
3,chief_complaint,Chief Complaint:,222,238,Chief Complaint:\nAbdominal pain\n\nMajor Surg...,222,350,
4,history_of_present_illness,History of Present Illness:,350,377,History of Present Illness:\n74y female with t...,350,532,


## Doc

In [31]:
doc_data = doc._.doc_data

In [32]:
doc_data.keys()

odict_keys(['text'])

In [33]:
doc_df = doc._.to_dataframe("doc")

In [34]:
doc_df

Unnamed: 0,text
0,Admission Date: [**2573-5-30**] ...


## Customizing attributes

In [42]:
nlp2 = medspacy.load("en_info_3700_i2b2_2012", enable=enable)



In [43]:
doc_consumer2 = DocConsumer(nlp2, dtypes=("ent",), 
                            dtype_attrs={
                                "ent": [
                                    "lower_",
                                    "label_",
                                    "is_negated",
                                    "section_category",
                                    
                                ]
                                
                            }
)

In [44]:
nlp2.add_pipe(doc_consumer2)

In [72]:
doc2 = nlp2(text)

In [73]:
doc2._.to_dataframe("ent")

Unnamed: 0,lower_,label_,is_negated,section_category
0,hydrochlorothiazide,TREATMENT,False,allergies
1,abdominal pain,PROBLEM,False,chief_complaint
2,invasive procedure,TREATMENT,False,chief_complaint
3,picc line,TREATMENT,False,chief_complaint
4,ercp,TEST,False,chief_complaint
5,sphincterotomy,TREATMENT,False,chief_complaint
6,a recent stroke,PROBLEM,False,history_of_present_illness
7,abdominal pain,PROBLEM,False,history_of_present_illness
8,imaging,TEST,False,history_of_present_illness
9,metastasis,PROBLEM,True,history_of_present_illness


# Writer and Reader

In [57]:
from medspacy.io.db import DbWriter, DbReader, DbConnect

In [51]:
import tempfile, os
tmpdirname = tempfile.TemporaryDirectory()
db = os.path.join(tmpdirname.name, "test.db")



## DbConn

In [52]:
import sqlite3

In [53]:
sq3_conn = sqlite3.connect(db)

In [59]:
conn = DbConnect(conn=sq3_conn)

Opened connection to None.None


## DbWriter

In [67]:
# TODO: Provide this as a default
col_types = [
    "varchar(1000)",
    "int",
    "int",
    "varchar(100)",
    "int",
    "int",
    "int",
    "int",
    "int",
    "varchar(100)",
    "varchar(100)"
]

In [77]:
doc_consumer.dtype_attrs["ent"]

['text',
 'start_char',
 'end_char',
 'label_',
 'is_negated',
 'is_uncertain',
 'is_historical',
 'is_hypothetical',
 'is_family',
 'section_category',
 'section_parent']

In [68]:
for (name, typ) in zip(doc_consumer.dtype_attrs["ent"], col_types):
    print(name, typ)

text varchar(1000)
start_char int
end_char int
label_ varchar(100)
is_negated int
is_uncertain int
is_historical int
is_hypothetical int
is_family int
section_category varchar(100)
section_parent varchar(100)


In [69]:
writer = DbWriter(conn, "ents", cols=doc_consumer.dtype_attrs["ent"], col_types=col_types,
                 create_table=True)

Created table ents with query: CREATE TABLE ents (text varchar(1000), start_char int, end_char int, label_ varchar(100), is_negated int, is_uncertain int, is_historical int, is_hypothetical int, is_family int, section_category varchar(100), section_parent varchar(100))


In [76]:
writer.write(doc._.to_dataframe("ent"))

Wrote 38 rows with query: INSERT INTO ents (text, start_char, end_char, label_, is_negated, is_uncertain, is_historical, is_hypothetical, is_family, section_category, section_parent) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)


## DbReader
This is probably supposed to read documents, not results, but just doing this to demonostrate this works

In [99]:
read_query = """
SELECT *
FROM ents
"""

In [100]:
reader = DbReader(conn, read_query)

In [101]:
rslts = reader.read()

Read 38 rows with query: 
SELECT *
FROM ents



In [102]:
rslts[0]

('Hydrochlorothiazide',
 163,
 182,
 'TREATMENT',
 0,
 0,
 0,
 0,
 0,
 'allergies',
 None)

## Pipeline
Need to refactor