# Exploring Text Data with a Consumer Lens

### We need to install some software

In [None]:
import warnings
warnings.simplefilter("ignore")
warnings.filterwarnings('ignore')

import ibis
import ipywidgets as ipw
import spacy
from spacy.pipeline import EntityRuler
from medspacy.context import ConTextComponent, ConTextItem
from medspacy.visualization import visualize_ent
from spacy_readability import Readability
from IPython.display import display, HTML
nlp = spacy.load("en_info_3700_i2b2_2012")
import getpass


In [None]:
MIMICHOST = "35.233.174.193"
con = ibis.mysql.connect(
   user=input("Enter username"),
   password=getpass.getpass("Enter password"),
   host=MIMICHOST,
   database='mimic2',
   )

In [None]:
ibis.options.interactive = True


### Types of Notes available

- 'DISCHARGE_SUMMARY'
- 'MD Notes'
- 'Nursing/Other'
- 'RADIOLOGY_REPORT'

### We are only going to look at noteevents and these three columns

In [None]:
table = con.table('noteevents')['subject_id', 'text', 'category']

### Think of some terms to search for sensitive information

In [None]:
filters = [table.category=='Nursing/Other', table.text.re_search('sexual|illicit drug|abuse')]
#filters = [table.subject_id==7524]
filters = [table.category=='Nursing/Other', table.text.re_search('sexual')]


In [None]:
tmp = table.filter(filters)

In [None]:
def view_results(tbl):
    @ipw.interact(x=ipw.fixed(tbl), n=ipw.IntSlider(max=tbl.shape[0]-1))
    def _view(x,n):
        print("%s: %s"%(x.iloc[n,0], x.iloc[n,-1]))
        print(x.iloc[n,1])

In [None]:
view_results((tmp).execute())

## Document Complexity

In this section you can explore the complexity of the documents. We are using the Spacy NLP packae. If the documents get too large or too numerous the kernel will die, so be cautious.

In [None]:
ruler = EntityRuler(nlp, phrase_matcher_attr="LOWER", overwrite_ents=False)

In [None]:
%%capture
context = ConTextComponent(nlp, rules="default")

In [None]:
nlp.add_pipe(ruler)
nlp.add_pipe(context)
nlp.add_pipe(Readability())


#### [Spacy-Readability](https://github.com/mholtzscher/spacy_readability)

The Readability pipeline computes various readability score. I'm arbitrarily pulling out the SMOG score and sorting by it. You can change this to one of the other scores comptued.

We are also using Spacy to markup clinical concepts that it identifies. The number of clinical concepts may also relate to the difficulty of understanding the document.

In [None]:
t = con.table('noteevents')['subject_id', 'text', 'category']

### Add/Modify filters as desired

In [None]:
filters2 = [t.category=='DISCHARGE_SUMMARY']
filters2 = [t.category=='Nursing/Other', t.text.re_search("SOCIAL WORK")]


In [None]:
data2  = ((t.filter(filters2)).limit(100, offset=200)).execute()

In [None]:
data2['docs'] = list(nlp.pipe(data2.text))

In [None]:
data2['smog'] = data2.docs.apply(lambda x: x._.smog)

In [None]:
def view_doc_results(tbl):
    @ipw.interact(x=ipw.fixed(tbl), n=ipw.IntSlider(max=tbl.shape[0]-1), mode=['PLAIN','MARKUP'])
    def _view(x,n, mode):
        display(HTML("<h3>subject: %s-- smog score: %s</h3>"%(x.iloc[n,0], x.iloc[n,-1])))
        if mode == 'MARKUP':
            visualize_ent(x.iloc[n,-2])
        else:
            print(x.iloc[n,1])


In [None]:
view_doc_results(data2.sort_values('smog', ascending=False))