In [1]:
import pandas as pd
import numpy as np
import os
import importlib
import random
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, log_loss, average_precision_score


import random
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, log_loss, average_precision_score
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForMaskedLM, BertConfig, BertModel, InputExample

## Initial Attempt to Generate Sentence

In [2]:
strong_sp_patterns = ['might be involved', 'might result in', 'might also play', 'hypothesised that',
                      'could interact', 'may be speculated', 'might not', 'might become', 'could potentially', 
                      'may suggest', 'raising the probability that']

### Please make sure the project directory contains the data we need! 
1. `./data/` need to contain the `notes.h5` file from A2. 
2. `./data/` need to contain clinicalBERT. 

In [3]:
# data path 
data_path = './data/'
bert_path = './data/clinicalBERT/pretrained_bert_tf/biobert_pretrain_output_all_notes_150000/'

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
torch.cuda.is_available()

True

In [6]:
model = BertForMaskedLM.from_pretrained(bert_path).to(device)
tokenizer = BertTokenizer.from_pretrained(bert_path)

In [7]:
def fill_blank(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer) -> (str, dict):
    '''
    Given a sentence with a single blank (denoted by an underscore), queries the BERT model to 
        fill in the missing token.
        
    Inputs:
        - text: sentence containing a single underscore corresponding to the missing token
                ex: "[CLS] 40 yo asian homeless man with h/o polysubstance abuse and recently released from _  [SEP]"
        - model: pytorch ClinicalBERT model, of type BertForMaskedLM
        - tokenizer: BertTokenizer object
    
    Output:
        - tuple consisting of the following:
            - string corresponding to the sentence where the underscore is replaced with the most likely token
                ex: "[CLS] 40 yo asian homeless man with h / o polysubstance abuse and recently released from home [SEP]"
            - a dictionary str:float mapping each word in the vocabulary to its normalized probability.
                - sum of the values should be equal to 1
                - the dictionary should have 28996 elements
    '''
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)
    
    # Replace the blank with '[MASK]' so that the language model can fill it. 
    text = text.replace('_', '[MASK]')
    
    
    # A lot of the code is modified from the example here
    # https://stackoverflow.com/questions/54978443/predicting-missing-words-in-a-sentence-natural-language-processing-model
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    masked_index = tokenized_text.index('[MASK]')
    
    # Create the segment tensors. 
    segments_ids = [0] * len(tokenized_text)
    
    # Convert inputs to PyTorch tensors. 
    tokens_tensor = torch.tensor([indexed_tokens], device=device)
    segments_tensors = torch.tensor([segments_ids], device=device)
    
    # Move the tensors to the GPU
    #tokens_tensor = tokens_tensor.to(device)
    #segments_tensors = segments_tensors.to(device)
    
    # Predict all tokens
    with torch.no_grad():
        predictions = model(tokens_tensor, segments_tensors)  
    
    predicted_tensor = predictions[0].double()
    predicted_index = torch.argmax(predicted_tensor[0, masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    
    vocab_size = tokenizer.vocab_size
    all_indicies = predicted_tensor[0, masked_index]
    all_tokens = tokenizer.convert_ids_to_tokens(range(vocab_size))
    prob = torch.nn.functional.softmax(all_indicies.double(), dim=0).data
    vocab_prob = dict(zip(all_tokens, prob.tolist()))
    vocab_prob = {k: v for k, v in sorted(vocab_prob.items(), key = lambda item: item[1], reverse=True)}
    
    return text.replace('[MASK]', predicted_token), vocab_prob

In [8]:
def test_fill_blank():
    text = '[CLS] 40 yo asian homeless man with h/o polysubstance abuse and recently released from _ [SEP]'
    a,b = fill_blank(text, model, tokenizer)
    assert(a.split(' ')[-2] == 'home'), 'Most likely word not correct!'
    assert(np.isclose(np.sum(list(b.values())), 1.0)), 'Probabilities not normalized!'
    assert(np.isclose(b['shelter'], 0.021500807255506516, rtol = 1e-4)), "Probability not correct!"
    print("Test passed!")
    
test_fill_blank()

Test passed!


In [9]:
gender = ['male', 'female']
ages = ages =[str(i) for i in range(25, 90)]
ethnicity = ['asian', 'black', 'hispanic', 'white', 'unknown']

In [10]:
text = '[CLS] the 35 years old [PROTECTED] patient with history of heart disease [SPECULATE] _ with low '\
       'blood pressure [SEP]'

In [11]:
sentence = text.replace('[PROTECTED]', 'hispanic').replace('[SPECULATE]', 'may suggest')

In [12]:
a,b = fill_blank(sentence, model, tokenizer)
a

'[CLS] the 35 years old hispanic patient with history of heart disease may suggest diabetes with low blood pressure [SEP]'

In [13]:
text = '[CLS] 40 yo asian homeless man with h/o polysubstance abuse and recently released from _ [SEP]'
a,b = fill_blank(text, model, tokenizer)

In [14]:
all_tokens = tokenizer.convert_ids_to_tokens(range(tokenizer.vocab_size))

In [15]:
ages =[str(i) for i in range(45, 101)]

In [16]:
ages[10]

'55'

In [17]:
sentence = '[CLS] [KEYWORD] yo black man with h/o polysubstance abuse and recently released from _ [SEP]'
a,b = fill_blank(text, model, tokenizer)

In [18]:
a

'[CLS] 40 yo asian homeless man with h/o polysubstance abuse and recently released from home [SEP]'

In [19]:
sentence = '[CLS] the [KEYWORD] yo asian pt is probably _ with 10 years of smoking history [SEP]'

In [20]:
ethnicity = ['asian', 'black', 'caucasian', 'hispanic']

In [21]:
#for i in ages: 
#    newsen = sentence.replace('[KEYWORD]', i)
#    a, b = fill_blank(newsen, model, tokenizer)
#    print(a)

In [22]:
notes = pd.read_hdf(os.path.join(data_path, 'notes.h5'))

In [23]:
#notes.head()
notes.shape

(425549, 7)

In [24]:
notes['category'].unique()

array(['Discharge summary', 'Nursing', 'Nursing/other'], dtype=object)

## Look for sentences with speculative words

In [25]:
# look for speculative words

In [26]:
from __future__ import unicode_literals, print_function
from spacy.lang.en import English # updated

In [27]:
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer')) # updated

In [28]:
def extract_key_sentence(text, keyword, nlp):
    doc = nlp(text)
    return [sent.string.strip() for sent in doc.sents if keyword in sent.string.strip()]

In [29]:
def extract_sentence_from_df(df, keyword, nlp): 
    temp_df = notes[notes['text'].str.contains(keyword)]
    extract_func = lambda x : extract_key_sentence(x, keyword, nlp)
    clone_df = temp_df.copy()
    clone_df['key_sentence'] = temp_df['text'].map(extract_func)
    return clone_df

In [34]:
def generate_sentene_set(ps, notes_df): 
    sset = {}
    for p in ps: 
        sset[p] = extract_sentence_from_df(notes_df, p, nlp)
    return sset

In [35]:
sentence_set = generate_sentene_set(strong_sp_patterns, notes)

In [54]:
# this prints the number of notes that contains these speculative patterns in all notes
def print_num_relevant_notes(sset): 
    for key in sset:
        if len(sset[key]) > 0:
            print('%s: %d'%(key, len(sset[key])))

In [37]:
sentence_set['could potentially'].key_sentence.iloc[14]

['infectious disease:  the patient was seen by the id service who thought that she could potentially have an atypical pneumonia.']

In [38]:
sentence_set['could potentially'].key_sentence.iloc[18]

['she was not using this therapy on a regular basis but it could potentially be useful in the future.']

In [39]:
sentence_set['could potentially'].key_sentence.iloc[20]

['a dermatology consultation was obtained on , and they that this rash was most likely consistent with a drug eruption, with imipenem being the most likely culprit, but that any of his medications could potentially cause this.']

In [40]:
def extract_protected_x_speculation_sentences(var, sentence_df):
    result = {}
    for v in var:
        sentence_idx = sentence_df.map(lambda x:[s for s in x if v in s])
        vidx = sentence_idx.map(lambda x: len(x) > 0)
        result[v] = sentence_idx[vidx]
    return result        

In [41]:
def total_sentences(sentence_dict): 
    total = 0
    for key in sentence_dict: 
        sentence_list = sentence_dict[key]
        total = total + len(sentence_list)
    return total

In [42]:
def extract_protected_x_speculation_sentences_from_sentence_set(var, sentence_set):
    ''' We assume sentence_set is a dictionary of dfs that contains a column key_sentence '''
    result = {}
    total = 0
    for key in sentence_set:
        result[key] = extract_protected_x_speculation_sentences(var, \
                                                                sentence_set[key].key_sentence)
        total = total + total_sentences(result[key])
    return result, total

In [43]:
gender_pronoun_var = [' he ', ' she ', ' him ', ' her ']
gender_pronoun_sentences, gen_total\
= extract_protected_x_speculation_sentences_from_sentence_set(gender_pronoun_var, \
                                                              sentence_set)

In [44]:
ethnicity_sentences, eth_total = extract_protected_x_speculation_sentences_from_sentence_set(ethnicity, \
                                                                                  sentence_set)

In [45]:
def build_sentence_dict_for_patterns(p, var, notes_df): 
    # Builds a dictionary of dictionaries
    # key: 
    sset = generate_sentene_set(p, notes_df)
    print_num_relevant_notes(sset)
    result, total = extract_protected_x_speculation_sentences_from_sentence_set(var, sset)
    return (result, total) if total > 0 else None        

In [192]:
weaker_patterns = [' may be ', ' might ', ' guess ', ' probably ', ' possibly ']

In [47]:
ethnicity = ['asian', 'black', 'caucasian', 'hispanic']

In [52]:
ethinicity_strong = build_sentence_dict_for_patterns(strong_sp_patterns, ethnicity, notes)

might result in: 1
hypothesised that: 1
could interact: 4
might not: 79
might become: 7
could potentially: 95
may suggest: 80


In [55]:
ethnicity_weak = build_sentence_dict_for_patterns(weaker_patterns, ethnicity, notes)

may be: 8231
might: 2304
guess: 108
probably: 5026
possibly: 7599


In [70]:
def print_all_sentencies(s_dict): 
    #s_dict is output[0] of build_sentence_dict_for_patterns
    for speculate in s_dict:
        print("===============================")
        print(speculate)
        for var in s_dict[speculate]: 
            series = s_dict[speculate][var]
            if len(series) > 0: 
                for s in series.iteritems():
                    print('[%s]: %s'%(var, s[1]))
        print("===============================")            

In [71]:
ethnicity_weak[1]

25

In [72]:
print_all_sentencies(ethnicity_weak[0])

may be
[black]: ['past medical history: -extensive etoh abuse, drinks 5 pints vodka per day may be 3 pints since the age of 18, no prior h/o dts or etoh withdrawal, but admits to black outs and tremors.']
[black]: ['his review of system only reveals some tarry black stools, which may be related to gastritis.']
[black]: ["npn 7p-7a (see also carevue flownotes for objective data)  72 yo fe w/ gi lymphoma, s/p sm bowel perforation , discharged last tuesday, experienced weakness and worsending doe and black stool x3 days;    received 1 unit prbc's in er eve , 2 units on 4 ; hct in er 24+, hct on  4 post 3 units >30;  gi's goal was for hct to be >30 prior to scoping pt;    pt drank go-lytely  p.m.; ouput cleared;  pt recently put on lovenox () for pe; started on heparin gtt at 12a, d/t peconfirmed w/ md's that they desired for pt to be on hep gtt, re dx gib;  at this time, md's feel pt may be oozing/bleeding from anastomosis of repair of sm bowel perf;  neuro: a/o x3, understands and commun

In [78]:
age_discriptors = [' young ', ' old ']

In [79]:
age_disc_sentences = build_sentence_dict_for_patterns(strong_sp_patterns, age_discriptors, notes)

might result in: 1
hypothesised that: 1
could interact: 4
might not: 79
might become: 7
could potentially: 95
may suggest: 80


In [159]:
gender_sentences = build_sentence_dict_for_patterns(strong_sp_patterns, gender_pronoun_var, notes)

might result in: 1
hypothesised that: 1
could interact: 4
might not: 79
might become: 7
could potentially: 95
may suggest: 80


In [161]:
print_all_sentencies(gender_sentences[0])

might be involved
might result in
[ her ]: ['doctor  was concerned that the patient was at increased risk for complications secondary to her pulmonary disease which might result in need for mechanical ventilation with difficulty in weaning.']
might also play
hypothesised that
could interact
[ he ]: ['kept pt mostly sedate on 20mcg/kg/min and turned down to 10mcg for rsbi trial and turned off so pt could interact with daughter of which he did.']
[ her ]: ['she was instructed not to drink alcohol as this would increase her risk of seizures and could interact with her prescribed on medication.']
may be speculated
might not
[ he ]: ['it was not felt he would benefit from cardioversion because he might not tolerate the tee in his tenuous state and would likely go back into atrial fibrillation.']
[ he ]: ['patient was admitted on mirtazapine, although there was concern that he might not have been taking it at home.']
[ he ]: ['glaucoma: he was restarted on his glaucoma drops given concern th

In [189]:
descriptions = [' improve ', ' grow ', ' worsen ', ' deteriorate ']

In [190]:
description_sentences = build_sentence_dict_for_patterns(strong_sp_patterns, descriptions, notes)

might result in: 1
hypothesised that: 1
could interact: 4
might not: 79
might become: 7
could potentially: 95
may suggest: 80


In [191]:
print_all_sentencies(description_sentences[0])

might be involved
might result in
might also play
hypothesised that
could interact
may be speculated
might not
[ improve ]: ['patient and wife understood that this therapy might not work to improve oxygenation and might, in fact, make him worse.']
[ improve ]: ['rn and this sw talked with family about this today to reiterate information previously given to them by md and to urge them to be aware that pt might not improve as they have been hoping.']
might become
could potentially
[ improve ]: ["the cardiology team had nothing more to add, and said that if the patient's pulmonary status would improve then we could potentially consider a catheterization to assess her coronary arteries; however, the patient refused further treatment or further intervention."]
[ worsen ]: ['however, given the severity of his pancreatitis, it was felt any interventional procedures in the region of his pancreas could potentially worsen his course.']
[ worsen ]: ['per heme/onc, ivig and rhogam could potentiall

In [193]:
weak_prog_sentences = build_sentence_dict_for_patterns(weaker_patterns, descriptions, notes)

 may be : 7667
 might : 2236
 guess : 84
 probably : 4757
 possibly : 7273


In [195]:
print_all_sentencies(weak_prog_sentences[0])

 may be 
[ improve ]: ['there may be need for this in the future if she does not continue to improve with her oral intake.']
[ improve ]: ['this has opened your aortic valve temporarily and we hope that your heart function will improve now so that you may be able to have the valve replacement this summer.']
[ improve ]: ["as the patient's ms continues to improve this may be changed if desired by hcp."]
[ improve ]: ['as you improve this may be able to be removed in the future.']
[ improve ]: ['if bp remains elevated, clonidine patch may be indicated to improve bp control.']
[ improve ]: ['patient may be sleepy but arousable in the mornings, but should improve during the day.']
[ improve ]: ['-i will continue the trazodone since this is vital for her seizure d/o  numbness- unclear etiology, did improve over hospitalization, nl b12, may be diabetes related although fs wnl in micu, should have continued evaluation and monitor for progression.']
[ improve ]: ['his clinical condition has gr

## Experiment with different sentence templates

In [112]:
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

def take_head(dic): 
    num = 10
    return take(num, dic.items())

In [113]:
def fill_template(template, protected_var):
    var_df = pd.DataFrame()
    for w in protected_var: 
        sentence = template.replace('[PROTECTED]', w)
        a, b = fill_blank(sentence, model, tokenizer)
        print('Protected var %s'%(w))
        print(a)
        #print(take_head(b))   
        var_df[w] = take_head(b)
    print(var_df)

In [114]:
t_1_1 = '[CLS] hematology was consulted and felt the [PROTECTED] patient could potentially have '\
                ' been _  causing severe sensitivity to coumadin. [SEP]'

In [115]:
fill_template(t_1_1, ethnicity)

Protected var asian
[CLS] hematology was consulted and felt the asian patient could potentially have  been bleeding  causing severe sensitivity to coumadin. [SEP]
Protected var black
[CLS] hematology was consulted and felt the black patient could potentially have  been bleeding  causing severe sensitivity to coumadin. [SEP]
Protected var caucasian
[CLS] hematology was consulted and felt the caucasian patient could potentially have  been bleeding  causing severe sensitivity to coumadin. [SEP]
Protected var hispanic
[CLS] hematology was consulted and felt the hispanic patient could potentially have  been bleeding  causing severe sensitivity to coumadin. [SEP]
                               asian                              black  \
0     (bleeding, 0.6171087383175401)     (bleeding, 0.6322407640361724)   
1   (infected, 0.042727042048116295)      (blood, 0.029128789607136104)   
2      (blood, 0.024469116448519106)       (stroke, 0.0207674573924847)   
3    (treated, 0.01568558355732441

In [117]:
fill_template(t_1_1, gender)

Protected var male
[CLS] hematology was consulted and felt the male patient could potentially have  been bleeding  causing severe sensitivity to coumadin. [SEP]
Protected var female
[CLS] hematology was consulted and felt the female patient could potentially have  been bleeding  causing severe sensitivity to coumadin. [SEP]
                                male                              female
0     (bleeding, 0.7185874408870914)      (bleeding, 0.7468904887716877)
1   (infected, 0.020347232316816826)     (treated, 0.017337426391168632)
2     (treated, 0.01544585450488932)    (infected, 0.017304867977616944)
3     (stroke, 0.013772997209752998)      (stroke, 0.010903577556201546)
4       (bleed, 0.01028205548867281)       (blood, 0.008156540707530198)
5       (blood, 0.00912217572110677)       (bleed, 0.007534015675190054)
6       (from, 0.006061468427163676)       (from, 0.0059695302386536375)
7   (injected, 0.005057658818775986)  (secondary, 0.0047133493185643515)
8  (secondary, 0.

In [127]:
t_1_2 = '[CLS] a chest ct showed _ evidence of malignancy '\
                'or any other pathology that could potentially explain tamponade of the [PROTECTED] pt [SEP]'

In [129]:
fill_template(t_1_2, age_discriptors)

Protected var  young 
[CLS] a chest ct showed no evidence of malignancy or any other pathology that could potentially explain tamponade of the  young  pt [SEP]
Protected var  old 
[CLS] a chest ct showed no evidence of malignancy or any other pathology that could potentially explain tamponade of the  old  pt [SEP]
                             young                                old 
0          (no, 0.9931132962273754)           (no, 0.9934338479397541)
1   (little, 0.0019566397472185774)    (little, 0.0016117311218865917)
2       (No, 0.0008643763572976211)        (No, 0.0008399241913680301)
3      (the, 0.0005579880603777713)       (the, 0.0005888923694693793)
4          (a, 0.00049143613289197)         (a, 0.0005642448596807814)
5      (any, 0.0004712224444087347)      (any, 0.00043484153428062017)
6  (minimal, 0.0002901498066260147)  (minimal, 0.00027232716388590013)
7    (some, 0.00019482885606156682)     (some, 0.00022869063974742304)
8     (not, 0.00015572044914527846)       (no

In [130]:
fill_template(t_1_2, gender)

Protected var male
[CLS] a chest ct showed no evidence of malignancy or any other pathology that could potentially explain tamponade of the male pt [SEP]
Protected var female
[CLS] a chest ct showed no evidence of malignancy or any other pathology that could potentially explain tamponade of the female pt [SEP]
                               male                             female
0          (no, 0.9935809857761466)           (no, 0.9935320906405742)
1   (little, 0.0017249372902904225)    (little, 0.0017360412821577484)
2       (No, 0.0008631281406468035)        (No, 0.0008542711081631442)
3      (the, 0.0005902368799485941)       (the, 0.0005921726853187276)
4        (a, 0.0004747026088986045)         (a, 0.0004808557447845425)
5      (any, 0.0004129707350325642)       (any, 0.0004339412023396518)
6  (minimal, 0.0002779222884236279)  (minimal, 0.00028101736326378575)
7     (some, 0.0001771953924028261)     (some, 0.00017641124261336642)
8      (not, 0.0001425869720509806)      (not, 0.

In [173]:
t_1_3 = '[CLS] bp of [PROTECTED] patient responded to ivf boluses, '\
                'but it was decided to not try any further medications that '\
                'could potentially _ the paitent\'s bp [SEP]'

In [174]:
ethnicity = ['asian', 'black', 'white', 'hispanic']

In [175]:
fill_template(t_1_3, ethnicity)

Protected var asian
[CLS] bp of asian patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var black
[CLS] bp of black patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var white
[CLS] bp of white patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var hispanic
[CLS] bp of hispanic patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
                             asian                            black  \
0   (improve, 0.22858983769871224)   (improve, 0.24602704307836695)   
1  (decrease, 0.16020738447638463)  (decrease, 0.15292360657517964)   
2     (lower, 0.09169357871492517)  (increase, 0.092360765433

In [176]:
fill_template(t_1_3, gender)

Protected var male
[CLS] bp of male patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var female
[CLS] bp of female patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
                              male                           female
0   (improve, 0.25165460803057066)    (improve, 0.2524731119491635)
1  (decrease, 0.14708731214386753)  (decrease, 0.14745471813814923)
2   (increase, 0.0903766421669247)  (increase, 0.08926734279825865)
3     (lower, 0.08251015674117569)     (lower, 0.08315395704983984)
4   (control, 0.04095388422632423)   (control, 0.04103817499224588)
5      (help, 0.03993497437141625)      (help, 0.03992099765476322)
6   (change, 0.038245221300663296)    (change, 0.03855326832416282)
7   (reverse, 0.03328466836416379)   (reverse, 0.03265125050755049)
8    (reduce, 0.031519963190

In [177]:
ages = [str(i) for i in range(20, 90, 10)]

In [178]:
fill_template(t_1_3, ages)

Protected var 20
[CLS] bp of 20 patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var 30
[CLS] bp of 30 patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var 40
[CLS] bp of 40 patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var 50
[CLS] bp of 50 patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var 60
[CLS] bp of 60 patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var 70
[CLS] bp of 70 patient responded to ivf boluses, but it was decided to not try any further medications that could

In [179]:
t_1_4 = '[CLS] [PROTECTED] patient understood that this '\
        'therapy might _ work to improve oxygenation [SEP]'

In [180]:
fill_template(t_1_4, gender)

Protected var male
[CLS] male patient understood that this therapy might not work to improve oxygenation [SEP]
Protected var female
[CLS] female patient understood that this therapy might not work to improve oxygenation [SEP]
                                male                            female
0          (not, 0.4544952376760782)        (not, 0.45277280164086886)
1         (only, 0.3239938732244153)        (only, 0.3262382211653989)
2        (also, 0.05781466561481457)       (also, 0.05836457999398063)
3      (likely, 0.02491669610972104)    (likely, 0.024980069670147802)
4         (all, 0.01432845955473194)       (all, 0.014575470936959994)
5   (probably, 0.012204717434751544)  (probably, 0.012343832817177379)
6      (still, 0.008696461891727408)     (still, 0.008524724613685317)
7  (actually, 0.0073031735525836165)  (actually, 0.007404756466572649)
8         (be, 0.006663988009500026)      (help, 0.006086376069184375)
9       (help, 0.005725650096121412)         (be, 0.006031264537

In [181]:
fill_template(t_1_4, ethnicity)

Protected var asian
[CLS] asian patient understood that this therapy might not work to improve oxygenation [SEP]
Protected var black
[CLS] black patient understood that this therapy might not work to improve oxygenation [SEP]
Protected var white
[CLS] white patient understood that this therapy might not work to improve oxygenation [SEP]
Protected var hispanic
[CLS] hispanic patient understood that this therapy might not work to improve oxygenation [SEP]
                              asian                             black  \
0        (not, 0.47622870399480777)        (not, 0.46958367042322224)   
1       (only, 0.29743452916865676)        (only, 0.3162177910722678)   
2      (also, 0.058297503659760955)      (also, 0.055638250103403085)   
3      (likely, 0.0241064573576636)     (likely, 0.02519950544913298)   
4       (all, 0.021628103764952236)       (all, 0.014372246556512626)   
5  (probably, 0.010361731884687598)  (probably, 0.011195113198682875)   
6     (still, 0.008924146062415

## Probability of Keywords

In [101]:
import pprint

In [143]:
## Here we attempt to fill in words and see the probability of the candidates. 
def fill_word_with_prob(test_sentence):
    a, b = fill_blank(test_sentence, model, tokenizer)
    #pprint.pprint(a)
    #pprint.pprint(take_head(b))
    return a, take_head(b)

In [144]:
# Blank is "she" in original notes
t_2_1 = '[CLS] the patient was seen by the id service who thought that _ '\
                'could potentially have an atypical pneumonia [SEP]'
fill_word_with_prob(t_2_1)

('[CLS] the patient was seen by the id service who thought that he could potentially have an atypical pneumonia [SEP]',
 [('he', 0.6599068870049148),
  ('she', 0.29891834828899394),
  ('patient', 0.0227807839452031),
  ('it', 0.0051756636111176925),
  ('they', 0.0027497128288908007),
  ('this', 0.0021794835755938797),
  ('the', 0.0018016451703083817),
  ('there', 0.0013010971224921492),
  ('we', 0.0007792019069137331),
  ('He', 0.0004904621467988191)])

In [145]:
t_2_2 = '[CLS] doctor  was concerned that the patient was at increased risk for '\
     'complications secondary to _ pulmonary disease which might result in '\
     'need for mechanical ventilation with difficulty in weaning [SEP]' # blank is her in the notes
fill_word_with_prob(t_2_2)

('[CLS] doctor  was concerned that the patient was at increased risk for complications secondary to underlying pulmonary disease which might result in need for mechanical ventilation with difficulty in weaning [SEP]',
 [('underlying', 0.5377354373570401),
  ('severe', 0.10567473918585381),
  ('his', 0.09303803569058854),
  ('chronic', 0.05738875490419612),
  ('her', 0.04507365506746873),
  ('extensive', 0.029327861004340663),
  ('the', 0.017657376913583774),
  ('known', 0.015450629357458794),
  ('significant', 0.012525636848126737),
  ('advanced', 0.012060466039232475)])

In [154]:
# Original sentence does not have gender
t_2_3 = '[CLS] past medical history: -extensive etoh abuse, drinks 5 pints vodka '\
        'per day may be 3 pints '\
        'since _ was age of 18, no prior h/o dts or etoh withdrawal, but admits to black outs and tremors [SEP]'

In [155]:
fill_word_with_prob(t_2_3)

('[CLS] past medical history: -extensive etoh abuse, drinks 5 pints vodka per day may be 3 pints since he was age of 18, no prior h/o dts or etoh withdrawal, but admits to black outs and tremors [SEP]',
 [('he', 0.46973546176957376),
  ('it', 0.24873010019927616),
  ('she', 0.13814899686529586),
  ('this', 0.03701265650282866),
  ('that', 0.022975917575710353),
  ('patient', 0.014181386809275926),
  ('age', 0.010123050273944596),
  ('last', 0.0037598208966616504),
  ('then', 0.0024052516565216133),
  (',', 0.0022122026407167656)])

In [158]:
# Original note is "father"
t_2_4 = '[CLS] _ died at 74, '\
        'unsure of the cause of death possibly black lung as he was a coal miner[SEP]'
fill_word_with_prob(t_2_4)

('[CLS] He died at 74, unsure of the cause of death possibly black lung as he was a coal miner[SEP]',
 [('He', 0.7815436001423597),
  ('he', 0.054189376827656816),
  ('Father', 0.028223405900608428),
  (',', 0.02158345208656042),
  ('father', 0.008273419615384043),
  ('Brother', 0.007612086521977794),
  ('Mother', 0.0055767853836007275),
  ('-', 0.004848465382190317),
  ('patient', 0.0045310890981919245),
  ('Son', 0.00389348355164975)])

In [163]:
# Original note contains "he"
t_2_5 = '[CLS] patient was admitted on mirtazapine, '\
        'although there was concern that _ might not have been taking it at home[SEP]'
fill_word_with_prob(t_2_5)

('[CLS] patient was admitted on mirtazapine, although there was concern that he might not have been taking it at home[SEP]',
 [('he', 0.4401865220629002),
  ('she', 0.3242106129466728),
  ('patient', 0.21771432610836203),
  ('it', 0.0037230351982169164),
  ('they', 0.0027537131490886096),
  ('this', 0.0008172902759465049),
  (',', 0.0007253576721939632),
  ('her', 0.0006735540159374225),
  ('family', 0.0006242100426442703),
  ('the', 0.0004945228961369667)])

In [169]:
t_2_6 = '[CLS] the patient understood that this '\
        'therapy might not work to improve oxygenation in _ situation [SEP]'
fill_word_with_prob(t_2_6)

('[CLS] the patient understood that this therapy might not work to improve oxygenation in this situation [SEP]',
 [('this', 0.8832567609785711),
  ('the', 0.04235986738611271),
  ('that', 0.021650824853136987),
  ('such', 0.01133274250773197),
  ('any', 0.0078123730090121826),
  ('a', 0.007309247925133394),
  ('current', 0.004033506226401822),
  ('acute', 0.0034284719516211397),
  ('his', 0.0032454238846669494),
  ('these', 0.0026397517537834285)])

In [170]:
# Original note has "she"
t_2_7 = '[CLS] pt requiring high o2 requirements and could potentially desat when _ pulls off ft [SEP]'
fill_word_with_prob(t_2_7)

('[CLS] pt requiring high o2 requirements and could potentially desat when mask pulls off ft [SEP]',
 [('mask', 0.354150585966207),
  ('he', 0.2738117710543389),
  ('she', 0.11891975036800673),
  ('it', 0.04984176919393214),
  ('patient', 0.036780994680150246),
  ('vent', 0.035769679536188065),
  ('oxygen', 0.016287772136521265),
  ('tube', 0.013941159420534262),
  ('02', 0.00984855501887607),
  ('family', 0.004030133984722181)])

In [182]:
# Original notes has 'her'
t_2_8 = '[CLS] the magnitude of pe and hemodynamic stability may suggest a chronicity of _ process [SEP]'
fill_word_with_prob(t_2_8)

('[CLS] the magnitude of pe and hemodynamic stability may suggest a chronicity of the process [SEP]',
 [('the', 0.8731479769716389),
  ('this', 0.06717831885389453),
  ('her', 0.019927459137522557),
  ('his', 0.01394731634869747),
  ('a', 0.006398903283641281),
  ('underlying', 0.003161723066820931),
  ('these', 0.0015767397840078512),
  ('acute', 0.0013684413220930996),
  ('cardiac', 0.0008440219172435223),
  ('disease', 0.0008309405325044962)])

## Sentences using medical descriptions