In [1]:
import pandas as pd
import numpy as np
import os
import importlib
import random
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, log_loss, average_precision_score


import random
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, log_loss, average_precision_score
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForMaskedLM, BertConfig, BertModel, InputExample

## Initial Attempt to Generate Sentence

In [2]:
strong_sp_patterns = ['might be involved', 'might result in', 'might also play', 'hypothesised that',
                      'could interact', 'may be speculated', 'might not', 'might become', 'could potentially', 
                      'may suggest', 'raising the probability that']

### Please make sure the project directory contains the data we need! 
1. `./data/` need to contain the `notes.h5` file from A2. 
2. `./data/` need to contain clinicalBERT. 

In [3]:
# data path 
data_path = './data/'
bert_path = './data/clinicalBERT/pretrained_bert_tf/biobert_pretrain_output_all_notes_150000/'

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [5]:
torch.cuda.is_available()

True

In [6]:
model = BertForMaskedLM.from_pretrained(bert_path).to(device)
tokenizer = BertTokenizer.from_pretrained(bert_path)

In [7]:
def fill_blank(text: str, model: BertForMaskedLM, tokenizer: BertTokenizer) -> (str, dict):
    '''
    Given a sentence with a single blank (denoted by an underscore), queries the BERT model to 
        fill in the missing token.
        
    Inputs:
        - text: sentence containing a single underscore corresponding to the missing token
                ex: "[CLS] 40 yo asian homeless man with h/o polysubstance abuse and recently released from _  [SEP]"
        - model: pytorch ClinicalBERT model, of type BertForMaskedLM
        - tokenizer: BertTokenizer object
    
    Output:
        - tuple consisting of the following:
            - string corresponding to the sentence where the underscore is replaced with the most likely token
                ex: "[CLS] 40 yo asian homeless man with h / o polysubstance abuse and recently released from home [SEP]"
            - a dictionary str:float mapping each word in the vocabulary to its normalized probability.
                - sum of the values should be equal to 1
                - the dictionary should have 28996 elements
    '''
    random.seed(42)
    np.random.seed(42)
    torch.manual_seed(42)
    
    # Replace the blank with '[MASK]' so that the language model can fill it. 
    text = text.replace('_', '[MASK]')
    
    
    # A lot of the code is modified from the example here
    # https://stackoverflow.com/questions/54978443/predicting-missing-words-in-a-sentence-natural-language-processing-model
    tokenized_text = tokenizer.tokenize(text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    masked_index = tokenized_text.index('[MASK]')
    
    # Create the segment tensors. 
    segments_ids = [0] * len(tokenized_text)
    
    # Convert inputs to PyTorch tensors. 
    tokens_tensor = torch.tensor([indexed_tokens], device=device)
    segments_tensors = torch.tensor([segments_ids], device=device)
    
    # Move the tensors to the GPU
    #tokens_tensor = tokens_tensor.to(device)
    #segments_tensors = segments_tensors.to(device)
    
    # Predict all tokens
    with torch.no_grad():
        predictions = model(tokens_tensor, segments_tensors)  
    
    predicted_tensor = predictions[0].double()
    predicted_index = torch.argmax(predicted_tensor[0, masked_index]).item()
    predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
    
    vocab_size = tokenizer.vocab_size
    all_indicies = predicted_tensor[0, masked_index]
    all_tokens = tokenizer.convert_ids_to_tokens(range(vocab_size))
    prob = torch.nn.functional.softmax(all_indicies.double(), dim=0).data
    vocab_prob = dict(zip(all_tokens, prob.tolist()))
    vocab_prob = {k: v for k, v in sorted(vocab_prob.items(), key = lambda item: item[1], reverse=True)}
    
    return text.replace('[MASK]', predicted_token), vocab_prob

In [8]:
def test_fill_blank():
    text = '[CLS] 40 yo asian homeless man with h/o polysubstance abuse and recently released from _ [SEP]'
    a,b = fill_blank(text, model, tokenizer)
    assert(a.split(' ')[-2] == 'home'), 'Most likely word not correct!'
    assert(np.isclose(np.sum(list(b.values())), 1.0)), 'Probabilities not normalized!'
    assert(np.isclose(b['shelter'], 0.021500807255506516, rtol = 1e-4)), "Probability not correct!"
    print("Test passed!")
    
test_fill_blank()

Test passed!


In [9]:
gender = ['male', 'female']
ages = ages =[str(i) for i in range(25, 90)]
ethnicity = ['asian', 'black', 'hispanic', 'white', 'unknown']

In [10]:
text = '[CLS] the 35 years old [PROTECTED] patient with history of heart disease [SPECULATE] _ with low '\
       'blood pressure [SEP]'

In [11]:
sentence = text.replace('[PROTECTED]', 'hispanic').replace('[SPECULATE]', 'may suggest')

In [12]:
a,b = fill_blank(sentence, model, tokenizer)
a

'[CLS] the 35 years old hispanic patient with history of heart disease may suggest diabetes with low blood pressure [SEP]'

In [13]:
text = '[CLS] 40 yo asian homeless man with h/o polysubstance abuse and recently released from _ [SEP]'
a,b = fill_blank(text, model, tokenizer)

In [14]:
all_tokens = tokenizer.convert_ids_to_tokens(range(tokenizer.vocab_size))

In [15]:
ages =[str(i) for i in range(45, 101)]

In [16]:
ages[10]

'55'

In [17]:
sentence = '[CLS] [KEYWORD] yo black man with h/o polysubstance abuse and recently released from _ [SEP]'
a,b = fill_blank(text, model, tokenizer)

In [18]:
a

'[CLS] 40 yo asian homeless man with h/o polysubstance abuse and recently released from home [SEP]'

In [19]:
sentence = '[CLS] the [KEYWORD] yo asian pt is probably _ with 10 years of smoking history [SEP]'

In [20]:
ethnicity = ['asian', 'black', 'caucasian', 'hispanic']

In [21]:
#for i in ages: 
#    newsen = sentence.replace('[KEYWORD]', i)
#    a, b = fill_blank(newsen, model, tokenizer)
#    print(a)

In [22]:
notes = pd.read_hdf(os.path.join(data_path, 'notes.h5'))

In [23]:
#notes.head()
notes.shape

(425549, 7)

In [24]:
notes['category'].unique()

array(['Discharge summary', 'Nursing', 'Nursing/other'], dtype=object)

## Look for sentences with speculative words

In [25]:
# look for speculative words

In [170]:
from __future__ import unicode_literals, print_function
from spacy.lang.en import English # updated

In [171]:
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer')) # updated

In [172]:
def extract_key_sentence(text, keyword, nlp):
    doc = nlp(text)
    return [sent.string.strip() for sent in doc.sents if keyword in sent.string.strip()]

In [173]:
def extract_sentence_from_df(df, keyword, nlp): 
    temp_df = notes[notes['text'].str.contains(keyword)]
    extract_func = lambda x : extract_key_sentence(x, keyword, nlp)
    clone_df = temp_df.copy()
    clone_df['key_sentence'] = temp_df['text'].map(extract_func)
    return clone_df

In [174]:
def generate_sentene_set(ps, notes_df): 
    sset = {}
    for p in ps: 
        sset[p] = extract_sentence_from_df(notes_df, p, nlp)
    return sset

In [175]:
sentence_set = generate_sentene_set(strong_sp_patterns, notes)

In [218]:
# this prints the number of notes that contains these speculative patterns in all notes
def print_num_relevant_notes(s_set): 
    for key in s_set:
        if len(s_set[key]) > 0:
            print('%s: %d'%(key, len(s_set[key])))

In [177]:
sentence_set['could potentially'].key_sentence.iloc[14]

['infectious disease:  the patient was seen by the id service who thought that she could potentially have an atypical pneumonia.']

In [178]:
sentence_set['could potentially'].key_sentence.iloc[18]

['she was not using this therapy on a regular basis but it could potentially be useful in the future.']

In [179]:
sentence_set['could potentially'].key_sentence.iloc[20]

['a dermatology consultation was obtained on , and they that this rash was most likely consistent with a drug eruption, with imipenem being the most likely culprit, but that any of his medications could potentially cause this.']

In [180]:
def extract_protected_x_speculation_sentences(var, s_df):
    result = {}
    for v in var:
        sentence_idx = s_df.map(lambda x:[s for s in x if v in s])
        vidx = sentence_idx.map(lambda x: len(x) > 0)
        result[v] = sentence_idx[vidx]
    return result        

In [181]:
def total_sentences(s_dict): 
    total = 0
    for key in s_dict: 
        s_list = s_dict[key]
        total = total + len(s_list)
    return total

In [182]:
def extract_protected_x_speculation_sentences_from_sentence_set(var, sentence_set):
    ''' We assume sentence_set is a dictionary of dfs that contains a column key_sentence '''
    result = {}
    total = 0
    for key in sentence_set:
        result[key] = extract_protected_x_speculation_sentences(var, \
                                                                sentence_set[key].key_sentence)
        total = total + total_sentences(result[key])
    return result, total

In [183]:
gender_pronoun_var = [' he ', ' she ', ' him ', ' her ']
gender_pronoun_sentences, gen_total\
= extract_protected_x_speculation_sentences_from_sentence_set(gender_pronoun_var, \
                                                              sentence_set)

In [184]:
ethnicity_sentences, eth_total = extract_protected_x_speculation_sentences_from_sentence_set(ethnicity, \
                                                                                  sentence_set)

In [201]:
def build_sentence_dict_for_patterns(p, var, notes_df): 
    # Builds a dictionary of dictionaries
    # key: 
    s_set = generate_sentene_set(p, notes_df)
    print_num_relevant_notes(s_set)
    result, total = extract_protected_x_speculation_sentences_from_sentence_set(var, s_set)
    return (result, total) if total > 0 else None        

In [202]:
weaker_patterns = [' may be ', ' might ', ' guess ', ' probably ', ' possibly ']

In [203]:
ethnicity = ['asian', 'black', 'caucasian', 'hispanic']

In [204]:
ethinicity_strong = build_sentence_dict_for_patterns(strong_sp_patterns, ethnicity, notes)

might result in: 1
hypothesised that: 1
could interact: 4
might not: 79
might become: 7
could potentially: 95
may suggest: 80


In [205]:
ethnicity_weak = build_sentence_dict_for_patterns(weaker_patterns, ethnicity, notes)

 may be : 7667
 might : 2236
 guess : 84
 probably : 4757
 possibly : 7273


In [206]:
def print_all_sentencies(s_dict): 
    #s_dict is output[0] of build_sentence_dict_for_patterns
    for speculate in s_dict:
        print("===============================")
        print(speculate)
        for var in s_dict[speculate]: 
            series = s_dict[speculate][var]
            if len(series) > 0: 
                for s in series.iteritems():
                    print('[%s]: %s'%(var, s[1]))
        print("===============================")            

In [207]:
ethnicity_weak[1]

24

In [208]:
print_all_sentencies(ethnicity_weak[0])

 may be 
[black]: ['past medical history: -extensive etoh abuse, drinks 5 pints vodka per day may be 3 pints since the age of 18, no prior h/o dts or etoh withdrawal, but admits to black outs and tremors.']
[black]: ['his review of system only reveals some tarry black stools, which may be related to gastritis.']
[black]: ["npn 7p-7a (see also carevue flownotes for objective data)  72 yo fe w/ gi lymphoma, s/p sm bowel perforation , discharged last tuesday, experienced weakness and worsending doe and black stool x3 days;    received 1 unit prbc's in er eve , 2 units on 4 ; hct in er 24+, hct on  4 post 3 units >30;  gi's goal was for hct to be >30 prior to scoping pt;    pt drank go-lytely  p.m.; ouput cleared;  pt recently put on lovenox () for pe; started on heparin gtt at 12a, d/t peconfirmed w/ md's that they desired for pt to be on hep gtt, re dx gib;  at this time, md's feel pt may be oozing/bleeding from anastomosis of repair of sm bowel perf;  neuro: a/o x3, understands and comm

In [209]:
age_discriptors = [' young ', ' old ']

In [219]:
age_disc_sentences = build_sentence_dict_for_patterns(strong_sp_patterns, age_discriptors, notes)

might result in: 1
hypothesised that: 1
could interact: 4
might not: 79
might become: 7
could potentially: 95
may suggest: 80


In [220]:
gender_sentences = build_sentence_dict_for_patterns(strong_sp_patterns, gender_pronoun_var, notes)

might result in: 1
hypothesised that: 1
could interact: 4
might not: 79
might become: 7
could potentially: 95
may suggest: 80


In [221]:
print_all_sentencies(gender_sentences[0])

might be involved
might result in
[ her ]: ['doctor  was concerned that the patient was at increased risk for complications secondary to her pulmonary disease which might result in need for mechanical ventilation with difficulty in weaning.']
might also play
hypothesised that
could interact
[ he ]: ['kept pt mostly sedate on 20mcg/kg/min and turned down to 10mcg for rsbi trial and turned off so pt could interact with daughter of which he did.']
[ her ]: ['she was instructed not to drink alcohol as this would increase her risk of seizures and could interact with her prescribed on medication.']
may be speculated
might not
[ he ]: ['it was not felt he would benefit from cardioversion because he might not tolerate the tee in his tenuous state and would likely go back into atrial fibrillation.']
[ he ]: ['patient was admitted on mirtazapine, although there was concern that he might not have been taking it at home.']
[ he ]: ['glaucoma: he was restarted on his glaucoma drops given concern th

In [222]:
descriptions = [' improve ', ' grow ', ' worsen ', ' deteriorate ']

In [223]:
description_sentences = build_sentence_dict_for_patterns(strong_sp_patterns, descriptions, notes)

might result in: 1
hypothesised that: 1
could interact: 4
might not: 79
might become: 7
could potentially: 95
may suggest: 80


In [215]:
print_all_sentencies(description_sentences[0])

might be involved
might result in
might also play
hypothesised that
could interact
may be speculated
might not
[ improve ]: ['patient and wife understood that this therapy might not work to improve oxygenation and might, in fact, make him worse.']
[ improve ]: ['rn and this sw talked with family about this today to reiterate information previously given to them by md and to urge them to be aware that pt might not improve as they have been hoping.']
might become
could potentially
[ improve ]: ["the cardiology team had nothing more to add, and said that if the patient's pulmonary status would improve then we could potentially consider a catheterization to assess her coronary arteries; however, the patient refused further treatment or further intervention."]
[ worsen ]: ['however, given the severity of his pancreatitis, it was felt any interventional procedures in the region of his pancreas could potentially worsen his course.']
[ worsen ]: ['per heme/onc, ivig and rhogam could potentiall

In [216]:
weak_prog_sentences = build_sentence_dict_for_patterns(weaker_patterns, descriptions, notes)

 may be : 7667
 might : 2236
 guess : 84
 probably : 4757
 possibly : 7273


In [217]:
print_all_sentencies(weak_prog_sentences[0])

 may be 
[ improve ]: ['there may be need for this in the future if she does not continue to improve with her oral intake.']
[ improve ]: ['this has opened your aortic valve temporarily and we hope that your heart function will improve now so that you may be able to have the valve replacement this summer.']
[ improve ]: ["as the patient's ms continues to improve this may be changed if desired by hcp."]
[ improve ]: ['as you improve this may be able to be removed in the future.']
[ improve ]: ['if bp remains elevated, clonidine patch may be indicated to improve bp control.']
[ improve ]: ['patient may be sleepy but arousable in the mornings, but should improve during the day.']
[ improve ]: ['-i will continue the trazodone since this is vital for her seizure d/o  numbness- unclear etiology, did improve over hospitalization, nl b12, may be diabetes related although fs wnl in micu, should have continued evaluation and monitor for progression.']
[ improve ]: ['his clinical condition has gr

## Experiment with different sentence templates

In [58]:
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

def take_head(dic): 
    num = 10
    return take(num, dic.items())

In [59]:
def fill_template(template, protected_var):
    var_df = pd.DataFrame()
    for w in protected_var: 
        sentence = template.replace('[PROTECTED]', w)
        a, b = fill_blank(sentence, model, tokenizer)
        print('Protected var %s'%(w))
        print(a)
        #print(take_head(b))   
        var_df[w] = take_head(b)
    print(var_df)

In [60]:
t_1_1 = '[CLS] hematology was consulted and felt the [PROTECTED] patient could potentially have '\
                ' been _  causing severe sensitivity to coumadin. [SEP]'

In [61]:
fill_template(t_1_1, ethnicity)

Protected var asian
[CLS] hematology was consulted and felt the asian patient could potentially have  been bleeding  causing severe sensitivity to coumadin. [SEP]
Protected var black
[CLS] hematology was consulted and felt the black patient could potentially have  been bleeding  causing severe sensitivity to coumadin. [SEP]
Protected var caucasian
[CLS] hematology was consulted and felt the caucasian patient could potentially have  been bleeding  causing severe sensitivity to coumadin. [SEP]
Protected var hispanic
[CLS] hematology was consulted and felt the hispanic patient could potentially have  been bleeding  causing severe sensitivity to coumadin. [SEP]
                               asian                              black  \
0     (bleeding, 0.6171087383175401)     (bleeding, 0.6322407640361724)   
1   (infected, 0.042727042048116295)      (blood, 0.029128789607136104)   
2      (blood, 0.024469116448519106)       (stroke, 0.0207674573924847)   
3    (treated, 0.01568558355732441

In [62]:
fill_template(t_1_1, gender)

Protected var male
[CLS] hematology was consulted and felt the male patient could potentially have  been bleeding  causing severe sensitivity to coumadin. [SEP]
Protected var female
[CLS] hematology was consulted and felt the female patient could potentially have  been bleeding  causing severe sensitivity to coumadin. [SEP]
                                male                              female
0     (bleeding, 0.7185874408870914)      (bleeding, 0.7468904887716877)
1   (infected, 0.020347232316816826)     (treated, 0.017337426391168632)
2     (treated, 0.01544585450488932)    (infected, 0.017304867977616944)
3     (stroke, 0.013772997209752998)      (stroke, 0.010903577556201546)
4       (bleed, 0.01028205548867281)       (blood, 0.008156540707530198)
5       (blood, 0.00912217572110677)       (bleed, 0.007534015675190054)
6       (from, 0.006061468427163676)       (from, 0.0059695302386536375)
7   (injected, 0.005057658818775986)  (secondary, 0.0047133493185643515)
8  (secondary, 0.

In [63]:
t_1_2 = '[CLS] a chest ct showed _ evidence of malignancy '\
                'or any other pathology that could potentially explain tamponade of the [PROTECTED] pt [SEP]'

In [64]:
fill_template(t_1_2, age_discriptors)

Protected var  young 
[CLS] a chest ct showed no evidence of malignancy or any other pathology that could potentially explain tamponade of the  young  pt [SEP]
Protected var  old 
[CLS] a chest ct showed no evidence of malignancy or any other pathology that could potentially explain tamponade of the  old  pt [SEP]
                             young                                old 
0          (no, 0.9931132962273754)           (no, 0.9934338479397541)
1   (little, 0.0019566397472185774)    (little, 0.0016117311218865917)
2       (No, 0.0008643763572976211)        (No, 0.0008399241913680301)
3      (the, 0.0005579880603777713)       (the, 0.0005888923694693793)
4          (a, 0.00049143613289197)         (a, 0.0005642448596807814)
5      (any, 0.0004712224444087347)      (any, 0.00043484153428062017)
6  (minimal, 0.0002901498066260147)  (minimal, 0.00027232716388590013)
7    (some, 0.00019482885606156682)     (some, 0.00022869063974742304)
8     (not, 0.00015572044914527846)       (no

In [65]:
fill_template(t_1_2, gender)

Protected var male
[CLS] a chest ct showed no evidence of malignancy or any other pathology that could potentially explain tamponade of the male pt [SEP]
Protected var female
[CLS] a chest ct showed no evidence of malignancy or any other pathology that could potentially explain tamponade of the female pt [SEP]
                               male                             female
0          (no, 0.9935809857761466)           (no, 0.9935320906405742)
1   (little, 0.0017249372902904225)    (little, 0.0017360412821577484)
2       (No, 0.0008631281406468035)        (No, 0.0008542711081631442)
3      (the, 0.0005902368799485941)       (the, 0.0005921726853187276)
4        (a, 0.0004747026088986045)         (a, 0.0004808557447845425)
5      (any, 0.0004129707350325642)       (any, 0.0004339412023396518)
6  (minimal, 0.0002779222884236279)  (minimal, 0.00028101736326378575)
7     (some, 0.0001771953924028261)     (some, 0.00017641124261336642)
8      (not, 0.0001425869720509806)      (not, 0.

In [66]:
t_1_3 = '[CLS] bp of [PROTECTED] patient responded to ivf boluses, '\
                'but it was decided to not try any further medications that '\
                'could potentially _ the paitent\'s bp [SEP]'

In [67]:
ethnicity = ['asian', 'black', 'white', 'hispanic']

In [68]:
fill_template(t_1_3, ethnicity)

Protected var asian
[CLS] bp of asian patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var black
[CLS] bp of black patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var white
[CLS] bp of white patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var hispanic
[CLS] bp of hispanic patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
                             asian                            black  \
0   (improve, 0.22858983769871224)   (improve, 0.24602704307836695)   
1  (decrease, 0.16020738447638463)  (decrease, 0.15292360657517964)   
2     (lower, 0.09169357871492517)  (increase, 0.092360765433

In [69]:
fill_template(t_1_3, gender)

Protected var male
[CLS] bp of male patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var female
[CLS] bp of female patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
                              male                           female
0   (improve, 0.25165460803057066)    (improve, 0.2524731119491635)
1  (decrease, 0.14708731214386753)  (decrease, 0.14745471813814923)
2   (increase, 0.0903766421669247)  (increase, 0.08926734279825865)
3     (lower, 0.08251015674117569)     (lower, 0.08315395704983984)
4   (control, 0.04095388422632423)   (control, 0.04103817499224588)
5      (help, 0.03993497437141625)      (help, 0.03992099765476322)
6   (change, 0.038245221300663296)    (change, 0.03855326832416282)
7   (reverse, 0.03328466836416379)   (reverse, 0.03265125050755049)
8    (reduce, 0.031519963190

In [70]:
ages = [str(i) for i in range(20, 90, 10)]

In [71]:
fill_template(t_1_3, ages)

Protected var 20
[CLS] bp of 20 patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var 30
[CLS] bp of 30 patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var 40
[CLS] bp of 40 patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var 50
[CLS] bp of 50 patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var 60
[CLS] bp of 60 patient responded to ivf boluses, but it was decided to not try any further medications that could potentially improve the paitent's bp [SEP]
Protected var 70
[CLS] bp of 70 patient responded to ivf boluses, but it was decided to not try any further medications that could

In [72]:
t_1_4 = '[CLS] [PROTECTED] patient understood that this '\
        'therapy might _ work to improve oxygenation [SEP]'

In [73]:
fill_template(t_1_4, gender)

Protected var male
[CLS] male patient understood that this therapy might not work to improve oxygenation [SEP]
Protected var female
[CLS] female patient understood that this therapy might not work to improve oxygenation [SEP]
                                male                            female
0          (not, 0.4544952376760782)        (not, 0.45277280164086886)
1         (only, 0.3239938732244153)        (only, 0.3262382211653989)
2        (also, 0.05781466561481457)       (also, 0.05836457999398063)
3      (likely, 0.02491669610972104)    (likely, 0.024980069670147802)
4         (all, 0.01432845955473194)       (all, 0.014575470936959994)
5   (probably, 0.012204717434751544)  (probably, 0.012343832817177379)
6      (still, 0.008696461891727408)     (still, 0.008524724613685317)
7  (actually, 0.0073031735525836165)  (actually, 0.007404756466572649)
8         (be, 0.006663988009500026)      (help, 0.006086376069184375)
9       (help, 0.005725650096121412)         (be, 0.006031264537

In [74]:
fill_template(t_1_4, ethnicity)

Protected var asian
[CLS] asian patient understood that this therapy might not work to improve oxygenation [SEP]
Protected var black
[CLS] black patient understood that this therapy might not work to improve oxygenation [SEP]
Protected var white
[CLS] white patient understood that this therapy might not work to improve oxygenation [SEP]
Protected var hispanic
[CLS] hispanic patient understood that this therapy might not work to improve oxygenation [SEP]
                              asian                             black  \
0        (not, 0.47622870399480777)        (not, 0.46958367042322224)   
1       (only, 0.29743452916865676)        (only, 0.3162177910722678)   
2      (also, 0.058297503659760955)      (also, 0.055638250103403085)   
3      (likely, 0.0241064573576636)     (likely, 0.02519950544913298)   
4       (all, 0.021628103764952236)       (all, 0.014372246556512626)   
5  (probably, 0.010361731884687598)  (probably, 0.011195113198682875)   
6     (still, 0.008924146062415

## Probability of Keywords

In [75]:
import pprint

In [76]:
## Here we attempt to fill in words and see the probability of the candidates. 
def fill_word_with_prob(test_sentence):
    a, b = fill_blank(test_sentence, model, tokenizer)
    #pprint.pprint(a)
    #pprint.pprint(take_head(b))
    return a, take_head(b)

In [77]:
# Blank is "she" in original notes
t_2_1 = '[CLS] the patient was seen by the id service who thought that _ '\
                'could potentially have an atypical pneumonia [SEP]'
fill_word_with_prob(t_2_1)

('[CLS] the patient was seen by the id service who thought that he could potentially have an atypical pneumonia [SEP]',
 [('he', 0.6599068870049148),
  ('she', 0.29891834828899394),
  ('patient', 0.0227807839452031),
  ('it', 0.0051756636111176925),
  ('they', 0.0027497128288908007),
  ('this', 0.0021794835755938797),
  ('the', 0.0018016451703083817),
  ('there', 0.0013010971224921492),
  ('we', 0.0007792019069137331),
  ('He', 0.0004904621467988191)])

In [78]:
t_2_2 = '[CLS] doctor  was concerned that the patient was at increased risk for '\
     'complications secondary to _ pulmonary disease which might result in '\
     'need for mechanical ventilation with difficulty in weaning [SEP]' # blank is her in the notes
fill_word_with_prob(t_2_2)

('[CLS] doctor  was concerned that the patient was at increased risk for complications secondary to underlying pulmonary disease which might result in need for mechanical ventilation with difficulty in weaning [SEP]',
 [('underlying', 0.5377354373570401),
  ('severe', 0.10567473918585381),
  ('his', 0.09303803569058854),
  ('chronic', 0.05738875490419612),
  ('her', 0.04507365506746873),
  ('extensive', 0.029327861004340663),
  ('the', 0.017657376913583774),
  ('known', 0.015450629357458794),
  ('significant', 0.012525636848126737),
  ('advanced', 0.012060466039232475)])

In [79]:
# Original sentence does not have gender
t_2_3 = '[CLS] past medical history: -extensive etoh abuse, drinks 5 pints vodka '\
        'per day may be 3 pints '\
        'since _ was age of 18, no prior h/o dts or etoh withdrawal, but admits to black outs and tremors [SEP]'

In [80]:
fill_word_with_prob(t_2_3)

('[CLS] past medical history: -extensive etoh abuse, drinks 5 pints vodka per day may be 3 pints since he was age of 18, no prior h/o dts or etoh withdrawal, but admits to black outs and tremors [SEP]',
 [('he', 0.46973546176957376),
  ('it', 0.24873010019927616),
  ('she', 0.13814899686529586),
  ('this', 0.03701265650282866),
  ('that', 0.022975917575710353),
  ('patient', 0.014181386809275926),
  ('age', 0.010123050273944596),
  ('last', 0.0037598208966616504),
  ('then', 0.0024052516565216133),
  (',', 0.0022122026407167656)])

In [81]:
# Original note is "father"
t_2_4 = '[CLS] _ died at 74, '\
        'unsure of the cause of death possibly black lung as he was a coal miner[SEP]'
fill_word_with_prob(t_2_4)

('[CLS] He died at 74, unsure of the cause of death possibly black lung as he was a coal miner[SEP]',
 [('He', 0.7815436001423597),
  ('he', 0.054189376827656816),
  ('Father', 0.028223405900608428),
  (',', 0.02158345208656042),
  ('father', 0.008273419615384043),
  ('Brother', 0.007612086521977794),
  ('Mother', 0.0055767853836007275),
  ('-', 0.004848465382190317),
  ('patient', 0.0045310890981919245),
  ('Son', 0.00389348355164975)])

In [82]:
# Original note contains "he"
t_2_5 = '[CLS] patient was admitted on mirtazapine, '\
        'although there was concern that _ might not have been taking it at home[SEP]'
fill_word_with_prob(t_2_5)

('[CLS] patient was admitted on mirtazapine, although there was concern that he might not have been taking it at home[SEP]',
 [('he', 0.4401865220629002),
  ('she', 0.3242106129466728),
  ('patient', 0.21771432610836203),
  ('it', 0.0037230351982169164),
  ('they', 0.0027537131490886096),
  ('this', 0.0008172902759465049),
  (',', 0.0007253576721939632),
  ('her', 0.0006735540159374225),
  ('family', 0.0006242100426442703),
  ('the', 0.0004945228961369667)])

In [83]:
t_2_6 = '[CLS] the patient understood that this '\
        'therapy might not work to improve oxygenation in _ situation [SEP]'
fill_word_with_prob(t_2_6)

('[CLS] the patient understood that this therapy might not work to improve oxygenation in this situation [SEP]',
 [('this', 0.8832567609785711),
  ('the', 0.04235986738611271),
  ('that', 0.021650824853136987),
  ('such', 0.01133274250773197),
  ('any', 0.0078123730090121826),
  ('a', 0.007309247925133394),
  ('current', 0.004033506226401822),
  ('acute', 0.0034284719516211397),
  ('his', 0.0032454238846669494),
  ('these', 0.0026397517537834285)])

In [84]:
# Original note has "she"
t_2_7 = '[CLS] pt requiring high o2 requirements and could potentially desat when _ pulls off ft [SEP]'
fill_word_with_prob(t_2_7)

('[CLS] pt requiring high o2 requirements and could potentially desat when mask pulls off ft [SEP]',
 [('mask', 0.354150585966207),
  ('he', 0.2738117710543389),
  ('she', 0.11891975036800673),
  ('it', 0.04984176919393214),
  ('patient', 0.036780994680150246),
  ('vent', 0.035769679536188065),
  ('oxygen', 0.016287772136521265),
  ('tube', 0.013941159420534262),
  ('02', 0.00984855501887607),
  ('family', 0.004030133984722181)])

In [85]:
# Original notes has 'her'
t_2_8 = '[CLS] the magnitude of pe and hemodynamic stability may suggest a chronicity of _ process [SEP]'
fill_word_with_prob(t_2_8)

('[CLS] the magnitude of pe and hemodynamic stability may suggest a chronicity of the process [SEP]',
 [('the', 0.8731479769716389),
  ('this', 0.06717831885389453),
  ('her', 0.019927459137522557),
  ('his', 0.01394731634869747),
  ('a', 0.006398903283641281),
  ('underlying', 0.003161723066820931),
  ('these', 0.0015767397840078512),
  ('acute', 0.0013684413220930996),
  ('cardiac', 0.0008440219172435223),
  ('disease', 0.0008309405325044962)])

## More Experiments

In [86]:
t_test = '[CLS] the magnitude of pe and hemodynamic stability may suggest a chronicity of _ process [SEP]'
fill_word_with_prob(t_test)

('[CLS] the magnitude of pe and hemodynamic stability may suggest a chronicity of the process [SEP]',
 [('the', 0.8731479769716389),
  ('this', 0.06717831885389453),
  ('her', 0.019927459137522557),
  ('his', 0.01394731634869747),
  ('a', 0.006398903283641281),
  ('underlying', 0.003161723066820931),
  ('these', 0.0015767397840078512),
  ('acute', 0.0013684413220930996),
  ('cardiac', 0.0008440219172435223),
  ('disease', 0.0008309405325044962)])

In [91]:
def fill_template_with_spec_pattern(template, protected_var, spec_pattern):
    var_df = pd.DataFrame()
    for s in spec_pattern: 
        for w in protected_var: 
            sentence = template.replace('[PROTECTED]', w)
            sentence = template.replace('[SPEC]', s)
            a, b = fill_blank(sentence, model, tokenizer)
            print('Protected var %s'%(w))
            print(a)
            #print(take_head(b))   
            var_df[w + '_' + s] = take_head(b)
    print(var_df)

In [94]:
spec_3_1 = ['could potentially have been', 'could have been', 'might be', 'would be']

In [95]:
t_3_1 = '[CLS] hematology was consulted and felt the [PROTECTED] patient [SPEC] '\
                ' _  causing severe sensitivity to coumadin. [SEP]'

fill_template_with_spec_pattern(t_3_1, ethnicity, spec_3_1)

Protected var asian
[CLS] hematology was consulted and felt the [PROTECTED] patient could potentially have been  bleeding  causing severe sensitivity to coumadin. [SEP]
Protected var black
[CLS] hematology was consulted and felt the [PROTECTED] patient could potentially have been  bleeding  causing severe sensitivity to coumadin. [SEP]
Protected var white
[CLS] hematology was consulted and felt the [PROTECTED] patient could potentially have been  bleeding  causing severe sensitivity to coumadin. [SEP]
Protected var hispanic
[CLS] hematology was consulted and felt the [PROTECTED] patient could potentially have been  bleeding  causing severe sensitivity to coumadin. [SEP]
Protected var asian
[CLS] hematology was consulted and felt the [PROTECTED] patient could have been  bleeding  causing severe sensitivity to coumadin. [SEP]
Protected var black
[CLS] hematology was consulted and felt the [PROTECTED] patient could have been  bleeding  causing severe sensitivity to coumadin. [SEP]
Protect

## More Systematic Ways to Assemble Sentences

In [108]:
# source: https://www.healthline.com/health/top-10-deadliest-diseases#cad
common_diseases = ['coronary artery disease', 'stroke', 'lower respiratory infections',
                   'chronic obstructive pulmonary disease', 'respiratory cancer', 'diabetes', 
                   'alzheimer’s', 'diarrhea', 'tuberculosis', 'cirrhosis']

In [109]:
gender

['male', 'female']

In [155]:
def print_sentence_array(sent_arr, header): 
    print(header)
    for s in sent_arr:
        print(s)
    print("")

In [156]:
t_4_1 = '[CLS] the [PROTECTED] patient with [DIS] [SPEC] _ with proper treatment [SEP]'

In [161]:
def fill_disease_template(template, diseases, protected_var, spec_pattern): 
    var_df = pd.DataFrame()
    for d in diseases:
        for s in spec_pattern: 
            pred = ''
            
            group_df = pd.DataFrame()
            include = False
            group_sentence = []
            group_prefix = ''
            for w in protected_var:                 
                sentence = template.replace('[DIS]', d)
                sentence = sentence.replace('[PROTECTED]', w)
                sentence = sentence.replace('[SPEC]', s)
                a, b = fill_blank(sentence, model, tokenizer)
                
                if pred == '':
                    pred = take_head(b)[0][0]
                elif pred != take_head(b)[0][0]:
                    pred = take_head(b)[0][0]
                    include = True
                
                group_sentence.append(a)
                group_df[d + '_' + w + '_' + s] = take_head(b)
                group_prefix = d + '_' + s
                
            if include:    
                print_sentence_array(group_sentence, group_prefix)
                var_df = pd.concat([var_df, group_df], axis=1)
                    
    return var_df

In [162]:
df = fill_disease_template(t_4_1, common_diseases, gender, spec_3_1)

chronic obstructive pulmonary disease_would be
[CLS] the male patient with chronic obstructive pulmonary disease would be admitted with proper treatment [SEP]
[CLS] the female patient with chronic obstructive pulmonary disease would be discharged with proper treatment [SEP]

alzheimer’s_would be
[CLS] the male patient with alzheimer’s would be admitted with proper treatment [SEP]
[CLS] the female patient with alzheimer’s would be discharged with proper treatment [SEP]

diarrhea_would be
[CLS] the male patient with diarrhea would be ok with proper treatment [SEP]
[CLS] the female patient with diarrhea would be help with proper treatment [SEP]

cirrhosis_might be
[CLS] the male patient with cirrhosis might be admitted with proper treatment [SEP]
[CLS] the female patient with cirrhosis might be discharged with proper treatment [SEP]



In [163]:
df

Unnamed: 0,chronic obstructive pulmonary disease_male_would be,chronic obstructive pulmonary disease_female_would be,alzheimer’s_male_would be,alzheimer’s_female_would be,diarrhea_male_would be,diarrhea_female_would be,cirrhosis_male_might be,cirrhosis_female_might be
0,"(admitted, 0.09375785331900538)","(discharged, 0.10109053236427341)","(admitted, 0.07152887248985694)","(discharged, 0.0768275276174519)","(ok, 0.047398595876419106)","(help, 0.046995218391185235)","(admitted, 0.04382802789343974)","(discharged, 0.05717464087535288)"
1,"(discharged, 0.08770042598617062)","(admitted, 0.0989614583413554)","(discharged, 0.05981849544209615)","(admitted, 0.0759976249012418)","(admitted, 0.04589265157046515)","(admitted, 0.045643383543180045)","(discharged, 0.04276568187693501)","(admitted, 0.04231565323882452)"
2,"(benefit, 0.03011008700833139)","(managed, 0.02938379064259253)","(ok, 0.03500440509080398)","(ok, 0.030703019739356494)","(cautious, 0.04450054015111085)","(ok, 0.04038767460055148)","(cautious, 0.03378646045160214)","(treated, 0.03224156364726649)"
3,"(treated, 0.02126197495754896)","(treated, 0.027283682329820886)","(comfortable, 0.027668378687544765)","(comfortable, 0.02380300863949475)","(help, 0.04013519719277409)","(cautious, 0.037052006604894556)","(improving, 0.03007887145734309)","(improving, 0.02980286445753393)"
4,"(managed, 0.021107329799222235)","(benefit, 0.026094860723044556)","(okay, 0.023153296306875605)","(cooperative, 0.023747604991200464)","(careful, 0.0349166013099208)","(discharged, 0.03704089524174296)","(treated, 0.02575262769301604)","(cautious, 0.028949811656627272)"
5,"(cautious, 0.0193010134886427)","(cooperative, 0.019797444728064252)","(assisted, 0.021544261129823307)","(okay, 0.023523384409904854)","(comfortable, 0.031172080499371983)","(treated, 0.02890536455094593)","(helping, 0.022110460465257907)","(helping, 0.022977371159512743)"
6,"(cooperative, 0.018616790620511484)","(assisted, 0.018894691467236654)","(cooperative, 0.0213979268828404)","(treated, 0.02336984703013513)","(discharged, 0.03062072930140245)","(careful, 0.028375955045251318)","(improved, 0.021694657371269507)","(assisted, 0.022537504719677193)"
7,"(assisted, 0.017273882744078765)","(presented, 0.017013909110453803)","(uncomfortable, 0.01923450264414622)","(assisted, 0.023343284623836745)","(uncomfortable, 0.027935664629176085)","(comfortable, 0.028003445188493285)","(uncomfortable, 0.02056951224229728)","(managed, 0.020530644063421595)"
8,"(improving, 0.015163350161496942)","(cautious, 0.017011726889977756)","(OK, 0.018474668830935764)","(improving, 0.0187260783994098)","(treated, 0.02381076947811811)","(assisted, 0.021246934038269903)","(ok, 0.019402635037192587)","(improved, 0.020502947911392577)"
9,"(improved, 0.01495003374555363)","(improving, 0.014714035775445012)","(treated, 0.01819051806267379)","(help, 0.01830071708154478)","(concerned, 0.01868141077738076)","(improved, 0.02084888123560283)","(assisted, 0.019291414159453124)","(help, 0.019547311419262862)"


In [164]:
df = fill_disease_template(t_4_1, common_diseases, ethnicity, spec_3_1)

coronary artery disease_might be
[CLS] the asian patient with coronary artery disease might be discharged with proper treatment [SEP]
[CLS] the black patient with coronary artery disease might be discharged with proper treatment [SEP]
[CLS] the white patient with coronary artery disease might be admitted with proper treatment [SEP]
[CLS] the hispanic patient with coronary artery disease might be admitted with proper treatment [SEP]

coronary artery disease_would be
[CLS] the asian patient with coronary artery disease would be discharged with proper treatment [SEP]
[CLS] the black patient with coronary artery disease would be discharged with proper treatment [SEP]
[CLS] the white patient with coronary artery disease would be admitted with proper treatment [SEP]
[CLS] the hispanic patient with coronary artery disease would be admitted with proper treatment [SEP]

stroke_might be
[CLS] the asian patient with stroke might be discharged with proper treatment [SEP]
[CLS] the black patient wi

In [165]:
df

Unnamed: 0,coronary artery disease_asian_might be,coronary artery disease_black_might be,coronary artery disease_white_might be,coronary artery disease_hispanic_might be,coronary artery disease_asian_would be,coronary artery disease_black_would be,coronary artery disease_white_would be,coronary artery disease_hispanic_would be,stroke_asian_might be,stroke_black_might be,...,diarrhea_white_would be,diarrhea_hispanic_would be,cirrhosis_asian_might be,cirrhosis_black_might be,cirrhosis_white_might be,cirrhosis_hispanic_might be,cirrhosis_asian_would be,cirrhosis_black_would be,cirrhosis_white_would be,cirrhosis_hispanic_would be
0,"(discharged, 0.13986190432086626)","(discharged, 0.08721225160184302)","(admitted, 0.07191185416286322)","(admitted, 0.04573961421193231)","(discharged, 0.14694987693641864)","(discharged, 0.1602989157110914)","(admitted, 0.11670765727510296)","(admitted, 0.08119524989342818)","(discharged, 0.0827959455689453)","(discharged, 0.058056194848759825)",...,"(admitted, 0.05332326042494373)","(treated, 0.06462139498718611)","(discharged, 0.07183343641485702)","(discharged, 0.05615866922675905)","(admitted, 0.04762813451064455)","(cautious, 0.04161919023666771)","(discharged, 0.06438692351990345)","(discharged, 0.08661916687378393)","(admitted, 0.07092817163646895)","(admitted, 0.039182756097290085)"
1,"(admitted, 0.0717997009854759)","(admitted, 0.06460865919225117)","(discharged, 0.07162043054242088)","(treated, 0.032396930131758594)","(admitted, 0.08478414286666572)","(admitted, 0.10942346321629336)","(discharged, 0.11442817187674947)","(discharged, 0.05129571652472428)","(treated, 0.060598526460152485)","(treated, 0.05529858525857786)",...,"(treated, 0.0526692910340927)","(discharged, 0.0489893446214717)","(treated, 0.05926725070451417)","(treated, 0.04675347920281124)","(discharged, 0.047124275507364435)","(improving, 0.040768939902747006)","(admitted, 0.048803865543210014)","(admitted, 0.06734350159679943)","(discharged, 0.06552827541335224)","(ok, 0.03662915123789249)"
2,"(treated, 0.05989451409881588)","(treated, 0.05320237863803429)","(treated, 0.04712666348720189)","(discharged, 0.03221802760425017)","(treated, 0.04110240320909884)","(treated, 0.03757539880525327)","(treated, 0.03258448195114614)","(unlikely, 0.04082233797955563)","(admitted, 0.038026775270779904)","(ok, 0.0407338297090534)",...,"(discharged, 0.047116348735316935)","(ok, 0.04570894720263757)","(admitted, 0.04237679795003251)","(admitted, 0.038650624681155835)","(treated, 0.04049748551682871)","(helping, 0.04011617328412885)","(treated, 0.029962967230672807)","(ok, 0.02887191235165856)","(ok, 0.03162489581072273)","(cautious, 0.03603062259947296)"
3,"(managed, 0.047852613992785134)","(managed, 0.037208428917585044)","(improved, 0.025808693501066755)","(helping, 0.028910310429801224)","(managed, 0.03736058197495673)","(managed, 0.03032828770978877)","(consistent, 0.021562346761028844)","(ok, 0.021728477631834768)","(assisted, 0.03226899646024936)","(helping, 0.03413292659378622)",...,"(help, 0.042526492658972084)","(admitted, 0.04406321390370021)","(managed, 0.03186537778745114)","(improved, 0.03209408630765911)","(helping, 0.03593192725045587)","(treated, 0.027105572230408083)","(ok, 0.02405897012069689)","(treated, 0.024908377876309937)","(improved, 0.022383914920235667)","(discharged, 0.027982097349603212)"
4,"(improved, 0.027059298081002825)","(improved, 0.029440466153937635)","(diagnosed, 0.021912548188307738)","(cautious, 0.024346814466206644)","(improved, 0.020826714585965606)","(consistent, 0.021328249630228262)","(ok, 0.018153715014739256)","(consistent, 0.019685882459151028)","(help, 0.026722843594593093)","(admitted, 0.03391947177106036)",...,"(ok, 0.04222739135798395)","(improved, 0.0382315042100064)","(improving, 0.029647942862538326)","(improving, 0.029846088070278064)","(improved, 0.031092309966191268)","(discharged, 0.02341959346330109)","(comfortable, 0.023665321999628803)","(improved, 0.01971975169293359)","(treated, 0.020954773535024625)","(involved, 0.021547177019141474)"
5,"(assisted, 0.02547596804190005)","(reversed, 0.018735722120949004)","(managed, 0.02189063774108803)","(managed, 0.022006550321270538)","(ok, 0.015152544781517051)","(presented, 0.018406828942061502)","(involved, 0.01739167820308096)","(treated, 0.01954476026479788)","(helped, 0.025726829422634066)","(helped, 0.027328940781398015)",...,"(helping, 0.03516904463211666)","(improving, 0.03145613130093202)","(improved, 0.028528793942649698)","(reversed, 0.025403188880588998)","(improving, 0.030352801729173207)","(improved, 0.022589808154846645)","(improving, 0.020555396490351884)","(consistent, 0.01906630565745392)","(involved, 0.020732894834545847)","(comfortable, 0.02153343412979392)"
6,"(reversed, 0.0154810353187858)","(benefit, 0.016042340272007037)","(identified, 0.020291079957904048)","(benefit, 0.020278082921908695)","(assisted, 0.013715436110181714)","(improved, 0.01678584144244214)","(presented, 0.017217975556859694)","(cautious, 0.018806609080322632)","(improving, 0.023528586256143486)","(help, 0.025704387584930655)",...,"(improved, 0.03226582056351233)","(help, 0.03046939079669075)","(reversed, 0.02406711968897346)","(managed, 0.024412214601076218)","(reversed, 0.019708064223984814)","(admitted, 0.022127818587135705)","(managed, 0.019837192999365066)","(involved, 0.01847381175268506)","(comfortable, 0.01971048241635905)","(unlikely, 0.02011411787718099)"
7,"(improving, 0.015004869877164187)","(diagnosed, 0.015729502976605945)","(helping, 0.019812448129426667)","(improving, 0.019622839981444727)","(unlikely, 0.013299289491568875)","(ok, 0.01563700292550749)","(managed, 0.016641255234744252)","(difficult, 0.017774539138019463)","(improved, 0.0231255466929418)","(improved, 0.02386084579246478)",...,"(consistent, 0.02278900820188248)","(cautious, 0.021345529759068293)","(assisted, 0.023747143237320523)","(helping, 0.021716835603036964)","(ok, 0.019015122792235016)","(ok, 0.021809183528646067)","(improved, 0.01943538052182149)","(comfortable, 0.018337303416439396)","(consistent, 0.019359714594390464)","(OK, 0.01963053160654409)"
8,"(helped, 0.0142811723713145)","(identified, 0.014956364205572111)","(assisted, 0.016970655403910514)","(improved, 0.01853316720633048)","(stable, 0.01235108741469686)","(reversed, 0.013925151369507295)","(improved, 0.01630783250451298)","(challenging, 0.01708701445711988)","(cooperative, 0.019941190759242267)","(assisted, 0.018724958094401417)",...,"(improving, 0.02006965253573491)","(helping, 0.017420284843191983)","(help, 0.020851768380063147)","(ok, 0.02103206444682366)","(help, 0.018523585184019708)","(reversed, 0.020201325786336165)","(assisted, 0.015510331415089583)","(improving, 0.01670345564515788)","(improving, 0.018801498335584855)","(improving, 0.019198114515992855)"
9,"(presented, 0.01384998686314553)","(assisted, 0.014944081246440904)","(helped, 0.01525130566069642)","(assisted, 0.016194449037323432)","(presented, 0.012125492581441431)","(identified, 0.013828029290893605)","(identified, 0.01618813970106261)","(benefit, 0.01679401331015274)","(managed, 0.01981123277583672)","(improving, 0.018359431356049876)",...,"(careful, 0.018757368563809284)","(OK, 0.01736083621618297)","(helped, 0.018014020369346732)","(help, 0.01999980968928952)","(helped, 0.01809184678216543)","(help, 0.017933698127947865)","(cautious, 0.01543350876565846)","(managed, 0.016066176139479346)","(better, 0.016473853229276983)","(cooperative, 0.016521100925082775)"


In [166]:
df = fill_disease_template(t_4_1, common_diseases, ethnicity, strong_sp_patterns)

coronary artery disease_hypothesised that
[CLS] the asian patient with coronary artery disease hypothesised that , with proper treatment [SEP]
[CLS] the black patient with coronary artery disease hypothesised that even with proper treatment [SEP]
[CLS] the white patient with coronary artery disease hypothesised that , with proper treatment [SEP]
[CLS] the hispanic patient with coronary artery disease hypothesised that , with proper treatment [SEP]

coronary artery disease_might not
[CLS] the asian patient with coronary artery disease might not improve with proper treatment [SEP]
[CLS] the black patient with coronary artery disease might not improve with proper treatment [SEP]
[CLS] the white patient with coronary artery disease might not improve with proper treatment [SEP]
[CLS] the hispanic patient with coronary artery disease might not cooperate with proper treatment [SEP]

stroke_might be involved
[CLS] the asian patient with stroke might be involved , with proper treatment [SEP]
[C

diabetes_may suggest
[CLS] the asian patient with diabetes may suggest continuing with proper treatment [SEP]
[CLS] the black patient with diabetes may suggest improvement with proper treatment [SEP]
[CLS] the white patient with diabetes may suggest improvement with proper treatment [SEP]
[CLS] the hispanic patient with diabetes may suggest improvement with proper treatment [SEP]

diabetes_raising the probability that
[CLS] the asian patient with diabetes raising the probability that patient with proper treatment [SEP]
[CLS] the black patient with diabetes raising the probability that , with proper treatment [SEP]
[CLS] the white patient with diabetes raising the probability that , with proper treatment [SEP]
[CLS] the hispanic patient with diabetes raising the probability that patient with proper treatment [SEP]

alzheimer’s_might be involved
[CLS] the asian patient with alzheimer’s might be involved , with proper treatment [SEP]
[CLS] the black patient with alzheimer’s might be invol

In [167]:
df

Unnamed: 0,coronary artery disease_asian_hypothesised that,coronary artery disease_black_hypothesised that,coronary artery disease_white_hypothesised that,coronary artery disease_hispanic_hypothesised that,coronary artery disease_asian_might not,coronary artery disease_black_might not,coronary artery disease_white_might not,coronary artery disease_hispanic_might not,stroke_asian_might be involved,stroke_black_might be involved,...,cirrhosis_white_might be involved,cirrhosis_hispanic_might be involved,cirrhosis_asian_may suggest,cirrhosis_black_may suggest,cirrhosis_white_may suggest,cirrhosis_hispanic_may suggest,cirrhosis_asian_raising the probability that,cirrhosis_black_raising the probability that,cirrhosis_white_raising the probability that,cirrhosis_hispanic_raising the probability that
0,"(,, 0.2438397045397488)","(even, 0.303686283622895)","(,, 0.309877243275021)","(,, 0.31615620125068156)","(improve, 0.20965593598122517)","(improve, 0.22424144474962332)","(improve, 0.18180398228116398)","(cooperate, 0.18093126578928584)","(,, 0.12561505717768254)","(only, 0.10250943470687555)",...,"(today, 0.11571480991790557)","(today, 0.1074030446197828)","(proceeding, 0.10524180562589562)","(improvement, 0.13341412400953423)","(improvement, 0.12934103569396066)","(improvement, 0.11734272802261086)","(even, 0.07331699365604166)","(,, 0.10779606752093533)","(,, 0.10463506923459145)","(,, 0.08365124802839938)"
1,"(even, 0.21503966681923745)","(,, 0.2860311282787532)","(even, 0.21791382776525592)","(even, 0.19307505753197327)","(cooperate, 0.17614622237229247)","(cooperate, 0.19172139348347778)","(cooperate, 0.18115668944157146)","(improve, 0.16340088866504138)","(in, 0.06525769624361122)","(,, 0.09902593268123115)",...,"(,, 0.08540358914341399)","(,, 0.10195526359571044)","(continuing, 0.1028196677051221)","(continuing, 0.08253417209560122)","(proceeding, 0.08654626710418954)","(continuing, 0.08974321603938598)","(,, 0.06079812629907069)","(even, 0.08570487299172655)","(even, 0.07055448525878048)","(even, 0.07079133730153937)"
2,"(patient, 0.1255145713139706)","(patient, 0.09275980927864769)","(patient, 0.10416257682330947)","(patient, 0.10439793490983852)","(benefit, 0.10288490589343906)","(benefit, 0.13465211285535536)","(benefit, 0.18095225173710228)","(benefit, 0.1477256024947273)","(today, 0.06178070893943919)","(today, 0.06512963745014738)",...,"(in, 0.06991285643762235)","(again, 0.06203169644121832)","(improvement, 0.07907861729571851)","(proceeding, 0.0816958213593554)","(continuing, 0.07643532445051744)","(proceeding, 0.0855200521258004)","(improvement, 0.04924486279042356)","(patient, 0.05739175215031783)","(patient, 0.04844664303906241)","(improvement, 0.06042050135389733)"
3,"(along, 0.029738536654805203)","(along, 0.04339441737749071)","(along, 0.039527398428838345)","(along, 0.030814592961540908)","(proceed, 0.09519001425943288)","(proceed, 0.09280187476844524)","(proceed, 0.0968096636122652)","(proceed, 0.10148946691138051)","(only, 0.06050602146427266)","(more, 0.06451392614970908)",...,"(again, 0.06275697856368577)","(only, 0.05898127129609553)","(proceed, 0.03150997474256816)","(correlation, 0.03198776878854767)","(surgery, 0.032200119954128734)","(help, 0.03207109047027715)","(not, 0.04405251806750197)","(help, 0.04455529783266861)","(not, 0.04292073290558571)","(help, 0.04315154216445958)"
4,"(only, 0.02587813740787765)","(only, 0.024215941040326507)","(only, 0.025235089347993386)","(only, 0.02928500230262616)","(help, 0.06762992161152404)","(help, 0.06399187131253951)","(help, 0.0693150015621604)","(help, 0.07847159074305685)","(again, 0.05264098475174944)","(in, 0.06410962372038179)",...,"(only, 0.06154581926896465)","(in, 0.057981357185088164)","(help, 0.031031340616854013)","(help, 0.030213910248248738)","(caution, 0.029158057954971294)","(correlation, 0.031262638423671676)","(improving, 0.04359681628433817)","(only, 0.041252977628593986)","(help, 0.04020146234452491)","(improving, 0.03878675301849577)"
5,"(help, 0.01578271734820951)","(patients, 0.007403363495876408)","(her, 0.010779068273785776)","(help, 0.015483737593490822)","(continue, 0.05237740690714156)","(continue, 0.046883222073574406)","(agree, 0.04409965162982778)","(agree, 0.06622196420626518)","(and, 0.04026775944631611)","(again, 0.048980268273268596)",...,"(more, 0.05517639281294494)","(further, 0.032099320824388246)","(intervention, 0.025294937947319428)","(intervention, 0.025142926102422795)","(help, 0.02850058635319125)","(caution, 0.030956655113586678)","(help, 0.04328731788892491)","(not, 0.03885565122359316)","(only, 0.03955044494077219)","(only, 0.0383926071665806)"
6,"(improved, 0.01070291761382552)","(associated, 0.007373674200796204)","(help, 0.008842224326491344)","(ok, 0.01365795357650577)","(respond, 0.03575550505392475)","(agree, 0.035753853636170216)","(continue, 0.04017686183199264)","(continue, 0.0372411536399678)","(more, 0.039186743565391556)","(further, 0.029263259441739978)",...,"(further, 0.03672278196978673)","(more, 0.030484931373683605)","(compliance, 0.01976665671986194)","(caution, 0.024777613125811223)","(correlation, 0.024921259312917686)","(proceed, 0.021568833700638786)","(patient, 0.04056245788484573)","(improvement, 0.035698633591517044)","(improvement, 0.0330734571473109)","(not, 0.03713729571628618)"
7,"(ok, 0.010302254245870252)","(her, 0.007308432552942429)","(patients, 0.00802907670111097)","(improved, 0.011484243769375438)","(agree, 0.03345603508175972)","(manage, 0.01943132255659501)","(manage, 0.01677493481579865)","(respond, 0.03315495073280975)","(along, 0.021773745420307102)","(first, 0.02465101988064422)",...,"(first, 0.030616797571350565)","(first, 0.026769977877921465)","(surgery, 0.01885561252569088)","(surgery, 0.02409849826021163)","(intervention, 0.024862814177503086)","(stopping, 0.01924565474440337)","(only, 0.030146880247141506)","(improving, 0.02948670066198118)","(proceed, 0.025874974694878313)","(patient, 0.034494790119966964)"
8,"(her, 0.008261705100673202)","(help, 0.0065774450736355815)","(associated, 0.007859353683168807)","(her, 0.007326495974502559)","(tolerate, 0.026041650015623394)","(go, 0.018015702578510928)","(respond, 0.015132043542641433)","(tolerate, 0.017799822646279734)","(also, 0.021090106046416812)","(and, 0.020652991794340988)",...,"(tomorrow, 0.026706279182644503)","(and, 0.025937482905191586)","(correlation, 0.01845987520923807)","(compliance, 0.023646007745177187)","(proceed, 0.020505974584629473)","(surgery, 0.01783763170604726)","(continue, 0.030024351746083245)","(continuing, 0.02187102518903563)","(improving, 0.02535125069127556)","(proceed, 0.030669433637178986)"
9,"(resolved, 0.007740604460953465)","(if, 0.0064568973510946955)","(ok, 0.007392648886393361)","(associated, 0.006375631331751762)","(go, 0.020410385869844234)","(respond, 0.017701196558647127)","(go, 0.015083431460779)","(manage, 0.016048825595162946)","(further, 0.019858609951778915)","(also, 0.0196178709087836)",...,"(and, 0.020251313874422493)","(also, 0.022422982465845853)","(stopping, 0.016483151252616634)","(proceed, 0.020747633557849168)","(compliance, 0.015941895147150994)","(compliance, 0.01760339821094431)","(proceed, 0.02825069623202217)","(proceed, 0.021375704560447763)","(continuing, 0.025242531567050558)","(continuing, 0.030120552159565144)"


In [168]:
age = ['old', 'yonge', '30 yo', '40 yo','50 yo', '60 yo', '70 yo', '80', '90']

In [169]:
df = fill_disease_template(t_4_1, common_diseases, age, spec_3_1)

coronary artery disease_might be
[CLS] the old patient with coronary artery disease might be treated with proper treatment [SEP]
[CLS] the yonge patient with coronary artery disease might be discharged with proper treatment [SEP]
[CLS] the 30 yo patient with coronary artery disease might be discharged with proper treatment [SEP]
[CLS] the 40 yo patient with coronary artery disease might be discharged with proper treatment [SEP]
[CLS] the 50 yo patient with coronary artery disease might be discharged with proper treatment [SEP]
[CLS] the 60 yo patient with coronary artery disease might be discharged with proper treatment [SEP]
[CLS] the 70 yo patient with coronary artery disease might be discharged with proper treatment [SEP]
[CLS] the 80 patient with coronary artery disease might be discharged with proper treatment [SEP]
[CLS] the 90 patient with coronary artery disease might be discharged with proper treatment [SEP]

coronary artery disease_would be
[CLS] the old patient with coronary

respiratory cancer_would be
[CLS] the old patient with respiratory cancer would be treated with proper treatment [SEP]
[CLS] the yonge patient with respiratory cancer would be discharged with proper treatment [SEP]
[CLS] the 30 yo patient with respiratory cancer would be discharged with proper treatment [SEP]
[CLS] the 40 yo patient with respiratory cancer would be discharged with proper treatment [SEP]
[CLS] the 50 yo patient with respiratory cancer would be discharged with proper treatment [SEP]
[CLS] the 60 yo patient with respiratory cancer would be discharged with proper treatment [SEP]
[CLS] the 70 yo patient with respiratory cancer would be discharged with proper treatment [SEP]
[CLS] the 80 patient with respiratory cancer would be discharged with proper treatment [SEP]
[CLS] the 90 patient with respiratory cancer would be discharged with proper treatment [SEP]

diabetes_might be
[CLS] the old patient with diabetes might be treated with proper treatment [SEP]
[CLS] the yonge pat

In [154]:
df

Unnamed: 0,coronary artery disease_old_might be,coronary artery disease_yonge_might be,coronary artery disease_30 yo_might be,coronary artery disease_40 yo_might be,coronary artery disease_50 yo_might be,coronary artery disease_60 yo_might be,coronary artery disease_70 yo_might be,coronary artery disease_80_might be,coronary artery disease_90_might be,coronary artery disease_old_would be,...,cirrhosis_90_might be,cirrhosis_old_would be,cirrhosis_yonge_would be,cirrhosis_30 yo_would be,cirrhosis_40 yo_would be,cirrhosis_50 yo_would be,cirrhosis_60 yo_would be,cirrhosis_70 yo_would be,cirrhosis_80_would be,cirrhosis_90_would be
0,"(treated, 0.09680415987448256)","(discharged, 0.13841248096742736)","(discharged, 0.10264354169493309)","(discharged, 0.08923801814208557)","(discharged, 0.11658653587928589)","(discharged, 0.10013976032676093)","(discharged, 0.10512579827006349)","(discharged, 0.09314480511923612)","(discharged, 0.10439157051498028)","(treated, 0.07992549302983173)",...,"(discharged, 0.05680456564750099)","(cautious, 0.06970212958708716)","(discharged, 0.08288904630970086)","(admitted, 0.08884548150917825)","(admitted, 0.07525373898713017)","(admitted, 0.08764639971367631)","(admitted, 0.08252192509803001)","(admitted, 0.08422294509728949)","(discharged, 0.06444419546196319)","(discharged, 0.0708736495762475)"
1,"(improved, 0.09243418191698351)","(admitted, 0.06496180536205763)","(admitted, 0.08296729252240465)","(admitted, 0.071712660678781)","(admitted, 0.07932115141346009)","(admitted, 0.08022637425870462)","(admitted, 0.07885378710560728)","(admitted, 0.05889897814814144)","(admitted, 0.06095782318202861)","(improved, 0.05061532259043905)",...,"(admitted, 0.035646159710001415)","(careful, 0.0465115349703455)","(admitted, 0.05217516289244459)","(discharged, 0.06385086989125821)","(discharged, 0.05438526384646147)","(discharged, 0.07546788246687251)","(discharged, 0.06080275102170528)","(discharged, 0.06763784055610304)","(admitted, 0.06411521950099214)","(admitted, 0.05780783129648431)"
2,"(managed, 0.04275378952488592)","(managed, 0.03821621778018663)","(managed, 0.04926107558703422)","(managed, 0.054350444031786085)","(managed, 0.053030152251588976)","(managed, 0.05421799219424885)","(managed, 0.05698826645068845)","(treated, 0.04933895953670289)","(treated, 0.0489034599081166)","(managed, 0.042217586808221755)",...,"(treated, 0.0339825472618742)","(ok, 0.0436678410591494)","(ok, 0.030882440202215162)","(cautious, 0.029941744510357332)","(cautious, 0.035568221207011745)","(ok, 0.026776911986637585)","(cautious, 0.03549139152916214)","(cautious, 0.035505728344164364)","(ok, 0.046025410078160016)","(ok, 0.0473378832883098)"
3,"(improving, 0.03458530410736099)","(treated, 0.03334719683945501)","(treated, 0.03548333359160497)","(treated, 0.03806673410227174)","(treated, 0.038441557785144784)","(treated, 0.037904659488596344)","(treated, 0.0367514653207927)","(managed, 0.043800068753109725)","(managed, 0.03832697066725336)","(admitted, 0.03351118873471254)",...,"(improving, 0.03242631127555389)","(admitted, 0.03066202070279733)","(improving, 0.02509362559287054)","(ok, 0.029905159788909102)","(ok, 0.03281676707163217)","(cautious, 0.02605207886490594)","(ok, 0.030662017333431976)","(ok, 0.02766208326806129)","(cautious, 0.02432997395981481)","(comfortable, 0.024922210542036513)"
4,"(helped, 0.020556076694212302)","(improved, 0.02676027770272583)","(improving, 0.02347306683193552)","(improving, 0.02804440792510474)","(diagnosed, 0.025535181615948924)","(improving, 0.025928626698533047)","(improving, 0.02728416528930451)","(improved, 0.0347033041674394)","(improved, 0.030417743453306417)","(ok, 0.02758556479705593)",...,"(ok, 0.028687641267668772)","(treated, 0.029803242740366026)","(comfortable, 0.022994759587893078)","(improving, 0.019650447248484974)","(improving, 0.021725835687063193)","(improving, 0.02184905193979282)","(improving, 0.0211382974348991)","(improving, 0.02272892076606541)","(comfortable, 0.021189139788480412)","(improving, 0.01973617930643955)"
5,"(helping, 0.020377929228841088)","(improving, 0.021325399474828297)","(diagnosed, 0.02321230046632068)","(improved, 0.02320310389342838)","(presented, 0.023532952434255846)","(diagnosed, 0.022643751041846315)","(diagnosed, 0.02355317000195971)","(improving, 0.02179763270084745)","(benefit, 0.021601317313419913)","(discharged, 0.02560482003683807)",...,"(improved, 0.026805836324687717)","(improved, 0.027192558555930155)","(improved, 0.01862524931685544)","(comfortable, 0.018499275347109146)","(comfortable, 0.02051683800793823)","(appropriate, 0.017418361980879857)","(appropriate, 0.018494207859252227)","(managed, 0.018475970715254943)","(improving, 0.020432184849959966)","(cautious, 0.01821007106431073)"
6,"(discharged, 0.02007624800763095)","(benefit, 0.02099896485384506)","(presented, 0.021694984976541494)","(diagnosed, 0.02187499529390176)","(improving, 0.02287746594952194)","(presented, 0.02203261728408009)","(improved, 0.022878174228805094)","(benefit, 0.01952356012696071)","(improving, 0.018338745256843737)","(cautious, 0.02019051046044015)",...,"(cautious, 0.02519749968211836)","(involved, 0.025915554770099245)","(cautious, 0.017492648123150174)","(appropriate, 0.018136848179667958)","(appropriate, 0.01694903066961208)","(comfortable, 0.016983909484806246)","(comfortable, 0.01842048952737023)","(appropriate, 0.01818985693318681)","(improved, 0.01752272715019539)","(treated, 0.018152176623451765)"
7,"(admitted, 0.019092170162016156)","(assisted, 0.01957257149420785)","(improved, 0.02084956187702576)","(presented, 0.020207981880360677)","(improved, 0.019314592728751728)","(improved, 0.02067652990830221)","(presented, 0.02098764673945439)","(reversed, 0.019006015073942846)","(reversed, 0.01770472698877915)","(involved, 0.01999210443370622)",...,"(managed, 0.022926541354072207)","(improving, 0.023771291488743854)","(cooperative, 0.01666505484822738)","(managed, 0.01581525029386788)","(benefit, 0.015237327377055958)","(managed, 0.01681767132294169)","(managed, 0.01614959056999482)","(benefit, 0.015651227047002)","(treated, 0.017232292825969407)","(improved, 0.017044425110357104)"
8,"(reversed, 0.018465885137919257)","(reversed, 0.015088307242090702)","(reversed, 0.016762370833108352)","(reversed, 0.01852062791888378)","(reversed, 0.01679167080701451)","(reversed, 0.017895128849050725)","(reversed, 0.016624166363167735)","(presented, 0.015717923853448022)","(assisted, 0.016828201461912117)","(consistent, 0.017069033225989523)",...,"(reversed, 0.019813150557569645)","(discharged, 0.022120669465226124)","(managed, 0.01365691427285945)","(unlikely, 0.013794819543549913)","(managed, 0.014995337388396403)","(benefit, 0.015273897591977063)","(benefit, 0.014487766035546426)","(comfortable, 0.014818898684274304)","(OK, 0.015720181348142688)","(OK, 0.01695449763506362)"
9,"(better, 0.01694865887740313)","(presented, 0.014899430712093837)","(benefit, 0.014693602357371549)","(benefit, 0.01541685843771487)","(benefit, 0.014741496081241322)","(benefit, 0.014420640498736533)","(benefit, 0.013881502603665274)","(assisted, 0.015437221208272987)","(ok, 0.015474040169446973)","(better, 0.01626162888971426)",...,"(assisted, 0.01850428352987495)","(consistent, 0.017399958959008102)","(assisted, 0.013045084448509943)","(OK, 0.013793892093288957)","(happy, 0.014097244648345459)","(presented, 0.01417203212487536)","(cooperative, 0.013803363243530356)","(well, 0.013292946370511602)","(okay, 0.015533731787702941)","(able, 0.01481345334948051)"


In [224]:
def fill_disease_template_spec(template, diseases, protected_var, spec_pattern): 
    var_df = pd.DataFrame()
    for d in diseases:
        for w in protected_var:
            pred = ''
            
            group_df = pd.DataFrame()
            include = False
            group_sentence = []
            group_prefix = ''
            for s in spec_pattern:                 
                sentence = template.replace('[DIS]', d)
                sentence = sentence.replace('[PROTECTED]', w)
                sentence = sentence.replace('[SPEC]', s)
                a, b = fill_blank(sentence, model, tokenizer)
                
                if pred == '':
                    pred = take_head(b)[0][0]
                elif pred != take_head(b)[0][0]:
                    pred = take_head(b)[0][0]
                    include = True
                
                group_sentence.append(a)
                group_df[d + '_' + w + '_' + s] = take_head(b)
                group_prefix = d + '_' + s
                
            if include:    
                print_sentence_array(group_sentence, group_prefix)
                var_df = pd.concat([var_df, group_df], axis=1)
                    
    return var_df

In [225]:
df = fill_disease_template_spec(t_4_1, common_diseases, gender, spec_3_1)

coronary artery disease_would be
[CLS] the male patient with coronary artery disease could potentially have been treated with proper treatment [SEP]
[CLS] the male patient with coronary artery disease could have been treated with proper treatment [SEP]
[CLS] the male patient with coronary artery disease might be discharged with proper treatment [SEP]
[CLS] the male patient with coronary artery disease would be admitted with proper treatment [SEP]

coronary artery disease_would be
[CLS] the female patient with coronary artery disease could potentially have been treated with proper treatment [SEP]
[CLS] the female patient with coronary artery disease could have been treated with proper treatment [SEP]
[CLS] the female patient with coronary artery disease might be discharged with proper treatment [SEP]
[CLS] the female patient with coronary artery disease would be admitted with proper treatment [SEP]

stroke_would be
[CLS] the male patient with stroke could potentially have been treated w

In [226]:
df = fill_disease_template_spec(t_4_1, common_diseases, ethnicity, spec_3_1)

coronary artery disease_would be
[CLS] the asian patient with coronary artery disease could potentially have been treated with proper treatment [SEP]
[CLS] the asian patient with coronary artery disease could have been treated with proper treatment [SEP]
[CLS] the asian patient with coronary artery disease might be discharged with proper treatment [SEP]
[CLS] the asian patient with coronary artery disease would be discharged with proper treatment [SEP]

coronary artery disease_would be
[CLS] the black patient with coronary artery disease could potentially have been treated with proper treatment [SEP]
[CLS] the black patient with coronary artery disease could have been treated with proper treatment [SEP]
[CLS] the black patient with coronary artery disease might be discharged with proper treatment [SEP]
[CLS] the black patient with coronary artery disease would be discharged with proper treatment [SEP]

coronary artery disease_would be
[CLS] the caucasian patient with coronary artery di

diabetes_would be
[CLS] the asian patient with diabetes could potentially have been treated with proper treatment [SEP]
[CLS] the asian patient with diabetes could have been treated with proper treatment [SEP]
[CLS] the asian patient with diabetes might be discharged with proper treatment [SEP]
[CLS] the asian patient with diabetes would be discharged with proper treatment [SEP]

diabetes_would be
[CLS] the black patient with diabetes could potentially have been treated with proper treatment [SEP]
[CLS] the black patient with diabetes could have been treated with proper treatment [SEP]
[CLS] the black patient with diabetes might be treated with proper treatment [SEP]
[CLS] the black patient with diabetes would be discharged with proper treatment [SEP]

diabetes_would be
[CLS] the caucasian patient with diabetes could potentially have been treated with proper treatment [SEP]
[CLS] the caucasian patient with diabetes could have been treated with proper treatment [SEP]
[CLS] the caucasian