# Investigation into diagnoses

Proof of concept to see whether hospitals can move from diagnoses written by doctors in freetext, to a dropdown menu of choices.

This notebook is one of the main examples of using NLP in this project.

Start with loading all necessary libraries.

In [None]:
import sys
!{sys.executable} -m pip install --upgrade --user pip
!{sys.executable} -m pip install pandas pyodbc numpy sklearn nltk wordcloud matplotlib --user

In [None]:
import pandas as pd # for manipulating data in dataframes
import pyodbc # for reading sql into pandas
import numpy as np # for numerical calculations
from collections import Counter # for counting the number of words in dictionaries
import re # for finding regular expressions in text

# Import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk import word_tokenize # to break sentences into words
from nltk.stem import WordNetLemmatizer # to find the lemmas of words

# Import NMF
from sklearn.decomposition import NMF

from wordcloud import WordCloud # to visualise wordclous
import matplotlib.pyplot as plt # other visualisations

Define medical dictionary

In [None]:
snomedct = pd.read_csv('sct2_Description_Snapshot-en_INT_20190731.txt',sep="\t",usecols=['term'])
medical_terms_series = snomedct['term'].str.lower().str.split().dropna()
medical_terms_list = []
for x in medical_terms_series:
    medical_terms_list.extend(x)
medical_terms_list = [medical_term for medical_term in medical_terms_list if medical_term.isalpha()]
medical_terms_list = [medical_term.strip("()") for medical_term in medical_terms_list]
medical_terms_list = [medical_term.strip("(") for medical_term in medical_terms_list]
medical_terms_counts = Counter(medical_terms_list)
print(medical_terms_counts)

Read SQL into pandas dataframe

In [None]:
sql_conn = pyodbc.connect('DRIVER={SQL Server};'
                            'SERVER=L_AAGwebapptest;'
                            'DATABASE=BedView;'
                            'Trusted_Connection=yes') 
query = "set transaction isolation level read uncommitted select cn.Diagnosis,ps.SpecialtyDesc,ps.AdmissionDate,ps.DischargeDate,pn.AmuTriage from tblClinicalNote cn inner join tblPatientSpell ps on ps.pkPatientSpellID=cn.fkPatientSpellID inner join tblPatientNote pn on pn.fkPatientSpellID=ps.pkPatientSpellID"
df = pd.read_sql(query, sql_conn)
df = df.dropna()
df

In [None]:
#df = df[df.index < 10]
df_19 = df[df['Diagnosis'].str.lower().str.contains("acute coronary syndrome",regex=False)]
print(df_19['Diagnosis'].values)

Find the abbreviations that doctors use

In [None]:
consecutive_caps_after = snomedct['term'].str.findall(r"((?:\b[A-Za-z]+\b\s)+\([A-Z][A-Z]+\))")
consecutive_caps_after = consecutive_caps_after[consecutive_caps_after.astype(str)!='[]']
consecutive_caps_after = consecutive_caps_after.dropna()
slist = []
for x in consecutive_caps_after:
    slist.extend(x)

term_to_abbreviation_dict = {}
for l in slist:
    inside_brackets = re.findall(r"\(([A-Za-z]+)\)", l)[0]
    len_inside_brackets = len(inside_brackets)
    num_words = len(l.split()) - 1
    if len_inside_brackets == num_words:
        words_before_brackets = []
        for i in range(len_inside_brackets):
            if i<num_words: words_before_brackets.insert(0,l.split(" ")[-i-2])
        string_before_brackets = " ".join(words_before_brackets)
        if all(words_before_brackets[i].lower()[0]==inside_brackets[i].lower() for i in range(len_inside_brackets)):
            if string_before_brackets not in term_to_abbreviation_dict.keys() and inside_brackets not in term_to_abbreviation_dict.values():
                term_to_abbreviation_dict[string_before_brackets] = inside_brackets
        

consecutive_caps_before = snomedct['term'].str.findall(r"[A-Za-z][A-Za-z]+\s\((?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+\)")
consecutive_caps_before = consecutive_caps_before[consecutive_caps_before.astype(str)!='[]']
consecutive_caps_before = consecutive_caps_before.dropna()
slist = []
for x in consecutive_caps_before:
    slist.extend(x)
for l in slist:
    inside_brackets = re.findall(r"\((?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+\)", l)[0]
    inside_brackets = inside_brackets.strip('()')
    words_inside_brackets = inside_brackets.split()
    num_words_inside_brackets = len(words_inside_brackets)
    word_before_brackets = l.split()[0]
    if num_words_inside_brackets==len(word_before_brackets) and all(words_inside_brackets[i].lower()[0]==word_before_brackets[i].lower() for i in range(num_words_inside_brackets)):
        if inside_brackets not in term_to_abbreviation_dict.keys() and word_before_brackets not in term_to_abbreviation_dict.values():
            term_to_abbreviation_dict[inside_brackets] = word_before_brackets
            

consecutive_caps_dash = snomedct['term'].str.findall(r"[A-Za-z][A-Za-z]+\s\-\s(?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+")
consecutive_caps_dash = consecutive_caps_dash[consecutive_caps_dash.astype(str)!='[]']
consecutive_caps_dash = consecutive_caps_dash.dropna()
slist = []
for x in consecutive_caps_dash:
    slist.extend(x)
for l in slist:
    after_dash = l.split(' - ')[1]
    words_after_dash = after_dash.split()
    num_words_after_dash = len(words_after_dash)
    word_before_dash = l.split(' - ')[0]
    if num_words_after_dash==len(word_before_dash) and all(words_after_dash[i].lower()[0]==word_before_dash[i].lower() for i in range(num_words_after_dash)):
        if after_dash not in term_to_abbreviation_dict.keys():
            term_to_abbreviation_dict[after_dash] = word_before_dash



consecutive_caps_after = df['Diagnosis'].str.findall(r"((?:\b[A-Za-z]+\b\s)+\([A-Z][A-Z]+\))")
consecutive_caps_after = consecutive_caps_after[consecutive_caps_after.astype(str)!='[]']
consecutive_caps_after = consecutive_caps_after.dropna()
slist = []
for x in consecutive_caps_after:
    slist.extend(x)

for l in slist:
    inside_brackets = re.findall(r"\(([A-Za-z]+)\)", l)[0]
    len_inside_brackets = len(inside_brackets)
    num_words = len(l.split()) - 1
    if len_inside_brackets == num_words:
        words_before_brackets = []
        for i in range(len_inside_brackets):
            if i<num_words: words_before_brackets.insert(0,l.split()[-i-2])
        string_before_brackets = " ".join(words_before_brackets)
        if all(words_before_brackets[i].lower()[0]==inside_brackets[i].lower() for i in range(len_inside_brackets)):
            if string_before_brackets not in term_to_abbreviation_dict.keys() and inside_brackets not in term_to_abbreviation_dict.values():
                term_to_abbreviation_dict[string_before_brackets] = inside_brackets
        

consecutive_caps_before = df['Diagnosis'].str.findall(r"[A-Za-z][A-Za-z]+\s\((?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+\)")
consecutive_caps_before = consecutive_caps_before[consecutive_caps_before.astype(str)!='[]']
consecutive_caps_before = consecutive_caps_before.dropna()
slist = []
for x in consecutive_caps_before:
    slist.extend(x)
for l in slist:
    inside_brackets = re.findall(r"\((?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+\)", l)[0]
    inside_brackets = inside_brackets.strip('()')
    words_inside_brackets = inside_brackets.split()
    num_words_inside_brackets = len(words_inside_brackets)
    word_before_brackets = l.split()[0]
    if num_words_inside_brackets==len(word_before_brackets) and all(words_inside_brackets[i].lower()[0]==word_before_brackets[i].lower() for i in range(num_words_inside_brackets)):
        if inside_brackets not in term_to_abbreviation_dict.keys() and word_before_brackets not in term_to_abbreviation_dict.values():
            term_to_abbreviation_dict[inside_brackets] = word_before_brackets
            

consecutive_caps_dash = df['Diagnosis'].str.findall(r"[A-Za-z][A-Za-z]+\s\-\s(?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+")
consecutive_caps_dash = consecutive_caps_dash[consecutive_caps_dash.astype(str)!='[]']
consecutive_caps_dash = consecutive_caps_dash.dropna()
slist = []
for x in consecutive_caps_dash:
    slist.extend(x)
for l in slist:
    after_dash = l.split('-')[1]
    words_after_dash = after_dash.split()
    num_words_after_dash = len(words_after_dash)
    word_before_dash = l.split(' - ')[0]
    if num_words_after_dash==len(word_before_dash) and all(words_after_dash[i].lower()[0]==word_before_dash[i].lower() for i in range(num_words_after_dash)):
        after_dash = after_dash.strip()
        if after_dash not in term_to_abbreviation_dict.keys() and word_before_dash not in term_to_abbreviation_dict.values():
            term_to_abbreviation_dict[after_dash] = word_before_dash
        
term_to_abbreviation_dict['trop'] = 'troponin'
term_to_abbreviation_dict['inf ex'] = 'infective ex'
term_to_abbreviation_dict['inf asthma'] = 'infective asthma'
term_to_abbreviation_dict[' exa '] = ' exacerbation '
term_to_abbreviation_dict['exac[^a-z]'] = 'exacerbation '
term_to_abbreviation_dict['ex copd'] = 'exacerbation copd'
term_to_abbreviation_dict['copd ex '] = 'copd exacerbation '
term_to_abbreviation_dict['ex of'] = 'exacerbation of'
term_to_abbreviation_dict['ex asthma'] = 'exacerbation asthma'
term_to_abbreviation_dict['ex due'] = 'exacerbation due'
term_to_abbreviation_dict['ex chronic'] = 'exacerbation chronic'
term_to_abbreviation_dict['infective exacerbation'] = 'ie'
term_to_abbreviation_dict['infected exacerbation'] = 'ie'
term_to_abbreviation_dict['ie of copd'] = 'iecopd'
term_to_abbreviation_dict['ie copd'] = 'iecopd'
term_to_abbreviation_dict['ie- copd'] = 'iecopd'
term_to_abbreviation_dict['ie-copd'] = 'iecopd'
term_to_abbreviation_dict['pulmonary embolism'] = 'pe'
term_to_abbreviation_dict['nebuliser'] = 'neb'
term_to_abbreviation_dict['nebulizer'] = 'neb'
term_to_abbreviation_dict['nebulisers'] = 'nebs'
term_to_abbreviation_dict['nebulizers'] = 'nebs'
term_to_abbreviation_dict['influenza'] = 'flu'
term_to_abbreviation_dict['over dose'] = 'overdose'
term_to_abbreviation_dict['[^a-z]od[^a-z]'] = ' overdose '
term_to_abbreviation_dict['msk'] = 'musculoskeletal'
term_to_abbreviation_dict['o2 sat'] = 'oxygen saturation'
term_to_abbreviation_dict['o sat'] = 'oxygen saturation'
term_to_abbreviation_dict['sat of'] = 'saturation of'
term_to_abbreviation_dict['sat %'] = 'saturation %'
term_to_abbreviation_dict['sat ?'] = 'saturday %'
term_to_abbreviation_dict['sat 0'] = 'saturday 0'
term_to_abbreviation_dict['(\s)sat(\s[0-9][0-9]%)'] = "\1saturated\2"
term_to_abbreviation_dict['(\s)sat(\s[0-9]\.)'] = "\1saturday\2"
term_to_abbreviation_dict['"(\s)sat(.{3,})sun'] = "\1saturday\2sunday"
term_to_abbreviation_dict['"(\s)sat(\s[0-9]{1,2}th)'] = "\1saturday\2"

#term_to_abbreviation_dict = {key.lower():val.lower() for (key,val) in term_to_abbreviation_dict.items()}
for key,val in term_to_abbreviation_dict.items():
    print(key+" & "+val+'\\\\')

In [None]:
consecutive_caps_series = df[~df['Diagnosis'].str.isupper()]['Diagnosis'].str.findall(r"\b[a-z]{2,}\b\s\b[A-Z]{2,}\b\s\b[a-z]{2,}\b")
consecutive_caps_series = consecutive_caps_series[consecutive_caps_series.map(lambda d: len(d)) > 0]
slist = []
for x in consecutive_caps_series:
    slist.extend(x)

# function to get unique values 
def unique(list1): 
  
    # intilize a null list 
    unique_list = [] 
      
    # traverse for all elements 
    for x in list1: 
        # check if exists in unique_list or not 
        if x.split()[1] not in unique_list: 
            unique_list.append(x.split()[1])
    return [string.lower() for string in unique_list]
        
abbreviations = [l.split()[1].lower() for l in slist]
term_to_abbreviation_dict_lower = [v.lower() for v in term_to_abbreviation_dict.values()]
abbreviations = [abbreviation for abbreviation in abbreviations if abbreviation in term_to_abbreviation_dict_lower]
abbreviation_counts = Counter(abbreviations)
print(abbreviation_counts)

Remove entries where AmuTriage is -1

In [None]:
df_AMU = df[df['AmuTriage']!='-1'].copy()
df_AMU

In [None]:
df_AMU = df_AMU[df_AMU['AdmissionDate']>'2018-11-07 09:13:59']
df_AMU

In [None]:
#df_nonAMU = df[df['AmuTriage']=='-1'].copy()
#df_nonAMU

Define spelling correction tool

In [None]:
def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter()
english_WORDS = Counter(words(open('big.txt').read()))
dict.update(WORDS,english_WORDS)
dict.update(WORDS,medical_terms_counts)
dict.update(WORDS,abbreviation_counts)
'''floors = ['A','B','C','D','E','F','G']
for floor in floors:
    for i in range(9):
        WORDS[floor+str(i+1)] = 1'''
print(WORDS)

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

Add words that you don't want in the wordclouds like, union(['useless','word'])

In [None]:
my_stop_words = text.ENGLISH_STOP_WORDS.union([])

Define American->British correction

In [None]:
def american_to_british(tokens):
    for t in tokens:
        t = re.sub(r"(...)or$", r"\1our", t)
        t = re.sub(r"([bt])er$", r"\1re", t)
        t = re.sub(r"([iy])z(e[drs]|e$|ing|ation)", r"\1s\2", t)
        t = re.sub(r"^(s.?[iy])s(e[drs]|e$|ing|ation)", r"\1z\2", t) # convert back words starting with s like size, seize
        t = re.sub(r"og$", "ogue", t)
        yield t
        
class CustomVectorizer(TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super().build_tokenizer()
        return lambda doc: list(american_to_british(tokenize(doc)))

Add words that come out wrong after lemmatization, like {'dos':'dose'}

In [None]:
corrected_lemma_dict = {'cathetre':'catheter','ac':'acs'} 
# undo American->British in catheter
# undo removal of plural ACS 

In [None]:
df_AMU['Diagnosis'] = df_AMU['Diagnosis'].str.lower()
df_AMU['Diagnosis'] = df_AMU['Diagnosis'].replace(term_to_abbreviation_dict, regex=True)
df_AMU['Diagnosis'] = df_AMU['Diagnosis'].replace(corrected_lemma_dict, regex=True)

In [None]:
df_AMU[df_AMU['Diagnosis'].str.contains('acute corononary syndrome')]

Define counters to check amount of spelling correction needed

In [None]:
english_words_before_correction = 0
non_english_words_before_correction = 0
english_words_after_correction = 0
non_english_words_after_correction = 0
medical_words_before_correction = 0
medical_words_after_correction = 0

Define tokenizer

In [None]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        
        tokens = [t for t in word_tokenize(doc) if t.isalpha()]
        no_stops = [t for t in tokens if t not in my_stop_words]
        lemmatized = [self.wnl.lemmatize(t) for t in no_stops]
        corrected = [correction(t) for t in lemmatized]
        
        '''
        global english_words_before_correction, non_english_words_before_correction, english_words_after_correction, non_english_words_after_correction, medical_words_before_correction, medical_words_after_correction
        for t in corrected_lemma:
            if len(t)>1:
                if t in english_WORDS: english_words_before_correction+=1
                elif t in medical_terms_counts: medical_words_before_correction+=1
                else: non_english_words_before_correction+=1
        for t in corrected:
            if len(t)>1:
                if t in english_WORDS: english_words_after_correction+=1
                elif t in medical_terms_counts: medical_words_after_correction+=1
                else: non_english_words_after_correction+=1
        '''
        return [t for t in corrected if len(t)>1]

In [None]:
# Create a TfidfVectorizer: tfidf
tfidf = CustomVectorizer(tokenizer=LemmaTokenizer())

In [None]:
# Apply fit_transform to document: csr_mat
csr_mat = tfidf.fit_transform(df_AMU['Diagnosis'])

In [None]:
# Get the words: words
words = tfidf.get_feature_names()

In [None]:
# Print words 
print(words)

In [None]:
'''
total_words_before_correction = english_words_before_correction + medical_words_before_correction + non_english_words_before_correction
english_percentage_before_correction = 100*english_words_before_correction/total_words_before_correction
non_english_percentage_before_correction = 100*(non_english_words_before_correction+medical_words_before_correction)/total_words_before_correction
plt.barh(1,english_percentage_before_correction,color='g')
plt.barh(1,non_english_percentage_before_correction,left=english_percentage_before_correction,color='r')
plt.text(english_percentage_before_correction/2,1,"{0:.0f}%".format(english_percentage_before_correction),ha='center')
plt.text(english_percentage_before_correction+non_english_percentage_before_correction/2,1,"{0:.0f}%".format(non_english_percentage_before_correction),ha='center')
plt.text(english_percentage_before_correction/2,0.4,"in\nenglish\ndictionary",ha='center')
plt.text(english_percentage_before_correction+non_english_percentage_before_correction/2,0.4,"not\nin\nenglish\ndictionary",ha='center')
plt.title('before spelling correction')
plt.gca().axis('off')
'''

In [None]:
'''
total_words_after_correction = english_words_after_correction + medical_words_after_correction + non_english_words_after_correction
english_percentage_after_correction = 100*english_words_after_correction/total_words_after_correction
non_english_percentage_after_correction = 100*(non_english_words_after_correction+medical_words_after_correction)/total_words_after_correction
plt.barh(1,english_percentage_after_correction,color='g')
plt.barh(1,non_english_percentage_after_correction,left=english_percentage_after_correction,color='r')
plt.text(english_percentage_after_correction/2,1,"{0:.0f}%".format(english_percentage_after_correction),ha='center')
plt.text(english_percentage_after_correction+non_english_percentage_after_correction/2,1,"{0:.0f}%".format(non_english_percentage_after_correction),ha='center')
plt.text(english_percentage_after_correction/2,0.4,"in\nenglish\ndictionary",ha='center')
plt.text(english_percentage_after_correction+non_english_percentage_after_correction/2,0.4,"not\nin\nenglish\ndictionary",ha='center')
plt.title('after spelling correction')
plt.gca().axis('off')
'''

In [None]:
'''
total_words_before_correction = english_words_before_correction + medical_words_before_correction + non_english_words_before_correction
english_percentage_before_correction = 100*english_words_before_correction/total_words_before_correction
medical_percentage_before_correction = 100*medical_words_before_correction/total_words_before_correction
non_english_percentage_before_correction = 100*(non_english_words_before_correction)/total_words_before_correction
plt.barh(1,english_percentage_before_correction,color='g')
plt.barh(1,medical_percentage_before_correction,left=english_percentage_before_correction,color='orange')
plt.barh(1,non_english_percentage_before_correction,left=english_percentage_before_correction+medical_percentage_before_correction,color='r')
plt.text(english_percentage_before_correction/2,1,"{0:.0f}% ".format(english_percentage_before_correction),ha='center')
plt.text(english_percentage_before_correction+medical_percentage_before_correction/2,1,"{0:.0f}%".format(medical_percentage_before_correction),ha='center')
plt.text(english_percentage_before_correction+medical_percentage_before_correction+non_english_percentage_before_correction/2,1,"{0:.0f}%".format(non_english_percentage_before_correction),ha='center')
plt.text(english_percentage_before_correction/2,0.4,"in\nenglish\ndictionary",ha='center')
plt.text(english_percentage_before_correction+medical_percentage_before_correction/2,0.4,"in\nmedical\ndictionary",ha='center')
plt.text(english_percentage_before_correction+medical_percentage_before_correction+non_english_percentage_before_correction/2,0.4,"in\nneither\ndictionary",ha='center')
plt.title('before spelling correction')
plt.gca().axis('off')
'''

In [None]:
'''
total_words_after_correction = english_words_after_correction + medical_words_after_correction + non_english_words_after_correction
english_percentage_after_correction = 100*english_words_after_correction/total_words_after_correction
medical_percentage_after_correction = 100*medical_words_after_correction/total_words_after_correction
non_english_percentage_after_correction = 100*(non_english_words_after_correction)/total_words_after_correction
plt.barh(1,english_percentage_after_correction,color='g')
plt.barh(1,medical_percentage_after_correction,left=english_percentage_after_correction,color='orange')
plt.barh(1,non_english_percentage_after_correction,left=english_percentage_after_correction+medical_percentage_after_correction,color='r')
plt.text(english_percentage_after_correction/2,1,"{0:.0f}% ".format(english_percentage_after_correction),ha='center')
plt.text(english_percentage_after_correction+medical_percentage_after_correction/2,1,"{0:.0f}%".format(medical_percentage_after_correction),ha='center')
#plt.text(english_percentage_after_correction+medical_percentage_after_correction+non_english_percentage_after_correction/2,1,"{0:.0f}%".format(non_english_percentage_after_correction),ha='center')
plt.text(english_percentage_after_correction/2,0.4,"in\nenglish\ndictionary",ha='center')
plt.text(english_percentage_after_correction+medical_percentage_after_correction/2,0.4,"in\nmedical\ndictionary",ha='center')
#plt.text(english_percentage_after_correction+medical_percentage_after_correction+non_english_percentage_after_correction/2,0.4,"in\nneither\ndictionary",ha='center')
plt.title('after spelling correction')
plt.gca().axis('off')
'''

In [None]:
wordcloud_words = []
def print_wordclouds_and_pies():
    donut_seg = 0
    for i,counts in theme_counts_series.items():
        # Initialize the word cloud
        width = 1024 #int(1024*counts/theme_counts_max)
        height = 720 #int(720*counts/theme_counts_max)
        wc = WordCloud(
            background_color="white",
            width = width,
            height = height
        )

        # Select row : component
        component = components_df.iloc[i]

        # Generate the cloud
        component.nlargest().index = component.nlargest().index.map(str)
        wc.generate_from_frequencies(component.nlargest())
        wordcloud_words.append(component.nlargest().index)

        # Display the generated image:
        figure, (wc_fig, counts_fig) = plt.subplots(nrows=1,ncols=2, figsize=(width/50,height/100))
        wc_fig.imshow(wc, interpolation='bilinear')
        wc_fig.axis("off");

        counts_fig.axis('equal')
        colors = ['w' for j in theme_counts_series.index]
        colors[donut_seg] = 'b'
        labels = ['' for val in theme_counts_series.values]
        labels[donut_seg] = str(counts)+"/"+str(theme_counts_series.values.sum())
        donut_seg += 1
        mypie, texts = counts_fig.pie(theme_counts_series.values/theme_counts_max, colors=colors, labels=labels, startangle=90, counterclock=False)
        for text in texts: text.set_fontsize(20)
        plt.setp( mypie, width=0.4, edgecolor='black')
        plt.tight_layout()
        #plt.savefig('bedview_'+str(k)+'_'+str(i)+'_wordcloud.png')
        plt.show()

        print('-------------------------------------------------------------------------------------------------------------------')

In [None]:
triage_counts = df_AMU['AmuTriage'].value_counts()
triage_counts = triage_counts.rename({'-1':'not via AMU'})

sizes = triage_counts*2
y = []
cumulative_y = 0
for size in sizes:
    cumulative_y+=np.sqrt(size)/2
    y.append(cumulative_y)
    cumulative_y+=np.sqrt(size)/2

cm = plt.get_cmap('RdYlGn_r')
colors = cm(sizes/max(sizes))
triage_color_dict = {}

plt.figure(figsize=(17,14*y[-1]/400))
plt.ylim(max(y),np.sqrt(min(y)))
for i,triage in enumerate(triage_counts.index):
    triage_color_dict[triage] = colors[i]
    plt.scatter(x=0,y=y[i],s=sizes[i],marker='s',color=triage_color_dict[triage])
    if i==0 or y[i]-y[i-1]>4: plt.text((y[-1]/500)*np.sqrt(sizes)[i]/10000,y[i],triage,va='center')
plt.gca().axis('off');
plt.title('AMU Triage')

In [None]:
specialty_counts = df_AMU['SpecialtyDesc'].value_counts()

sizes = specialty_counts*4
y = []
cumulative_y = 0
for size in sizes:
    cumulative_y+=np.sqrt(size/np.pi)
    y.append(cumulative_y)
    cumulative_y+=np.sqrt(size/np.pi)

cm = plt.get_cmap('RdYlGn_r')
colors = cm(sizes/max(sizes))
specialty_color_dict = {}

plt.figure(figsize=(17,14*y[-1]/600))
plt.ylim(max(y),np.sqrt(min(y)))
for i,specialty in enumerate(specialty_counts.index):
    specialty_color_dict[specialty] = colors[i]
    plt.scatter(x=0,y=y[i],s=sizes[i],marker='o',color=specialty_color_dict[specialty])
    if i==0 or y[i]-y[i-1]>5: plt.text((y[-1]/500)*np.sqrt(sizes)[i]/10000,y[i],specialty,va='center')
plt.gca().axis('off');
plt.title('Specialty at discharge')

In [None]:
def triage_specialty_plots(dataframe):

    triage_counts = dataframe['AmuTriage'].value_counts()

    triage_sizes = triage_counts*2
    triage_y = []
    cumulative_triage_y = 0
    for size in triage_sizes:
        cumulative_triage_y+=np.sqrt(size)/2
        triage_y.append(cumulative_triage_y)
        cumulative_triage_y+=np.sqrt(size)/2


    specialty_counts = dataframe['SpecialtyDesc'].value_counts()

    specialty_sizes = specialty_counts*4
    specialty_y = []
    cumulative_specialty_y = 0
    for size in specialty_sizes:
        cumulative_specialty_y+=np.sqrt(size/np.pi)
        specialty_y.append(cumulative_specialty_y)
        cumulative_specialty_y+=np.sqrt(size/np.pi)

    # Display the generated image:
    figure, (triage_fig, specialty_fig) = plt.subplots(nrows=1,ncols=2, figsize=(8,14*triage_y[-1]/400))

    triage_fig.set_ylim(max(triage_y),np.sqrt(min(triage_y)))
    for i,triage in enumerate(triage_counts.index):
        triage_fig.scatter(x=0,y=triage_y[i],s=triage_sizes[i],marker='s',color=triage_color_dict[triage])
        if i==0 or triage_y[i]-triage_y[i-1]>4: triage_fig.text((triage_y[-1]/400)*np.sqrt(triage_sizes)[i]/500,triage_y[i],triage,va='center')
    triage_fig.axis('off');
    triage_fig.set_title('AMU Triage')

    specialty_fig.set_ylim(max(specialty_y),np.sqrt(min(specialty_y)))
    for i,specialty in enumerate(specialty_counts.index):
        specialty_fig.scatter(x=0,y=specialty_y[i],s=specialty_sizes[i],marker='o',color=specialty_color_dict[specialty])
        if i==0 or specialty_y[i]-specialty_y[i-1]>5: specialty_fig.text((triage_y[-1]/400)*np.sqrt(specialty_sizes)[i]/400,specialty_y[i],specialty,va='center')
    specialty_fig.axis('off');
    specialty_fig.set_title('Specialty at discharge')

    plt.tight_layout()
    plt.show()

In [None]:
def print_wordclouds_and_plots():
    donut_seg = 0
    for i,counts in theme_counts_series.items():
        # Initialize the word cloud
        width = 1024 #int(1024*counts/theme_counts_max)
        height = 720 #int(720*counts/theme_counts_max)
        wc = WordCloud(
            background_color="white",
            width = width,
            height = height
        )

        # Select row : component
        component = components_df.iloc[i]

        # Generate the cloud
        component.nlargest().index = component.nlargest().index.map(str)
        wc.generate_from_frequencies(component.nlargest())
        wordcloud_words.append(component.nlargest().index)

        # Display the generated image:
        figure, (wc_fig, counts_fig) = plt.subplots(nrows=1,ncols=2, figsize=(width/50,height/100))
        wc_fig.imshow(wc, interpolation='bilinear')
        wc_fig.axis("off");

        counts_fig.axis('equal')
        colors = ['w' for j in theme_counts_series.index]
        colors[donut_seg] = 'b'
        labels = ['' for val in theme_counts_series.values]
        labels[donut_seg] = str(counts)+"/"+str(theme_counts_series.values.sum())
        donut_seg += 1
        mypie, texts = counts_fig.pie(theme_counts_series.values/theme_counts_max, colors=colors, labels=labels, startangle=90, counterclock=False)
        for text in texts: text.set_fontsize(20)
        plt.setp( mypie, width=0.4, edgecolor='black')
        plt.tight_layout()
        plt.show()
        
        df_max_feature = df_AMU[df_AMU['max_feature']==str(i)].copy()
        
        triage_specialty_plots(df_max_feature)

        print('-------------------------------------------------------------------------------------------------------------------')

Up to 17 components are tried because there are 17 possible AMU triage categories.

In [None]:
ks = list(range(2,18))
for k in ks:
    # Create an NMF instance: model
    model = NMF(n_components=k)

    # Fit the model to articles
    model.fit(csr_mat)
    
    # Transform the articles: nmf_features
    nmf_features = model.transform(csr_mat)

    # Create a pandas DataFrame: df
    df_nmf = pd.DataFrame(nmf_features,index=df_AMU['Diagnosis'])

    # Create a DataFrame: components_df
    components_df = pd.DataFrame(model.components_,columns=words)
    
    df_nmf.columns = df_nmf.columns.astype(str)
    df_nmf['max_feature'] = df_nmf.idxmax(axis=1)
    
    df_AMU['max_feature'] = df_nmf['max_feature'].values
    theme_counts_series = df_AMU['max_feature'].value_counts()
    theme_counts_series.index = theme_counts_series.index.astype(int)
    theme_counts_max = theme_counts_series.values.max()
    
    print("number themes: "+str(k))
    print_wordclouds_and_plots()
    print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")