# Investigation into medication patient safety events

Proof of concept to see whether Natural Language Processing (NLP) can find the themes within medication patient safety events.

This notebook is one of the main examples of using NLP in this project.

Start with loading all necessary libraries.

In [None]:
import pandas as pd # for manipulating data in dataframes
import pyodbc # for reading sql into pandas
import numpy as np # for numerical calculations
from collections import Counter # for counting the number of words in dictionaries
import re # for finding regular expressions in text

# Import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk import word_tokenize # to break sentences into words
from nltk.stem import WordNetLemmatizer # to find the lemmas of words

# Import NMF
from sklearn.decomposition import NMF

from wordcloud import WordCloud # to visualise wordclous
import matplotlib.pyplot as plt # other visualisations
from datetime import date
import math

# Change the start and end dates for your report here!

In [None]:
start_date = date(2019,9,1)
end_date = date(2019,11,30)

In [None]:
start_date = str(start_date)
end_date = str(end_date)

Define medical dictionary

In [None]:
snomedct = pd.read_csv('sct2_Description_Snapshot-en_INT_20190731.txt',sep="\t",usecols=['term'])
medical_terms_series = snomedct['term'].str.lower().str.split().dropna()
medical_terms_list = []
for x in medical_terms_series:
    medical_terms_list.extend(x)
medical_terms_list = [medical_term for medical_term in medical_terms_list if medical_term.isalpha()]
medical_terms_list = [medical_term.strip("()") for medical_term in medical_terms_list]
medical_terms_list = [medical_term.strip("(") for medical_term in medical_terms_list]
medical_terms_counts = Counter(medical_terms_list)
print(medical_terms_counts)

Read SQL into pandas dataframe

In [None]:
sql_conn = pyodbc.connect('DRIVER={SQL Server};'
                            'SERVER=L_AAGname;'
                            'DATABASE=database_name;'
                            'Trusted_Connection=yes') 
query = "set transaction isolation level read uncommitted select b.description,a.inc_dincident,a.inc_severity,c.imed_name_admin,a.inc_notes,a.inc_actiontaken from DatixCRM.dbo.code_locactual b join DatixCRM.dbo.incidents_main a on a.inc_locactual=b.code join DatixCRM.dbo.inc_medications c on a.recordid=c.inc_id"
df = pd.read_sql(query, sql_conn)
df = df.dropna()
df = df.groupby(['description','inc_dincident','inc_severity','inc_notes','inc_actiontaken'])['imed_name_admin'].apply(' '.join).reset_index()
column_list = list(df.columns)
column_list[0] = 'location'
df.columns = column_list
df

Find the abbreviations that doctors use

In [None]:
consecutive_caps_after = snomedct['term'].str.findall(r"((?:\b[A-Za-z]+\b\s)+\([A-Z][A-Z]+\))")
consecutive_caps_after = consecutive_caps_after[consecutive_caps_after.astype(str)!='[]']
consecutive_caps_after = consecutive_caps_after.dropna()
slist = []
for x in consecutive_caps_after:
    slist.extend(x)

term_to_abbreviation_dict = {}
for l in slist:
    inside_brackets = re.findall(r"\(([A-Za-z]+)\)", l)[0]
    len_inside_brackets = len(inside_brackets)
    num_words = len(l.split()) - 1
    if len_inside_brackets == num_words:
        words_before_brackets = []
        for i in range(len_inside_brackets):
            if i<num_words: words_before_brackets.insert(0,l.split(" ")[-i-2])
        string_before_brackets = " ".join(words_before_brackets)
        if all(words_before_brackets[i].lower()[0]==inside_brackets[i].lower() for i in range(len_inside_brackets)):
            if string_before_brackets not in term_to_abbreviation_dict.keys() and inside_brackets not in term_to_abbreviation_dict.values():
                term_to_abbreviation_dict[string_before_brackets] = inside_brackets
        

consecutive_caps_before = snomedct['term'].str.findall(r"[A-Za-z][A-Za-z]+\s\((?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+\)")
consecutive_caps_before = consecutive_caps_before[consecutive_caps_before.astype(str)!='[]']
consecutive_caps_before = consecutive_caps_before.dropna()
slist = []
for x in consecutive_caps_before:
    slist.extend(x)
for l in slist:
    inside_brackets = re.findall(r"\((?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+\)", l)[0]
    inside_brackets = inside_brackets.strip('()')
    words_inside_brackets = inside_brackets.split()
    num_words_inside_brackets = len(words_inside_brackets)
    word_before_brackets = l.split()[0]
    if num_words_inside_brackets==len(word_before_brackets) and all(words_inside_brackets[i].lower()[0]==word_before_brackets[i].lower() for i in range(num_words_inside_brackets)):
        if inside_brackets not in term_to_abbreviation_dict.keys() and word_before_brackets not in term_to_abbreviation_dict.values():
            term_to_abbreviation_dict[inside_brackets] = word_before_brackets
            

consecutive_caps_dash = snomedct['term'].str.findall(r"[A-Za-z][A-Za-z]+\s\-\s(?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+")
consecutive_caps_dash = consecutive_caps_dash[consecutive_caps_dash.astype(str)!='[]']
consecutive_caps_dash = consecutive_caps_dash.dropna()
slist = []
for x in consecutive_caps_dash:
    slist.extend(x)
for l in slist:
    after_dash = l.split(' - ')[1]
    words_after_dash = after_dash.split()
    num_words_after_dash = len(words_after_dash)
    word_before_dash = l.split(' - ')[0]
    if num_words_after_dash==len(word_before_dash) and all(words_after_dash[i].lower()[0]==word_before_dash[i].lower() for i in range(num_words_after_dash)):
        if after_dash not in term_to_abbreviation_dict.keys():
            term_to_abbreviation_dict[after_dash] = word_before_dash



consecutive_caps_series_location = df['location'].str.findall(r"((?:\b[A-Za-z&]+\b\s)+\([A-Za-z][A-Za-z]+\))")
consecutive_caps_series_notes = df['inc_notes'].str.findall(r"((?:\b[A-Za-z]+\b\s)+\([A-Za-z][A-Za-z]+\))")
consecutive_caps_series_action = df['inc_actiontaken'].str.findall(r"((?:\b[A-Za-z]+\b\s)+\([A-Za-z][A-Za-z]+\))")
consecutive_caps_after = pd.concat([consecutive_caps_series_location,consecutive_caps_series_notes,consecutive_caps_series_action])
consecutive_caps_after = consecutive_caps_after[consecutive_caps_after.astype(str)!='[]']
consecutive_caps_after = consecutive_caps_after.dropna()
slist = []
for x in consecutive_caps_after:
    slist.extend(x)

for l in slist:
    inside_brackets = re.findall(r"\(([A-Za-z]+)\)", l)[0]
    len_inside_brackets = len(inside_brackets)
    num_words = len(l.split()) - 1
    if len_inside_brackets == num_words:
        words_before_brackets = []
        for i in range(len_inside_brackets):
            if i<num_words: words_before_brackets.insert(0,l.split()[-i-2])
        string_before_brackets = " ".join(words_before_brackets)
        if all(words_before_brackets[i].lower()[0]==inside_brackets[i].lower() for i in range(len_inside_brackets)):
            if string_before_brackets not in term_to_abbreviation_dict.keys() and inside_brackets not in term_to_abbreviation_dict.values():
                term_to_abbreviation_dict[string_before_brackets] = inside_brackets
        

consecutive_caps_series_location = df['location'].str.findall(r"[A-Za-z][A-Za-z]+\s\((?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+\)")
consecutive_caps_series_notes = df['inc_notes'].str.findall(r"[A-Za-z][A-Za-z]+\s\((?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+\)")
consecutive_caps_series_action = df['inc_actiontaken'].str.findall(r"[A-Za-z][A-Za-z]+\s\((?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+\)")
consecutive_caps_before = pd.concat([consecutive_caps_series_location,consecutive_caps_series_notes,consecutive_caps_series_action])
consecutive_caps_before = consecutive_caps_before[consecutive_caps_before.astype(str)!='[]']
consecutive_caps_before = consecutive_caps_before.dropna()
slist = []
for x in consecutive_caps_before:
    slist.extend(x)
for l in slist:
    inside_brackets = re.findall(r"\((?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+\)", l)[0]
    inside_brackets = inside_brackets.strip('()')
    words_inside_brackets = inside_brackets.split()
    num_words_inside_brackets = len(words_inside_brackets)
    word_before_brackets = l.split()[0]
    if num_words_inside_brackets==len(word_before_brackets) and all(words_inside_brackets[i].lower()[0]==word_before_brackets[i].lower() for i in range(num_words_inside_brackets)):
        if inside_brackets not in term_to_abbreviation_dict.keys() and word_before_brackets not in term_to_abbreviation_dict.values():
            term_to_abbreviation_dict[inside_brackets] = word_before_brackets
            

consecutive_caps_series_location = df['location'].str.findall(r"[A-Za-z][A-Za-z]+\s\-\s(?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+")
consecutive_caps_series_notes = df['inc_notes'].str.findall(r"[A-Za-z][A-Za-z]+\s\-\s(?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+")
consecutive_caps_series_action = df['inc_actiontaken'].str.findall(r"[A-Za-z][A-Za-z]+\s\-\s(?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+")
consecutive_caps_dash = pd.concat([consecutive_caps_series_location,consecutive_caps_series_notes,consecutive_caps_series_action])
consecutive_caps_dash = consecutive_caps_dash[consecutive_caps_dash.astype(str)!='[]']
consecutive_caps_dash = consecutive_caps_dash.dropna()
slist = []
for x in consecutive_caps_dash:
    slist.extend(x)
for l in slist:
    after_dash = l.split('-')[1]
    words_after_dash = after_dash.split()
    num_words_after_dash = len(words_after_dash)
    word_before_dash = l.split(' - ')[0]
    if num_words_after_dash==len(word_before_dash) and all(words_after_dash[i].lower()[0]==word_before_dash[i].lower() for i in range(num_words_after_dash)):
        after_dash = after_dash.strip()
        if after_dash not in term_to_abbreviation_dict.keys() and word_before_dash not in term_to_abbreviation_dict.values():
            term_to_abbreviation_dict[after_dash] = word_before_dash
            
for key,val in dict(term_to_abbreviation_dict).items():
    if val=='OD': del term_to_abbreviation_dict[key]
    elif val=='PIVOTAL': del term_to_abbreviation_dict[key]
    elif val.lower().startswith('pri'): del term_to_abbreviation_dict[key]
    elif val=='fresh': del term_to_abbreviation_dict[key]
    elif val=='West': del term_to_abbreviation_dict[key]
    elif val.lower()=='oxynorm': del term_to_abbreviation_dict[key]
    elif val=='methylprednisolone': del term_to_abbreviation_dict[key]
    elif val=='cetraben': del term_to_abbreviation_dict[key]
    elif val=='Levemir': del term_to_abbreviation_dict[key]
    elif val=='Desmopressin': del term_to_abbreviation_dict[key]
    elif val.lower()=='oramorph': del term_to_abbreviation_dict[key]
    elif val=='insulatard': del term_to_abbreviation_dict[key]
    elif val=='missing': del term_to_abbreviation_dict[key]
    elif val=='insulatard': del term_to_abbreviation_dict[key]
    elif val=='SS': del term_to_abbreviation_dict[key]
    elif val=='Tramadol': del term_to_abbreviation_dict[key]
    elif val.lower()=='eprex': del term_to_abbreviation_dict[key]
    elif val=='Tuesday': del term_to_abbreviation_dict[key]
    elif val=='cloudy': del term_to_abbreviation_dict[key]
    elif val=='stable': del term_to_abbreviation_dict[key]
    elif val=='Solent': del term_to_abbreviation_dict[key]
    elif val=='carer': del term_to_abbreviation_dict[key]
term_to_abbreviation_dict['Intravenous Antibiotics'] = 'IV'
term_to_abbreviation_dict['Intravenous'] = 'IV'
term_to_abbreviation_dict['Morphine sulphate MR'] = 'MS'
term_to_abbreviation_dict['Morphine sulphate'] = 'MS'
term_to_abbreviation_dict['milligram'] = 'mg'
term_to_abbreviation_dict['Department of Critical Care'] = 'DCCQ'
term_to_abbreviation_dict['mau'] = 'amu'
term_to_abbreviation_dict['controlled drug'] = 'cd'
term_to_abbreviation_dict['patient(.{1,3})own drug'] = "pod"
term_to_abbreviation_dict['twice a day'] = "bd"
term_to_abbreviation_dict['twice daily'] = "bd"
term_to_abbreviation_dict['to take out'] = "tto"
term_to_abbreviation_dict['mino2'] = "min o2"
term_to_abbreviation_dict['lo2'] = "l o2"
term_to_abbreviation_dict[' o2'] = " oxygen"
term_to_abbreviation_dict[' po2'] = " partial pressure of oxygen"
term_to_abbreviation_dict['spo2'] = "peripheral capillary oxygen saturation"
term_to_abbreviation_dict['sao2'] = "oxygen saturation"
term_to_abbreviation_dict['fio2'] = "fraction of inspired oxygen"

#term_to_abbreviation_dict = {key.lower():val.lower() for (key,val) in term_to_abbreviation_dict.items()}
for key,val in term_to_abbreviation_dict.items():
    print(key+" & "+val+'\\\\')

In [None]:
abbreviations = [v.lower() for v in term_to_abbreviation_dict.values()]
abbreviation_counts = Counter(abbreviations)
print(abbreviation_counts)

Remove entries with incident date before 1st April 2016

In [None]:
df = df[df['inc_severity']!='']
df = df[df['inc_dincident'] > '2016-03-31']
df

Define spelling correction tool

In [None]:
def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter()
english_WORDS = Counter(words(open('big.txt').read()))
dict.update(WORDS,english_WORDS)
dict.update(WORDS,medical_terms_counts)
dict.update(WORDS,abbreviation_counts)
floors = ['A','B','C','D','E','F','G']
for floor in floors:
    for i in range(9):
        WORDS[floor+str(i+1)] = 1
WORDS['nomad'] = 1
WORDS['gik'] = 1
print(WORDS)

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

Add words that you don't want in the wordclouds like, union(['useless','word'])

In [None]:
my_stop_words = text.ENGLISH_STOP_WORDS.union(["patient","patients","pt","pharmacy","medicine","kd","mso","event","reported","recoded","coding","did","insulin"])

Define American->British correction

In [None]:
def american_to_british(tokens):
    for t in tokens:
        t = re.sub(r"(...)or$", r"\1our", t)
        t = re.sub(r"([bt])er$", r"\1re", t)
        t = re.sub(r"([iy])z(e[drs]|e$|ing|ation)", r"\1s\2", t)
        t = re.sub(r"^(s.?[iy])s(e[drs]|e$|ing|ation)", r"\1z\2", t) # convert back words starting with s like size, seize
        t = re.sub(r"og$", "ogue", t)
        yield t
        
class CustomVectorizer(TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super().build_tokenizer()
        return lambda doc: list(american_to_british(tokenize(doc)))

In [None]:
ward_num_series = df['location'].str.findall(r"[A-G][0-9]\s.+")
ward_num_series = ward_num_series[ward_num_series.map(lambda d: len(d)) > 0]
slist = []
for x in ward_num_series:
    slist.extend(x)
ward_name_to_num_dict = {}
for l in slist:
    l_split = l.split()
    name = " ".join(l_split[1:])
    name = name.strip("- ")
    ward_name_to_num_dict[name.lower()] = l_split[0].lower()
ward_name_to_num_dict['dccq'] = 'e5'

Add words that come out wrong after lemmatization, like {'dos':'dose'}

In [None]:
pre_correction_dict = {' dos ':' dose ', ' doses ':' dose ', ' ttos ':' tto ', ' cds ':' cd ', ' discharged ':' discharge ', 'non clinical':''}
corrected_lemma_dict = {'stat':'stated','errour':'error','doctour':'doctor','floour':'floor'}

Define tokenizer

In [None]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        
        tokens = [t for t in word_tokenize(doc) if t.isalpha()]
        no_stops = [t for t in tokens if t not in my_stop_words]
        lemmatized = [self.wnl.lemmatize(t) for t in no_stops]
        corrected = [correction(t) for t in lemmatized]
        return [t for t in corrected if len(t)>1]#corrected_lemma if len(t)>1]

In [None]:
# Create a TfidfVectorizer: tfidf
tfidf = CustomVectorizer(tokenizer=LemmaTokenizer())

In [None]:
df['month_year'] = df['inc_dincident'].map(lambda x: x.strftime('%Y-%m'))
df

In [None]:
df_insulin = df[df['inc_notes'].str.contains('insulin|dka|gik|glucose| bm|dextrose|hypog|hyperg')].copy()
df_insulin = df_insulin[df_insulin['imed_name_admin'].str.contains('32|33|34|1197|1201|1202|1203|1204|1205|1206|1207|1208|1209|1210|1211|1218|1219|1220|1952|2343|2414|2575')]

In [None]:
df_insulin['inc_notes'] = df_insulin['inc_notes'].str.lower()
df_insulin['inc_notes'] = df_insulin['inc_notes'].replace(term_to_abbreviation_dict, regex=True)
df_insulin['inc_notes'] = df_insulin['inc_notes'].replace(pre_correction_dict, regex=True)
df_insulin['inc_notes'] = df_insulin['inc_notes'].replace(ward_name_to_num_dict, regex=True)

In [None]:
month_year_value_counts = df_insulin['month_year'].value_counts().sort_index()
df_insulin['inc_severity'] = pd.Categorical(df_insulin['inc_severity'], ["NMISS", "NONE", "LOW", "MODRTE", "SEVERE", "DEATH"])
df_insulin = df_insulin.sort_values('inc_severity')

In [None]:
color_dict = {'NMISS':'b','NONE':'g','LOW':'y','MODRTE':'orange','SEVERE':'r','DEATH':'k'}
def severity_over_time_plot(dataframe):
    severity_counts = dataframe['inc_severity'].value_counts()[dataframe['inc_severity'].unique()]
    cumulative_bottom = 0
    month_years = list(month_year_value_counts.index)
    bins = list(month_year_value_counts.index)
    bins.append('9999-12')
    plt.figure(figsize=(17,14))
    for i,counts in severity_counts.items():
        height_severity_total,_ = np.histogram(dataframe[dataframe['inc_severity']==i]['month_year'].values, bins=bins)
        plt.bar(month_years, height_severity_total, bottom=cumulative_bottom, label=i, color=color_dict[i]);
        cumulative_bottom += height_severity_total
    plt.xticks(rotation='vertical');
    plt.legend()
    plt.title('Insulin Safety Learning Events by month')
    plt.show()
severity_over_time_plot(df_insulin)

In [None]:
# Apply fit_transform to document: csr_mat
csr_mat = tfidf.fit_transform(df_insulin['inc_notes'])

In [None]:
# Get the words: words
words = tfidf.get_feature_names()
words = [corrected_lemma_dict.get(t,t) for t in words]

In [None]:
# Print words 
print(words)

In [None]:
def print_wordclouds_and_plots():
    donut_seg = 0
    for i,counts in theme_counts_series.items():
        # Initialize the word cloud
        width = 1024 #int(1024*counts/theme_counts_max)
        height = 720 #int(720*counts/theme_counts_max)
        wc = WordCloud(
            background_color="white",
            width = width,
            height = height
        )

        # Select row : component
        component = components_df.iloc[i]

        # Generate the cloud
        component.nlargest().index = component.nlargest().index.map(str)
        wc.generate_from_frequencies(component.nlargest())
        for word in range(5):
            if component.nlargest().index[word] not in unique_wordcloud_words: 
                unique_wordcloud_words.append(component.nlargest().index[word])
        wordcloud_words.append(list(component.nlargest().index))

        # Display the generated image:
        figure, (wc_fig, counts_fig) = plt.subplots(nrows=1,ncols=2, figsize=(width/50,height/100))
        wc_fig.imshow(wc, interpolation='bilinear')
        wc_fig.axis("off");

        counts_fig.axis('equal')
        colors = ['w' for j in theme_counts_series.index]
        colors[donut_seg] = 'b'
        labels = ['' for val in theme_counts_series.values]
        labels[donut_seg] = str(counts)+"/"+str(theme_counts_series.values.sum())
        donut_seg += 1
        mypie, texts = counts_fig.pie(theme_counts_series.values/theme_counts_max, colors=colors, labels=labels, startangle=90, counterclock=False)
        for text in texts: text.set_fontsize(20)
        plt.setp( mypie, width=0.4, edgecolor='black')
        plt.tight_layout()
        plt.show()
        '''
        df_max_feature = df_insulin[df_insulin['max_feature']==str(i)].copy()
        
        severity_over_time_plot(df_max_feature)
        
        df_max_feature_date = df_max_feature[df_max_feature['inc_dincident'] >= start_date].copy()
        df_max_feature_date = df_max_feature_date[df_max_feature_date['inc_dincident'] < end_date]
        severity_pie(df_max_feature_date)
        '''

        print('-------------------------------------------------------------------------------------------------------------------')

In [None]:
unique_wordcloud_words = []
wordcloud_words = []
oxygen_keyword = False
k = 10
while oxygen_keyword==False:
    # Create an NMF instance: model
    model = NMF(n_components=k)

    # Fit the model to articles
    model.fit(csr_mat)
    
    # Transform the articles: nmf_features
    nmf_features = model.transform(csr_mat)

    # Create a pandas DataFrame: df
    df_nmf = pd.DataFrame(nmf_features,index=df_insulin['inc_notes'])

    # Create a DataFrame: components_df
    components_df = pd.DataFrame(model.components_,columns=words)
    
    df_nmf.columns = df_nmf.columns.astype(str)
    df_nmf['max_feature'] = df_nmf.idxmax(axis=1)
    
    df_insulin['max_feature'] = df_nmf['max_feature'].values
    theme_counts_series = df_insulin['max_feature'].value_counts()
    theme_counts_series.index = theme_counts_series.index.astype(int)
    theme_counts_max = theme_counts_series.values.max()
    
    print("number themes: "+str(k))
    print_wordclouds_and_plots()

    print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")
    
    oxygen_keyword=True
    k += 1

In [None]:
df_date = df_insulin[df_insulin['inc_dincident'] >= start_date].copy()
df_date = df_date[df_date['inc_dincident'] < end_date]

In [None]:
def severity_pie(dataframe):
    severity_counts = dataframe['inc_severity'].value_counts()[dataframe['inc_severity'].unique()]
    colors = [color_dict[v] for v in severity_counts.index]
    wedges = plt.pie(severity_counts, colors=colors, startangle=90, counterclock=False);
    plt.legend(list(severity_counts.index), loc=(1,0.5))
    plt.title('Severity of Insulin SLE\n'+str(dataframe['inc_dincident'].min().strftime('%b'))+' '+str(dataframe['inc_dincident'].min().strftime('%y'))+' - '+str(dataframe['inc_dincident'].max().strftime('%b'))+' '+str(dataframe['inc_dincident'].max().strftime('%y')))
    plt.show()
severity_pie(df_date)

In [None]:
def category_plot(dataframe):
    number_in_categories_within_date = dataframe['max_feature'].value_counts()
    number_in_categories_within_date.plot.barh();
    plt.gca().invert_yaxis()
    ax = plt.gca()
    yticklabels = []
    for i in list(number_in_categories_within_date.index):
        yticklabels.append(wordcloud_words[int(i)])
    ax.set_yticklabels(yticklabels);
    plt.title('Category of Insulin SLE\n'+str(dataframe['inc_dincident'].min().strftime('%b'))+' '+str(dataframe['inc_dincident'].min().strftime('%y'))+' - '+str(dataframe['inc_dincident'].max().strftime('%b'))+' '+str(dataframe['inc_dincident'].max().strftime('%y')))
category_plot(df_date)