In [None]:
# Start with loading all necessary libraries
import numpy as np
import pandas as pd
import pyodbc

import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches
from matplotlib.container import ErrorbarContainer
from matplotlib.lines import Line2D
from matplotlib.collections import LineCollection
from matplotlib.ticker import MaxNLocator
from mpl_toolkits.axes_grid1 import make_axes_locatable

# Import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

# Perform the necessary imports
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

# Import NMF
from sklearn.decomposition import NMF
from sklearn.metrics import explained_variance_score

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

import nltk
#nltk.set_proxy('https://username:password@proxy:8080') # uncomment this with your username, password, proxy
#nltk.download('wordnet')
from nltk import word_tokenize,sent_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus.reader import wordnet

from lmfit.models import PolynomialModel, Model

# Import Dictionary
from gensim.corpora.dictionary import Dictionary

import matplotlib.image as mpimg
from PIL import Image
import pytesseract

import math
import scipy.stats as ss
import seaborn as sns
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Import figure from bokeh.plotting
from bokeh.plotting import figure as bokeh_figure

# Import output_file and show from bokeh.io
from bokeh.io import show, output_notebook

from bokeh.models import ColumnDataSource, Whisker, Band, Legend, LegendItem, Span, HoverTool

import re
from collections import Counter

In [None]:
snomedct = pd.read_csv('sct2_Description_Snapshot-en_INT_20190731.txt',sep="\t",usecols=['term'])
medical_terms_series = snomedct['term'].str.lower().str.split().dropna()
medical_terms_list = []
for x in medical_terms_series:
    medical_terms_list.extend(x)
medical_terms_list = [medical_term for medical_term in medical_terms_list if medical_term.isalpha()]
medical_terms_list = [medical_term.strip("()") for medical_term in medical_terms_list]
medical_terms_list = [medical_term.strip("(") for medical_term in medical_terms_list]
medical_terms_counts = Counter(medical_terms_list)
print(medical_terms_counts)

In [None]:
sql_conn = pyodbc.connect('DRIVER={SQL Server};'
                            'SERVER=L_AAGname;'
                            'DATABASE=database_name;'
                            'Trusted_Connection=yes') 
query = "set transaction isolation level read uncommitted select a.recordid, b.description, c.description, d.description, a.inc_dincident,a.inc_time,a.inc_dreported,a.inc_submittedtime,a.inc_loctype,a.inc_result,a.inc_severity,a.show_other_contacts,a.show_employee,a.show_witness,a.show_document,a.inc_reportedby,a.inc_rep_email,a.inc_notes,a.inc_actiontaken from DatixCRM.dbo.code_unit b join DatixCRM.dbo.incidents_main a on a.inc_unit = b.code join DatixCRM.dbo.code_locactual c on a.inc_locactual = c.code join DatixCRM.dbo.code_specialty d on a.inc_specialty = d.code where a.inc_type='PAT' and a.inc_category='MEDIC' and a.inc_organisation='QA' and c.cod_parent2='QA'"
df = pd.read_sql(query, sql_conn, index_col='recordid')
column_list = list(df.columns)
column_list[0] = 'division-care_group'
column_list[1] = 'location'
column_list[2] = 'specialty'
df['inc_rep_email'] = df['inc_rep_email'].str.split("@",n=1,expand=True)[0]
df.columns = column_list
df.head()

In [None]:
sql_conn = pyodbc.connect('DRIVER={SQL Server};'
                            'SERVER=L_AAGDATIX;'
                            'DATABASE=DatixCRM;'
                            'Trusted_Connection=yes') 
query = "set transaction isolation level read uncommitted select a.recordid,c.description,b.con_email from contacts_main b join incidents_main a on a.recordid=b.recordid join code_con_type c on b.con_type=c.code where a.inc_type='PAT' and a.inc_category='MEDIC' and a.inc_organisation='QA' and a.show_employee='Y'"
df_con = pd.read_sql(query, sql_conn, index_col='recordid')
df_con.columns = ['employee_involved','email']
df_con['email'] = df_con['email'].str.split("@",n=1,expand=True)[0]
df_con = df_con[df_con['employee_involved'].str.contains('Employee')]
df_con['employee_involved'] = df_con['employee_involved'].str.strip('Employee (')
df_con['employee_involved'] = df_con['employee_involved'].str.strip('Registered ')
df_con['employee_involved'] = df_con['employee_involved'].str.strip('\)')
df_con['employee_involved'] = df_con['employee_involved'].str.replace('administrative','administrative)')
df_con['employee_involved'] = df_con['employee_involved'].str.replace(' / ','/')
df_con['employee_involved'] = df_con['employee_involved'].str.replace(' and ',' & ')
df_con['employee_involved'] = df_con['employee_involved'].str.replace(' staff','')
df_con

In [None]:
df = df.join(df_con,on='recordid')
df

In [None]:
#df = df[df.index < 1000]
# index < 3491 is the highest number that doesn't give memory error during n_components loop

In [None]:
consecutive_caps_after = snomedct['term'].str.findall(r"((?:\b[A-Za-z]+\b\s)+\([A-Z][A-Z]+\))")
consecutive_caps_after = consecutive_caps_after[consecutive_caps_after.astype(str)!='[]']
consecutive_caps_after = consecutive_caps_after.dropna()
slist = []
for x in consecutive_caps_after:
    slist.extend(x)

term_to_abbreviation_dict = {}
for l in slist:
    inside_brackets = re.findall(r"\(([A-Za-z]+)\)", l)[0]
    len_inside_brackets = len(inside_brackets)
    num_words = len(l.split()) - 1
    if len_inside_brackets == num_words:
        words_before_brackets = []
        for i in range(len_inside_brackets):
            if i<num_words: words_before_brackets.insert(0,l.split(" ")[-i-2])
        string_before_brackets = " ".join(words_before_brackets)
        if all(words_before_brackets[i].lower()[0]==inside_brackets[i].lower() for i in range(len_inside_brackets)):
            if string_before_brackets not in term_to_abbreviation_dict.keys() and inside_brackets not in term_to_abbreviation_dict.values():
                term_to_abbreviation_dict[string_before_brackets] = inside_brackets
        

consecutive_caps_before = snomedct['term'].str.findall(r"[A-Za-z][A-Za-z]+\s\((?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+\)")
consecutive_caps_before = consecutive_caps_before[consecutive_caps_before.astype(str)!='[]']
consecutive_caps_before = consecutive_caps_before.dropna()
slist = []
for x in consecutive_caps_before:
    slist.extend(x)
for l in slist:
    inside_brackets = re.findall(r"\((?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+\)", l)[0]
    inside_brackets = inside_brackets.strip('()')
    words_inside_brackets = inside_brackets.split()
    num_words_inside_brackets = len(words_inside_brackets)
    word_before_brackets = l.split()[0]
    if num_words_inside_brackets==len(word_before_brackets) and all(words_inside_brackets[i].lower()[0]==word_before_brackets[i].lower() for i in range(num_words_inside_brackets)):
        if inside_brackets not in term_to_abbreviation_dict.keys() and word_before_brackets not in term_to_abbreviation_dict.values():
            term_to_abbreviation_dict[inside_brackets] = word_before_brackets
            

consecutive_caps_dash = snomedct['term'].str.findall(r"[A-Za-z][A-Za-z]+\s\-\s(?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+")
consecutive_caps_dash = consecutive_caps_dash[consecutive_caps_dash.astype(str)!='[]']
consecutive_caps_dash = consecutive_caps_dash.dropna()
slist = []
for x in consecutive_caps_dash:
    slist.extend(x)
for l in slist:
    after_dash = l.split(' - ')[1]
    words_after_dash = after_dash.split()
    num_words_after_dash = len(words_after_dash)
    word_before_dash = l.split(' - ')[0]
    if num_words_after_dash==len(word_before_dash) and all(words_after_dash[i].lower()[0]==word_before_dash[i].lower() for i in range(num_words_after_dash)):
        if after_dash not in term_to_abbreviation_dict.keys():
            term_to_abbreviation_dict[after_dash] = word_before_dash



consecutive_caps_series_location = df['location'].str.findall(r"((?:\b[A-Za-z&]+\b\s)+\([A-Za-z][A-Za-z]+\))")
consecutive_caps_series_notes = df['inc_notes'].str.findall(r"((?:\b[A-Za-z]+\b\s)+\([A-Za-z][A-Za-z]+\))")
consecutive_caps_series_action = df['inc_actiontaken'].str.findall(r"((?:\b[A-Za-z]+\b\s)+\([A-Za-z][A-Za-z]+\))")
consecutive_caps_after = pd.concat([consecutive_caps_series_location,consecutive_caps_series_notes,consecutive_caps_series_action])
consecutive_caps_after = consecutive_caps_after[consecutive_caps_after.astype(str)!='[]']
consecutive_caps_after = consecutive_caps_after.dropna()
slist = []
for x in consecutive_caps_after:
    slist.extend(x)

for l in slist:
    inside_brackets = re.findall(r"\(([A-Za-z]+)\)", l)[0]
    len_inside_brackets = len(inside_brackets)
    num_words = len(l.split()) - 1
    if len_inside_brackets == num_words:
        words_before_brackets = []
        for i in range(len_inside_brackets):
            if i<num_words: words_before_brackets.insert(0,l.split()[-i-2])
        string_before_brackets = " ".join(words_before_brackets)
        if all(words_before_brackets[i].lower()[0]==inside_brackets[i].lower() for i in range(len_inside_brackets)):
            if string_before_brackets not in term_to_abbreviation_dict.keys() and inside_brackets not in term_to_abbreviation_dict.values():
                term_to_abbreviation_dict[string_before_brackets] = inside_brackets
        

consecutive_caps_series_location = df['location'].str.findall(r"[A-Za-z][A-Za-z]+\s\((?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+\)")
consecutive_caps_series_notes = df['inc_notes'].str.findall(r"[A-Za-z][A-Za-z]+\s\((?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+\)")
consecutive_caps_series_action = df['inc_actiontaken'].str.findall(r"[A-Za-z][A-Za-z]+\s\((?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+\)")
consecutive_caps_before = pd.concat([consecutive_caps_series_location,consecutive_caps_series_notes,consecutive_caps_series_action])
consecutive_caps_before = consecutive_caps_before[consecutive_caps_before.astype(str)!='[]']
consecutive_caps_before = consecutive_caps_before.dropna()
slist = []
for x in consecutive_caps_before:
    slist.extend(x)
for l in slist:
    inside_brackets = re.findall(r"\((?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+\)", l)[0]
    inside_brackets = inside_brackets.strip('()')
    words_inside_brackets = inside_brackets.split()
    num_words_inside_brackets = len(words_inside_brackets)
    word_before_brackets = l.split()[0]
    if num_words_inside_brackets==len(word_before_brackets) and all(words_inside_brackets[i].lower()[0]==word_before_brackets[i].lower() for i in range(num_words_inside_brackets)):
        if inside_brackets not in term_to_abbreviation_dict.keys() and word_before_brackets not in term_to_abbreviation_dict.values():
            term_to_abbreviation_dict[inside_brackets] = word_before_brackets
            

consecutive_caps_series_location = df['location'].str.findall(r"[A-Za-z][A-Za-z]+\s\-\s(?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+")
consecutive_caps_series_notes = df['inc_notes'].str.findall(r"[A-Za-z][A-Za-z]+\s\-\s(?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+")
consecutive_caps_series_action = df['inc_actiontaken'].str.findall(r"[A-Za-z][A-Za-z]+\s\-\s(?:\b[A-Za-z]+\b\s)(?:\b[A-Za-z]+\b\s?)+")
consecutive_caps_dash = pd.concat([consecutive_caps_series_location,consecutive_caps_series_notes,consecutive_caps_series_action])
consecutive_caps_dash = consecutive_caps_dash[consecutive_caps_dash.astype(str)!='[]']
consecutive_caps_dash = consecutive_caps_dash.dropna()
slist = []
for x in consecutive_caps_dash:
    slist.extend(x)
for l in slist:
    after_dash = l.split('-')[1]
    words_after_dash = after_dash.split()
    num_words_after_dash = len(words_after_dash)
    word_before_dash = l.split(' - ')[0]
    if num_words_after_dash==len(word_before_dash) and all(words_after_dash[i].lower()[0]==word_before_dash[i].lower() for i in range(num_words_after_dash)):
        after_dash = after_dash.strip()
        if after_dash not in term_to_abbreviation_dict.keys() and word_before_dash not in term_to_abbreviation_dict.values():
            term_to_abbreviation_dict[after_dash] = word_before_dash
            
for key,val in dict(term_to_abbreviation_dict).items():
    if val=='OD': del term_to_abbreviation_dict[key]
    elif val=='PIVOTAL': del term_to_abbreviation_dict[key]
    elif val.lower().startswith('pri'): del term_to_abbreviation_dict[key]
    elif val=='fresh': del term_to_abbreviation_dict[key]
    elif val=='West': del term_to_abbreviation_dict[key]
    elif val.lower()=='oxynorm': del term_to_abbreviation_dict[key]
    elif val=='methylprednisolone': del term_to_abbreviation_dict[key]
    elif val=='cetraben': del term_to_abbreviation_dict[key]
    elif val=='Levemir': del term_to_abbreviation_dict[key]
    elif val=='Desmopressin': del term_to_abbreviation_dict[key]
    elif val.lower()=='oramorph': del term_to_abbreviation_dict[key]
    elif val=='insulatard': del term_to_abbreviation_dict[key]
    elif val=='missing': del term_to_abbreviation_dict[key]
    elif val=='insulatard': del term_to_abbreviation_dict[key]
    elif val=='SS': del term_to_abbreviation_dict[key]
    elif val=='Tramadol': del term_to_abbreviation_dict[key]
    elif val.lower()=='eprex': del term_to_abbreviation_dict[key]
    elif val=='Tuesday': del term_to_abbreviation_dict[key]
    elif val=='cloudy': del term_to_abbreviation_dict[key]
    elif val=='stable': del term_to_abbreviation_dict[key]
    elif val=='Solent': del term_to_abbreviation_dict[key]
    elif val=='carer': del term_to_abbreviation_dict[key]
term_to_abbreviation_dict['Intravenous Antibiotics'] = 'IV'
term_to_abbreviation_dict['Intravenous'] = 'IV'
term_to_abbreviation_dict['Morphine sulphate MR'] = 'MS'
term_to_abbreviation_dict['Morphine sulphate'] = 'MS'
term_to_abbreviation_dict['milligram'] = 'mg'
term_to_abbreviation_dict['Department of Critical Care'] = 'DCCQ'
term_to_abbreviation_dict['mau'] = 'amu'
term_to_abbreviation_dict['controlled drug'] = 'cd'
term_to_abbreviation_dict['patient(.{1,3})own drug'] = "pod"
term_to_abbreviation_dict['twice a day'] = "bd"
term_to_abbreviation_dict['twice daily'] = "bd"
term_to_abbreviation_dict['to take out'] = "tto"

#term_to_abbreviation_dict = {key.lower():val.lower() for (key,val) in term_to_abbreviation_dict.items()}
for key,val in term_to_abbreviation_dict.items():
    print(key+" & "+val+'\\\\')

In [None]:
abbreviations = [v.lower() for v in term_to_abbreviation_dict.values()]
abbreviation_counts = Counter(abbreviations)
print(abbreviation_counts)

In [None]:
def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter()
english_WORDS = Counter(words(open('big.txt').read()))
dict.update(WORDS,english_WORDS)
dict.update(WORDS,medical_terms_counts)
dict.update(WORDS,abbreviation_counts)
floors = ['A','B','C','D','E','F','G']
for floor in floors:
    for i in range(9):
        WORDS[floor+str(i+1)] = 1
WORDS['moprs'] = 1
WORDS['locker'] = 1
WORDS['nomad'] = 1
WORDS['gik'] = 1
print(WORDS)

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [None]:
pre_correction_frequent_words = ['dose', 'given', 'chart', 'drug', 'prescribed', 'cd', 'cupboard', 'book', 'missing', 'tablet', 'tto', 'discharge', 'ward', 'home', 'pharmacist', 'iv', 'insulin', 'round', 'written', 'missed', 'signed', 'infusion', 'unit', 'nurse', 'fluid', 'warfarin', 'paracetamol', 'blood', 'humulin', 'glucose', 'morning', 'prescription', 'bm', 'bottle', 'bag', 'box', 'pod', 'locker', 'dispensed', 'evening', 'antibiotic', 'day', 'administered', 'bd', 'labelled', 'gentamicin', 'inr', 'clinic', 'referral', 'staff', 'medication', 'locked', 'tpn', 'fridge', 'room', 'belonging', 'bed']
pre_correction_frequent_words = ['dose', 'given', 'chart', 'drug', 'prescribed', 'cd', 'cupboard', 'book', 'missing', 'tablet', 'tto', 'discharge', 'ward', 'home', 'pharmacist', 'iv', 'insulin', 'round', 'written', 'missed', 'signed', 'infusion', 'unit', 'nurse', 'fluid', 'warfarin', 'paracetamol', 'blood', 'humulin', 'glucose', 'morning', 'prescription', 'bm', 'bottle', 'bag', 'box', 'pod', 'locker', 'dispensed', 'evening', 'antibiotic', 'day', 'administered', 'bd', 'gentamicin', 'inr', 'clinic', 'referral', 'staff', 'labelled', 'medication', 'locked', 'hour', 'night', 'belonging']
post_corrected_frequent_words = ['dose', 'given', 'chart', 'drug', 'prescribed', 'cd', 'cupboard', 'book', 'missing', 'tablet', 'tto', 'discharge', 'ward', 'home', 'pharmacist', 'iv', 'insulin', 'round', 'written', 'missed', 'signed', 'infusion', 'unit', 'nurse', 'fluid', 'warfarin', 'paracetamol', 'blood', 'humulin', 'glucose', 'morning', 'prescription', 'bag', 'bm', 'bottle', 'box', 'pod', 'locker', 'dispensed', 'evening', 'hour', 'antibiotic', 'day', 'administered', 'bd', 'gentamicin', 'inr', 'clinic', 'referral', 'staff', 'labelled', 'medication', 'locked', 'night', 'belonging']

In [None]:
my_stop_words = text.ENGLISH_STOP_WORDS.union(["patient","patients","pt","pharmacy","medicine","kd","mso","event","reported","recoded","coding","did"])

In [None]:
def american_to_british(tokens):
    for t in tokens:
        t = re.sub(r"(...)or$", r"\1our", t)
        t = re.sub(r"([bt])er$", r"\1re", t)
        t = re.sub(r"([iy])z(e[drs]|e$|ing|ation)", r"\1s\2", t)
        t = re.sub(r"^(s.?[iy])s(e[drs]|e$|ing|ation)", r"\1z\2", t) # convert back words starting with s like size, seize
        t = re.sub(r"og$", "ogue", t)
        yield t
        
class CustomVectorizer(TfidfVectorizer):
    def build_tokenizer(self):
        tokenize = super().build_tokenizer()
        return lambda doc: list(american_to_british(tokenize(doc)))

In [None]:
ward_num_series = df['location'].str.findall(r"[A-G][0-9]\s.+")
ward_num_series = ward_num_series[ward_num_series.map(lambda d: len(d)) > 0]
slist = []
for x in ward_num_series:
    slist.extend(x)
ward_name_to_num_dict = {}
for l in slist:
    l_split = l.split()
    name = " ".join(l_split[1:])
    name = name.strip("- ")
    ward_name_to_num_dict[name.lower()] = l_split[0].lower()
ward_name_to_num_dict['dccq'] = 'e5'

In [None]:
pre_correction_dict = {' dos ':' dose ', ' doses ':' dose ', ' ttos ':' tto ', ' cds ':' cd ', ' discharged ':' discharge ', 'non clinical':''}
corrected_lemma_dict = {'stat':'stated','errour':'error','doctour':'doctor','floour':'floor'}

In [None]:
df['inc_notes'] = df['inc_notes'].str.lower()
df['inc_notes'] = df['inc_notes'].replace(term_to_abbreviation_dict, regex=True)
df['inc_notes'] = df['inc_notes'].replace(pre_correction_dict, regex=True)
df['inc_notes'] = df['inc_notes'].replace(ward_name_to_num_dict, regex=True)

In [None]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        self.porter_stemmer = PorterStemmer()
    def __call__(self, doc):
        
        tokens = [t for t in word_tokenize(doc) if t.isalpha()]
        no_stops = [t for t in tokens if t not in my_stop_words]
        lemmatized = [self.wnl.lemmatize(t) for t in no_stops]
        corrected = [correction(t) for t in lemmatized]
        return [t for t in corrected if len(t)>1]#corrected_lemma if len(t)>1]

In [None]:
# Create a TfidfVectorizer: tfidf
tfidf = CustomVectorizer(tokenizer=LemmaTokenizer())
# add argument ngram_range=(1,2) to get word pairs like "drug chart"

In [None]:
df.dtypes

In [None]:
# Apply fit_transform to document: csr_mat
csr_mat = tfidf.fit_transform(df['inc_notes'])

# Print result of toarray() method
#print(csr_mat.toarray())

In [None]:
# Get the words: words
words = tfidf.get_feature_names()
words = [corrected_lemma_dict.get(t,t) for t in words]

In [None]:
# Print words 
print(words)

In [None]:
perfs_train = []
ks = list(range(1,7))
#ks = list(range(1,23))
# score decreases for k=23 with 3491 training events
for k in ks:
    # Create an NMF instance: model
    model = NMF(n_components=k)

    # Fit the model to articles
    model.fit(csr_mat)

    # Transform the articles: nmf_features
    nmf_features = model.transform(csr_mat)

    # Print the NMF features
    #print(nmf_features)

    # Create a pandas DataFrame: df
    df_nmf = pd.DataFrame(nmf_features,index=df['inc_notes'])

    # Create a DataFrame: components_df
    components_df = pd.DataFrame(model.components_,columns=words)

    # Print the shape of the DataFrame
    print(components_df.shape)
    
    #perfs_train.append(get_score(model, csr_mat.toarray()))

In [None]:
df_nmf.columns = df_nmf.columns.astype(str)
df_nmf['max_feature'] = df_nmf.idxmax(axis=1)

In [None]:
df['max_feature'] = df_nmf['max_feature'].values
theme_counts = df['max_feature'].value_counts().sort_index().values
theme_counts_max = theme_counts.max()
print(theme_counts)

In [None]:
wordcloud_words = []
def print_wordclouds_and_pies():
    donut_seg = 0
    for i,counts in theme_counts_series.items():
        # Initialize the word cloud
        width = 1024 #int(1024*counts/theme_counts_max)
        height = 720 #int(720*counts/theme_counts_max))
        wc = WordCloud(
            background_color="white",
            width = width,
            height = height
        )

        # Select row : component
        component = components_df.iloc[i]

        # Generate the cloud
        component.nlargest().index = component.nlargest().index.map(str)
        wc.generate_from_frequencies(component.nlargest())
        wordcloud_words.append(component.nlargest().index)

        # Display the generated image:
        figure, (wc_fig, counts_fig) = plt.subplots(nrows=1,ncols=2, figsize=(width/50,height/100))
        wc_fig.imshow(wc, interpolation='bilinear')
        wc_fig.axis("off");

        counts_fig.axis('equal')
        colors = ['w' for j in theme_counts_series.index]
        colors[donut_seg] = 'b'
        labels = ['' for val in theme_counts_series.values]
        labels[donut_seg] = str(counts)+"/"+str(theme_counts_series.values.sum())
        donut_seg += 1
        mypie, texts = counts_fig.pie(theme_counts_series.values/theme_counts_max, colors=colors, labels=labels, startangle=90, counterclock=False)
        for text in texts: text.set_fontsize(20)
        plt.setp( mypie, width=0.4, edgecolor='black')
        plt.tight_layout()
        #plt.savefig(str(k)+'_'+str(i)+'_wordcloud_donut.png')
        #plt.savefig(str(k)+'_'+str(i)+'_wordcloud_donut.pdf')
        plt.show()

        print('-------------------------------------------------------------------------------------------------------------------')

In [None]:
ks = list(range(3,13))
for k in ks:
    # Create an NMF instance: model
    model = NMF(n_components=k)

    # Fit the model to articles
    model.fit(csr_mat)

    # Transform the articles: nmf_features
    nmf_features = model.transform(csr_mat)

    # Create a pandas DataFrame: df
    df_nmf = pd.DataFrame(nmf_features,index=df['inc_notes'])

    # Create a DataFrame: components_df
    components_df = pd.DataFrame(model.components_,columns=words)
    
    df_nmf.columns = df_nmf.columns.astype(str)
    #print(list(df_nmf.nlargest(10,'0').index))
    #print(list(df_nmf.nlargest(10,'1').index))
    #print(list(df_nmf.nlargest(10,'2').index))
    df_nmf['max_feature'] = df_nmf.idxmax(axis=1)
    
    df['max_feature'] = df_nmf['max_feature'].values
    theme_counts_series = df['max_feature'].value_counts()
    theme_counts_series.index = theme_counts_series.index.astype(int)
    theme_counts_max = theme_counts_series.values.max()
    
    print("number themes: "+str(k))
    print_wordclouds_and_pies()
    print("\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n")

In [None]:
unique_wordcloud_words = []
for top5 in wordcloud_words:
    for i in range(5):
        if top5[i] not in unique_wordcloud_words: unique_wordcloud_words.append(top5[i])
print(unique_wordcloud_words)

In [None]:
# Create an NMF instance: model
model = NMF(n_components=3)

# Fit the model to articles
model.fit(csr_mat)

# Transform the articles: nmf_features
nmf_features = model.transform(csr_mat)

# Create a pandas DataFrame: df
df_nmf = pd.DataFrame(nmf_features,index=df['inc_notes'])

# Create a DataFrame: components_df
components_df = pd.DataFrame(model.components_,columns=words)
    
df_nmf.columns = df_nmf.columns.astype(str)
df_nmf['max_feature'] = df_nmf.idxmax(axis=1)
pd.options.display.max_colwidth = 80
print(df_nmf.head(n=11).to_latex(columns=['0','1','2']))
    
df['max_feature'] = df_nmf['max_feature'].values
theme_counts_series = df['max_feature'].value_counts()
theme_counts_series.index = theme_counts_series.index.astype(int)
theme_counts_max = theme_counts_series.values.max()

In [None]:
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
tto_notes = list(df_nmf.nlargest(6,'1').index)
for i,note in enumerate(tto_notes):
    sentences = sent_tokenizer.tokenize(note)
    sentences = [sent.capitalize() for sent in sentences]
    sentences = [sent.replace('gp','GP') for sent in sentences]
    sentences = [sent.replace('. d','. D') for sent in sentences]
    sentences = [sent.replace('discharge','\\textcolor[RGB]{60,80,139}{discharge}') for sent in sentences]
    sentences = [sent.replace('Discharge','\\textcolor[RGB]{60,80,139}{Discharge}') for sent in sentences]
    sentences = [sent.replace('home','\\textcolor[RGB]{68,59,132}{home}') for sent in sentences]
    sentences = [sent.replace('Tto','tto') for sent in sentences]
    sentences = [sent.replace('tto','\\textcolor[RGB]{72,40,120}{tto}') for sent in sentences]
    sentences = [sent.replace('ward','\\textcolor[RGB]{44,115,142}{ward}') for sent in sentences]
    sentences = [sent.replace('medication','\\textcolor[RGB]{61,78,138}{medication}') for sent in sentences]
    sentences = [sent.replace('Medication','\\textcolor[RGB]{61,78,138}{Medication}') for sent in sentences]
    print(str(i+1)+' & ')
    for sent in sentences:
        print(sent+' ')
    print('\\\\')
    #print(note+'\\\\')

In [None]:
df['hour'] = df['inc_time'].str[:2]
df = df.astype({'hour':'int'})
df['reported_hour'] = df['inc_submittedtime'].str[:2]
df = df.astype({'reported_hour':'int'})
df['weekday'] = df['inc_dincident'].dt.weekday
df_Apr16_Nov19 = df[df.inc_dincident > '2016-03-31']
df_Apr16_Nov19 = df[df.inc_dincident < '2019-12-01']
df['month'] = df_Apr16_Nov19['inc_dincident'].dt.month

In [None]:
total_days = (max(df.inc_dincident)-min(df.inc_dincident)).days
total_weeks = total_days/7

In [None]:
month_occurences = np.array([3,3,3,4,4,4,4,4,4,4,4,3])
days_in_month = np.ones(12)*31
days_in_month[1] = 28
days_in_month[3] = 30
days_in_month[5] = 30
days_in_month[8] = 30
days_in_month[10] = 30
division_factor = month_occurences*days_in_month

In [None]:
months_labels = ['Apr 16', 'May 16', 'Jun 16', 'Jul 16', 'Aug 16', 'Sep 16', 'Oct 16', 'Nov 16', 'Dec 16', 'Jan 17', 'Feb 17', 'Mar 17', 'Apr 17', 'May 17', 'Jun 17', 'Jul 17', 'Aug 17', 'Sep 17', 'Oct 17', 'Nov 17', 'Dec 17', 'Jan 18', 'Feb 18', 'Mar 18', 'Apr 18', 'May 18', 'Jun 18', 'Jul 18', 'Aug 18', 'Sep 18', 'Oct 18', 'Nov 18', 'Dec 18', 'Jan 19', 'Feb 19', 'Mar 19', 'Apr 19', 'May 19', 'Jun 19', 'Jul 19', 'Aug 19', 'Sep 19', 'Oct 19', 'Nov 19']

In [None]:
days_in_month_year = np.ones(len(months_labels))*31
days_in_month_year[0] = 30
days_in_month_year[2] = 30
days_in_month_year[5] = 30
days_in_month_year[7] = 30
days_in_month_year[10] = 28
days_in_month_year[12] = 30
days_in_month_year[14] = 30
days_in_month_year[17] = 30
days_in_month_year[19] = 30
days_in_month_year[22] = 28
days_in_month_year[24] = 30
days_in_month_year[26] = 30
days_in_month_year[29] = 30
days_in_month_year[31] = 30
days_in_month_year[34] = 28
days_in_month_year[36] = 30
days_in_month_year[38] = 30
days_in_month_year[41] = 30
days_in_month_year[43] = 30

In [None]:
def reportedby_plot():
    reportedby_counts = df_max_feature['inc_reportedby'].value_counts()
    reportedby_counts = reportedby_counts.drop('',axis=0)
    thresh = reportedby_counts[reportedby_counts.index == 'Other'].values[0] + 1
    reportedby_counts = reportedby_counts.append(pd.Series(reportedby_counts[reportedby_counts.values < thresh].sum(),index=['Other']))
    reportedby_counts = reportedby_counts[reportedby_counts.values >= thresh]
    plt.figure(figsize=(17,13))
    cm = plt.get_cmap('RdYlGn')
    cm_subsection = np.linspace(0, 1, len(reportedby_counts.values))
    x_range = range(0,len(reportedby_counts.values))
    colors = [cm(x) for x in cm_subsection]
    reportedby_list = list(reportedby_counts.index)
    for i in x_range:
        plt.barh(reportedby_list[-(i+1)],reportedby_counts.values[-(i+1)],color=colors[-(i+1)])
        plt.text(reportedby_counts.values[-(i+1)],i,reportedby_list[-(i+1)])
    plt.yticks([])
    plt.title("number of medicine patient safety events reported by different staff roles");
    img = plt.imread("nhsportsmouth.png");
    ax = plt.axes([0.4,0, 0.5, 0.5], frameon=True)  # Change the numbers in this array to position your image [left, bottom, width, height])
    ax.imshow(img)
    ax.axis('off')  # get rid of the ticks and ticklabels
    plt.show()

In [None]:
def reporter_email():
    email_counts = df_max_feature['inc_rep_email'].value_counts(ascending=True)
    people_per_counts = email_counts.value_counts()[email_counts.unique()]
    cumulative_num_people = []
    cumulative_percentages = []
    num_people = people_per_counts.values.sum()
    percentage = 100
    for i in range(len(people_per_counts)):
        cumulative_num_people.append(str(num_people))
        num_people -= people_per_counts.values[i]
        cumulative_percentages.append(percentage)
        percentage -= 100*people_per_counts.index[i]*people_per_counts.values[i]/email_counts.values.sum()
    red_percentages = [100-percentage for percentage in cumulative_percentages]

    plt.figure(figsize=(14,14))
    plt.barh(cumulative_num_people,cumulative_percentages,color='g')
    plt.barh(cumulative_num_people,red_percentages,left=cumulative_percentages,color='r')
    plt.ylabel('number of people reporting the events')
    for i, percentage in enumerate(cumulative_percentages):
        plt.text(percentage,i-0.2,"{0:.1f}% ".format(percentage))
        if int(cumulative_num_people[i])!=1 and i>2: 
            plt.text(69,i-0.2,"the same "+cumulative_num_people[i]+" people reported "+"{0:.1f}% ".format(percentage)+" of events")
    plt.text(69,len(cumulative_percentages)-1-0.2,"the same 1 person reported "+"{0:.1f}% ".format(percentage)+" of events");
    plt.show()

In [None]:
def staff_involved_role():

    reportedby_counts = df_max_feature['employee_involved'].value_counts().dropna()
    plt.figure(figsize=(17,13))
    cm = plt.get_cmap('RdYlGn')
    cm_subsection = np.linspace(0, 1, len(reportedby_counts.values))
    x_range = range(0,len(reportedby_counts.values))
    colors = [cm(x) for x in cm_subsection]
    reportedby_list = list(reportedby_counts.index)
    for i in x_range:
        plt.barh(reportedby_list[-(i+1)],reportedby_counts.values[-(i+1)],color=colors[-(i+1)])
        plt.text(reportedby_counts.values[-(i+1)],i,reportedby_list[-(i+1)])
    plt.yticks([])
    plt.xlim([0,max(reportedby_counts)+len(reportedby_list[0])])
    plt.title("employees involved in medicine patient safety events");
    plt.show()

In [None]:
def time_trend_plot():
    df_month_year = df_max_feature[df_max_feature.inc_dincident > '2016-03-31']
    df_month_year = df_month_year[df_month_year.inc_dincident < '2019-12-01']
    month_year = df_month_year.groupby([df_month_year.inc_dincident.dt.year, df_month_year.inc_dincident.dt.month]).agg('count')
    num_per_day = month_year.inc_dincident.values/days_in_month_year
    num_per_day_err = np.sqrt(month_year.inc_dincident.values)/days_in_month_year
    
    data_x = np.array([i for i in range(len(months_labels))])
    data_y = num_per_day

    # data fit
    polynomial_mod = PolynomialModel(1)
    pars = polynomial_mod.guess(data_y, x=data_x, c0=data_y[0], c1=0)
    model = polynomial_mod
    out = model.fit(data_y, pars, x=data_x)

    season_labels = ['Spring' for i in range(len(months_labels))]
    for i in range(len(season_labels)):
        if 'Ju' in months_labels[i]: season_labels[i]='Summer'
        elif 'Au' in months_labels[i]: season_labels[i]='Summer'
        elif 'Se' in months_labels[i]: season_labels[i]='Autumn'
        elif 'Oc' in months_labels[i]: season_labels[i]='Autumn'
        elif 'No' in months_labels[i]: season_labels[i]='Autumn'
        elif 'De' in months_labels[i]: season_labels[i]='Winter'
        elif 'Ja' in months_labels[i]: season_labels[i]='Winter'
        elif 'Fe' in months_labels[i]: season_labels[i]='Winter'
    colors = ['red','green','blue','purple']
    season_codes = []
    for season in season_labels:
        if season == 'Spring': season_codes.append(0)
        elif season == 'Summer': season_codes.append(1)
        elif season == 'Autumn': season_codes.append(2)
        else: season_codes.append(3)
    season_colours = []
    for season in season_labels:
        if season == 'Spring': season_colours.append('red')
        elif season == 'Summer': season_colours.append('green')
        elif season == 'Autumn': season_colours.append('blue')
        else: season_colours.append('purple')
            
    num_per_spring_day = np.ones(len(num_per_day))*100
    num_per_summer_day = np.ones(len(num_per_day))*100
    num_per_autumn_day = np.ones(len(num_per_day))*100
    num_per_winter_day = np.ones(len(num_per_day))*100
    for idx,season in enumerate(season_labels):
        if season == 'Spring': num_per_spring_day[idx] = num_per_day[idx]
        elif season == 'Summer': num_per_summer_day[idx] = num_per_day[idx]
        elif season == 'Autumn': num_per_autumn_day[idx] = num_per_day[idx]
        else: num_per_winter_day[idx] = num_per_day[idx]
        
    upper_spring = [x+e for x,e in zip(num_per_spring_day, num_per_day_err) ]
    lower_spring = [x-e for x,e in zip(num_per_spring_day, num_per_day_err) ]
    upper_summer = [x+e for x,e in zip(num_per_summer_day, num_per_day_err) ]
    lower_summer = [x-e for x,e in zip(num_per_summer_day, num_per_day_err) ]
    upper_autumn = [x+e for x,e in zip(num_per_autumn_day, num_per_day_err) ]
    lower_autumn = [x-e for x,e in zip(num_per_autumn_day, num_per_day_err) ]
    upper_winter = [x+e for x,e in zip(num_per_winter_day, num_per_day_err) ]
    lower_winter = [x-e for x,e in zip(num_per_winter_day, num_per_day_err) ]

    spring_source = ColumnDataSource(data=dict(groups=months_labels, counts=num_per_spring_day, upper=upper_spring, lower=lower_spring))
    summer_source = ColumnDataSource(data=dict(groups=months_labels, counts=num_per_summer_day, upper=upper_summer, lower=lower_summer))
    autumn_source = ColumnDataSource(data=dict(groups=months_labels, counts=num_per_autumn_day, upper=upper_autumn, lower=lower_autumn))
    winter_source = ColumnDataSource(data=dict(groups=months_labels, counts=num_per_winter_day, upper=upper_winter, lower=lower_winter))

    y_max = math.ceil(max(num_per_day) / 2.) * 2
    p = bokeh_figure(x_range=months_labels, title="Daily number of medicine patient safety events at QA", y_range=(0,y_max),
              x_axis_label='Month Year',y_axis_label='events per day',plot_width=900, plot_height=450,
               tools='pan,box_select,box_zoom,wheel_zoom,save,reset,help')

    p.circle(months_labels,num_per_spring_day,color='green',size=10,alpha=0.5,nonselection_alpha=0, 
             hover_fill_color='black')
    p.circle(months_labels,num_per_summer_day,color='red',size=10,alpha=0.5,nonselection_alpha=0, hover_fill_color='black')
    p.circle(months_labels,num_per_autumn_day,color='yellow',size=10,alpha=0.5,nonselection_alpha=0, hover_fill_color='black')
    p.circle(months_labels,num_per_winter_day,color='blue',size=10,alpha=0.5,nonselection_alpha=0, hover_fill_color='black')

    p.add_layout(
        Whisker(source=spring_source, base="groups", upper="upper", lower="lower", level="overlay", line_color='green')
    )
    p.add_layout(
        Whisker(source=summer_source, base="groups", upper="upper", lower="lower", level="overlay", line_color='red')
    )
    p.add_layout(
        Whisker(source=autumn_source, base="groups", upper="upper", lower="lower", level="overlay", line_color='yellow')
    )
    p.add_layout(
        Whisker(source=winter_source, base="groups", upper="upper", lower="lower", level="overlay", line_color='blue')
    )

    r_linear = p.line(months_labels,out.best_fit,line_color='purple')

    legend = Legend(items=[
        LegendItem(label="spring", renderers=[p.circle(0,0,color='green')]),
        LegendItem(label="summer", renderers=[p.circle(0,0,color='red')]),
        LegendItem(label="autumn", renderers=[p.circle(0,0,color='yellow')]),
        LegendItem(label="winter", renderers=[p.circle(0,0,color='blue')]),
        LegendItem(label='fit χ²/Nᵈᶠ = '+str(round(out.redchi,1)), renderers=[r_linear]),
    ])
    p.add_layout(legend)

    p.xgrid.grid_line_color = None
    p.xaxis.major_label_orientation = math.pi/2
    p.legend
    #p.legend.orientation = "horizontal"
    p.legend.location = "bottom_right"

    # Create a HoverTool: hover
    hover = HoverTool(tooltips=None,mode='hline')

    # Add the hover tool to the figure p
    p.add_tools(hover)

    height = math.ceil(max(num_per_day) / 2.) * 2/6
    width = 13.314516129032258
    p.image_url(url=['nhsportsmouth.png'], x=0.2, y=y_max-0.1, w=width, h=height)

    output_notebook()
    
    show(p)

In [None]:
def hour_plot():
    # Compute pie slices
    N = 24
    bins = [i-0.5 for i in range(25)]
    data_x = [i for i in range(24)]
    data_y_tot,_ = np.histogram(df_max_feature['hour'].values, bins=bins)
    data_y = data_y_tot/total_days
    data_y_err = np.sqrt(data_y_tot)/total_days
    width = 2 * np.pi/N
    theta = np.linspace(0.0 + width, 2 * np.pi + width, N, endpoint=False)
    theta_deg = np.linspace(0.0 + 7.5, 360 + 7.5, N, endpoint=False)
    radii = data_y
    radii_err = data_y_err
    morning_radii = np.zeros(len(radii))
    afternoon_radii = np.zeros(len(radii))
    evening_radii = np.zeros(len(radii))
    for i in range(24):
        if i <= 7: morning_radii[i] = radii[i]
        elif i >= 8 and i <= 15: afternoon_radii[i] = radii[i]
        else: evening_radii[i] = radii[i]
    plt.figure()
    ax = plt.subplot(111, projection='polar');
    ax.bar(theta, morning_radii, width=width,label='night');
    ax.bar(theta, afternoon_radii, width=width,label='daytime');
    ax.bar(theta, evening_radii, width=width,label='evening');
    ax.bar(theta, 2*radii_err, bottom=radii-radii_err,width=width,alpha=0.5,color='none',hatch="////",label='Uncertainty');
    ax.set_theta_offset(np.pi/2 + width/2)
    ax.set_theta_direction(-1)
    yticklabels = ax.get_yticks()
    yticklabels = [round(item,1) for item in yticklabels]
    for i,item in enumerate(yticklabels):
        if i%2==0: yticklabels[i]=''
    ax.set_yticklabels(yticklabels)
    plt.xlabel('hour occured')
    plt.title('Hourly number of patient medicine safety events',y=1.1)
    plt.legend(loc=(1.04,0))
    lines, labels = plt.thetagrids( theta_deg, (data_x) );
    ax.axvspan(theta[np.argmax(radii)]-width/2,theta[np.argmax(radii)]+width/2,color='red',alpha=0.3);
    plt.show()
    
    plt.figure()
    ax = plt.subplot(111, projection='polar')
    ax.bar(theta, radii, width=width,label='Data');
    ax.bar(theta, radii_err, bottom=radii-radii_err/2,width=width,alpha=0.5,color='none',hatch="////",label='Uncertainty');
    ax.set_theta_offset(np.pi/2 + width/2)
    ax.set_theta_direction(-1)
    yticklabels = ax.get_yticks()
    yticklabels = [round(item,1) for item in yticklabels]
    for i,item in enumerate(yticklabels):
        if i%2==0: yticklabels[i]=''
    ax.set_yticklabels(yticklabels)
    plt.title('hour occured',y=1.1)
    lines, labels = plt.thetagrids( theta_deg, (data_x) )
    ax.axvline(theta[7]-width/2,color='orange');
    ax.axvline(theta[15]-width/2,color='orange');
    ax.axvline(theta[12],color='green');
    ax.axvline(theta[20],color='green');
    ax.axvline(theta[20]-width/2,color='red');
    ax.axvline(theta[8]-width/2,color='red');
    ax.annotate('Early shift start',xy=(theta[7]-width/2,ax.get_ylim()[1]),xytext=(theta[7]-width/2,ax.get_ylim()[1]*11/7),color='orange',ha='center',
           arrowprops=dict(facecolor='orange',alpha=0.1));
    ax.annotate('Early shift end',xy=(theta[15]-width/2,ax.get_ylim()[1]),xytext=(theta[15]-width/2,ax.get_ylim()[1]*9/7),color='orange',ha='right',
               arrowprops=dict(facecolor='orange',alpha=0.1));
    ax.annotate('Late shift start',xy=(theta[12],ax.get_ylim()[1]),xytext=(theta[12],ax.get_ylim()[1]*9/7),color='green',ha='center',
               arrowprops=dict(facecolor='green',alpha=0.1));
    ax.annotate('Late shift end',xy=(theta[20],ax.get_ylim()[1]),xytext=(theta[20],ax.get_ylim()[1]*9/7),color='green',ha='right',
               arrowprops=dict(facecolor='green',alpha=0.1));
    ax.annotate('Night shift start',xy=(theta[20]-width/2,ax.get_ylim()[1]),xytext=(theta[20]-width/2,ax.get_ylim()[1]*9/7),color='red',ha='right',
               arrowprops=dict(facecolor='red',alpha=0.1));
    ax.annotate('Night shift end',xy=(theta[8]-width/2,ax.get_ylim()[1]),xytext=(theta[8]-width/2,ax.get_ylim()[1]*9/7),color='red',ha='left',
           arrowprops=dict(facecolor='red',alpha=0.1));
    plt.show()
    
    plt.figure()
    reportedby_counts = df_max_feature['inc_reportedby'].value_counts().drop(labels=[''])
    reportedby_counts_nlargest = reportedby_counts.nlargest(9)
    thresh = reportedby_counts_nlargest[-1]
    reportedby_counts = reportedby_counts.append(pd.Series(reportedby_counts[reportedby_counts.values < thresh].sum(),index=['Other']))
    reportedby_counts = reportedby_counts[reportedby_counts.values >= thresh]
    ax = plt.subplot(111, projection='polar')
    cumulative_bottom = 0
    for i,counts in reportedby_counts.items():
        radii_reported_tot,_ = np.histogram(df_max_feature[df_max_feature['inc_reportedby']==i].hour.values, bins=bins)
        radii_reported = radii_reported_tot/total_days
        ax.bar(theta, radii_reported, bottom=cumulative_bottom, width=width, label=i);
        cumulative_bottom += radii_reported
    ax.bar(theta, radii_err, bottom=cumulative_bottom-radii_err/2,width=width,alpha=0.5,color='none',hatch="////",label='Total Uncertainty');
    ax.set_theta_offset(np.pi/2 + width/2)
    ax.set_theta_direction(-1)
    yticklabels = ax.get_yticks()
    yticklabels = [round(item,1) for item in yticklabels]
    for i,item in enumerate(yticklabels):
        if i%2==0: yticklabels[i]=''
    ax.set_yticklabels(yticklabels)
    plt.xlabel('hour occured')
    plt.title('Hourly number of patient medicine safety events at QA',y=1.1)
    plt.legend(loc=(1.1,0),title='Reported by:')
    lines, labels = plt.thetagrids( theta_deg, (data_x) )
    plt.show()
    
    night_radii = np.zeros(len(radii))
    night_early_radii = np.zeros(len(radii))
    early_radii = np.zeros(len(radii))
    early_late_radii = np.zeros(len(radii))
    late_radii = np.zeros(len(radii))
    late_night_radii = np.zeros(len(radii))
    for i in range(24):
        if i==7: night_early_radii[i] = radii[i]
        elif i >= 8 and i <= 12: early_radii[i] = radii[i]
        elif i==13 or i==14: early_late_radii[i] = radii[i]
        elif i >= 15 and i <= 19: late_radii[i] = radii[i]
        elif i==20: late_night_radii[i] = radii[i]
        else: night_radii[i] = radii[i]
    plt.figure(figsize=(11,11))
    ax = plt.subplot(111, projection='polar');
    ax.bar(theta, night_radii, width=width,label='night',color='green');
    ax.bar(theta, night_early_radii, width=width, label='night-early crossover', color='yellow')
    ax.bar(theta, early_radii, width=width,label='early',color='orange');
    ax.bar(theta, early_late_radii, width=width, label='early-late crossover', color='red')
    ax.bar(theta, late_radii, width=width,label='late',color='purple');
    ax.bar(theta, late_night_radii, width=width, label='late-night crossover', color='blue')
    ax.bar(theta, 2*radii_err, bottom=radii-radii_err,width=width,alpha=0.5,color='none',hatch="////",label='uncertainty');
    ax.set_theta_offset(np.pi/2 + width/2)
    ax.set_theta_direction(-1)
    yticklabels = ax.get_yticks()
    yticklabels = [round(item,1) for item in yticklabels]
    for i,item in enumerate(yticklabels):
        if i%2==0: yticklabels[i]=''
    ax.set_yticklabels(yticklabels)
    #plt.xlabel('hour occured')
    plt.title('Hourly number of medicine patient safety events',y=1)
    plt.legend(loc=(0.9,0.81),title='Nursing shifts')
    lines, labels = plt.thetagrids( theta_deg, (data_x) );
    ax.axvspan(theta[np.argmax(radii)]-width/2,theta[np.argmax(radii)]+width/2,color='red',alpha=0.3);
    ax.axvline(theta[7]-width/2,color='orange');
    ax.axvline(theta[15]-width/2,color='orange');
    ax.axvline(theta[12],color='purple');
    ax.axvline(theta[20],color='purple');
    ax.axvline(theta[20]-width/2,color='green');
    ax.axvline(theta[8]-width/2,color='green');
    ymax = ax.get_ylim()[1]
    ax.annotate('Early shift start',xy=(theta[7]-width/2,ax.get_ylim()[1]),xytext=(theta[7]-width/2,ymax*1.2),color='orange',ha='center',
               arrowprops=dict(facecolor='orange',alpha=0.1));
    ax.annotate('Early shift end',xy=(theta[15]-width/2,ax.get_ylim()[1]),xytext=(theta[15]-width/2,ymax*1.1),color='orange',ha='right',
               arrowprops=dict(facecolor='orange',alpha=0.1));
    ax.annotate('Late shift start',xy=(theta[12],ax.get_ylim()[1]),xytext=(theta[12],ymax*1.1),color='purple',ha='center',
               arrowprops=dict(facecolor='purple',alpha=0.1));
    ax.annotate('Late shift end',xy=(theta[20],ax.get_ylim()[1]),xytext=(theta[20],ymax*1.1),color='purple',ha='right',
               arrowprops=dict(facecolor='purple',alpha=0.1));
    ax.annotate('Night shift start',xy=(theta[20]-width/2,ax.get_ylim()[1]),xytext=(theta[20]-width/2,ymax*1.1),color='green',ha='right',
               arrowprops=dict(facecolor='green',alpha=0.1));
    ax.annotate('Night shift end',xy=(theta[8]-width/2,ax.get_ylim()[1]),xytext=(theta[8]-width/2,ymax*1.1),color='green',ha='left',
               arrowprops=dict(facecolor='green',alpha=0.1));
    plt.show()
    
    night_radii = np.zeros(len(radii))
    night_early_radii = np.zeros(len(radii))
    early_radii = np.zeros(len(radii))
    early_late_radii = np.zeros(len(radii))
    late_radii = np.zeros(len(radii))
    late_night_radii = np.zeros(len(radii))
    for i in range(24):
        if i==7: night_early_radii[i] = radii[i]
        elif i >= 8 and i <= 12: early_radii[i] = radii[i]
        elif i==13 or i==14: early_late_radii[i] = radii[i]
        elif i >= 15 and i <= 19: late_radii[i] = radii[i]
        elif i==20: late_night_radii[i] = radii[i]
        else: night_radii[i] = radii[i]
    plt.figure(figsize=(11,11))
    ax = plt.subplot(111, projection='polar');
    ax.bar(theta, night_radii, width=width,label='night',color='green');
    ax.bar(theta, night_early_radii, width=width, label='night-early crossover', color='yellow')
    ax.bar(theta, early_radii, width=width,label='early',color='orange');
    ax.bar(theta, early_late_radii, width=width, label='early-late crossover', color='red')
    ax.bar(theta, late_radii, width=width,label='late',color='purple');
    ax.bar(theta, late_night_radii, width=width, label='late-night crossover', color='blue')
    uncertainty = ax.bar(theta, 2*radii_err, bottom=radii-radii_err,width=width,alpha=0.5,color='none',hatch="////");
    ax.set_theta_offset(np.pi/2 + width/2)
    ax.set_theta_direction(-1)
    ax.set_yticklabels(['',0.2,'',0.4,'',0.6])
    #plt.xlabel('hour occured')
    plt.title('Hourly number of medicine patient safety events',y=1)
    shift_legend = plt.legend(loc=(0.9,0.81),title='Nursing shifts')
    lines, labels = plt.thetagrids( theta_deg, (data_x) );
    for morning_hour in range(8,10):
        morning_round = ax.axvspan(theta[morning_hour]-width/2,theta[morning_hour]+width/2,color='orange',alpha=0.1);
    for lunch_hour in range(12,14):
        lunch_round = ax.axvspan(theta[lunch_hour]-width/2,theta[lunch_hour]+width/2,color='red',alpha=0.1);
    for afternoon_hour in range(17,19):
        afternoon_round = ax.axvspan(theta[afternoon_hour]-width/2,theta[afternoon_hour]+width/2,color='purple',alpha=0.1);
    for night_hour in range(21,23):
        night_round = ax.axvspan(theta[night_hour]-width/2,theta[night_hour]+width/2,color='green',alpha=0.1);
    round_legend = plt.legend([morning_round,lunch_round,afternoon_round,night_round],['morning','lunch','afternoon','night'],
                             title='Drug rounds',loc=(1.0,0.66))
    uncertainty_legend = plt.legend([uncertainty],['uncertainty'],loc=(0.73,0.97))
    ax.add_artist(shift_legend)
    ax.add_artist(round_legend)
    ax.axvline(theta[7]-width/2,color='orange');
    ax.axvline(theta[15]-width/2,color='orange');
    ax.axvline(theta[12],color='purple');
    ax.axvline(theta[20],color='purple');
    ax.axvline(theta[20]-width/2,color='green');
    ax.axvline(theta[8]-width/2,color='green');
    ymax = ax.get_ylim()[1]
    ax.annotate('Early shift start',xy=(theta[7]-width/2,ax.get_ylim()[1]),xytext=(theta[7]-width/2,ymax*1.2),color='orange',ha='center',
               arrowprops=dict(facecolor='orange',alpha=0.1));
    ax.annotate('Early shift end',xy=(theta[15]-width/2,ax.get_ylim()[1]),xytext=(theta[15]-width/2,ymax*1.1),color='orange',ha='right',
               arrowprops=dict(facecolor='orange',alpha=0.1));
    ax.annotate('Late shift start',xy=(theta[12],ax.get_ylim()[1]),xytext=(theta[12],ymax*1.1),color='purple',ha='center',
               arrowprops=dict(facecolor='purple',alpha=0.1));
    ax.annotate('Late shift end',xy=(theta[20],ax.get_ylim()[1]),xytext=(theta[20],ymax*1.1),color='purple',ha='right',
               arrowprops=dict(facecolor='purple',alpha=0.1));
    ax.annotate('Night shift start',xy=(theta[20]-width/2,ax.get_ylim()[1]),xytext=(theta[20]-width/2,ymax*1.1),color='green',ha='right',
               arrowprops=dict(facecolor='green',alpha=0.1));
    ax.annotate('Night shift end',xy=(theta[8]-width/2,ax.get_ylim()[1]),xytext=(theta[8]-width/2,ymax*1.1),color='green',ha='left',
               arrowprops=dict(facecolor='green',alpha=0.1));
    plt.show()
    print('after hour')
    '''
    data_y_tot,_ = np.histogram(df_max_feature['reported_hour'].values, bins=bins)
    data_y = data_y_tot/total_days
    data_y_err = np.sqrt(data_y_tot)/total_days
    radii = data_y
    radii_err = data_y_err
    morning_radii = np.zeros(len(radii))
    afternoon_radii = np.zeros(len(radii))
    evening_radii = np.zeros(len(radii))
    for i in range(24):
        if i <= 7: morning_radii[i] = radii[i]
        elif i >= 8 and i <= 15: afternoon_radii[i] = radii[i]
        else: evening_radii[i] = radii[i]
    plt.figure()
    ax = plt.subplot(111, projection='polar');
    ax.bar(theta, morning_radii, width=width,label='night');
    ax.bar(theta, afternoon_radii, width=width,label='daytime');
    ax.bar(theta, evening_radii, width=width,label='evening');
    ax.bar(theta, 2*radii_err, bottom=radii-radii_err,width=width,alpha=0.5,color='none',hatch="////",label='Uncertainty');
    ax.set_theta_offset(np.pi/2 + width/2)
    ax.set_theta_direction(-1)
    yticklabels = ax.get_yticks()
    yticklabels = [round(item,1) for item in yticklabels]
    for i,item in enumerate(yticklabels):
        if i%2==0: yticklabels[i]=''
    ax.set_yticklabels(yticklabels)
    plt.xlabel('hour reported')
    plt.title('Hourly number of patient medicine safety events at QA',y=1.1)
    plt.legend(loc=(1.04,0))
    lines, labels = plt.thetagrids( theta_deg, (data_x) );
    ax.axvspan(theta[np.argmax(radii)]-width/2,theta[np.argmax(radii)]+width/2,color='red',alpha=0.3);
    plt.show()
    
    plt.figure()
    ax = plt.subplot(111, projection='polar')
    ax.bar(theta, radii, width=width,label='Data');
    ax.bar(theta, radii_err, bottom=radii-radii_err/2,width=width,alpha=0.5,color='none',hatch="////",label='Uncertainty');
    ax.set_theta_offset(np.pi/2 + width/2)
    ax.set_theta_direction(-1)
    yticklabels = ax.get_yticks()
    yticklabels = [round(item,1) for item in yticklabels]
    for i,item in enumerate(yticklabels):
        if i%2==0: yticklabels[i]=''
    ax.set_yticklabels(yticklabels)
    plt.title('hour reported',y=1.1)
    lines, labels = plt.thetagrids( theta_deg, (data_x) )
    ax.axvline(theta[7]-width/2,color='orange');
    ax.axvline(theta[15]-width/2,color='orange');
    ax.axvline(theta[12],color='green');
    ax.axvline(theta[20],color='green');
    ax.axvline(theta[20]-width/2,color='red');
    ax.axvline(theta[8]-width/2,color='red');
    ax.annotate('Early shift start',xy=(theta[7]-width/2,ax.get_ylim()[1]),xytext=(theta[7]-width/2,ax.get_ylim()[1]*11/7),color='orange',ha='center',
           arrowprops=dict(facecolor='orange',alpha=0.1));
    ax.annotate('Early shift end',xy=(theta[15]-width/2,ax.get_ylim()[1]),xytext=(theta[15]-width/2,ax.get_ylim()[1]*9/7),color='orange',ha='right',
               arrowprops=dict(facecolor='orange',alpha=0.1));
    ax.annotate('Late shift start',xy=(theta[12],ax.get_ylim()[1]),xytext=(theta[12],ax.get_ylim()[1]*9/7),color='green',ha='center',
               arrowprops=dict(facecolor='green',alpha=0.1));
    ax.annotate('Late shift end',xy=(theta[20],ax.get_ylim()[1]),xytext=(theta[20],ax.get_ylim()[1]*9/7),color='green',ha='right',
               arrowprops=dict(facecolor='green',alpha=0.1));
    ax.annotate('Night shift start',xy=(theta[20]-width/2,ax.get_ylim()[1]),xytext=(theta[20]-width/2,ax.get_ylim()[1]*9/7),color='red',ha='right',
               arrowprops=dict(facecolor='red',alpha=0.1));
    ax.annotate('Night shift end',xy=(theta[8]-width/2,ax.get_ylim()[1]),xytext=(theta[8]-width/2,ax.get_ylim()[1]*9/7),color='red',ha='left',
           arrowprops=dict(facecolor='red',alpha=0.1));
    plt.show()
    
    plt.figure()
    reportedby_counts = df_max_feature['inc_reportedby'].value_counts().drop(labels=[''])
    reportedby_counts_nlargest = reportedby_counts.nlargest(9)
    thresh = reportedby_counts_nlargest[-1]
    reportedby_counts = reportedby_counts.append(pd.Series(reportedby_counts[reportedby_counts.values < thresh].sum(),index=['Other']))
    reportedby_counts = reportedby_counts[reportedby_counts.values >= thresh]
    ax = plt.subplot(111, projection='polar')
    cumulative_bottom = 0
    for i,counts in reportedby_counts.items():
        radii_reported_tot,_ = np.histogram(df_max_feature[df_max_feature['inc_reportedby']==i].reported_hour.values, bins=bins)
        radii_reported = radii_reported_tot/total_days
        ax.bar(theta, radii_reported, bottom=cumulative_bottom, width=width, label=i);
        cumulative_bottom += radii_reported
    ax.bar(theta, radii_err, bottom=cumulative_bottom-radii_err/2,width=width,alpha=0.5,color='none',hatch="////",label='Total Uncertainty');
    ax.set_theta_offset(np.pi/2 + width/2)
    ax.set_theta_direction(-1)
    yticklabels = ax.get_yticks()
    yticklabels = [round(item,1) for item in yticklabels]
    for i,item in enumerate(yticklabels):
        if i%2==0: yticklabels[i]=''
    ax.set_yticklabels(yticklabels)
    plt.xlabel('hour reported')
    plt.title('Hourly number of patient medicine safety events at QA',y=1.1)
    plt.legend(loc=(1.1,0),title='Reported by:')
    lines, labels = plt.thetagrids( theta_deg, (data_x) )
    plt.show()
    '''


In [None]:
def weekday_plot():
    bins = [i-0.5 for i in range(8)]
    data_x = [i for i in range(7)]
    data_y_tot,_ = np.histogram(df_max_feature['weekday'].values, bins=bins)
    data_y = data_y_tot/total_weeks
    data_y_err = np.sqrt(data_y_tot)/total_weeks
    
    # data fit

    def raised_backward_step(x, A, raised):
        return [A if xi<4.5 else raised for xi in x]

    step_model = Model(raised_backward_step)
    out = step_model.fit(data_y, x=data_x, weights=1/data_y_err, A=data_y[0], raised=data_y[-1])

    groups= ['Mon','Tue','Wed','Thu','Fri','Sat','Sun']
    counts = data_y
    error = data_y_err
    upper = [x+e for x,e in zip(counts, error) ]
    lower = [x-e for x,e in zip(counts, error) ]

    source = ColumnDataSource(data=dict(groups=groups, counts=counts, upper=upper, lower=lower))

    p = bokeh_figure(x_range=groups, plot_height=450, title="Daily medicine patient safety events at QA", y_range=(0,math.ceil(max(upper))),
              x_axis_label='Weekday',y_axis_label='events per day',
              tools='pan,box_zoom,wheel_zoom,save,reset,help')

    r_fit = p.line(groups,out.best_fit,line_color='purple')

    band = Band(base='groups', lower='lower', upper='upper', source=source, level='underlay',
                fill_alpha=1.0, line_width=1, line_color='black')
    p.add_layout(band)

    p.xgrid.grid_line_color = None

    legend = Legend(items=[
        LegendItem(label="uncertainty", renderers=[p.vbar(x=0, top=0, width=0.9, color='yellow', line_width=1, line_color='black')]),
        LegendItem(label='fit χ²/Nᵈᶠ = '+str(round(out.redchi,1)), renderers=[r_fit]),
    ])
    p.add_layout(legend)

    vline = Span(location=5, dimension='height', line_color='red', line_width=3)
    p.renderers.extend([vline])

    output_notebook()
    
    show(p)

In [None]:
def month_plot():
    bins = [i+0.5 for i in range(13)]
    data_x = [i+1 for i in range(12)]
    data_y_tot,_ = np.histogram(df_max_feature['month'].values, bins=bins)
    data_y = data_y_tot/division_factor
    data_y_err = np.sqrt(data_y_tot)/division_factor
    
    # data fit
    polynomial_mod = PolynomialModel(0)
    pars = polynomial_mod.guess(data_y, x=data_x, c0=data_y[0])
    model = polynomial_mod
    out = model.fit(data_y, pars, x=data_x, weights=1/data_y_err)

    groups= ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
    counts = data_y
    error = data_y_err
    upper = [x+e for x,e in zip(counts, error) ]
    lower = [x-e for x,e in zip(counts, error) ]
    color = ['blue','blue','green','green','green','red','red','red','yellow','yellow','yellow','blue']

    source = ColumnDataSource(data=dict(groups=groups, counts=counts, upper=upper, lower=lower, color=color))

    p = bokeh_figure(x_range=groups, plot_height=450, title="Daily medicine patient safety events at QA", y_range=(0,math.ceil(max(counts))),
              x_axis_label='Month',y_axis_label='events per day',
              tools='pan,box_select,box_zoom,wheel_zoom,save,reset,help')

    p.vbar(x='groups', top='counts', width=0.9, source=source, 
           line_color='white', color='color',nonselection_alpha=0, 
             hover_fill_color='black')
    r_err = p.vbar(x='groups', bottom='lower', top='upper', width=0.9, source=source, fill_color='grey', fill_alpha=0.9,
          line_color='white')
    r_linear = p.line(groups,out.best_fit,line_color='purple')

    p.xgrid.grid_line_color = None

    legend = Legend(items=[
        LegendItem(label="winter", renderers=[p.vbar(x=0, top=0, width=0.9, color='blue')]),
        LegendItem(label="spring", renderers=[p.vbar(x=0, top=0, width=0.9, color='green')]),
        LegendItem(label="summer", renderers=[p.vbar(x=0, top=0, width=0.9, color='red')]),
        LegendItem(label="autumn", renderers=[p.vbar(x=0, top=0, width=0.9, color='yellow')]),
        LegendItem(label="uncertainty", renderers=[r_err]),
        LegendItem(label='fit χ²/Nᵈᶠ = '+str(round(out.redchi,1)), renderers=[r_linear]),
    ])
    p.add_layout(legend)
    p.legend.location = "bottom_right"

    # Create a HoverTool: hover
    hover = HoverTool(tooltips=None,mode='hline')

    # Add the hover tool to the figure p
    p.add_tools(hover)

    output_notebook()
    
    show(p)


In [None]:
def day_plot():
    plt.figure()
    df_day = df_max_feature.copy()
    df_day = df_day[df_day.inc_dincident > '2016-03-31']
    df_day = df_day[df_day.inc_dincident < '2019-12-01']
    df_day.head()

    df_day['day'] = df_day['inc_dincident'].dt.day
    df_day.head()

    bins = [i+0.5 for i in range(32)]
    data_x = [i+1 for i in range(31)]
    df_day_grouped_by_day = df_day.groupby(['inc_dincident']).size().reset_index(name='counts')
    df_day_grouped_by_day['day'] = df_day_grouped_by_day['inc_dincident'].dt.day

    day_occurences = np.ones(31)*44
    day_occurences[-1] = 25
    day_occurences[-2] = 41
    day_occurences[-3] = 41

    data_y_tot,_ = np.histogram(df_day['day'].values, bins=bins)
    data_y = data_y_tot/day_occurences
    data_y_err = np.sqrt(data_y_tot)/day_occurences

    # data fit
    polynomial_mod = PolynomialModel(0)
    pars = polynomial_mod.guess(data_y, x=data_x, c0=data_y[0])
    model = polynomial_mod
    out = model.fit(data_y, pars, x=data_x, weights=1/data_y_err)

    df_day_grouped_by_day.boxplot(by='day',column=['counts'],figsize=(17,13));
    plt.plot(data_x, out.best_fit, '-r')
    plt.ylim(bottom=0);
    plt.ylabel('# events per day');
    plt.title('Daily number of patient medicine safety events at QA');
    plt.suptitle('');
    ax = plt.gca()
    ax.yaxis.set_major_locator(MaxNLocator(integer=True))
    line = Line2D([],[], ls="none",markeredgecolor='k')
    barline = LineCollection(np.empty((2,2,2)))
    custom_lines = [Line2D([0], [0], color='green'),
                    mpatches.Patch(facecolor='w',edgecolor='b'),
                    ErrorbarContainer((line, [line], [barline]), has_yerr=True),
                   Line2D([0], [0], marker='o', color='w', markerfacecolor='w', markeredgecolor='k',markersize=10),
                   Line2D([0], [0], color='red')]
    plt.legend(custom_lines,['median','quartiles','extremes','outliers','Fit '+r'$\chi^2/N_{df}$ = '+str(round(out.redchi,1))]);
    img = plt.imread("nhsportsmouth.png");
    ax = plt.axes([0.2,0.7, 0.3, 0.3], frameon=True);  # Change the numbers in this array to position your image [left, bottom, width, height])
    ax.imshow(img);
    ax.axis('off'); # get rid of the ticks and ticklabels
    plt.show()

In [None]:
def division_care_group_loc_plot():
    df_division_care_group = df_max_feature.copy()
    df_division_care_group['division-care_group'] = df_division_care_group['division-care_group'].str.replace(' and ',' & ')
    divider = df_division_care_group['division-care_group'].str.split(" Division - ",n=1,expand=True)
    df_division_care_group['division'] = divider[0]
    df_division_care_group['care_group'] = divider[1]
    #df_division_care_group['division'] = df_division_care_group['division'].str.replace(' and ',' & ')
    df_division_care_group.loc[df_division_care_group.division == 'Corporate Functions', 'care_group'] = 'Corporate Functions'
    df_division_care_group = df_division_care_group[df_division_care_group['division-care_group'] != 'Corporate Functions']
    df_division_care_group = df_division_care_group[df_division_care_group.care_group != 'Pathology']
    #df_division_care_group.loc[df_division_care_group.division == 'Surgical & Outpatients', 'division'] = 'Surgical & Outpatients + Corporate Functions'
    #df_division_care_group.loc[df_division_care_group.division == 'Corporate Functions', 'division'] = 'Surgical & Outpatients + Corporate Functions'

    division_counts = df_division_care_group['division'].value_counts()
    division_percentages = 100*division_counts/sum(division_counts.values)
    division_labels = [str(i) for i in division_counts.index]

    df_division_care_group['division_sorted'] = pd.Categorical(df_division_care_group['division'], division_labels)
    df_division_care_group = df_division_care_group.sort_values("division_sorted")
    df_division_care_group.head()

    care_group_labels = []
    for i in division_counts.index:
        df_division = df_division_care_group[df_division_care_group.division == i]
        per_division_counts = df_division['care_group'].value_counts()
        care_group_labels += [str(i) for i in per_division_counts.index]

    df_division_care_group['care_group_sorted'] = pd.Categorical(df_division_care_group['care_group'], care_group_labels)
    df_division_care_group = df_division_care_group.sort_values(['division_sorted','care_group_sorted'])
    df_division_care_group.head()

    care_group_counts = df_division_care_group['care_group'].value_counts()[df_division_care_group['care_group'].unique()]
    care_group_percentages = 100*care_group_counts/sum(care_group_counts.values)
    for care_group,percentage in care_group_percentages.iteritems():
        if percentage <= 2.210928:
            df_division_care_group.loc[df_division_care_group['care_group'] == care_group, 'care_group'] = 'Other'
    care_group_counts = df_division_care_group['care_group'].value_counts()[df_division_care_group['care_group'].unique()]
    care_group_percentages = 100*care_group_counts/sum(care_group_counts.values)
    care_group_labels = care_group_counts.index

    # Create colors
    a, b, c, d, e =[plt.cm.Reds, plt.cm.Oranges, plt.cm.Greens, plt.cm.Blues, plt.cm.Purples]
    a, c, d, e = [plt.cm.Reds, plt.cm.Greens, plt.cm.Blues, plt.cm.Purples]
    division_colour_dict = {'Medicine & Urgent Care':plt.cm.Reds, 'Networked Services':plt.cm.Greens, 'Surgical & Outpatients':plt.cm.Blues, 'Clinical Delivery':plt.cm.Purples}
    inner_ring_colours = [division_colour_dict[division] for division in division_labels]

    # First Ring (Inside)
    fig, ax = plt.subplots()
    ax.axis('equal')
    division_labels_newlines = [i.replace(' ','\n') for i in division_labels]
    #mypie, _ = ax.pie(division_counts, radius=2, labels=division_labels_newlines, labeldistance=0.6, colors=[a(0.9), b(0.9), c(0.9), d(0.9), e(0.9)], textprops={'color':'w'})
    mypie, _ = ax.pie(division_counts, radius=1.9, labels=division_labels_newlines, labeldistance=0.5, colors=[inner_ring_colours[0](0.9), inner_ring_colours[1](0.9), inner_ring_colours[2](0.9), inner_ring_colours[3](0.9)], textprops={'color':'w'}, startangle=90, counterclock=False)
    plt.setp( mypie, width=1.9, edgecolor='white')

    # Second Ring (oUTside)
    care_group_colours = []
    for i,index in enumerate(division_counts.index):
        df_division = df_division_care_group[df_division_care_group.division == index]
        per_division_counts = df_division['care_group'].value_counts()
        slice_colour = 8
        for j in range(len(per_division_counts)):
            care_group_colours.append(inner_ring_colours[i](slice_colour/10))
            slice_colour -= 1
    mypie2, texts = ax.pie(care_group_counts, radius=2.2, labels=care_group_labels, colors=care_group_colours, startangle=90, counterclock=False)
    plt.setp( mypie2, width=0.4, edgecolor='white')
    for text, color in zip(texts, care_group_colours):
        text.set_color(color)
    plt.margins(0,0)

    plt.title('Medicine patient safety events at QA - division (inner ring) and care group (outer ring)', y=1.6);
    plt.show()
    
    
    df_division_care_group.specialty = df_division_care_group.specialty.str.replace(' Rehabilitation','\nRehab')
    df_division_care_group.specialty = df_division_care_group.specialty.str.replace('Ear Nose and Throat','ENT')
    df_division_care_group.specialty = df_division_care_group.specialty.str.replace('Acute ','Acute\n')
    
    specialty_counts = df_division_care_group['specialty'].value_counts()[df_division_care_group['specialty'].unique()]

    specialty_labels = []
    for i in care_group_counts.index:
        df_care_group = df_division_care_group[df_division_care_group.care_group == i]
        per_care_group_counts = df_care_group['specialty'].value_counts()
        specialty_labels += [i + ' - ' + str(j) for j in per_care_group_counts.index]

        
    specialty_labels = [re.sub(r"Surgery\s-\s(.+)\sSurgery",r"Surgery - \1",s) for s in specialty_labels]
    df_division_care_group.specialty = df_division_care_group.specialty.str.replace(' Surgery','')
    df_division_care_group['care_group-specialty'] = df_division_care_group['care_group'] + ' - ' + df_division_care_group['specialty']
    df_division_care_group['specialty_sorted'] = pd.Categorical(df_division_care_group['care_group-specialty'], specialty_labels)
    df_division_care_group = df_division_care_group.sort_values(['division_sorted','care_group_sorted','specialty_sorted'])
    df_division_care_group.head()

    specialty_counts = df_division_care_group['care_group-specialty'].value_counts()[df_division_care_group['care_group-specialty'].unique()]
    specialty_percentages = 100*specialty_counts/sum(specialty_counts.values)

    for specialty,percentage in specialty_percentages.iteritems():
        if percentage <= 1.407625:
            df_division_care_group.loc[df_division_care_group['care_group-specialty'] == specialty, 'care_group-specialty'] = specialty.split(" - ")[0] + ' - Other'

    specialty_counts = df_division_care_group['care_group-specialty'].value_counts()[df_division_care_group['care_group-specialty'].unique()]
    specialty_percentages = 100*specialty_counts/sum(specialty_counts.values)
    specialty_labels_split = specialty_counts.index.str.split(" - ",n=1)
    specialty_labels = [el[1] for el in specialty_labels_split]

    # Create colors
    a, b, c, d, e =[plt.cm.Reds, plt.cm.Oranges, plt.cm.Greens, plt.cm.Blues, plt.cm.Purples]
    a, c, d, e = [plt.cm.Reds, plt.cm.Greens, plt.cm.Blues, plt.cm.Purples]
    division_colour_dict = {'Medicine & Urgent Care':plt.cm.Reds, 'Networked Services':plt.cm.Greens, 'Surgical & Outpatients':plt.cm.Blues, 'Clinical Delivery':plt.cm.Purples}
    inner_ring_colours = [division_colour_dict[division] for division in division_labels]

    # First Ring (Inside)
    fig, ax = plt.subplots()
    ax.axis('equal')
    division_labels_newlines = [i.replace(' ','\n') for i in division_labels]
    #mypie, _ = ax.pie(division_counts, radius=2, labels=division_labels_newlines, labeldistance=0.6, colors=[a(0.9), b(0.9), c(0.9), d(0.9), e(0.9)], textprops={'color':'w'})
    mypie, _ = ax.pie(division_counts, radius=1.9, labels=division_labels_newlines, labeldistance=0.5, colors=[inner_ring_colours[0](0.9), inner_ring_colours[1](0.9), inner_ring_colours[2](0.9), inner_ring_colours[3](0.9)], textprops={'color':'w'}, startangle=90, counterclock=False)
    plt.setp( mypie, width=1.9, edgecolor='white')

    # Second Ring (oUTside)
    care_group_labels_newlines = [i.replace(' ','\n') for i in care_group_labels]
    care_group_labels_newlines[care_group_labels_newlines.index('Renal\n&\nTransplantation')] = 'Renal &\nTransplantation'
    #care_group_labels_newlines[care_group_labels_newlines.index('Critical\nCare,\nTheatres,\nAnaesthetics\n&\nHSDU')] = 'Critical Care, Theatres,\nAnaesthetics\n& HSDU'
    #care_group_labels_newlines[care_group_labels_newlines.index('Imaging\nRadiology')] = 'Imaging Radiology'
    #care_group_colours = [a(0.8), a(0.7), a(0.6), a(0.5), b(0.8), c(0.8), c(0.7), c(0.6), d(0.8), d(0.7), d(0.6), e(0.8), e(0.7), e(0.6)]
    care_group_colours = [a(0.8), a(0.7), a(0.6), c(0.8), c(0.7), c(0.6), d(0.8), d(0.7), d(0.6), e(0.8), e(0.7), e(0.6)]
    care_group_colours = []
    for i,index in enumerate(division_counts.index):
        df_division = df_division_care_group[df_division_care_group.division == index]
        per_division_counts = df_division['care_group'].value_counts()
        slice_colour = 8
        for j in range(len(per_division_counts)):
            care_group_colours.append(inner_ring_colours[i](slice_colour/10))
            slice_colour -= 1
    mypie2, texts = ax.pie(care_group_counts, radius=3.6, labels=care_group_labels_newlines, labeldistance=0.7, colors=care_group_colours, textprops={'color':'k'},startangle=90, counterclock=False)
    plt.setp( mypie2, width=1.7, edgecolor='white')
    care_group_label_colours = ['k' if i%2==0 else 'w' for i in range(len(care_group_counts))]
    for text, color in zip(texts, care_group_label_colours):
        text.set_color(color)
    plt.margins(0,0)

    # Third Ring (oUTside)
    specialty_colours = [a(0.5),a(0.49),a(0.48),a(0.47),a(0.4),a(0.39),a(0.38),a(0.37),a(0.36),a(0.35),a(0.34),a(0.33),a(0.32),a(0.31),a(0.30),a(0.29),a(0.28),a(0.27),a(0.26),a(0.25),a(0.24),c(0.50),c(0.49),c(0.48),c(0.47),c(0.46),c(0.45),c(0.40),c(0.39),c(0.38),c(0.37),c(0.36),c(0.30),c(0.29),c(0.28),c(0.27),d(0.50),d(0.40),d(0.39),d(0.38),d(0.37),d(0.30),e(0.50),e(0.49),e(0.48),e(0.47),e(0.46),e(0.45),e(0.44),e(0.40),e(0.39),e(0.38),e(0.37)]
    specialty_colours = []
    for i,div_index in enumerate(division_counts.index):
        df_division = df_division_care_group[df_division_care_group.division == div_index]
        slice_colour_orig = 0.5
        division_care_group_counts = df_division['care_group'].value_counts()[df_division['care_group'].unique()]
        for j,care_index in enumerate(division_care_group_counts.index):
            df_care_group = df_division[df_division['care_group-specialty'].str.startswith(care_index)]
            per_care_group_counts = df_care_group['care_group-specialty'].value_counts()[df_care_group['care_group-specialty'].unique()]
            slice_colour = slice_colour_orig - j/10
            for k in per_care_group_counts:
                specialty_colours.append(inner_ring_colours[i](slice_colour))
                slice_colour -= 0.01
    mypie3, texts = ax.pie(specialty_counts, radius=3.9, labels=specialty_labels, colors=specialty_colours, startangle=90, counterclock=False)
    plt.setp( mypie3, width=0.3, edgecolor='white')
    for text, color in zip(texts, specialty_colours):
        text.set_color(color)
    plt.margins(0,0)

    plt.title('Medicine patient safety events at QA - division (inner ring), care group (middle ring), specialty (outer ring)', y=2.3);
    plt.show()
    

    df_division_care_group.loc[df_division_care_group.location == 'Department of Critical Care (E5) (DCCQ)', 'location'] = 'E5'
    df_division_care_group.loc[df_division_care_group.location == 'Acute Medical Unit (AMU) (MAU)', 'location'] = 'AMU'
    df_division_care_group.loc[df_division_care_group.location == 'Haematology & Oncology Day Unit (HODU)', 'location'] = 'HODU'
    df_division_care_group.loc[df_division_care_group.location == 'Childrens Assesment Unit (CAU)', 'location'] = 'CAU'
    df_division_care_group.loc[df_division_care_group.location == 'Hepatology Clinic (Nurse led) (on C5)', 'location'] = 'Hepatology'
    df_division_care_group.loc[df_division_care_group.location == 'Cardiac Day Unit (CDU)', 'location'] = 'CDU'
    df_division_care_group.loc[df_division_care_group.location == 'Surgical Assessment Unit (SAU)', 'location'] = 'SAU'
    df_division_care_group.location = df_division_care_group.location.str.replace(' QA','')
    df_division_care_group.location = df_division_care_group.location.str.replace(' Department','')
    df_division_care_group.location = df_division_care_group.location.str.replace(' Unit','')
    df_division_care_group.location = df_division_care_group.location.str.replace(' Centre','')
    df_division_care_group.location = df_division_care_group.location.str.replace(' Clinic','')
    df_division_care_group.location = df_division_care_group.location.str.replace(' and ',' & ')
    df_division_care_group.loc[df_division_care_group.location == 'Theatre Admissions Suite (TAS)', 'location'] = 'TAS'
    df_division_care_group.loc[df_division_care_group.location == 'Pre-Operative Assessment (POA)', 'location'] = 'POA'
    df_division_care_group.loc[df_division_care_group.location == 'Theatre - D1', 'location'] = 'D1'
    df_division_care_group.loc[df_division_care_group.location == 'Research & Development', 'location'] = 'R&D'
    df_division_care_group.loc[df_division_care_group.location == 'Reception (Atrium)', 'location'] = 'Reception'
    df_division_care_group.loc[df_division_care_group.location == 'G5 Endoscopy Suites', 'location'] = 'G5'
    df_division_care_group.loc[df_division_care_group.location == 'F3 Stroke Rehabilitation', 'location'] = 'F3'
    df_division_care_group.loc[df_division_care_group.location == 'F4 Acute Stroke Ward', 'location'] = 'F4'
    df_division_care_group.loc[df_division_care_group.location == 'F1  Neuro Rehab', 'location'] = 'F1'
    df_division_care_group.loc[df_division_care_group.location == 'Emergency (ED)', 'location'] = 'ED'
    df_division_care_group.loc[df_division_care_group.location == 'B6  Antenatal Ward', 'location'] = 'B6'
    df_division_care_group.loc[df_division_care_group.location == 'B7 Postnatal Ward', 'location'] = 'B7'
    df_division_care_group.loc[df_division_care_group.location == 'B9 NICU', 'location'] = 'B9'
    df_division_care_group.loc[df_division_care_group.location == 'A7 Starfish', 'location'] = 'A7'
    df_division_care_group.loc[df_division_care_group.location == 'A8 Shipwreck', 'location'] = 'A8'
    df_division_care_group.loc[df_division_care_group.location == 'B5  - Mary Rose Ward', 'location'] = 'B5'
    df_division_care_group.loc[df_division_care_group.location == 'B8 Labour Ward', 'location'] = 'B8'
    df_division_care_group.loc[df_division_care_group.location == 'Radiotherapy  Dept within CHOC', 'location'] = 'Radiotherapy'
    df_division_care_group.loc[df_division_care_group.location == 'Surgical High Care (SHCU)', 'location'] = 'SHCU'

    location_counts = df_division_care_group['location'].value_counts()[df_division_care_group['location'].unique()]

    location_labels = []
    for i in care_group_counts.index:
        df_care_group = df_division_care_group[df_division_care_group.care_group == i]
        per_care_group_counts = df_care_group['location'].value_counts()
        location_labels += [i + ' - ' + str(j) for j in per_care_group_counts.index]

    df_division_care_group['care_group-location'] = df_division_care_group['care_group'] + ' - ' + df_division_care_group['location']
    df_division_care_group['location_sorted'] = pd.Categorical(df_division_care_group['care_group-location'], location_labels)
    df_division_care_group = df_division_care_group.sort_values(['division_sorted','care_group_sorted','location_sorted'])
    df_division_care_group.head()
    
    location_counts = df_division_care_group['care_group-location'].value_counts()[df_division_care_group['care_group-location'].unique()]
    location_percentages = 100*location_counts/sum(location_counts.values)

    for location,percentage in location_percentages.iteritems():
        if percentage < 1.407625:
            df_division_care_group.loc[df_division_care_group['care_group-location'] == location, 'care_group-location'] = location.split(" - ")[0] + ' - Other'

    location_counts = df_division_care_group['care_group-location'].value_counts()[df_division_care_group['care_group-location'].unique()]
    location_percentages = 100*location_counts/sum(location_counts.values)
    location_labels_split = location_counts.index.str.split(" - ",n=1)
    location_labels = [el[1] for el in location_labels_split]

    # Create colors
    a, b, c, d, e =[plt.cm.Reds, plt.cm.Oranges, plt.cm.Greens, plt.cm.Blues, plt.cm.Purples]
    a, c, d, e = [plt.cm.Reds, plt.cm.Greens, plt.cm.Blues, plt.cm.Purples]

    # First Ring (Inside)
    fig, ax = plt.subplots()
    ax.axis('equal')
    division_labels_newlines = [i.replace(' ','\n') for i in division_labels]
    #mypie, _ = ax.pie(division_counts, radius=2, labels=division_labels_newlines, labeldistance=0.6, colors=[a(0.9), b(0.9), c(0.9), d(0.9), e(0.9)], textprops={'color':'w'})
    mypie, _ = ax.pie(division_counts, radius=1.9, labels=division_labels_newlines, labeldistance=0.5, colors=[inner_ring_colours[0](0.9), inner_ring_colours[1](0.9), inner_ring_colours[2](0.9), inner_ring_colours[3](0.9)], textprops={'color':'w'}, startangle=90, counterclock=False)
    plt.setp( mypie, width=1.9, edgecolor='white')

    # Second Ring (oUTside)
    care_group_labels_newlines = [i.replace(' ','\n') for i in care_group_labels]
    care_group_labels_newlines[care_group_labels_newlines.index('Renal\n&\nTransplantation')] = 'Renal &\nTransplantation'
    #care_group_labels_newlines[care_group_labels_newlines.index('Critical\nCare,\nTheatres,\nAnaesthetics\n&\nHSDU')] = 'Critical Care, Theatres,\nAnaesthetics\n& HSDU'
    care_group_colours = []
    for i,index in enumerate(division_counts.index):
        df_division = df_division_care_group[df_division_care_group.division == index]
        per_division_counts = df_division['care_group'].value_counts()
        slice_colour = 8
        for j in range(len(per_division_counts)):
            care_group_colours.append(inner_ring_colours[i](slice_colour/10))
            slice_colour -= 1
    mypie2, texts = ax.pie(care_group_counts, radius=3.6, labels=care_group_labels_newlines, labeldistance=0.7, colors=care_group_colours, textprops={'color':'k'},startangle=90, counterclock=False)
    plt.setp( mypie2, width=1.7, edgecolor='white')
    care_group_label_colours = ['k' if i%2==0 else 'w' for i in range(len(care_group_counts))]
    for text, color in zip(texts, care_group_label_colours):
        text.set_color(color)
    plt.margins(0,0)

    # Third Ring (oUTside)
    location_colours = [a(0.5),a(0.49),a(0.48),a(0.47),a(0.4),a(0.39),a(0.38),a(0.37),a(0.36),a(0.35),a(0.34),a(0.33),a(0.32),a(0.31),a(0.30),a(0.29),a(0.28),a(0.27),a(0.26),a(0.25),a(0.24),c(0.50),c(0.49),c(0.48),c(0.47),c(0.46),c(0.45),c(0.40),c(0.39),c(0.38),c(0.37),c(0.36),c(0.30),c(0.29),c(0.28),c(0.27),d(0.50),d(0.40),d(0.39),d(0.38),d(0.37),d(0.30),e(0.50),e(0.49),e(0.48),e(0.47),e(0.46),e(0.45),e(0.44),e(0.40),e(0.39),e(0.38),e(0.37)]
    location_colours = []
    for i,div_index in enumerate(division_counts.index):
        df_division = df_division_care_group[df_division_care_group.division == div_index]
        slice_colour_orig = 0.5
        division_care_group_counts = df_division['care_group'].value_counts()[df_division['care_group'].unique()]
        for j,care_index in enumerate(division_care_group_counts.index):
            df_care_group = df_division[df_division['care_group-location'].str.startswith(care_index)]
            per_care_group_counts = df_care_group['care_group-location'].value_counts()[df_care_group['care_group-location'].unique()]
            slice_colour = slice_colour_orig - j/10
            for k in per_care_group_counts:
                location_colours.append(inner_ring_colours[i](slice_colour))
                slice_colour -= 0.01
    mypie3, texts = ax.pie(location_counts, radius=3.9, labels=location_labels, colors=location_colours, startangle=90, counterclock=False)
    plt.setp( mypie3, width=0.3, edgecolor='white')
    for text, color in zip(texts, location_colours):
        text.set_color(color)
    plt.margins(0,0)

    plt.title('Medicine patient safety events at QA - division (inner ring), care group (middle ring), location (outer ring)', y=2.3);
    plt.show()
    
    
    for specialty,percentage in specialty_percentages.iteritems():
        if percentage < 0.860021:
            df_division_care_group.loc[df_division_care_group['care_group-specialty'] == specialty, 'care_group-specialty'] = specialty.split(" - ")[0] + ' - Other'

    specialty_counts = df_division_care_group['care_group-specialty'].value_counts()[df_division_care_group['care_group-specialty'].unique()]
    specialty_percentages = 100*specialty_counts/sum(specialty_counts.values)
    specialty_labels_split = specialty_counts.index.str.split(" - ",n=1)
    specialty_labels = [el[1] for el in specialty_labels_split]

    location_counts = df_division_care_group['location'].value_counts()[df_division_care_group['location'].unique()]

    location_labels = []
    for i in care_group_counts.index:
        df_care_group = df_division_care_group[df_division_care_group.care_group == i]
        per_care_group_counts = df_care_group['location'].value_counts()
        location_labels += [i + ' - ' + str(j) for j in per_care_group_counts.index]

    df_division_care_group['care_group-location'] = df_division_care_group['care_group'] + ' - ' + df_division_care_group['location']

    location_counts = df_division_care_group['care_group-location'].value_counts()[df_division_care_group['care_group-location'].unique()]
    location_percentages = 100*location_counts/sum(location_counts.values)

    for location,percentage in location_percentages.iteritems():
        if percentage < 0.701596 + 0.000001:
            df_division_care_group.loc[df_division_care_group['care_group-location'] == location, 'care_group-location'] = location.split(" - ")[0] + ' - Other'

    location_counts = df_division_care_group['care_group-location'].value_counts()[df_division_care_group['care_group-location'].unique()]
    location_percentages = 100*location_counts/sum(location_counts.values)
    location_labels_split = location_counts.index.str.split(" - ",n=1)
    location_labels = [el[1] for el in location_labels_split]
    
    plt.rcParams.update({'font.size': 8})

    # Create colors
    a, b, c, d, e =[plt.cm.Reds, plt.cm.Oranges, plt.cm.Greens, plt.cm.Blues, plt.cm.Purples]
    a, c, d, e = [plt.cm.Reds, plt.cm.Greens, plt.cm.Blues, plt.cm.Purples]
    division_colour_dict = {'Medicine & Urgent Care':plt.cm.Reds, 'Networked Services':plt.cm.Greens, 'Surgical & Outpatients':plt.cm.Blues, 'Clinical Delivery':plt.cm.Purples}
    inner_ring_colours = [division_colour_dict[division] for division in division_labels]

    # First Ring (Inside)
    fig, ax = plt.subplots()
    ax.axis('equal')
    division_labels_newlines = [i.replace(' ','\n') for i in division_labels]
    division_labels_newlines[division_labels_newlines.index('Surgical\n&\nOutpatients')] = 'Surgical &\nOutpatients'
    #mypie, _ = ax.pie(division_counts, radius=2, labels=division_labels_newlines, labeldistance=0.6, colors=[a(0.9), b(0.9), c(0.9), d(0.9), e(0.9)], textprops={'color':'w'})
    mypie, _ = ax.pie(division_counts, radius=0.8, labels=division_labels_newlines, labeldistance=0.4, colors=[inner_ring_colours[0](0.9), inner_ring_colours[1](0.9), inner_ring_colours[2](0.9), inner_ring_colours[3](0.9)], textprops={'color':'w'}, startangle=90, counterclock=False)
    plt.setp( mypie, width=0.8, edgecolor='white')

    # Second Ring (oUTside)
    care_group_labels_newlines = [i.replace(' ','\n') for i in care_group_labels]
    care_group_labels_newlines[care_group_labels_newlines.index('Renal\n&\nTransplantation')] = 'Renal &\nTransplantation'
    care_group_labels_newlines[care_group_labels_newlines.index('Critical\nCare,\nTheatres,\nAnaesthetics\n&\nHSDU')] = 'Critical Care, Theatres,\nAnaesthetics\n& HSDU'
    #care_group_labels_newlines[care_group_labels_newlines.index('Imaging\nRadiology')] = 'Imaging Radiology'
    #care_group_colours = [a(0.8), a(0.7), a(0.6), a(0.5), b(0.8), c(0.8), c(0.7), c(0.6), d(0.8), d(0.7), d(0.6), e(0.8), e(0.7), e(0.6)]
    care_group_colours = [a(0.8), a(0.7), a(0.6), c(0.8), c(0.7), c(0.6), d(0.8), d(0.7), d(0.6), e(0.8), e(0.7), e(0.6)]
    care_group_colours = []
    for i,index in enumerate(division_counts.index):
        df_division = df_division_care_group[df_division_care_group.division == index]
        per_division_counts = df_division['care_group'].value_counts()
        slice_colour = 8
        for j in range(len(per_division_counts)):
            care_group_colours.append(inner_ring_colours[i](slice_colour/10))
            slice_colour -= 1
    mypie2, texts = ax.pie(care_group_counts, radius=2.7, labels=care_group_labels_newlines, labeldistance=0.6, colors=care_group_colours, textprops={'color':'k'},startangle=90, counterclock=False)
    plt.setp( mypie2, width=1.9, edgecolor='white')
    care_group_label_colours = ['k' if i%2==0 else 'w' for i in range(len(care_group_counts))]
    for text, color in zip(texts, care_group_label_colours):
        text.set_color(color)
    plt.margins(0,0)

    # Third Ring (oUTside)
    specialty_labels_newlines = [i.replace(' ','\n') for i in specialty_labels]
    specialty_colours = []
    for i,div_index in enumerate(division_counts.index):
        df_division = df_division_care_group[df_division_care_group.division == div_index]
        slice_colour_orig = 0.5
        division_care_group_counts = df_division['care_group'].value_counts()[df_division['care_group'].unique()]
        for j,care_index in enumerate(division_care_group_counts.index):
            df_care_group = df_division[df_division['care_group-specialty'].str.startswith(care_index)]
            per_care_group_counts = df_care_group['care_group-specialty'].value_counts()[df_care_group['care_group-specialty'].unique()]
            slice_colour = slice_colour_orig - j/10
            for k in per_care_group_counts:
                specialty_colours.append(inner_ring_colours[i](slice_colour))
                slice_colour -= 0.01
    mypie3, texts = ax.pie(specialty_counts, radius=3.6, labels=specialty_labels_newlines, labeldistance=0.8, colors=specialty_colours, startangle=90, counterclock=False)
    plt.setp( mypie3, width=0.9, edgecolor='white')
    specialty_label_colours = ['k' if i%2==0 else 'gray' for i in range(len(specialty_counts))]
    for text, color in zip(texts, specialty_label_colours):
        text.set_color(color)
    plt.margins(0,0)

    # Fourth Ring (oUTside)
    location_colours = [a(0.5),a(0.49),a(0.48),a(0.47),a(0.4),a(0.39),a(0.38),a(0.37),a(0.36),a(0.35),a(0.34),a(0.33),a(0.32),a(0.31),a(0.30),a(0.29),a(0.28),a(0.27),a(0.26),a(0.25),a(0.24),c(0.50),c(0.49),c(0.48),c(0.47),c(0.46),c(0.45),c(0.40),c(0.39),c(0.38),c(0.37),c(0.36),c(0.30),c(0.29),c(0.28),c(0.27),d(0.50),d(0.40),d(0.39),d(0.38),d(0.37),d(0.30),e(0.50),e(0.49),e(0.48),e(0.47),e(0.46),e(0.45),e(0.44),e(0.40),e(0.39),e(0.38),e(0.37)]
    location_colours = []
    for i,div_index in enumerate(division_counts.index):
        df_division = df_division_care_group[df_division_care_group.division == div_index]
        slice_colour_orig = 0.5
        division_care_group_counts = df_division['care_group'].value_counts()[df_division['care_group'].unique()]
        for j,care_index in enumerate(division_care_group_counts.index):
            df_care_group = df_division[df_division['care_group-location'].str.startswith(care_index)]
            per_care_group_counts = df_care_group['care_group-location'].value_counts()[df_care_group['care_group-location'].unique()]
            slice_colour = slice_colour_orig - j/10
            for k in per_care_group_counts:
                location_colours.append(inner_ring_colours[i](slice_colour))
                slice_colour -= 0.01
    mypie4, texts = ax.pie(location_counts, radius=3.9, labels=location_labels, colors=location_colours, startangle=90, counterclock=False)
    plt.setp( mypie4, width=0.3, edgecolor='white')
    for text, color in zip(texts, location_colours):
        text.set_color(color)
    plt.margins(0,0)
    plt.show()

In [None]:
def heatmap():
    df_heatmap = df_max_feature.copy()
    
    def find_ward(location):
        ward = re.findall(r"[A-G][0-9]",location)
        if len(ward)>0: return ward[0]
        else: return location

    def find_ward1to4(location):
        ward1to4 = re.findall(r"[D-G][1-4]",location)
        if len(ward1to4)>0: return ward1to4[0][0]+'3'
        else: return location

    def find_wardBCF5to7(location):
        ward5to7 = re.findall(r"(B|C|F)[5-7]",location)
        if len(ward5to7)>0: return ward5to7[0][0]+'6'
        else: return location

    def find_wardEG6to8(location):
        ward6to8 = re.findall(r"(E|G)[6-8]",location)
        if len(ward6to8)>0: return ward6to8[0][0]+'7'
        else: return location

    def find_acronym(location):
        acronym = re.findall(r"\(([A-Z]+)\)",location)
        if len(acronym)>0: return acronym[0]
        else: return location

    def find_haematology_or_oncology(location):
        if 'Haematology' in location or 'Oncology' in location: return 'Haematology & Oncology Centre'
        else: return location

    def find_ED(location):
        if 'Emergency Department' in location: return 'Emergency Department'
        else: return location

    def find_Ophthalmology(location):
        if 'Ophthalmology' in location: return 'Eye'
        else: return location

    def find_surgical(location):
        if 'Surgical' in location: return 'E3'
        else: return location

    df_heatmap['location'] = np.vectorize(find_ward)(df_heatmap['location'])
    df_heatmap['location'] = np.vectorize(find_ward1to4)(df_heatmap['location'])
    df_heatmap['location'] = np.vectorize(find_wardBCF5to7)(df_heatmap['location'])
    df_heatmap['location'] = np.vectorize(find_wardEG6to8)(df_heatmap['location'])
    #df_heatmap['location'] = np.vectorize(find_acronym)(df_heatmap['location'])
    df_heatmap['location'] = np.vectorize(find_haematology_or_oncology)(df_heatmap['location'])
    df_heatmap['location'] = np.vectorize(find_ED)(df_heatmap['location'])
    df_heatmap['location'] = np.vectorize(find_Ophthalmology)(df_heatmap['location'])
    df_heatmap['location'] = np.vectorize(find_surgical)(df_heatmap['location'])



    df_heatmap.loc[df_heatmap['location'] == 'A7', 'location'] = 'Paediatric Unit'
    df_heatmap.loc[df_heatmap['location'] == 'A8', 'location'] = 'Paediatric Unit'
    df_heatmap.loc[df_heatmap['location'] == 'Administration Offices Renal & Transplant', 'location'] = 'Renal Unit'
    df_heatmap.loc[df_heatmap['location'] == 'Renal Day Unit  QA', 'location'] = 'Renal Unit'
    df_heatmap.loc[df_heatmap['location'] == 'Anti Coagulation Clinic QA', 'location'] = 'Haematology & Oncology Centre'
    df_heatmap.loc[df_heatmap['location'] == 'Cardiac Catheter Laboratory', 'location'] = 'Cardiology'
    df_heatmap.loc[df_heatmap['location'] == 'Cardiac Day Unit (CDU)', 'location'] = 'Cardiology'
    df_heatmap.loc[df_heatmap['location'] == 'Cardiology Outpatients QA', 'location'] = 'C Level Outpatients'
    df_heatmap.loc[df_heatmap['location'] == 'Childrens Assesment Unit (CAU)', 'location'] = 'Paediatric Unit'
    df_heatmap.loc[df_heatmap['location'] == 'Colorectal Outpatients QA', 'location'] = 'C Level Outpatients'
    df_heatmap.loc[df_heatmap['location'] == 'Diabetes and Endocrinology Centre QA', 'location'] = 'Diabetes and Endocrinology'
    df_heatmap.loc[df_heatmap['location'] == 'ENT Outpatients QA', 'location'] = 'Ear, Nose & Throat'
    df_heatmap.loc[df_heatmap['location'] == 'Early Pregnancy Assessment Unit', 'location'] = 'A5'
    df_heatmap.loc[df_heatmap['location'] == 'Endoscopy Unit D Level QA', 'location'] = 'Endoscopy Unit'
    df_heatmap.loc[df_heatmap['location'] == 'Eye Day Case Unit QA', 'location'] = 'Eye Unit'
    df_heatmap.loc[df_heatmap['location'] == 'Gastroenterology  Outpatients QA', 'location'] = 'D Level Outpatients'
    df_heatmap.loc[df_heatmap['location'] == 'Gynae Outpatients QA', 'location'] = 'Gynaecology Outpatients'
    df_heatmap.loc[df_heatmap['location'] == 'Haemodialysis Unit', 'location'] = 'Renal Haemodialysis'
    df_heatmap.loc[df_heatmap['location'] == 'Maxillofacial Outpatients QA', 'location'] = 'D Level Outpatients'
    df_heatmap.loc[df_heatmap['location'] == 'Ophthalmology Eye Emergency Department', 'location'] = 'Eye Unit'
    df_heatmap.loc[df_heatmap['location'] == 'Pathology Department', 'location'] = 'Pathology'
    df_heatmap.loc[df_heatmap['location'] == 'Peritoneal Dialysis', 'location'] = 'Renal Unit'
    df_heatmap.loc[df_heatmap['location'] == 'Radiotherapy  Dept within CHOC', 'location'] = 'F6'
    df_heatmap.loc[df_heatmap['location'] == 'Receipt and Delivery', 'location'] = 'Reception'
    df_heatmap.loc[df_heatmap['location'] == 'Reception (Atrium)', 'location'] = 'Reception'
    df_heatmap['location'] = df_heatmap['location'].str.replace(' Unit','')
    df_heatmap.loc[df_heatmap['location'] == 'Renal', 'location'] = 'Renal Unit'

    df_heatmap = df_heatmap[df_heatmap['location'] != "Administration Offices Women and Children's"]
    df_heatmap = df_heatmap[df_heatmap['location'] != 'Anaesthetics Department'] 
    df_heatmap = df_heatmap[df_heatmap['location'] != 'Antenatal Clinic'] 
    df_heatmap = df_heatmap[df_heatmap['location'] != 'Diagnostic Imaging'] 
    df_heatmap = df_heatmap[df_heatmap['location'] != 'ICT  Centre QA'] 
    df_heatmap = df_heatmap[~df_heatmap['location'].str.contains('Maternity')] 
    df_heatmap = df_heatmap[df_heatmap['location'] != 'Observation Ward']
    df_heatmap = df_heatmap[df_heatmap['location'] != 'Occupational Health']
    df_heatmap = df_heatmap[df_heatmap['location'] != 'Orthopaedic Pre Operation Clinic QA']
    df_heatmap = df_heatmap[df_heatmap['location'] != 'Pre-Operative Assessment (POA)']
    df_heatmap = df_heatmap[~df_heatmap['location'].str.contains('QA@ Home ')] 
    df_heatmap = df_heatmap[df_heatmap['location'] != 'Research & Development Department']
    df_heatmap = df_heatmap[df_heatmap['location'] != 'Respiratory Day']
    df_heatmap = df_heatmap[df_heatmap['location'] != 'TIA Clinic']
    df_heatmap = df_heatmap[df_heatmap['location'] != 'Theatres QA']
    df_heatmap = df_heatmap[df_heatmap['location'] != 'Transplant and Retrieval  QA']

    location_counts = df_heatmap['location'].value_counts().sort_index(ascending=True)

    df_heatmap_counts = location_counts.rename_axis('location').reset_index(name='events')

    df_heatmap_counts['floor'] = df_heatmap_counts['location'].str[0]
    df_heatmap_counts.loc[df_heatmap_counts['location'] == 'Acute Medical (AMU) (MAU)', 'floor'] = 'C'
    df_heatmap_counts.loc[df_heatmap_counts['location'] == 'Diabetes and Endocrinology', 'floor'] = 'C'
    df_heatmap_counts.loc[df_heatmap_counts['location'] == 'Discharge', 'floor'] = 'A'
    df_heatmap_counts.loc[df_heatmap_counts['location'] == 'Ear, Nose & Throat', 'floor'] = 'D'
    df_heatmap_counts.loc[df_heatmap_counts['location'] == 'Emergency Department', 'floor'] = 'C'
    df_heatmap_counts.loc[df_heatmap_counts['location'] == 'Endoscopy', 'floor'] = 'D'
    df_heatmap_counts.loc[df_heatmap_counts['location'] == 'Eye', 'floor'] = 'B'
    df_heatmap_counts.loc[df_heatmap_counts['location'] == 'Gynaecology Outpatients', 'floor'] = 'D'
    df_heatmap_counts.loc[df_heatmap_counts['location'] == 'Haematology & Oncology Centre', 'floor'] = 'B'
    df_heatmap_counts.loc[df_heatmap_counts['location'] == 'Medical Physics Department', 'floor'] = 'F'
    df_heatmap_counts.loc[df_heatmap_counts['location'] == 'Paediatric', 'floor'] = 'A'
    df_heatmap_counts.loc[df_heatmap_counts['location'] == 'Pathology', 'floor'] = 'E'
    df_heatmap_counts.loc[df_heatmap_counts['location'] == 'Pharmacy QA', 'floor'] = 'C'
    df_heatmap_counts.loc[df_heatmap_counts['location'] == 'Reception', 'floor'] = 'A'
    df_heatmap_counts.loc[df_heatmap_counts['location'] == 'Rheumatology Outpatients QA', 'floor'] = 'C'
    df_heatmap_counts.loc[df_heatmap_counts['location'] == 'Theatre Admissions Suite (TAS)', 'floor'] = 'E'

    def find_renal(location,floor):
        if 'Renal' in location: return 'G'
        else: return floor

    df_heatmap_counts['floor'] = np.vectorize(find_renal)(df_heatmap_counts['location'],df_heatmap_counts['floor'])

    df_heatmap_counts['location'] = df_heatmap_counts['location'].str.replace(' QA','')

    pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
    text_dict = {}
    floors = ['A','B','C','D','E','F','G']
    for floor in floors:
        text = pytesseract.image_to_data(Image.open('level_maps_'+floor.lower()+'.png'),output_type='data.frame')
        text = text.dropna()
        text = text[text.text != ' ']
        text = text[text.text != '  ']
        text = text[text.text != '   ']
        text = text[text.text != '    ']
        text = text[text.text != '     ']
        text_dict[floor] = text

    text_dict['A'].loc[text_dict['A'].text == 'AS,', 'text'] = 'A5'
    text_dict['A'].loc[text_dict['A'].text == 'Dischage', 'text'] = 'Discharge'
    text_dict['B'].loc[text_dict['B'].text == 'Centre', 'text'] = 'Haematology & Oncology Centre'
    text_dict['B'].loc[text_dict['B'].text == '(B8)', 'text'] = 'B8'
    text_dict['B'].loc[text_dict['B'].text == 'Neonatal', 'text'] = 'B9'
    text_dict['B'].loc[text_dict['B'].text == 'Eye', 'left'] = 1100
    text_dict['C'].loc[text_dict['C'].text == 'Day', 'text'] = 'Acute Medical (AMU) (MAU)'
    text_dict['C'].loc[text_dict['C'].text == 'Acute Medical (AMU) (MAU)', 'top'] = 354
    text_dict['C'].loc[text_dict['C'].text == 'Ambulance', 'left'] = 700
    text_dict['C'].loc[text_dict['C'].text == 'Ambulance', 'text'] = 'C Level Outpatients'
    text_dict['C'].loc[text_dict['C'].text == 'C Level Outpatients', 'left'] = 50
    text_dict['C'].loc[text_dict['C'].text == 'Diabetes', 'text'] = 'Diabetes and Endocrinology'
    text_dict['C'].loc[text_dict['C'].text == 'Blood', 'text'] = 'Emergency Department'
    text_dict['C'].loc[text_dict['C'].text == 'Emergency Department', 'left'] = 1100
    text_dict['C'].loc[text_dict['C'].text == 'Rheumatology', 'text'] = 'Rheumatology Outpatients'
    text_dict['D'].loc[text_dict['D'].text == 'Day', 'text'] = 'Day Surgery'
    text_dict['D'].loc[text_dict['D'].text == 'Nose', 'text'] = 'Ear, Nose & Throat'
    text_dict['D'].loc[text_dict['D'].text == 'Level', 'text'] = 'D Level Outpatients'
    text_dict['D'].loc[text_dict['D'].text == 'Gynaecology', 'text'] = 'Gynaecology Outpatients'
    text_dict['E'].loc[text_dict['E'].text == 'Theatre', 'text'] = 'Theatre Admissions Suite (TAS)'
    text_dict['E'].loc[text_dict['E'].text == 'E5)', 'text'] = 'E5'
    text_dict['F'].loc[text_dict['F'].text == 'Medical', 'text'] = 'Medical Physics Department'
    text_dict['G'].loc[text_dict['G'].text == 'Unit', 'text'] = 'Renal Unit'
    text_dict['G'].loc[text_dict['G'].text == 'G8,', 'text'] = 'G5'
    text_dict['G'].loc[text_dict['G'].text == 'G5', 'top'] = 750
    text_dict['G'].loc[text_dict['G'].text == 'G5', 'left'] = 700
    text_dict['G'].loc[text_dict['G'].text == 'G2,', 'text'] = 'G3'
    text_dict['G'].loc[text_dict['G'].text == 'Haemodialysis', 'text'] = 'Renal Haemodialysis'
    text_dict['G'].loc[text_dict['G'].text == 'Outpatients', 'text'] = 'Renal Outpatients'

    merged_dict = {}
    for floor in floors:
        merged_dict[floor] = pd.merge(left=df_heatmap_counts,right=text_dict[floor],left_on='location',right_on='text')

    cm = plt.cm.get_cmap('RdYlGn_r')
    vmin = df_heatmap_counts['events'].min()
    vmax = df_heatmap_counts['events'].max()
    area_multiplication = 6

    for floor in floors:
        plt.figure()
        img=mpimg.imread('level_maps_'+floor.lower()+'.png')
        plt.figure(figsize=(60,6))
        imgplot = plt.imshow(img)
        fig = plt.gca()
        x = merged_dict[floor].left.values + merged_dict[floor].width.values/2
        y = merged_dict[floor].top.values + merged_dict[floor].height.values/2
        size = merged_dict[floor].events.values*area_multiplication
        color = merged_dict[floor].events.values
        plt.scatter(x,y,s=size,c=color,vmin=vmin,vmax=vmax,cmap=cm,alpha=0.5)
        plt.colorbar(label='# of medicine patient safety events')
        fig.get_xaxis().set_visible(False)
        fig.get_yaxis().set_visible(False)

In [None]:
def correlations():
    df_correlations = df_max_feature.drop(['inc_notes','inc_actiontaken','max_feature', 'hour', 'weekday', 'month'],axis=1).reset_index()
    df_correlations = df_correlations.drop(['recordid'],axis=1)
    df_correlations['inc_time'] = df_correlations['inc_time'].str.replace(':','')
    df_correlations['inc_submittedtime'] = df_correlations['inc_submittedtime'].str.replace(':','')

    df_correlations['inc_year'] = df_correlations.inc_dincident.dt.year
    df_correlations['inc_month'] = df_correlations.inc_dincident.dt.month
    df_correlations['inc_day'] = df_correlations.inc_dincident.dt.day
    df_correlations['inc_week'] = df_correlations.inc_dincident.dt.week
    df_correlations['inc_weekday'] = df_correlations.inc_dincident.dt.weekday
    df_correlations['inc_quarter'] = df_correlations.inc_dincident.dt.quarter
    df_correlations['inc_hour'] = df_correlations['inc_time'].str[:2]
    df_correlations['inc_minute'] = df_correlations['inc_time'].str[2:4]
    df_correlations = df_correlations.astype({'inc_hour':'int'})
    df_correlations = df_correlations.astype({'inc_minute':'int'})
    df_correlations['inc_datetime'] = df_correlations.apply(lambda x: x.loc['inc_dincident'] + dt.timedelta(hours=x.loc['inc_hour'],minutes=x.loc['inc_minute']), axis=1)
    df_correlations['reported_year'] = df_correlations.inc_dreported.dt.year
    df_correlations['reported_month'] = df_correlations.inc_dreported.dt.month
    df_correlations['reported_day'] = df_correlations.inc_dreported.dt.day
    df_correlations['reported_week'] = df_correlations.inc_dreported.dt.week
    df_correlations['reported_weekday'] = df_correlations.inc_dreported.dt.weekday
    df_correlations['reported_quarter'] = df_correlations.inc_dreported.dt.quarter
    df_correlations['reported_hour'] = df_correlations['inc_submittedtime'].str[:2]
    df_correlations['reported_minute'] = df_correlations['inc_submittedtime'].str[2:4]
    df_correlations = df_correlations.astype({'reported_hour':'int'})
    df_correlations = df_correlations.astype({'reported_minute':'int'})
    df_correlations['reported_datetime'] = df_correlations.apply(lambda x: x.loc['inc_dreported'] + dt.timedelta(hours=x.loc['reported_hour'],minutes=x.loc['reported_minute']), axis=1)
    df_correlations['diff'] = df_correlations['reported_datetime'] - df_correlations['inc_datetime']
    df_correlations['reported-incident'] = df_correlations['diff'] / np.timedelta64(1, 'D')
    df_correlations.head()
    df_correlations = df_correlations.drop(['inc_dincident','inc_time','inc_dreported','inc_submittedtime','inc_datetime','reported_datetime','diff'],axis=1)

    def cramers_v(x, y):
        """
        Calculates Cramer's V statistic for categorical-categorical association.
        Uses correction from Bergsma and Wicher, Journal of the Korean Statistical Society 42 (2013): 323-328.
        This is a symmetric coefficient: V(x,y) = V(y,x)
        Original function taken from: https://stackoverflow.com/a/46498792/5863503
        Wikipedia: https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
        **Returns:** float in the range of [0,1]
        Parameters
        ----------
        x : list / NumPy ndarray / Pandas Series
            A sequence of categorical measurements
        y : list / NumPy ndarray / Pandas Series
            A sequence of categorical measurements
        """
        confusion_matrix = pd.crosstab(x,y)
        chi2 = ss.chi2_contingency(confusion_matrix)[0]
        n = confusion_matrix.sum().sum()
        phi2 = chi2/n
        r,k = confusion_matrix.shape
        phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
        rcorr = r-((r-1)**2)/(n-1)
        kcorr = k-((k-1)**2)/(n-1)
        return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))


    def theils_u(x, y):
        """
        Calculates Theil's U statistic (Uncertainty coefficient) for categorical-categorical association.
        This is the uncertainty of x given y: value is on the range of [0,1] - where 0 means y provides no information about
        x, and 1 means y provides full information about x.
        This is an asymmetric coefficient: U(x,y) != U(y,x)
        Wikipedia: https://en.wikipedia.org/wiki/Uncertainty_coefficient
        **Returns:** float in the range of [0,1]
        Parameters
        ----------
        x : list / NumPy ndarray / Pandas Series
            A sequence of categorical measurements
        y : list / NumPy ndarray / Pandas Series
            A sequence of categorical measurements
        """
        s_xy = conditional_entropy(x,y)
        x_counter = Counter(x)
        total_occurrences = sum(x_counter.values())
        p_x = list(map(lambda n: n/total_occurrences, x_counter.values()))
        s_x = ss.entropy(p_x)
        if s_x == 0:
            return 1
        else:
            return (s_x - s_xy) / s_x


    def correlation_ratio(categories, measurements):
        """
        Calculates the Correlation Ratio (sometimes marked by the greek letter Eta) for categorical-continuous association.
        Answers the question - given a continuous value of a measurement, is it possible to know which category is it
        associated with?
        Value is in the range [0,1], where 0 means a category cannot be determined by a continuous measurement, and 1 means
        a category can be determined with absolute certainty.
        Wikipedia: https://en.wikipedia.org/wiki/Correlation_ratio
        **Returns:** float in the range of [0,1]
        Parameters
        ----------
        categories : list / NumPy ndarray / Pandas Series
            A sequence of categorical measurements
        measurements : list / NumPy ndarray / Pandas Series
            A sequence of continuous measurements
        """
        #categories = convert(categories, 'array')
        #measurements = convert(measurements, 'array')
        fcat, _ = pd.factorize(categories)
        cat_num = np.max(fcat)+1
        y_avg_array = np.zeros(cat_num)
        n_array = np.zeros(cat_num)
        for i in range(0,cat_num):
            cat_measures = measurements[np.argwhere(fcat == i).flatten()]
            n_array[i] = len(cat_measures)
            y_avg_array[i] = np.average(cat_measures)
        y_total_avg = np.sum(np.multiply(y_avg_array,n_array))/np.sum(n_array)
        numerator = np.sum(np.multiply(n_array,np.power(np.subtract(y_avg_array,y_total_avg),2)))
        denominator = np.sum(np.power(np.subtract(measurements,y_total_avg),2))
        if numerator == 0:
            eta = 0.0
        else:
            eta = np.sqrt(numerator/denominator)
        return eta


    def associations(dataset, nominal_columns=None, mark_columns=False, theil_u=False, plot=True,
                              return_results = False, **kwargs):
        """
        Calculate the correlation/strength-of-association of features in data-set with both categorical (eda_tools) and
        continuous features using:
         * Pearson's R for continuous-continuous cases
         * Correlation Ratio for categorical-continuous cases
         * Cramer's V or Theil's U for categorical-categorical cases
        **Returns:** a DataFrame of the correlation/strength-of-association between all features
        **Example:** see `associations_example` under `dython.examples`
        Parameters
        ----------
        dataset : NumPy ndarray / Pandas DataFrame
            The data-set for which the features' correlation is computed
        nominal_columns : string / list / NumPy ndarray
            Names of columns of the data-set which hold categorical values. Can also be the string 'all' to state that all
            columns are categorical, or None (default) to state none are categorical
        mark_columns : Boolean, default = False
            if True, output's columns' names will have a suffix of '(nom)' or '(con)' based on there type (eda_tools or
            continuous), as provided by nominal_columns
        theil_u : Boolean, default = False
            In the case of categorical-categorical feaures, use Theil's U instead of Cramer's V
        plot : Boolean, default = True
            If True, plot a heat-map of the correlation matrix
        return_results : Boolean, default = False
            If True, the function will return a Pandas DataFrame of the computed associations
        kwargs : any key-value pairs
            Arguments to be passed to used function and methods
        """
        #dataset = convert(dataset, 'dataframe')
        columns = dataset.columns
        corr = pd.DataFrame(index=columns, columns=columns)
        for i in range(0,len(columns)):
            for j in range(i,len(columns)):
                if i == j:
                    corr[columns[i]][columns[j]] = 0.0
                else:
                    if columns[i] in nominal_columns:
                        if columns[j] in nominal_columns:
                            if theil_u:
                                corr[columns[j]][columns[i]] = theils_u(dataset[columns[i]],dataset[columns[j]])
                                corr[columns[i]][columns[j]] = theils_u(dataset[columns[j]],dataset[columns[i]])
                            else:
                                cell = cramers_v(dataset[columns[i]],dataset[columns[j]])
                                corr[columns[i]][columns[j]] = cell
                                corr[columns[j]][columns[i]] = cell
                        else:
                            cell = correlation_ratio(dataset[columns[i]], dataset[columns[j]])
                            corr[columns[i]][columns[j]] = cell
                            corr[columns[j]][columns[i]] = cell
                    else:
                        if columns[j] in nominal_columns:
                            cell = correlation_ratio(dataset[columns[j]], dataset[columns[i]])
                            corr[columns[i]][columns[j]] = cell
                            corr[columns[j]][columns[i]] = cell
                        else:
                            cell, _ = ss.pearsonr(dataset[columns[i]], dataset[columns[j]])
                            corr[columns[i]][columns[j]] = cell
                            corr[columns[j]][columns[i]] = cell
        corr.fillna(value=np.nan, inplace=True)
        if mark_columns:
            marked_columns = ['{} (nom)'.format(col) if col in nominal_columns else '{} (con)'.format(col) for col in columns]
            corr.columns = marked_columns
            corr.index = marked_columns
        if plot:
            plt.figure(figsize=kwargs.get('figsize',None))
            labels = np.round(np.array(corr),decimals=1).astype(str)
            for i in range(0,len(columns)):
                for j in range(i,len(columns)):
                    if j>=i: labels[i][j] = ''
            ax = sns.heatmap(corr, annot=labels, fmt='')
            cbar = ax.collections[0].colorbar
            cbar.set_ticks([-0.2, 0, 0.5, 0.9])
            cbar.set_ticklabels(['-0.2 anti-associated', '0 no association', '0.5 associated', '0.9 closely associated'])
            plt.title('Amount of association between variables. 1:fully associated, 0:no association, -1:fully anti-associated')
            plt.show()
        if return_results:
            for i in range(0,len(columns)):
                for j in range(i,len(columns)):
                    if j>=i: 
                        corr[columns[i]][columns[j]] = 0.0
            return corr

    columns_list = df_correlations.columns.values
    columns_list = [el.replace('inc_','') for el in columns_list]
    columns_list = [el.replace('location','ward/dept/unit') for el in columns_list]
    columns_list = [el.replace('loctype','location_type') for el in columns_list]
    columns_list = [el.replace('show_other_contacts','any_other_patient_involved') for el in columns_list]
    columns_list = [el.replace('show_employee','any_other_employee_involved') for el in columns_list]
    columns_list = [el.replace('show_witness','any_witness') for el in columns_list]
    columns_list = [el.replace('show_document','any_document_attached') for el in columns_list]
    columns_list = [el.replace('rep_email','email') for el in columns_list]
    columns_list = [el.replace('reportedby','role') for el in columns_list]
    df_correlations.columns=columns_list
    nominal_columns = df_correlations.select_dtypes('O').columns.values
    #associations(df_correlations,nominal_columns,figsize=(11,11))

    df_correlations_new = df_correlations.drop(['reported_year','reported_month','reported_day','reported_week','reported_weekday','reported_quarter','reported_hour','reported_minute','day','week','quarter','minute','any_other_patient_involved','any_other_employee_involved','any_witness','any_document_attached','employee_involved','email'],axis=1)
    nominal_columns = df_correlations_new.select_dtypes('O').columns.values
    returned_results = associations(df_correlations_new,nominal_columns,figsize=(11,11),return_results=True,plot=False)

    df_association = pd.DataFrame(columns=['row','col','association','description','interest'])
    while returned_results.values.max() > 0.2:
        for row in list(returned_results.index):
            for col in returned_results.columns:
                if returned_results.at[row,col] == returned_results.values.max():
                    df_association = df_association.append({'row':row,'col':col,'association':returned_results.values.max()},ignore_index=True)
                    returned_results.at[row,col] = 0
                    break

    df_association = df_association.astype({'description':'object','interest':'object'})

    df_association.loc[(df_association['row']=='result') & (df_association['col']=='severity'),'description'] = "both contain options for 'near miss' and 'no harm'"
    df_association.loc[(df_association['row']=='result') & (df_association['col']=='severity'),'interest'] = "expected"
    df_association.loc[(df_association['row']=='division-care_group') & (df_association['col']=='ward/dept/unit'),'description'] = "wards/depts/units are within specific divsions & care groups"
    df_association.loc[(df_association['row']=='division-care_group') & (df_association['col']=='ward/dept/unit'),'interest'] = "expected"
    df_association.loc[(df_association['row']=='division-care_group') & (df_association['col']=='specialty'),'description'] = "the fact this isn't 1.0 shows that there are outliers outside of their care group"
    df_association.loc[(df_association['row']=='division-care_group') & (df_association['col']=='specialty'),'interest'] = "interesting"
    df_association.loc[(df_association['row']=='email') & (df_association['col']=='year'),'description'] = "recently, some staff have been reporting more"
    df_association.loc[(df_association['row']=='email') & (df_association['col']=='year'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='role') & (df_association['col']=='year'),'description'] = "recently, some roles have been reporting more"
    df_association.loc[(df_association['row']=='role') & (df_association['col']=='year'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='email') & (df_association['col']=='hour'),'description'] = "some staff are more likely to see incidents at certain hours"
    df_association.loc[(df_association['row']=='email') & (df_association['col']=='hour'),'interest'] = "interesting"
    df_association.loc[(df_association['row']=='role') & (df_association['col']=='email'),'description'] = "some staff are in multiple roles"
    df_association.loc[(df_association['row']=='role') & (df_association['col']=='email'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='email') & (df_association['col']=='reported-incident'),'description'] = "some staff take longer to report after the incident"
    df_association.loc[(df_association['row']=='email') & (df_association['col']=='reported-incident'),'interest'] = "interesting"
    df_association.loc[(df_association['row']=='email') & (df_association['col']=='weekday'),'description'] = "some staff are more likely to see incidents on certain weekdays"
    df_association.loc[(df_association['row']=='email') & (df_association['col']=='weekday'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='email') & (df_association['col']=='month'),'description'] = "some staff are more likely to see incidents on certain months"
    df_association.loc[(df_association['row']=='email') & (df_association['col']=='month'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='specialty'),'description'] = "the fact this isn't 1.0 shows that there are outliers outside of their ward/dept/unit"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='specialty'),'interest'] = "interesting"
    df_association.loc[(df_association['row']=='division-care_group') & (df_association['col']=='email'),'description'] = "some divisions & care groups have more staff reporting"
    df_association.loc[(df_association['row']=='division-care_group') & (df_association['col']=='email'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='location_type') & (df_association['col']=='email'),'description'] = "some location types have more staff reporting"
    df_association.loc[(df_association['row']=='location_type') & (df_association['col']=='email'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='location_type'),'description'] = "some location types are more likely to be in specific wards/depts/units"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='location_type'),'interest'] = "expected"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='email'),'description'] = "some specialties have more staff reporting"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='email'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='division-care_group') & (df_association['col']=='location_type'),'description'] = "some location types are more likely to be in specific divisions & care groups"
    df_association.loc[(df_association['row']=='division-care_group') & (df_association['col']=='location_type'),'interest'] = "expected"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='email'),'description'] = "some wards/depts/units have more staff reporting"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='email'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='reported-incident'),'description'] = "some wards/depts/units take longer to report after the incident"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='reported-incident'),'interest'] = "interesting"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='location_type'),'description'] = "some location types are more likely to be in specific specialties"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='location_type'),'interest'] = "expected"
    df_association.loc[(df_association['row']=='location_type') & (df_association['col']=='reported-incident'),'description'] = "some location types take longer to report after the incident"
    df_association.loc[(df_association['row']=='location_type') & (df_association['col']=='reported-incident'),'interest'] = "interesting"
    df_association.loc[(df_association['row']=='division-care_group') & (df_association['col']=='reported-incident'),'description'] = "some divisions & care groups take longer to report after the incident"
    df_association.loc[(df_association['row']=='division-care_group') & (df_association['col']=='reported-incident'),'interest'] = "interesting"
    df_association.loc[(df_association['row']=='severity') & (df_association['col']=='email'),'description'] = "some staff see more severe incidents"
    df_association.loc[(df_association['row']=='severity') & (df_association['col']=='email'),'interest'] = "interesting"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='year'),'description'] = "some staff see incidents with specific results"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='year'),'interest'] = "interesting"
    df_association.loc[(df_association['row']=='result') & (df_association['col']=='email'),'description'] = "recently, some wards/depts/units have had more incidents"
    df_association.loc[(df_association['row']=='result') & (df_association['col']=='email'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='division-care_group') & (df_association['col']=='role'),'description'] = "some roles see incidents in specific divisions & care groups"
    df_association.loc[(df_association['row']=='division-care_group') & (df_association['col']=='role'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='role'),'description'] = "some roles see incidents in specific wards/depts/units"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='role'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='role'),'description'] = "some roles see incidents in specific specialties"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='role'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='location_type') & (df_association['col']=='role'),'description'] = "some roles see incidents in specific location types"
    df_association.loc[(df_association['row']=='location_type') & (df_association['col']=='role'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='year'),'description'] = "recently, some specialties have had more incidents"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='year'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='result'),'description'] = "some wards/depts/units see incidents with specific results"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='result'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='weekday'),'description'] = "some wards/depts/units see incidents on specific weekdays"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='weekday'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='month'),'description'] = "some wards/depts/units see incidents in specific months"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='month'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='hour'),'description'] = "some wards/depts/units see incidents during specific hours"
    df_association.loc[(df_association['row']=='ward/dept/unit') & (df_association['col']=='hour'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='result') & (df_association['col']=='role'),'description'] = "some roles see incidents with specific results"
    df_association.loc[(df_association['row']=='result') & (df_association['col']=='role'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='result'),'description'] = "some specialties see incidents with specific results"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='result'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='division-care_group') & (df_association['col']=='result'),'description'] = "some divisions & care groups see incidents with specific results"
    df_association.loc[(df_association['row']=='division-care_group') & (df_association['col']=='result'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='reported-incident'),'description'] = "some specialties take longer to report after the incident"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='reported-incident'),'interest'] = "interesting"
    df_association.loc[(df_association['row']=='location_type') & (df_association['col']=='result'),'description'] = "some specialties see incidents in specific months"
    df_association.loc[(df_association['row']=='location_type') & (df_association['col']=='result'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='role') & (df_association['col']=='reported-incident'),'description'] = "some roles take longer to report after the incident"
    df_association.loc[(df_association['row']=='role') & (df_association['col']=='reported-incident'),'interest'] = "interesting"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='month'),'description'] = "some locationt types see incidents with specific results"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='month'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='weekday'),'description'] = "some specialties see incidents on specific weekdays"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='weekday'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='location_type') & (df_association['col']=='year'),'description'] = "recently, some location types have had more incidents"
    df_association.loc[(df_association['row']=='location_type') & (df_association['col']=='year'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='hour'),'description'] = "some specialties see incidents during specific hours"
    df_association.loc[(df_association['row']=='specialty') & (df_association['col']=='hour'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='role') & (df_association['col']=='weekday'),'description'] = "some roles see incidents on specific weekdays"
    df_association.loc[(df_association['row']=='role') & (df_association['col']=='weekday'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='location_type') & (df_association['col']=='weekday'),'description'] = "some location types see incidents on specific weekdays"
    df_association.loc[(df_association['row']=='location_type') & (df_association['col']=='weekday'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='location_type') & (df_association['col']=='hour'),'description'] = "some location types see incidents during specific hours"
    df_association.loc[(df_association['row']=='location_type') & (df_association['col']=='hour'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='role') & (df_association['col']=='month'),'description'] = "some roles see incidents in specific months"
    df_association.loc[(df_association['row']=='role') & (df_association['col']=='month'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='division-care_group') & (df_association['col']=='year'),'description'] = "recently, some divisions & care groups have had more incidents"
    df_association.loc[(df_association['row']=='division-care_group') & (df_association['col']=='year'),'interest'] = "interesting?"
    df_association.loc[(df_association['row']=='severity') & (df_association['col']=='role'),'description'] = "some roles see more severe incidents"
    df_association.loc[(df_association['row']=='severity') & (df_association['col']=='role'),'interest'] = "interesting"

    cm = sns.cm.rocket
    color = df_association['association']
    size = np.ones(len(df_association))*200
    plt.figure(figsize=(17,df_association.index.max()/3))
    plt.ylim(df_association.index.max()+1,-1)
    plt.xlim(0.2,2.2)
    plt.scatter(np.ones(len(df_association)),df_association.index,marker='s',c=color,cmap=cm,s=size)
    plt.scatter(np.ones(len(df_association))*0.6,df_association.index,marker='*')
    plt.gca().get_xaxis().set_visible(False)
    plt.gca().get_yaxis().set_visible(False)
    for i in df_association.index:
        number_color = 'w' if df_association.at[i,'association']<0.66 else 'k'
        plt.text(1,i,round(df_association.at[i,'association'],1),ha='center',va='center',color=number_color)
        plt.text(0.3,i,df_association.at[i,'row'],ha='left',va='center')
        plt.text(0.7,i,df_association.at[i,'col'],ha='left',va='center')
        plt.text(1.1,i,df_association.at[i,'description'],ha='left',va='center')
        interest_color = 'r' if df_association.at[i,'interest']=='expected' else 'g'
        plt.text(2.0,i,df_association.at[i,'interest'],ha='left',va='center',color=interest_color)
        plt.axhline(i+0.5,color='k',linewidth=1)
    plt.show()
    
    del df_correlations,df_correlations_new

In [None]:
def naive_bayes():
    df_nb = df_max_feature.drop(['inc_dincident','inc_time','inc_dreported','inc_submittedtime','max_feature','hour','weekday','month','inc_rep_email','reported_hour'],axis=1).copy()
    df_nb['division-care_group'] = df_nb['division-care_group'].str.replace(' and ',' & ')
    divider = df_nb['division-care_group'].str.split(" Division - ",n=1,expand=True)
    df_nb['division'] = divider[0]
    df_nb['care group'] = divider[1]
    df_nb = df_nb.drop('division-care_group',axis=1)

    df_nb.columns = [column.replace('inc_organisation','site') for column in df_nb.columns]
    df_nb.columns = [column.replace('inc_locactual','ward/dept/unit') for column in df_nb.columns]
    df_nb.columns = [column.replace('inc_specialty','specialty') for column in df_nb.columns]
    df_nb.columns = [column.replace('inc_loctype','location type') for column in df_nb.columns]
    df_nb.columns = [column.replace('inc_result','result') for column in df_nb.columns]
    df_nb.columns = [column.replace('inc_severity','severity') for column in df_nb.columns]
    df_nb.columns = [column.replace('inc_reportedby','reported by') for column in df_nb.columns]
    df_nb.columns = [column.replace('show_other_contacts','other patients involved?') for column in df_nb.columns]
    df_nb.columns = [column.replace('show_employee','other employees involved?') for column in df_nb.columns]
    df_nb.columns = [column.replace('show_witness','any witnesses?') for column in df_nb.columns]
    df_nb.columns = [column.replace('show_document','any documents attached?') for column in df_nb.columns]
    df_nb.head()

    category_columns = list(df_nb.columns.values)
    category_columns.remove('inc_notes')
    count_accuracies = []
    tfidf_accuracies = []
    for column in category_columns:
        df_nb_dropped = df_nb.dropna(subset=[column])
        y = df_nb_dropped[column]
        X_train, X_test, y_train, y_test = train_test_split(
                                        df_nb_dropped['inc_notes'], y,
                                        test_size=0.33,
                                        random_state=53)


        # Initialize a TfidfVectorizer object: tfidf_vectorizer
        tfidf_vectorizer = CustomVectorizer(tokenizer=LemmaTokenizer())

        # Transform the training data: tfidf_train 
        tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)

        # Transform the test data: tfidf_test 
        tfidf_test = tfidf_vectorizer.transform(X_test.values)

        # Print the first 10 features
        #print(tfidf_vectorizer.get_feature_names()[:10])

        # Print the first 5 vectors of the tfidf training data
        #print(tfidf_train.A[:5])


        # Create the CountVectorizer DataFrame: count_df_nb
        #count_df_nb = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

        # Create the TfidfVectorizer DataFrame: tfidf_df_nb
        #tfidf_df_nb = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

        # Print the head of count_df_nb
        #print(count_df_nb.head())

        # Print the head of tfidf_df_nb
        #print(tfidf_df_nb.head())

        # Calculate the difference in columns: difference
        #difference = set(count_df_nb.columns) - set(tfidf_df_nb.columns)
        #print(difference)

        # Check whether the DataFrames are equal
        #print(count_df_nb.equals(tfidf_df_nb))




        tfidf_nb_classifier = MultinomialNB()
        tfidf_nb_classifier.fit(tfidf_train, y_train)
        tfidf_pred = tfidf_nb_classifier.predict(tfidf_test)
        tfidf_accuracies.append(100*metrics.accuracy_score(y_test,tfidf_pred))    



    tfidf_from50 = [int(round(accuracy/2)) for accuracy in tfidf_accuracies]
    n_category_columns = len(category_columns)
    colors = [['green' if tfidf_from50[j]>i else 'red' for j in range(n_category_columns)] for i in range(50)]
    for i in range(50):
        plt.scatter(x=np.ones(len(category_columns))*(i+1),y=category_columns,color=colors[i])
    plt.xlim((0,51));
    fig1 = plt.figure(1)
    fig1.text(0, 0.95, "Correct", ha="center", va="bottom", size="large", color="green");
    fig1.text(0.06, 0.95, "/", ha="center", va="bottom", size="large");
    fig1.text(0.13,0.95,"Incorrect", ha="center", va="bottom", size="large", color="red");
    fig1.text(0.19, 0.95, " prediction from freetext entry for medicine patient safety events", va="bottom", size="large");
    
    non_crammed_columns = [column for column in category_columns]
    non_crammed_columns.remove('location')
    non_crammed_columns.remove('specialty')
    #non_crammed_columns.remove('reported by')
    
    tfidf_accuracies = []
    for column in non_crammed_columns:
        df_dropped = df_nb.dropna(subset=[column])
        df_dropped = df_dropped[df_dropped[column]!='']
        classes = [str(i) for i in df_dropped[column].value_counts().index]
        mapping = dict((el,i) for i,el in enumerate(classes)) 
        df_replaced = df_dropped.replace({column: mapping})
        y = df_replaced[column]
        X_train, X_test, y_train, y_test = train_test_split(
                                        df_dropped['inc_notes'], y,
                                        test_size=0.33,
                                        random_state=53)
        tfidf_vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)
        tfidf_test = tfidf_vectorizer.transform(X_test.values)

        tfidf_nb_classifier = MultinomialNB()
        tfidf_nb_classifier.fit(tfidf_train, y_train)
        tfidf_pred = tfidf_nb_classifier.predict(tfidf_test)
        tfidf_accuracies.append(100*metrics.accuracy_score(y_test,tfidf_pred))

        cm = metrics.confusion_matrix(y_test, tfidf_pred, labels=list(mapping.values()))
    
        plt.figure();
        # https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
        fig, ax = plt.subplots(figsize=(11,11));
        im = ax.imshow(cm, cmap=plt.cm.Blues);
        # create an axes on the right side of ax. The width of cax will be 5%
        # of ax and the padding between cax and ax will be fixed at 0.05 inch.
        divider = make_axes_locatable(ax)
        cax = divider.append_axes("right", size="5%", pad=0.05)
        ax.figure.colorbar(im, cax=cax)
        #plt.title('True vs Predicted "'+str(column)+'" from freetext of medicine patient safety events',x=1.2)        
        # We want to show all ticks...
        ax.set(xticks=np.arange(cm.shape[1]),
               yticks=np.arange(cm.shape[0]),
               # ... and label them with the respective list entries
               xticklabels=classes, yticklabels=classes,
               title='True vs Predicted "'+str(column)+'" from freetext of medicine patient safety events',
               ylabel='True label',
               xlabel='Predicted label');
        ax.xaxis.set_label_coords(1.09, -0.02);

        # Rotate the tick labels and set their alignment.
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
                 rotation_mode="anchor");

        # Loop over data dimensions and create text annotations.
        fmt = 'd'
        thresh = cm.max() / 2.
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(j, i, format(cm[i, j], fmt),
                        ha="center", va="center",
                        color="white" if cm[i, j] > thresh else "black");
        fig.tight_layout();
        plt.show();

In [None]:

donut_seg = 0
for i,counts in theme_counts_series.items():
    # Initialize the word cloud
    width = int(1024*counts/theme_counts_max)
    height = int(720*counts/theme_counts_max)
    wc = WordCloud(
        background_color="white",
        width = width,
        height = height
    )

    # Select row : component
    component = components_df.iloc[i]

    # Generate the cloud
    component.nlargest().index = component.nlargest().index.map(str)
    wc.generate_from_frequencies(component.nlargest())
    wordcloud_words.append(component.nlargest().index)

    # Display the generated image:
    figure, (wc_fig, counts_fig) = plt.subplots(nrows=1,ncols=2, figsize=(width/50,height/100))
    wc_fig.imshow(wc, interpolation='bilinear')
    wc_fig.axis("off");

    counts_fig.axis('equal')
    colors = ['w' for j in theme_counts_series.index]
    colors[donut_seg] = 'b'
    labels = ['' for val in theme_counts_series.values]
    labels[donut_seg] = str(counts)+"/"+str(theme_counts_series.values.sum())
    donut_seg += 1
    mypie, _ = counts_fig.pie(theme_counts_series.values/theme_counts_max, colors=colors, labels=labels, startangle=90, counterclock=False)
    plt.setp( mypie, width=0.4, edgecolor='black')
    plt.tight_layout()
    plt.show()
    
    df_max_feature = df[df['max_feature']==str(i)].copy()
    
    reportedby_plot()
    
    reporter_email()
    
    staff_involved_role()
    
    time_trend_plot()
    
    month_plot()
    
    day_plot()
    
    weekday_plot()

    hour_plot()
    plt.savefig('hour_'+str(i)+'.pdf')
    
    division_care_group_loc_plot()
    
    heatmap()
    
    correlations()
    
    #naive_bayes()
    
    plt.show()

    print('-------------------------------------------------------------------------------------------------------------------')

In [None]:
df_19 = df[df['inc_notes'].str.contains(' wa ')]
print(df_19['inc_notes'].values)