# Annotating documents using publicly-available SNOMED models on LTH data

###     
    author: naa
    created: 2023-04-03
    version: 0.1.0

This is the annotation folder for the Public SNOMED model trained on LTH data

In [None]:
from common import *

In [None]:
dir_root = "/home/jovyan/nhsx_nlp"

# Load models and prepare MedCAT

In [None]:
vocab_path = dir_root + "/models/mc_modelpack_snomed_int_16_mar_2022_25be3857ba34bdd5/vocab.dat"
cdb_path = dir_root + "/models/mc_modelpack_snomed_int_16_mar_2022_25be3857ba34bdd5/cdb.dat"

## Baseline model

In [None]:
#Create model path to Public model
MEDCAT_MODEL_PATH = Path(
    dir_root + "/models/mc_modelpack_snomed_int_16_mar_2022_25be3857ba34bdd5"
)
logging.debug(f"Loading MedCAT models from {MEDCAT_MODEL_PATH}")

In [None]:
# initialise cdb
cdb = CDB()

#load cdb
cdb = CDB.load(dir_root + "/models/mc_modelpack_snomed_int_16_mar_2022_25be3857ba34bdd5/cdb.dat")

# Create and load the Vocabulary
vocab = Vocab()
vocab = Vocab.load(vocab_path)

## Setup config

In [None]:
cdb, len_cdb = common_setupConfig(cdb,MEDCAT_MODEL_PATH)
len_cdb

## Initialise meta annotator

In [None]:
meta_paths = [i for i in MEDCAT_MODEL_PATH.glob("meta_*") if i.is_dir()]
meta_cats = [MetaCAT.load(save_dir_path=meta_path) for meta_path in meta_paths]

# Initialise CAT (main class from medcat used for concept annotation)

In [None]:
cat,ts = common_initialiseCAT(cdb, vocab, meta_cats)

## Add semantic tag field to SNOMED-CT CDB

In [None]:
# add SNOMED-CT semantic tag to understand type_ids
typeid2name = cat.cdb.addl_info['type_id2name']

# load data

In [None]:
df = pd.read_csv(dir_root + "/data/raw/neurology_letters_2023_03_18.csv")
df.head()

# Check CAT class

In [None]:

cat

# test NER+L on one single document

In [None]:
text  = "He was diagnosed with a neurology issue"
doc = cat(text)
print(doc.ents)

In [None]:
#check type of document
type(doc)

In [None]:
displacy.render(doc, style='ent',jupyter=True)

In [None]:
cat.get_entities("He was diagnosed with a neurology issue")

# Slice data

In [None]:
#all sentences selected for NER+L
data= df[0:]

In [None]:
len(data)

# Test multiprocessing annotator function on one document

In [None]:
data.iloc[0]['doctext']

In [None]:
#process one document into correct format for multiprocessing (list of two-element tuples )
in_data =[(1,"He was a neurology patient")]

In [None]:
#multiprocess and check results 
results = cat.multiprocessing(in_data,nproc=2) # first argument is input data, second argument is number of processors
results

In [None]:
#check named entities with displacy
displacy.render(cat(in_data[0]), style='ent',jupyter=True)

# Create data iterator for multiprocessing

###  data has to be in the form of a list of tuples containing two elements each: docid and doctext

In [None]:
#unhash the following and filter data of selected length

#data = data[data.doctext.apply(lambda x: len(str(x))>10)] #select data with length of >10 words

In [None]:
#check filtered data
data

In [None]:
# create data iterator in [(docid, text), (docid, text)...] format
in_data=[]
for docid, row in data[['doctext']].iterrows():
    #print(docid)
    text=row['doctext']
    in_data.append((docid,text))

# Multiprocess based on number of documents (alternative way to multiprocessing based on number of characters)

In [None]:
#this function does not always work
# Set the batch size to the number of documents
batch_size = 100 # Batch size (BS) in number of documents

# Run model
if __name__ == '__main__':
    import torch
    torch.multiprocessing.set_start_method('spawn', force=True)
    results_pipe = cat.multiprocessing_pipe(in_data[:1000], # Formatted data
                                       batch_size = batch_size,
                                       nproc=2) # Increase it when having more cores available

# Multiprocess based on number of characters in documents, select 8 processors

In [None]:
batch_size_chars = 500000

results = cat.multiprocessing(in_data, batch_size_chars=batch_size_chars, nproc=8) # try with small document number first if preferred

## Check result of multiprocessing with baseline model on one document

In [None]:
# index of input data correspondes to the key of the annotated results, allowing for inspection of input text and extracted entitites
data.iloc[550]['doctext']

In [None]:
results[550]

The returned list of `entities` contains the following:

`acc` - Confidence score for this detection

`cui` - ID of the detected entity in the CDB (in our case UMLS)

`pretty_name` - The pretty name for this entity linked with the CUI

`detected_name` - What exact source value was detected

`type_ids` - The category code

`types` - Description label of the type_ids

`start` - The start character for the entity in the original string

`end` - End character for the entity in the original string

`id` - Internal ID, each entity inside a document has an unique ID

`meta_anns ` - Each key is a customised meta-annotation task. 


__Optional parameters which can also be set:__

The following can also be set to be returned during the creation of the MedCAT CDB within the model pack

`icd10` - If we are using a medical CDB, we'll also get ICD10 codes

`umls` - If the CDB was something other than UMLS, we would get the potential link to UMLS.

`snomed` - If we are using a medical CDB this would link to the equivalent SNOMED concept

In [None]:
# check entities extracted for one document in structured annotation corpus
for annotation in list(results[550]['entities'].values()):
    print(annotation)
    #print(list(results[3]['entities'].values()))
    #print(annotation['cui'],annotation['pretty_name'])
    print(annotation['meta_anns']['Status']['value'])
    print()

### save structured corpus of data annotated from baseline model: mc_modelpack_snomed_int_16_mar_2022_25be3857ba34bdd5'

In [None]:
DATA_DIR = dir_root + '/data/interim/Annotated_Public/'

In [None]:
pickle.dump(results, open(DATA_DIR+'2023_05_15_public_annotated_results.dat','wb'))

# Flatten nested dictionary of annotated results for easy inspection

## load untrained structured annotation corpus if required

In [None]:
results = pickle.load(open(DATA_DIR + '2023_05_15_public_annotated_results.dat','rb'))

In [None]:
# check length of multiprocessed corpus is the same as original corpus
len(results)

In [None]:
# Initialise empty dictionary for relevant keys to be extracted from multiprocessed results
flat_data = {
    'docid': [], # most superficial key in annotated results
    'cui': [], #concept unique identifier
    'pretty_name': [], #name of concept entity is linked to
    'source_value': [], # detected entity
    'detected_name': [], # detected entity
    'type': [], # semantic type (may not be one-to-one, whereby concepts can be mapped to more than one type)
    'context_similarity': [],
    'text': [], #pulled from doctext in raw data    
    'meta_status': []
}

In [None]:
# create flattened dictionary of results for each entity and their locations, CUI, pretty_name, source value, type(s), context similarity, text and meta-annotation 
for doc in list(results.keys()):

    for entity in list(results[doc]['entities'].values()):
        
        flat_data['docid'].append(doc)
        flat_data['cui'].append(entity['cui'])
        flat_data['pretty_name'].append(entity['pretty_name'])
        flat_data['source_value'].append(entity['source_value'])
        flat_data['detected_name'].append(entity['detected_name'])
        flat_data['type'].append(entity['types'])#[0]) #comment this out as we're checking whether this is a one-one or one-many mapping for CUIs to TUIs
        flat_data['context_similarity'].append(entity['context_similarity'])
        flat_data['text'].append(data.iloc[doc]['doctext'])
        flat_data['meta_status'].append(entity['meta_anns']['Status']['value'])

### check if CUIs map to more than 1 semantic type

In [None]:
# create list of lengths of semantic type for each concept
listtype = []
for typelist in df_flat.type:
    if not typelist == 'Nil':
        listtype.append(len(typelist))

        

In [None]:
#make set of listtype. If there is only the number on in the set, then all concepts map one-to-one to each type
set_type = set(listtype)

In [None]:
# --> CUIs have a one-one relationship with semantic types. 
# Therefore we can confidently re-use the code in the following cell, 
# and access the 0'th element in "entity['types']"" list, confidently knowing that we only have one element in that list
set_type 

In [None]:
#reinitialise empty dictionary for flat data
flat_data = {
    'docid': [], # most superficial key in annotated results
    'cui': [],
    'pretty_name': [],
    'source_value': [],
    'detected_name': [],
    'type': [],
    'context_similarity': [],
    'text': [], #pulled from doctext in raw data    
    'meta_status': []
}

In [None]:
# create flattened dictionary of results for each entity and their locations, CUI, pretty_name, source value, type, context similarity, text and meta-annotation 
for doc in list(results.keys()):

    for entity in list(results[doc]['entities'].values()):
        
        flat_data['docid'].append(doc)
        flat_data['cui'].append(entity['cui'])
        flat_data['pretty_name'].append(entity['pretty_name'])
        flat_data['source_value'].append(entity['source_value'])
        flat_data['detected_name'].append(entity['detected_name'])
        flat_data['type'].append(entity['types'][0]) #comment this out as we're checking whether this is a one-one or one-many mapping for CUIs to TUIs
        flat_data['context_similarity'].append(entity['context_similarity'])
        flat_data['text'].append(data.iloc[doc]['doctext'])
        flat_data['meta_status'].append(entity['meta_anns']['Status']['value'])

In [None]:
#check value length of each key in dictionary and confirm they are the same before converting them to dataframe
for i in flat_data.keys():
    print(len(flat_data[i]))

In [None]:
df_flat = pd.DataFrame.from_dict(flat_data)

### save flattened annotated results

In [None]:
filepath = Path(dir_root + '/data/interim/Annotated_Public/2023_05_15_public_annotated_flattened_results.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df_flat.to_csv(filepath, index=False)

# Inspect flattened annotation results

#### load data saved above for inspection if required

In [None]:
df_flat = pd.read_csv(dir_root + '/data/interim/Annotated_Public/2023_05_15_public_annotated_flattened_results.csv')

In [None]:
#inspect data
df_flat.head(12)

In [None]:
df_flat.info()

In [None]:
df_flat.nunique()

# Create dict of CUIs and their location

In [None]:
# This will be a map from CUI to a list of documents where it appears: {"cui": [<doc_id>, <doc_id>, ...], ..}
cui_location = {}
for doc in list(results.keys()):
    for annotation in list(results[doc]['entities'].values()):
        if annotation['cui'] in cui_location:
            cui_location[annotation['cui']].append(doc)
        else:
            cui_location[annotation['cui']] = [doc]

In [None]:
#check length of dictionary. This should be the same as unique values of CUIs in flattened dataframe above
len(cui_location)

# Create dict of type ids and their location

In [None]:
# For the type_ids and their corresponding documents
# Remember that a cui may map to more than one type_ids (one to many mapping), but we have checked this before flattening annotation results
# Let's also save the type_ids location
type_ids_location = {}
for cui in cui_location.keys():
   #print(cui, list(cat.cdb.cui2type_ids[cui]))
    #print(cui_location[cui])
   type_ids_location[list(cat.cdb.cui2type_ids[cui])[0]] = cui_location[cui]

In [None]:
len(type_ids_location) # this should correspond to unique values of type in flattened annotation results

## Create dict of CUIS and their context similarity

In [None]:
# This will be a map from CUI its context similarities : {"cui": [<context similarity>, <context similarity>, ...], ..}
cui_similarity = {}

for doc in list(results.keys()):
    for annotation in list(results[doc]['entities'].values()):
        if annotation['cui'] in cui_similarity:
         cui_similarity[annotation['cui']].append(annotation['context_similarity'])
        else:
         cui_similarity[annotation['cui']] = [annotation['context_similarity']]

In [None]:
len(cui_similarity) #this should be the same as unique numbers of CUIs

### save untrained CUI location, type ID location and CUI_context similarity

In [None]:
# Save the data so that we don't have to do the annotation again
pickle.dump(cui_location, open(DATA_DIR + "cui_location.dat", 'wb'))
pickle.dump(type_ids_location, open(DATA_DIR + "type_ids_location.dat", 'wb'))
pickle.dump(cui_similarity, open(DATA_DIR + "cui_similarity.dat", 'wb'))

# Load untrained annotated results from  public SNOMED model on Data, CUI location, type id location, context similarity and meta task of status

In [None]:
untrained_results = pickle.load(open(DATA_DIR + '2023_05_15_public_annotated_results.dat','rb'))

In [None]:
#check loaded data
untrained_results

In [None]:
cui_location = pickle.load(open(DATA_DIR + 'cui_location.dat','rb'))

In [None]:
type_ids_location = pickle.load(open(DATA_DIR + 'type_ids_location.dat','rb'))

In [None]:
cui_similarity = pickle.load(open(DATA_DIR + 'cui_similarity.dat','rb'))

# Visualise annotation frequency for untrained public model

## Create df (df_cui_ndocs) of CUI, locations, type ids, cui similarity for ease of visualisation

In [None]:
cui_ndocs = [('cui', 'ndocs')]

for cui in cui_location.keys():
    cui_ndocs.append((cui, len(cui_location[cui])))

In [None]:
df_cui_ndocs = pd.DataFrame(cui_ndocs[1:], columns=cui_ndocs[0])

In [None]:
# add type ids for each CUI

df_cui_ndocs['type_ids'] = ['unk'] * len(df_cui_ndocs)
cols = list(df_cui_ndocs.columns)

for i in range(len(df_cui_ndocs)):
    cui = df_cui_ndocs.iat[i, cols.index('cui')]
    type_ids = cat.cdb.cui2type_ids.get(cui, 'unk')
    df_cui_ndocs.iat[i, cols.index('type_ids')] = type_ids

In [None]:
# add type id semantic tag for each CUI
semantic_tag =[]
for i, row in df_cui_ndocs.iterrows():
    key = tuple(row['type_ids'])[0]
    #print(i, key, type(key))
    #print(key, typeid2name[key])
    semantic_tag.append(typeid2name[key])

In [None]:
df_cui_ndocs['Semantic_tags'] = semantic_tag

In [None]:
#  Add name for each CUI
df_cui_ndocs['name'] = ['unk'] * len(df_cui_ndocs)
cols = list(df_cui_ndocs.columns)
for i in range(len(df_cui_ndocs)):
    cui = df_cui_ndocs.iat[i, cols.index('cui')]
    name = cat.cdb.cui2preferred_name.get(cui, 'unk')
    df_cui_ndocs.iat[i, cols.index('name')] = name

In [None]:
# Add the percentage column
total_docs = len(data)
df_cui_ndocs['perc_docs'] = (df_cui_ndocs['ndocs'] / total_docs) * 100

In [None]:
# add mean of context similarity per cui 
cons_similarity = []
for _, row in df_cui_ndocs.iterrows():
    cui = row['cui']
    #print(cui)
    mean_similarity = statistics.mean(cui_similarity[cui])
    cons_similarity.append(mean_similarity)

In [None]:
df_cui_ndocs['mean_similarity'] = cons_similarity

In [None]:
#sort dataframe by ndocs (number of documents with mention of CUI)
df_cui_ndocs = df_cui_ndocs.sort_values('ndocs', ascending=False)

In [None]:
#inspect new dataframe created
df_cui_ndocs.head(15)

## Save dataframe of CUIs, number of docs mentioning them, related semantic tags, concept name, and mean context similarity 

In [None]:
filepath = Path(dir_root + '/data/interim/Annotated_Public/2023_05_15_cui_docs.csv')  


In [None]:
filepath.parent.mkdir(parents=True, exist_ok=True)  
df_cui_ndocs.to_csv(filepath, index=False )

## Load cui and docs data again if required

In [None]:
df_cui_ndocs = pd.read_csv(filepath)

In [None]:
#inspect df loaded
df_cui_ndocs.head()

### plot count of unfiltered concepts extracted

In [None]:
# Plot the top 30 concepts
sns.reset_defaults()
sns.set(
    rc={'figure.figsize':(10, 15)}, 
    style="whitegrid",
    palette='pastel'
)
f, ax = plt.subplots()
_data = df_cui_ndocs.iloc[0:30]
sns.barplot(x="ndocs", y="name", data=_data, label="Concept", color="b")
_ = ax.set(xlim=(0, 6000), ylabel="SNOMED-CT concept", xlabel="Count of documents with mention of concept following annotation \n of clinic letters with public SNOMED model")
plt.show()

###  plot count of top 30 SNOMED-CT concepts for type 9090192: Disorder

In [None]:
# Plot the top 30 concepts
sns.reset_defaults()
sns.set(
    rc={'figure.figsize':(6,15)}, 
    style="whitegrid",
    palette='pastel'
)

f, ax = plt.subplots()
# Subset the data and chose only 9090192, top 30
_data = df_cui_ndocs[df_cui_ndocs['type_ids'].apply(lambda x: '9090192' in x)].iloc[:31]

sns.barplot(x="ndocs", y="name", data=_data, label="Concept", color="b")
_ = ax.set(xlim=(0, 2000), ylabel="SNOMED-CT concept", xlabel="Count of documents with mention of concept")
plt.show()

###  plot percentage of top 30 SNOMED-CT concepts for type 9090192: Disorder

In [None]:
#barplot for percentage of documents with mention of concept
sns.reset_defaults()
sns.set(
    rc={'figure.figsize':(6,15)}, 
    style="whitegrid",
    palette='pastel',
    )

f, ax = plt.subplots()
# Subset the data and chose only 9090192, top 30
_data = df_cui_ndocs[df_cui_ndocs['type_ids'].apply(lambda x: '9090192' in x)].iloc[:31]

sns.barplot(x="perc_docs", y="name", data=_data, label="Concept Name", color="b")
_ = ax.set(xlim=(0, 1.5), ylabel="Concept Name", xlabel="Percentage of documents with mention of concept \n for Public model annotation ")
plt.show()

In [None]:
#inspect data
_data