# Annotating documents using King's 1.2 finetuned model on LTH data

###     
    author: naa
    created: 2023-05-11
    version: 0.1.0

    
The steps in this notebook is similar to 01_aa_annotation.ipynb, with some changes due to differences in format (type-ids), additional information (concept filters applied, meta-annotation task) and missing information (semantic types) in the model

In [None]:
from common import *

In [None]:
dir_root = "/home/jovyan/nhsx_nlp"

# Load models and prepare MedCAT

In [None]:
vocab_path = dir_root + "models/kcl_private_modelpack/vocab.dat"
cdb_path = dir_root + "models/kcl_private_modelpack/cdb.dat"

## Baseline model (King's 1.2)

In [None]:
MEDCAT_MODEL_PATH = Path(
    dir_root + "/models/kcl_private_modelpack"
)
logging.debug(f"Loading MedCAT models from {MEDCAT_MODEL_PATH}")

In [None]:
# initialise cdb
cdb = CDB()
cdb = CDB.load(cdb_path)

# Create and load the Vocabulary
vocab = Vocab()
vocab = Vocab.load(vocab_path)

## Setup config

In [None]:
cdb, len_cdb = common_setupConfig(cdb,MEDCAT_MODEL_PATH)
len_cdb

## Initialise meta annotator

In [None]:
meta_paths = [i for i in MEDCAT_MODEL_PATH.glob("meta_*") if i.is_dir()]
meta_cats = [MetaCAT.load(save_dir_path=meta_path) for meta_path in meta_paths]

In [None]:
#check meta-annotation tasks
meta_cats

# Initialise CAT (main class from medcat used for concept annotation)

In [None]:
cat,ts = common_initialiseCAT(cdb, vocab, meta_cats)

# Create typeid2name dictionary with key:value of typeid:semantic type name

In [None]:
# add SNOMED-CT semantic tag to understand type_ids
#this is a dictionary that contains the type ids and their names
typeid2name = cat.cdb.addl_info['type_id2name']

In [None]:
typeid2name.items()

# load data

In [None]:
df = pd.read_csv(dir_root + "/data/raw/neurology_letters_2023_03_18.csv")
df.head()

# test NER

In [None]:
text  = "He was diagnosed with a neurology issue"
doc = cat(text)
print(doc.ents)

In [None]:
type(doc)

In [None]:
displacy.render(doc, style='ent',jupyter=True)

In [None]:
cat.get_entities("He was diagnosed with a neurology issue")

# Define subset of data to be analysed if required

In [None]:
data= df[0:] # take all of data to start with

In [None]:
len(data)

# test multiprocessing annotator function

In [None]:
in_data =[(1,"He was a neurology patient")]
results = cat.multiprocessing(in_data,nproc=2)
results

In [None]:

displacy.render(cat(in_data[0]), style='ent',jupyter=True)

# create data iterator for multiprocessing

###  data has to be in the form of a list of tuples containing two elements each: docid and doctext

In [None]:
# filter out data according to sentence length if required
#data = data[data.doctext.apply(lambda x: len(str(x))>10)] #select data with length of >10 words

In [None]:
in_data=[]
for docid, row in data[['doctext']].iterrows():
    #print(docid)
    text=row['doctext']
    in_data.append((docid,text))

# multiprocess based on number of characters in documents

In [None]:
batch_size_chars = 500000 # define batch size before multiprocessing

results = cat.multiprocessing(in_data, batch_size_chars=batch_size_chars, nproc=8) # try with small document number first if required

# Skip this if above alternative step done - multiprocess based on number of documents (alternative way to multiprocessing based on number of characters) 

In [None]:
# Set the batch size to the number of documents
batch_size = 100 # Batch size (BS) in number of documents

# Run model
if __name__ == '__main__':
    import torch
    torch.multiprocessing.set_start_method('spawn', force=True)
    results_pipe = cat.multiprocessing_pipe(in_data[:1000], # Formatted data
                                       batch_size = batch_size,
                                       nproc=2) # Increase it when having more cores available

## check result of multiprocessing with King's model on one document

In [None]:
# index of input data corresponds to the key of the annotated results, allowing for inspection of input text and extracted entitites
data.iloc[3]['doctext']

In [None]:
results[3].values()

In [None]:
len(results[3].values())

In [None]:
# check entities extracted for one document in structured annotation corpus
for annotation in list(results[3]['entities'].values()): # change .values object is a list as dict values are not subscriptable
    #print(annotation)
    print((annotation['types'][0])) #to access type as just a string, access the first item in it if CUIs have one-one mapping to semantic type
    print()
    
    #print(list(results[3]['entities'].values()))
    #print(annotation['cui'],annotation['pretty_name'])
    #print()

### save structured corpus of data annotated from  model: nhsx_nlp/models/kcl_private_modelpack

In [None]:
DATA_DIR = dir_root + '/data/interim/Annotated_Finetuned_Kings/'

In [None]:
pickle.dump(results, open(DATA_DIR+'2023_05_12_Kings_untrained_annotated_results.dat','wb'))

# flatten nested dictionary of annotated results

## load untrained structured annotation corpus if required

In [None]:
results = pickle.load(open(DATA_DIR + '2023_05_12_Kings_untrained_annotated_results.dat','rb'))

In [None]:
len(results)

In [None]:
flat_data = {
    'docid': [], # most superficial key in annotated results
    'cui': [],
    'pretty_name': [],
    'source_value': [],
    'detected_name': [],
    'type': [],
    'context_similarity': [],
    'text': [], #pulled from doctext in raw data    
    'Presence' : [], # Meta annotation of presence
    'Subject': [], # Meta annotation of subject experiencing
    'Time' : [] # Meta annotation of temporality of entity
}

In [None]:
# create flattened dictionary of results for each entity and their locations, CUI, pretty_name, source value, semantic types, context similarity and meta-annotation tasks
for doc in list(results.keys()): # the most superficial key in the nested dictionary of results is the document id

    for entity in list(results[doc]['entities'].values()): # the key in the lower layer of the nested dictionary of the document is the entity, which has its own unique id 
        
        flat_data['docid'].append(doc)
        flat_data['cui'].append(entity['cui'])
        flat_data['pretty_name'].append(entity['pretty_name'])
        flat_data['source_value'].append(entity['source_value'])
        flat_data['detected_name'].append(entity['detected_name'])
        
        flat_data['context_similarity'].append(entity['context_similarity'])
        flat_data['Presence'].append(entity['meta_anns']['Presence']['value'])
        flat_data['Subject'].append(entity['meta_anns']['Subject']['value'])
        flat_data['Time'].append(entity['meta_anns']['Time']['value'])
        flat_data['text'].append(data.iloc[doc]['doctext'])

        if entity['types']: 
            # if list is not empty (a list with an element is True)
            # for some reason, semantic type of an entity is in a list, but if we do (entity['types'][0]), we get an index out of range message, because there are CUIs that do not have semantic types
            flat_data['type'].append(entity['types']) #need to access first element in the list entity['types'] if every CUI correspond to just one semantic type
             
        else:
            flat_data['type'].append('Nil') # for CUIs with no semantic type, change to 'Nil'. This is a quirk of this model, whereby 7 CUIs in 76 documents do not have semantic types or type ids

In [None]:
#check value length of each key and confirm they are the same before converting them to dataframe
for i in flat_data.keys():
    print(len(flat_data[i]))

In [None]:
df_flat = pd.DataFrame.from_dict(flat_data)

In [None]:
df_flat.head()

In [None]:
df_flat.describe()

In [None]:
df_flat.info()

### check if CUIs map to more than 1 semantic type

In [None]:
listtype = []
for typelist in df_flat.type:
    if not typelist == 'Nil':
        listtype.append(len(typelist))

        

In [None]:
len(listtype)

In [None]:
set_type = set(listtype)

In [None]:
set_type # --> CUIs have a one-one relationship with semantic types. Therefore we can re-use the below code, and access the 0'th element in entity['types'] 

In [None]:
flat_data = {
    'docid': [], # most superficial key in annotated results
    'cui': [],
    'pretty_name': [],
    'source_value': [],
    'detected_name': [],
    'type': [],
    'context_similarity': [],
    'text': [], #pulled from doctext in raw data    
    'Presence' : [], # Meta annotation of presence
    'Subject': [], # Meta annotation of subject experiencing
    'Time' : [] # Meta annotation of temporality of entity
}

In [None]:
# create flattened dictionary of results for each entity and their locations, CUI, pretty_name, source value, semantic types,  context similarity and meta annotation
for doc in list(results.keys()): # the most superficial key in the nested dictionary of results is the document id

    for entity in list(results[doc]['entities'].values()): # the key in the lower layer of the nested dictionary of the document is the entity, which has its own unique id 
        
        flat_data['docid'].append(doc)
        flat_data['cui'].append(entity['cui'])
        flat_data['pretty_name'].append(entity['pretty_name'])
        flat_data['source_value'].append(entity['source_value'])
        flat_data['detected_name'].append(entity['detected_name'])
        
        flat_data['context_similarity'].append(entity['context_similarity'])
        flat_data['Presence'].append(entity['meta_anns']['Presence']['value'])
        flat_data['Subject'].append(entity['meta_anns']['Subject']['value'])
        flat_data['Time'].append(entity['meta_anns']['Time']['value'])
        flat_data['text'].append(data.iloc[doc]['doctext'])

        if entity['types']: 
            # if list is not empty (a list with an element is True)
            # for some reason, semantic type of an entity is in a list, but if we do (entity['types'][0]), we get an index out of range message, because there are CUIs that do not have semantic types
            flat_data['type'].append(entity['types'][0]) #need to access first element in the list entity['types']
             
        else:
            flat_data['type'].append('Nil') # for CUIs with no semantic type, change to 'Nil'. This is a quirk of this model, whereby 7 CUIs in 76 documents do not have semantic types or type ids

In [None]:
#check value length of each key and confirm they are the same before converting them to dataframe
for i in flat_data.keys():
    print(len(flat_data[i]))

In [None]:
df_flat = pd.DataFrame.from_dict(flat_data)

In [None]:
df_flat.head()

### Save flattened annotated results dataframe

In [None]:
filepath = Path(dir_root + '/data/interim/Annotated_Finetuned_Kings/2023_05_12_Kings_untrained_flattened_annotated_results.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True)  
df_flat.to_csv(filepath, index=False) #put index = False so we don't get an unnamed column when we pd.read_csv this csv

#### Reload data saved above for inspection if required

In [None]:
flat_data_path = dir_root + '/data/interim/Annotated_Finetuned_Kings/2023_05_12_Kings_untrained_flattened_annotated_results.csv'

In [None]:
df_flat = pd.read_csv(flat_data_path)

In [None]:
df_flat.head()

In [None]:
df_flat.nunique()

In [None]:
df_flat['type'].unique() # so this confirms that there are CUIs that do not have semantic types, and we have coded that as 'Nil'

In [None]:
df_flat.info()

In [None]:
df_flat.describe()

In [None]:
# which CUIs don't have type ids?
no_type = df_flat.query("type == 'Nil'")

In [None]:
no_type

In [None]:
no_type.nunique() # there are 7 CUIS that do not have semantic types

In [None]:
no_type.cui.unique()

In [None]:
no_type.pretty_name.unique()

## create dict of CUIs and their location

In [None]:
# This will be a map from CUI to a list of documents where it appears: {"cui": [<doc_id>, <doc_id>, ...], ..}
cui_location = {}
for doc in list(results.keys()):
    for annotation in list(results[doc]['entities'].values()):
        if annotation['cui'] in cui_location:
            cui_location[annotation['cui']].append(doc)
        else:
            cui_location[annotation['cui']] = [doc]

In [None]:
#check that number length of dictionary corresponds to total number of concepts extracted
len(cui_location)

## Create dict of type ids and their location

In [None]:
#test CUI conversion to type ID
cat.cdb.cui2type_ids['3457005']

In [None]:
# For the type_ids and their corresponding documents
# Let's also save the type_ids location
type_ids_location = {}

for cui in cui_location.keys():

      if list(cat.cdb.cui2type_ids[cui]): # If CUI has type id, then...

         type_ids_location[list(cat.cdb.cui2type_ids[cui])[0]] = cui_location[cui] # assign the location of that type id as its CUI

      else:
         type_ids_location['No type'] = cui_location[cui] 

In [None]:
#check length of this type id location dictionary is the same as unique values of types
len(type_ids_location)

## Create dict of CUIS and their context similarity

In [None]:
# This will be a map from CUI its context similarities : {"cui": [<context similarity>, <context similarity>, ...], ..}
cui_similarity = {}

for doc in list(results.keys()):
    for annotation in list(results[doc]['entities'].values()):
        if annotation['cui'] in cui_similarity:
         cui_similarity[annotation['cui']].append(annotation['context_similarity'])
        else:
         cui_similarity[annotation['cui']] = [annotation['context_similarity']]

In [None]:
#check length of this context similarity dictionary is the same as unique numbers of context similarity in flat dataframe
len(cui_similarity)

### Save untrained CUI location, type ID location and CUI_context similarity

In [None]:
# Save the data so that we don't have to do the annotation again
pickle.dump(cui_location, open(DATA_DIR + "cui_location.dat", 'wb'))
pickle.dump(type_ids_location, open(DATA_DIR + "type_ids_location.dat", 'wb'))
pickle.dump(cui_similarity, open(DATA_DIR + "cui_similarity.dat", 'wb'))

# Load untrained annotated results from  King's finetuned SNOMED model on Data, CUI location, type id location and context similarity

In [None]:
DATA_DIR = dir_root + '/data/interim/Annotated_Finetuned_Kings/'


In [None]:
#load structured annotation dataset
untrained_results = pickle.load(open(DATA_DIR + '2023_05_12_Kings_untrained_annotated_results.dat','rb'))

In [None]:
#check loaded data
untrained_results

In [None]:
cui_location = pickle.load(open(DATA_DIR + 'cui_location.dat','rb'))

In [None]:
type_ids_location = pickle.load(open(DATA_DIR + 'type_ids_location.dat','rb'))

In [None]:
cui_similarity = pickle.load(open(DATA_DIR + 'cui_similarity.dat','rb'))

## Visualise annotation frequency for finetuned KCH model we have not trained for LTH's data

### create df of CUI, number of documents containing it, type ids, cui similarity

In [None]:
cui_ndocs = [('cui', 'ndocs')]

for cui in cui_location.keys():
    cui_ndocs.append((cui, len(cui_location[cui])))

In [None]:
df_cui_ndocs = pd.DataFrame(cui_ndocs[1:], columns=cui_ndocs[0])

In [None]:
df_cui_ndocs.head() # ndocs == number of documents containing that CUI

In [None]:
#test cat.cdb.cui2type_ids.get with CUIs that don't have type ids
cat.cdb.cui2type_ids['34281000175105'] # --> the output is 'set()'??? Buggy. empty set basically

In [None]:
cat.cdb.cui2type_ids.get('34281000175105', 'unk')

In [None]:
#test cat.cdb.cui2type_ids.get with CUIs that do have type ids
cat.cdb.cui2type_ids['3457005'] # --> output is a set

In [None]:
# add type ids for each CUI

df_cui_ndocs['type_ids'] = ['unk'] * len(df_cui_ndocs) # unknown token times the length of dataframe
cols = list(df_cui_ndocs.columns)

for i in range(len(df_cui_ndocs)):
    cui = df_cui_ndocs.iat[i, cols.index('cui')] # access the cui in the 'i'th row and index of column 'cui'

    if  cat.cdb.cui2type_ids.get(cui, 'unk'): # if this set has elements, then...
        type_ids = tuple(cat.cdb.cui2type_ids.get(cui, 'unk'))[0] #Turn the 'set' type of type ids into a tuple then access the string of type id in it for us to easily filter the type 
    else:
        type_ids = '00000' # CUIs with no type IDs, get given a value of '00000'

    df_cui_ndocs.iat[i, cols.index('type_ids')] = type_ids

In [None]:

# add type id semantic tag for each CUI
semantic_tag =[]
for i, row in df_cui_ndocs.iterrows():
     
    if row['type_ids'] == '00000':
        semantic_tag.append('Nil')
    else:
        type = row['type_ids'] # change set to row and access type id in it
        semantic_tag.append(typeid2name[type])


In [None]:
df_cui_ndocs['Semantic_tags'] = semantic_tag #make the semantic tag list into a column

In [None]:
#add Semantic tags as column in new dataframe
df_cui_ndocs.loc[df_cui_ndocs['Semantic_tags'] == 'disorder']

In [None]:
#  Add name for each CUI
df_cui_ndocs['name'] = ['unk'] * len(df_cui_ndocs)
cols = list(df_cui_ndocs.columns)
for i in range(len(df_cui_ndocs)):
    cui = df_cui_ndocs.iat[i, cols.index('cui')]
    name = cat.cdb.cui2preferred_name.get(cui, 'unk')
    df_cui_ndocs.iat[i, cols.index('name')] = name

In [None]:
# Add the percentage column
total_docs = len(data)
df_cui_ndocs['perc_docs'] = (df_cui_ndocs['ndocs'] / total_docs) * 100

In [None]:
# add mean of context similarity per cui 
cons_similarity = []
for _, row in df_cui_ndocs.iterrows():
    cui = row['cui']
    #print(cui)
    mean_similarity = statistics.mean(cui_similarity[cui])
    cons_similarity.append(mean_similarity)

In [None]:
df_cui_ndocs['mean_similarity'] = cons_similarity

In [None]:
df_cui_ndocs.head()

In [None]:
df_cui_ndocs = df_cui_ndocs.sort_values('ndocs', ascending=False)

In [None]:
df_cui_ndocs.head(15)

### Save dataframe of CUIs, number of docs mentioning them, related semantic tags, concept name, and mean context similarity 

In [None]:
filepath = Path(dir_root + '/data/interim/Annotated_Finetuned_Kings/2023_05_15_cui_docs.csv')  


In [None]:
filepath.parent.mkdir(parents=True, exist_ok=True)  
df_cui_ndocs.to_csv(filepath, index=False)

### Plot unfiltered concepts extracted

In [None]:
#load file again if required as per filepath above
df_cui_ndocs = pd.read_csv(filepath)

In [None]:
df_cui_ndocs

In [None]:
# Plot the top 30 concepts
sns.reset_defaults()
sns.set(
    rc={'figure.figsize':(5,15)}, 
    style="whitegrid",
    palette='pastel'
)
f, ax = plt.subplots()
_data = df_cui_ndocs.iloc[0:30]
sns.barplot(x="ndocs", y="name", data=_data, label="Concept", color="b")
_ = ax.set(xlim=(0, 6000), ylabel="SNOMED-CT concept", xlabel="Count of documents with mention of concept")
plt.show()

###  Plot top 30 SNOMED-CT concepts for type T-11: Disorder

In [None]:
# Plot the top 30 concepts
sns.reset_defaults()
sns.set(
    rc={'figure.figsize':(5,15)}, 
    style="whitegrid",
    palette='pastel'
)

f, ax = plt.subplots()
# Subset the data and chose only T-11(Disorder), top 30
_data = df_cui_ndocs[df_cui_ndocs['type_ids'].apply(lambda x: 'T-11' in x)].iloc[:31]

sns.barplot(x="ndocs", y="name", data=_data, label="Concept", color="b")
_ = ax.set(xlim=(0, 1200), ylabel="SNOMED-CT concept", xlabel="Count of documents with mention of concept")
plt.show()

In [None]:
#barplot for percentage of documents with mention of concept
sns.reset_defaults()
sns.set(
    rc={'figure.figsize':(6,15)}, 
    style="whitegrid",
    palette='pastel',
    )

f, ax = plt.subplots()
# Subset the data and chose only 9090192, top 30
_data = df_cui_ndocs[df_cui_ndocs['type_ids'].apply(lambda x: 'T-11' in x)].iloc[:31]

sns.barplot(x="perc_docs", y="name", data=_data, label="Concept Name", color="b")
_ = ax.set(xlim=(0, 1.5), ylabel="Concept Name", xlabel="Percentage of documents with mention of concept \n for King's 1.2 model annotation ")
plt.show()

In [None]:
_data