In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np
import math
import os
from numpy.linalg import norm

drive = os.listdir()
drive

['.config', 'gdrive', 'sample_data']

In [None]:
df=pd.read_csv('/content/gdrive/MyDrive/ImageCLEF2023/Imageclef2023_all_in_one.csv', sep='\t', encoding='latin')

In [None]:
df = df.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])

Unnamed: 0,ID,cuis,caption
0,ImageCLEFmedical_Caption_2023_train_000001,C0040405,Head CT demonstrating left parotiditis.
1,ImageCLEFmedical_Caption_2023_train_000002,C0041618,Acquired renal cysts in end-stage renal failur...
2,ImageCLEFmedical_Caption_2023_train_000003,C0040405;C0817096;C0205271,Computed tomography of the chest showing the r...
3,ImageCLEFmedical_Caption_2023_train_000004,C0002978;C0036033;C0262950;C0225317,Lateral view of the sacrum showing the low con...
4,ImageCLEFmedical_Caption_2023_train_000005,C0040405;C0817096;C0497156,Thoracic CT scan showing perihilar pulmonary l...


In [None]:
concepts_mapper = pd.read_csv('/content/gdrive/MyDrive/imageclef23/ImageCLEFmedical_Caption_2023_cui_mapping.csv', sep="\t", header=None, names=['cui', 'concept'])

# Build a mapper
_concepts_dict = {}
for row in concepts_mapper['concept']:
    mapper = concepts_mapper.loc[concepts_mapper['concept'] == row].values.flatten().tolist()
    _concepts_dict[mapper[0]] = mapper[1]

In [None]:
# Create new column and fill it with nan values
df['concepts'] = np.nan

# Iterate through the dataframe and fill the column with either the real-world medical concept (if available), or the CUI.
for i, cuis in enumerate(df['cuis']):
    tags = []
    for tag in cuis.split(';'):
        if tag in _concepts_dict.keys():
            tags.append(_concepts_dict[tag])
        else:
            tags.append(tag)
    df['concepts'][i] = ';'.join(tags)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['concepts'][i] = ';'.join(tags)


In [None]:
tags_dict = dict()
for concepts in df['concepts']:
  tags = concepts.split(';')
  for t in tags:
    if t not in tags_dict.keys():
      tags_dict[t] = 0

In [None]:
import pickle

# Load fasttext embeddings
fasttext_embed = np.load("/content/gdrive/My Drive/fasttext.npy")
fasttext_word_to_index = pickle.load(open("/content/gdrive/My Drive/fasttext_voc.pkl", 'rb'))


In [None]:
# save concepts and captions into list
df_concepts, df_captions = list(df['concepts']), list(df['caption'])
for i, item in enumerate(df_concepts):
  df_concepts[i] = item.split(';')
df_concepts, df_captions

([['X-Ray Computed Tomography'],
  ['Ultrasonography'],
  ['X-Ray Computed Tomography', 'Chest', 'Irregular'],
  ['angiogram', 'Sacral Region', 'Skeletal bone', 'soft tissue'],
  ['X-Ray Computed Tomography', 'Chest', 'Lymphadenopathy'],
  ['X-Ray Computed Tomography'],
  ['X-Ray Computed Tomography', 'Pelvis', 'Acute abscess'],
  ['X-Ray Computed Tomography',
   'Ventriculoperitoneal catheter',
   'RIght lateral ventricle structure'],
  ['X-Ray Computed Tomography',
   'Left lateral ventricle structure',
   'Structure of parenchyma of lung',
   'Hemorrhage',
   'Right frontal lobe structure',
   'Ventricular hemorrhage'],
  ['X-Ray Computed Tomography'],
  ['X-Ray Computed Tomography', 'Skeletal bone'],
  ['Magnetic Resonance Imaging', 'Cerebellar hemisphere structure'],
  ['Magnetic Resonance Imaging', 'Nodule', 'Spinal Cord', 'Cauda Equina'],
  ['Magnetic Resonance Imaging', 'Fluid behavior'],
  ['X-Ray Computed Tomography'],
  ['X-Ray Computed Tomography', 'Bronchi'],
  ['Ultrasono

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm',disable=["tagger", "parser","ner"])
nlp.add_pipe('sentencizer')

# Define function that tokenizes the given structure using the SpaCy library
def tokenize_samples(samples):

  tokenized_samples = []
  for i in range(len(samples)):
    doc = nlp(samples[i])  # Tokenize the sample into sentences
    tokens = []
    for sent in doc.sents:
      for tok in sent:  # Iterate through the words of the sentence
        if '\n' in tok.text or "\t" in tok.text or "--" in tok.text or "*" in tok.text or tok.text.lower() in STOP_WORDS:
          continue
        if tok.text.strip():
          tokens.append(tok.text.replace('"',"'").strip())
    tokenized_samples.append(tokens)

  return tokenized_samples

# Tokenize the captions
df_captions_tokenized = tokenize_samples(df_captions)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

MAX_WORDS = 50000
MAX_SEQUENCE_LENGTH = 150
EMBEDDING_DIM = fasttext_embed.shape[1]

# Init tokenizer
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='__UNK__')
# num_words: the maximum number of words to keep, based on word frequency.
# oov_token: will be used to replace OOV WORDS

# Fit tokenizer (Updates internal vocabulary based on a list of texts.)
tokenizer.fit_on_texts([" ".join(x) for x in df_captions_tokenized])

# Converts text to sequences of IDs
train_seqs = tokenizer.texts_to_sequences([" ".join(x) for x in df_captions_tokenized])

# Pad the training sequences
train_data = pad_sequences(train_seqs, maxlen=MAX_SEQUENCE_LENGTH, padding='post')


#--------------------------------------------------------------------------------------------

# Init tokenizer
tokenizer2 = Tokenizer(num_words=len(df_concepts), oov_token='__UNK__')
# num_words: the maximum number of words to keep, based on word frequency.
# oov_token: will be used to replace OOV WORDS

# Fit tokenizer (Updates internal vocabulary based on a list of texts.)
tokenizer2.fit_on_texts([" ".join(x) for x in df_concepts])

# Converts text to sequences of IDs
train_seqs2 = tokenizer2.texts_to_sequences([" ".join(x) for x in df_concepts])

# Pad the training sequences
train_data2 = pad_sequences(train_seqs2, maxlen=MAX_SEQUENCE_LENGTH, padding='post')


In [None]:
# Save the word index from TensorFlow's tokenizer
word_index = tokenizer.word_index
word_index2 = tokenizer2.word_index

print('Found {} unique tokens.\n'.format(len(word_index)))
print('Found {} unique tokens2.\n'.format(len(word_index2)))

word_index.update(word_index2)
print('Found {} unique tokens2.\n'.format(len(word_index)))
print(word_index)

import pickle

# the hist_train pkl file will essentially contain the train histograms for each tag
# it is calculated as: for each caption that comprises the tag, retrieve the max cosine similarity between the tag and the caption's words!
# save dictionary to pickle file
with open('word_index.pkl', 'wb') as file:
    pickle.dump(word_index, file, protocol=pickle.HIGHEST_PROTOCOL)

Found 35425 unique tokens.

Found 1607 unique tokens2.

Found 35480 unique tokens2.

{'__UNK__': 1, 'showing': 2, 'right': 18, 'left': 15, 'ct': 23, 'arrow': 6, 'image': 7, 'scan': 141, 'chest': 10, 'tomography': 6, 'computed': 7, 'shows': 12, 'view': 13, 'mass': 392, 'contrast': 19, 'x': 2, 'axial': 649, 'mri': 155, 'artery': 20, 'ray': 3, 'radiograph': 722, 'arrows': 22, 'patient': 1373, 'lesion': 354, 'demonstrating': 25, 'white': 229, 'abdomen': 22, 'anterior': 13, 'abdominal': 94, 'coronal': 559, '2': 548, '1': 32, 'bilateral': 179, 'lateral': 191, 'lung': 27, 'imaging': 9, 'posterior': 17, 'red': 38, 'year': 39, 'old': 40, 'showed': 41, 'large': 382, 'weighted': 469, 'pulmonary': 37, 'sagittal': 39, 'cm': 46, 'wall': 93, 'lower': 28, 'enhanced': 1000, 'resonance': 12, 'lobe': 29, 'magnetic': 11, 'ultrasound': 172, 't2': 54, '3': 1245, 'upper': 44, 'small': 128, 'line': 965, 'seen': 59, 'bone': 16, 'pelvis': 25, 'vein': 55, 'multiple': 452, 'level': 299, 'ventricle': 71, 'tumor': 

In [None]:
# Previously, we saved all fasttext embeddings to a NumPy array for quick access
# Now, we save the fasttext embeddings for the words that we only need, based on the tokenizer

embedding_matrix = np.zeros((len(word_index)+2, EMBEDDING_DIM))  # +2 (pad, unkown)

for word, i in word_index.items():
    if i > len(word_index):
            continue
    try:
        embedding_vector = fasttext_embed[fasttext_word_to_index[word],:]
        embedding_matrix[i] = embedding_vector
    except:
        embedding_matrix[i] = np.ones(300)
print('Size of Embedding Matrix:', len(embedding_matrix))
print('Embedding Matrix:', embedding_matrix)

np.save('embedding_matrix.npy', embedding_matrix)

Size of Embedding Matrix: 35482
Embedding Matrix: [[ 0.      0.      0.     ...  0.      0.      0.    ]
 [ 0.      0.      0.     ...  0.      0.      0.    ]
 [ 0.0789 -0.1382  0.3439 ...  0.0467 -0.3636 -0.1238]
 ...
 [ 0.      0.      0.     ...  0.      0.      0.    ]
 [ 0.      0.      0.     ...  0.      0.      0.    ]
 [ 0.      0.      0.     ...  0.      0.      0.    ]]


In [None]:
# Define function that calculates the given text's word embeddings centroid.
def text_centroid(text, model, word_index):
    """ Calculate centroid function """
    text_vec =[]
    counter = 0
    text = text.split(" ")
    for word in text:
      """if word in STOP_WORDS:
        continue"""
      try:
        if (counter == 0):
          text_vec = model[word_index[word.lower()]]
        else:
          text_vec = np.add(text_vec, model[word_index[word.lower()]])
        counter+=1
      except:
        pass

    return np.asarray(text_vec) / counter

In [None]:
# Define function that calculates the word embeddings of each item in the given list
def get_concept_word_embeddings(_concepts:list, dims):

  concepts_embeddings = list()
  if dims == 2:
    for i, clist in enumerate(_concepts):
      concepts_embeddings.append([])
      for c in clist:
        c = c.replace('-', ' ')
        c = c.replace('.', ' ')
        c = c.replace(':', ' ')
        c = c.replace('[', ' ')
        c = c.replace(']', ' ')
        c = c.replace('(', ' ')
        c = c.replace(')', ' ')
        c = c.replace('=', ' ')
        c = c.replace('/', ' ')


        if ((len(c.split(' ')) == 1)):
          # if tag is only one word --> word_embedding(tag)
          if c.lower() in word_index:
            print(embedding_matrix[word_index[c.lower()]])
            concepts_embeddings[i].append(embedding_matrix[word_index[c.lower()]])
          else:
            concepts_embeddings[i].append(np.ones(300))
        else:
          # else if tag is more than one word --> centroid of words embeddings of each tag subword
          concepts_embeddings[i].append(text_centroid(c, embedding_matrix, word_index))
  elif dims == 1:

     for i, c in enumerate(_concepts):

        c = c.replace('-', ' ')
        c = c.replace('.', ' ')
        c = c.replace(':', ' ')
        c = c.replace('[', ' ')
        c = c.replace(']', ' ')
        c = c.replace('(', ' ')
        c = c.replace(')', ' ')
        c = c.replace('=', ' ')
        c = c.replace('/', ' ')

        if ((len(c.split(' ')) == 1)):
          # if tag is only one word --> word_embedding(tag)
          if c.lower() in word_index:
            #print(embedding_matrix[word_index[c.lower()]])
            concepts_embeddings.append(embedding_matrix[word_index[c.lower()]])
          else:
            concepts_embeddings.append(np.zeros(300))
        else:
          # else if tag is more than one word --> centroid of words embeddings of each tag subword
          concepts_embeddings.append(text_centroid(c, embedding_matrix, word_index))

  return concepts_embeddings

In [None]:
def fine_captions(_captions:list):
  to_delete = ['(', ')', '.', ',', ]
  for i, caption in enumerate(_captions):
    for word in caption:
      if word in to_delete:
        _captions[i].remove(word)

  return _captions

In [None]:
def get_captions_word_embeddings(_captions:list, dims):

  captions_embeddings = list()

  if dims == 2:
    for i, clist in enumerate(_captions):
      captions_embeddings.append([])
      for c in clist.split(' '):
        c = c.replace('-', ' ')
        c = c.replace('.', ' ')
        c = c.replace(':', ' ')
        c = c.replace('[', ' ')
        c = c.replace(']', ' ')
        c = c.replace('(', ' ')
        c = c.replace(')', ' ')
        c = c.replace('=', ' ')
        c = c.replace('/', ' ')


        if ((len(c.split(' ')) == 1)):
          if c.lower() in word_index:
            captions_embeddings[i].append(embedding_matrix[word_index[c.lower()]])
          else:
            captions_embeddings[i].append(np.ones(300))
        elif ((len(c.split()) > 1) and (len(text_centroid(c, embedding_matrix, word_index)) > 0)):
          captions_embeddings[i].append(text_centroid(c, embedding_matrix, word_index))
        else:
          captions_embeddings[i].append(np.ones(300))

  elif dims == 1:
    for i, c in enumerate(_captions):
      c = c.replace('-', ' ')
      c = c.replace('.', ' ')
      c = c.replace(':', ' ')
      c = c.replace('[', ' ')
      c = c.replace(']', ' ')
      c = c.replace('(', ' ')
      c = c.replace(')', ' ')
      c = c.replace('=', ' ')
      c = c.replace('/', ' ')

      if ((len(c.split(' ')) == 1)):
        captions_embeddings.append(embedding_matrix[word_index[c.lower()]])
      elif ((len(c.split()) > 1) and (len(text_centroid(c, embedding_matrix, word_index)) > 0)):
        captions_embeddings.append(text_centroid(c, embedding_matrix, word_index))
      else:
        captions_embeddings[i].append(np.ones(300))


  return captions_embeddings

In [None]:
# compute cosine similarity
def cosine_sim(A, B):
  cosine = np.dot(A,B)/(norm(A)*norm(B))
  return cosine

In [None]:
# function that computes the cosine similarity betwen each tag and each caption word.
def compute_similarities(concepts_embeds:list, captions_embeds:list):

  similarities = list()
  for i, tags_i in enumerate(concepts_embeds):
    similarities.append([])
    for j in range(len(tags_i)):
      similarities[i].append([])
      for k in range(len(captions_embeds[i])):
        similarities[i][j].append(cosine_sim(concepts_embeds[i][j], captions_embeds[i][k]))
    return similarities

In [None]:
# ALT - function that computes the cosine similarity betwen each tag and each caption word.
def compute_sims(concepts_embeds:list, captions_embeds:list):

  similarities = list()
  for i, tags_i in enumerate(concepts_embeds):
    similarities.append([])
    for k in range(len(captions_embeds)):
      similarities[i].append(cosine_sim(concepts_embeds[i], captions_embeds[k]))
  return similarities

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [None]:
# Initial approach:
#    1. Iterate through all the captions.
#    2. For each caption, retrieve the tags.
#    3. Compute the cosine similarity between each tag and each word caption.
#    4. Add the max cosine similarity between the tag and the caption's words in the resepctiv dictionary position (the tag is the key).

# First, initialize the dictionary.

targets = set(flatten(df_captions))
print(targets)
print('Targets length:', len(targets)) # must be 2125 tags

max_cosines_dict = dict()
for target in targets:
  max_cosines_dict[target] = 0 # initialize with zero


{'Í', '\x99', '\x88', 's', 'b', 'D', '\x9a', '¥', '±', '1', '\xa0', 'L', '³', '°', '®', '\xad', '+', 'S', '\x97', '^', 'C', '#', 'H', '\x98', '.', 'l', 'º', '\x86', '\\', 'x', '6', 'ï', 'i', 'Å', 'I', '`', 'o', 'v', 'G', 'Ä', '´', '»', 'z', '½', ',', '\x8a', '0', '¤', '¸', 'm', '¢', 'h', 'ª', 'F', 'Ï', '\x94', 'U', '8', 'Ì', '\x9e', '\x8b', '|', '\x91', 'M', '\x92', '\x95', '!', '"', '}', '/', '\x9f', 'y', '\x9b', 'B', '\x8c', '@', '{', 'N', 'k', '¶', '¿', 'A', '\x90', '\x8e', '²', '%', 'j', '*', '\x83', '~', '\x9c', '¡', '\x84', 'T', '\x96', 'Î', ']', 'X', '£', 'p', 'P', '&', 'Z', '3', 'Y', 'q', '[', 'Q', ')', '¨', 'V', 't', 'd', '©', ':', '\x85', 'Â', '2', '(', 'E', '?', 'R', '5', 'â', '-', '>', '$', 'O', ';', '4', 'n', 'W', '¦', '¬', '7', '\x81', "'", 'µ', '\x82', '·', '¯', 'g', '\x89', '\x93', 'c', 'K', 'Ã', '9', 'e', '¼', 'r', '<', 'a', '=', 'Ð', '_', 'u', 'f', 'w', 'J', 'á', '¹', '«', '§', ' ', '\x87', '¾', '\x80', '\x9d'}
Targets length: 169


In [None]:
# Start iterating through all the captions.
respective_tags = list()
concepts_embeddings = list()
captions_embeddings = get_captions_word_embeddings(df_captions, dims=2)
for i, caption in enumerate(df_captions): #df_captions is a list of length 71355, where each list item is one caption.
  tags = df_concepts[i]
  tags_embeddings = get_concept_word_embeddings(tags, dims=1)
  concepts_embeddings.append(tags_embeddings)
  respective_tags.append(tags)


In [None]:
tags_dict = dict()
for concepts in df['concepts']:
  tags = concepts.split(';')
  for t in tags:
    if t not in tags_dict.keys():
      tags_dict[t] = list()

print(tags_dict)

{'X-Ray Computed Tomography': [], 'Ultrasonography': [], 'Chest': [], 'Irregular': [], 'angiogram': [], 'Sacral Region': [], 'Skeletal bone': [], 'soft tissue': [], 'Lymphadenopathy': [], 'Pelvis': [], 'Acute abscess': [], 'Ventriculoperitoneal catheter': [], 'RIght lateral ventricle structure': [], 'Left lateral ventricle structure': [], 'Structure of parenchyma of lung': [], 'Hemorrhage': [], 'Right frontal lobe structure': [], 'Ventricular hemorrhage': [], 'Magnetic Resonance Imaging': [], 'Cerebellar hemisphere structure': [], 'Nodule': [], 'Spinal Cord': [], 'Cauda Equina': [], 'Fluid behavior': [], 'Bronchi': [], 'Bone structure of cranium': [], 'Atrial Septal Defects': [], 'Ectopic kidney': [], 'Kidney': [], 'Calculi': [], 'Obstructed': [], 'Right kidney': [], 'Left kidney': [], 'Both kidneys': [], 'Heart': [], 'Arteriovenous fistula': [], 'Anterior descending branch of left coronary artery': [], 'Coronary artery': [], 'Pulmonary artery structure': [], 'Entire right sinus of Val

In [None]:
import math
from tqdm import tqdm

# iterate through the dataset captions
for i in tqdm(range(len(captions_embeddings))):
  # for each caption compute the cosine similarity between each tag and each caption word
  # ie. if #tags = 2 and len(caption)=10, then a matrix of size (2, 10) is returned
  sims = compute_sims(concepts_embeddings[i], captions_embeddings[i])

  # iterate through the sims vector
  for k, rt in enumerate(sims):
    sims[k] = [x for x in sims[k] if (math.isnan(x))==False]
    if len(sims[k]) > 0:
      tags_dict[respective_tags[i][k]].append(np.max(sims[k]))
    else:
      print('Empty sims list!')

100%|██████████| 71355/71355 [00:59<00:00, 1206.88it/s]


In [None]:
import pickle

# the hist_train pkl file will essentially contain the train histograms for each tag
# it is calculated as: for each caption that comprises the tag, retrieve the max cosine similarity between the tag and the caption's words!
# save dictionary to pickle file
with open('hist_train.pkl', 'wb') as file:
    pickle.dump(tags_dict, file, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
to_plot = list()
for i in range(1000):
  to_plot.append( tags_dict[list(tags_dict.keys())[i]] )

In [None]:
import statistics
tags_dict2 = dict()
for k in tags_dict.keys():
  counter = 0
  if len(tags_dict[k]) > 0:
    tags_dict2[k] = [statistics.median(tags_dict[k]), statistics.stdev(tags_dict[k]), len(tags_dict[k])]
  else:
    counter += 1

print('Finished loop!', counter, 'lists with zero-size.')

Finished loop! 0 lists with zero-size.


In [None]:
sorted_dict2 = {k: v for k, v in sorted(tags_dict2.items(), key=lambda item: item[1][0], reverse=False)}

In [None]:
import pickle

# save dictionary to pickle file
with open('median_max_cos_c.pkl', 'wb') as file:
    pickle.dump(sorted_dict2, file, protocol=pickle.HIGHEST_PROTOCOL)