In [34]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("fill-mask", model="microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")
model = AutoModelForMaskedLM.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
import json
# Get data from json file
with open('./pubmed_captions.json', 'r') as f:
    pubmed = json.load(f)

In [None]:
text = "basophilic"
inputs = tokenizer(text, return_tensors="pt")

outputs = model(**inputs, output_hidden_states=True)
query_embedding = outputs.hidden_states[-1][:, 0, :]
# outputs.

In [19]:
# Test basophilic
basophilic_caption = pubmed.get("29e5289d-121b-4c14-bf9f-2d9ab26a2eba")
print(basophilic_caption)

inputs = tokenizer(basophilic_caption, return_tensors="pt")
outputs = model(**inputs, output_hidden_states=True)
basophilic_embedding = outputs.hidden_states[-1][:, 0, :]

# Non basophilic
pneumonia_caption = pubmed.get("cc262f9a-d835-4660-befb-248ee3e2106a")
inputs = tokenizer(pneumonia_caption, return_tensors="pt")
outputs = model(**inputs, output_hidden_states=True)
pneumonia_embeddings = outputs.hidden_states[-1][:, 0, :]

Acinar cell carcinoma. Partially intact tissue fragment with neoplastic cells has moderately hyperchromatic, uniform nuclei and delicate basophilic cytoplasm. A lack of single dispersed cells may help to differentiate this from a pancreatic neuroendocrine tumor (Diff-Quik stain).


In [3]:
import torch
import torch.nn.functional as F
from torch.nn import CosineSimilarity
from transformers import CLIPTokenizer, CLIPModel, CLIPTextModel
# import tensorflow as tf
import numpy as np
import faiss # For dot product and L2 similarity
cossim = CosineSimilarity(dim=0, eps=1e-6)

In [4]:
def embed_caption(key):
    caption = pubmed.get(key)
    inputs = tokenizer(caption, return_tensors="pt")
    outputs = model(**inputs, output_hidden_states=True)
    embedding = outputs.hidden_states[-1][:, 0, :]
    return embedding

In [5]:
def cos_sim(v1, v2):
  result = F.cosine_similarity(torch.tensor(v1), torch.tensor(v2))
  # print(result)
  return result

In [23]:
#Cosine similarity 
baso_eval = cos_sim(query_embedding, basophilic_embedding)
other_eval = cos_sim(query_embedding, pneumonia_embeddings)

print(baso_eval, other_eval)

<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
tensor([0.8483]) tensor([0.8477])


  result = F.cosine_similarity(torch.tensor(v1), torch.tensor(v2))


In [20]:
caption_embeddings = {}
keys = [k for k in pubmed.keys()]
print(len(keys))

for k in keys[:200]:
    embedding = embed_caption(k)
    caption_embeddings[k] = embedding.tolist()

with open('bert_embeddings.json', 'w') as f:
    json.dump(caption_embeddings, f, indent=4)

3309


In [53]:

# Get data from json file
with open('./bert_embeddings.json', 'r') as f:
    bert_embeddings = json.load(f)

print(len(bert_embeddings.keys()))

3309


In [52]:
new_embeddings = {}
for k in keys[3200:]:
    embedding = embed_caption(k)
    new_embeddings[k] = embedding.tolist()

caption_embeddings.update(new_embeddings)
with open('bert_embeddings.json', 'w') as f:
    json.dump(caption_embeddings, f, indent=4)


In [None]:
for k in keys[400:600]:
    embedding = embed_caption(k)
    caption_embeddings[k] = embedding

In [None]:
for k in keys[600:800]:
    embedding = embed_caption(k)
    caption_embeddings[k] = embedding

In [None]:
for k in keys[:200]:
    embedding = embed_caption(k)
    caption_embeddings[k] = embedding

In [30]:
# Given a query and the image data, find the n top related pathology images
def nTopImages(query, embeddings, n=5):
#   query_embedding = getQueryEmbedding([query])
  keys = ["None"]
  evals = [0]
  eval_scores = {}
  for k in embeddings.keys():
    eval = cos_sim(embeddings[k], query)
    for i in range(min(len(evals), n)):
      if eval > evals[i]:
        evals.insert(i, eval)
        keys.insert(i, k)
        eval_scores[k] = eval
        break
  return keys[:n], eval_scores

In [31]:
top_keys, eval_results = nTopImages(query_embedding, caption_embeddings)

<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'torch.Tensor'>
<class 'torch.Tensor'> <class 'tor

  result = F.cosine_similarity(torch.tensor(v1), torch.tensor(v2))


In [33]:
for key in top_keys:
    print(pubmed.get(key), eval_results.get(key))

Numerous small to medium-sized with thick and hyalinized walls tensor([0.9094])
numerous cells with sebaceous differentiation tensor([0.9004])
Plasma cells. tensor([0.8950])
Granulomatous inflammation presents with epithelioid histiocytes, giant cells, and necrosis. tensor([0.8948])
tumour cells exhibiting diffuse positivity with CD34 tensor([0.8945])
