In [36]:
from google.colab import drive
import json
import clip
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel


In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:

file_path = '/content/drive/MyDrive/disease_descriptions.json'

with open(file_path, 'r') as f:
    disease_descriptions = json.load(f)

print(disease_descriptions.keys())



disease_full_texts = {}
for disease, descriptions in disease_descriptions.items():
    full_text = " ".join(descriptions)
    disease_full_texts[disease] = full_text



dict_keys(['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Pleural Effusion', 'Pneumonia', 'Pneumothorax'])


In [23]:
# for disease, text in disease_full_texts.items():
#     print(f"{disease}:\n{text}\n")

In [37]:
texts = []
labels = []
for i, disease in enumerate(disease_full_texts.keys()):
    texts.append(disease_full_texts[disease])
    labels.append(i)


In [33]:
# texts

In [39]:
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [42]:
disease_embeddings = {}

with torch.no_grad():
    for disease, text in disease_full_texts.items():

        inputs = tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        ).to(device)


        outputs = model(**inputs)


        cls_embedding = outputs.pooler_output  # shape: (1, hidden_size)


        disease_embeddings[disease] = cls_embedding.squeeze(0).cpu()


In [45]:
# disease_embeddings

In [44]:
torch.save(disease_embeddings, '/content/drive/MyDrive/disease_text_embeddings.pt')