In [2]:
#!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from tqdm import tqdm

In [None]:
df = pd.read_csv("icd11_data_vectorization.csv")  

#texts are displayed as lists
texts = df['vectorization_text'].fillna("").tolist()

In [5]:
model_name = "dmis-lab/biobert-base-cased-v1.1"  # You can change this to any BioBERT variant
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Set model to evaluation mode (no dropout, no training)
model.eval()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

This is the function to perform the embedding

In [6]:
def embed_texts_batched(texts, tokenizer, model, device, batch_size=16):
    all_embeddings = []

    for i in tqdm(range(0, len(texts), batch_size), desc="Embedding batches"):
        batch = texts[i:i+batch_size]

        # Tokenize batch
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, max_length=512,
                           padding='max_length')  # uniform length across batch
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            hidden_states = outputs.last_hidden_state  # [batch, seq_len, hidden_dim]
            mask = inputs['attention_mask']

            # Mean pooling (ignore padding)
            mask_expanded = mask.unsqueeze(-1).expand(hidden_states.size()).float()
            sum_embeddings = torch.sum(hidden_states * mask_expanded, 1)
            sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
            mean_pooled = (sum_embeddings / sum_mask).cpu().numpy()

        all_embeddings.extend(mean_pooled)

    return np.array(all_embeddings)


In [7]:
df.head()

Unnamed: 0,id,code,title,browser_url,class_kind,definition,parent,inclusions,foundation_children,foundation_child_references,...,full_text,children,postcoordination_scales,index_term_references,exclusions,exclusion_references,fully_specified_name,id_parent1,parent_description1,vectorization_text
0,1937339080,1C22,Infections due to Chlamydia psittaci,https://icd.who.int/browse/2023-01/mms/en#1937...,category,Any condition caused by an infection with the ...,1127435854,Psittacosis; Ornithosis; Parrot fever,Pneumonia in chlamydia psittaci infection,Pneumonia in chlamydia psittaci infection: htt...,...,Infections due to Chlamydia psittaci Any condi...,,,,,,,1127436000.0,,Title: Infections due to Chlamydia psittaci\nD...
1,1671640403,1F51.0,Gambiense trypanosomiasis,https://icd.who.int/browse/2023-01/mms/en#1671...,category,A disease caused by an infection with the prot...,875488052,West African sleeping sickness; Infection due ...,,,...,Gambiense trypanosomiasis A disease caused by ...,1842725899; other; unspecified,"{'axis_name': 'hasManifestation', 'required': ...",,,,,875488100.0,A disease caused by an infection with the prot...,Title: Gambiense trypanosomiasis\nDefinition: ...
2,1528414070,1A07,Typhoid fever,https://icd.who.int/browse/2023-01/mms/en#1528...,category,A condition caused by an infection with the gr...,135352227,,,,...,Typhoid fever A condition caused by an infecti...,364534567; other; unspecified,"{'axis_name': 'hasManifestation', 'required': ...",,,,,135352200.0,"Any condition of the intestines, caused by an ...",Title: Typhoid fever\nDefinition: A condition ...
3,328097188,1A36.12,Cutaneous amoebiasis,https://icd.who.int/browse/2023-01/mms/en#3280...,category,,1777228366,,,,...,Cutaneous amoebiasis Cutaneous amoebiasis; Amo...,,,,,,,1777228000.0,,Title: Cutaneous amoebiasis\nRelated entities:...
4,1483190070,1D03,Infectious abscess of the central nervous system,https://icd.who.int/browse/2023-01/mms/en#1483...,category,A focal suppurative process of the brain paren...,1585949804,,,,...,Infectious abscess of the central nervous syst...,443087096; 613341872; 1147230459; 1128677700; ...,"{'axis_name': 'specificAnatomy', 'required': '...",,,,,1585950000.0,"Any condition of the nervous system, caused by...",Title: Infectious abscess of the central nervo...


Creation of the vectors 

In [8]:

vectors = embed_texts_batched(texts, tokenizer, model, device, batch_size=16)

# Save results
np.save("icd_biobert_vectors.npy", vectors)
df['biobert_vector'] = list(vectors)
df.to_pickle("icd_vectorization_with_biobert.pkl")

Embedding batches:   0%|          | 0/1756 [00:00<?, ?it/s]

Embedding batches: 100%|██████████| 1756/1756 [27:53<00:00,  1.05it/s]


In [12]:
len(df['biobert_vector'][0])


768

## Proviamo a fare un prompt e ritrovare il codice corrispondente con cosine similarity

In [32]:
# Structured prompt text
structured_text = "[Title]: brain tumor\n[Definition]: Tumours are usually located in the brain hemispheres."



# Call your embed function (assuming it's defined like you said)
embedding = embed_texts_batched(
    texts=[structured_text],
    tokenizer=tokenizer,
    model=model,
    device=device,
    batch_size=16
)

# Result is a NumPy array (likely shape: (1, 768))
print("Embedded vector shape:", embedding.shape)
print("First 5 dims:", embedding[0][:5])


Embedding batches: 100%|██████████| 1/1 [00:00<00:00,  1.24it/s]

Embedded vector shape: (1, 768)
First 5 dims: [-0.10943115  0.143661   -0.2256604   0.00810312  0.26296365]





In [33]:
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
bio_vectors_stacked = np.vstack(df['biobert_vector'].values).astype('float32')

# Ensure embedding is 2D (1, 768)
query_embedding = np.array(embedding, dtype='float32').reshape(1, -1)

# Now cosine similarity will work
sim = cosine_similarity(bio_vectors_stacked, query_embedding)  # shape: (N, 1)

# Top-k matches
top_k = sim[:, 0].argsort()[-5:][::-1]


In [35]:
top_matches = df.iloc[top_k][['code']]

In [36]:
print(top_matches)

         code
800    2E80.1
4272     9A23
17446  XA54B9
9845   MB71.0
8036     KB2K


In [37]:
row = df.loc[df['code'] == '2A00.0']

In [42]:
# Ensure the full text is displayed
print(row['vectorization_text'].values[0])

Title: Gliomas of brain
Full text: Gliomas of brain
Children: 650534447; other; unspecified


In [44]:
df['inclusions'].sample(10)

18471        NaN
12279        NaN
11627        NaN
20892    AIN III
15880        NaN
27019        NaN
4751         NaN
13504        NaN
12769        NaN
4415         NaN
Name: inclusions, dtype: object

Horrible result 