# ICD-11 BERT Embeddings Generator - Google Colab Notebook
 This notebook generates embeddings for ICD-11 codes using various BERT models

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import PCA
from tqdm.notebook import tqdm
import os
import re
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm

def bert_embeddings(file_in, file_out, model_name='bert-base-uncased', batch_size=16, max_length=512, device=None):
    df = pd.read_csv(file_in)
    texts = df['vectorization_text'].fillna('').astype(str).tolist()
    codes = df['code'].tolist()

    if device is None:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    model.eval()

    all_embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc=f"{model_name} embeddings"):
        batch_texts = texts[i:i+batch_size]
        encoded = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=max_length,
            return_tensors='pt'
        )
        encoded = {k: v.to(device) for k, v in encoded.items()}
        with torch.no_grad():
            outputs = model(**encoded)
            token_embeddings = outputs.last_hidden_state
            attention_mask = encoded['attention_mask']
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            embeddings = (sum_embeddings / sum_mask).cpu().numpy()
        all_embeddings.extend(embeddings)
        if device == 'cuda':
            torch.cuda.empty_cache()

        shape=all_embeddings[0].shape
    df_embeddings = pd.DataFrame({
        "ICD11_code": codes,
        "Vector": [np.array2string(vec, separator=',', precision=6, suppress_small=True) for vec in all_embeddings]
    })

    print(f"Number of rows in resulting DataFrame: {len(df_embeddings)}")
    print(f"Shape of the first vector: {shape}")

    df_embeddings.to_csv(file_out, index=False)





In [None]:
# --- LOOP OVER ALL MODELS ---
MODELS = {
    'bert': 'bert-base-uncased',
    'biobert': 'dmis-lab/biobert-base-cased-v1.1',
    'bioclinicalbert': 'emilyalsentzer/Bio_ClinicalBERT',
    'pubmedbert': 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
}

In [None]:
# embeddings for ICD11 codes
for model_name, model_path in MODELS.items():
    out_file = f"{model_name}_ICD11_embeddings.csv"
    print(f"\nProcessing {model_name}...")
    bert_embeddings("ICD11_preprocessed.csv", out_file, model_name=model_path)

In [None]:
# embeddings for Merriam-Webster definitions
for model_name, model_path in MODELS.items():
    out_file = f"{model_name}_encyclopedia_embeddings.csv"
    print(f"\nProcessing {model_name}...")
    bert_embeddings("definitions2vec/encyclopedia_sd_preprocessed.csv", out_file, model_name=model_path)


Processing bert...


bert-base-uncased embeddings: 100%|██████████| 24/24 [00:35<00:00,  1.49s/it]


Number of rows in resulting DataFrame: 378
Shape of the first vector: (768,)

Processing biobert...


dmis-lab/biobert-base-cased-v1.1 embeddings: 100%|██████████| 24/24 [00:40<00:00,  1.67s/it]


Number of rows in resulting DataFrame: 378
Shape of the first vector: (768,)

Processing bioclinicalbert...


emilyalsentzer/Bio_ClinicalBERT embeddings: 100%|██████████| 24/24 [00:34<00:00,  1.43s/it]


Number of rows in resulting DataFrame: 378
Shape of the first vector: (768,)

Processing pubmedbert...


microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext embeddings: 100%|██████████| 24/24 [00:34<00:00,  1.42s/it]


Number of rows in resulting DataFrame: 378
Shape of the first vector: (768,)


In [None]:
# embeddings for Non-medical prompts
for model_name, model_path in MODELS.items():
    out_file = f"{model_name}_non_medical_prompts_embeddings.csv"
    print(f"\nProcessing {model_name}...")
    bert_embeddings("Non_medical_prompts_preprocessed.csv", out_file, model_name=model_path)


Processing bert...


bert-base-uncased embeddings: 100%|██████████| 1/1 [00:01<00:00,  1.71s/it]


Number of rows in resulting DataFrame: 10
Shape of the first vector: (768,)

Processing biobert...


dmis-lab/biobert-base-cased-v1.1 embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.81it/s]


Number of rows in resulting DataFrame: 10
Shape of the first vector: (768,)

Processing bioclinicalbert...


emilyalsentzer/Bio_ClinicalBERT embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.32it/s]


Number of rows in resulting DataFrame: 10
Shape of the first vector: (768,)

Processing pubmedbert...


microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.54it/s]

Number of rows in resulting DataFrame: 10
Shape of the first vector: (768,)



