# ICD-11 Vectorization

# 1. Setup and Installation

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re
import json
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Set seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oldys\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oldys\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<torch._C.Generator at 0x1345161a010>

In [1]:
# check the working directory
import os
print(os.getcwd())

C:\Users\oldys\AppData\Local\Programs\Microsoft VS Code


# 2. Loading and Exploring the ICD-11 Data

In [3]:
# Load the ICD-11 data
icd11_df = pd.read_csv("D:/Desktop/2_NLP/_Project_/vector-database-ICD/icd11_data_raw.csv")

# === Load Comorbidity Data with ICD-11 Mappings (USED FOR EVALUATION AND PARTICULAR SAMPLING) ===
comorbidity_df = pd.read_csv(r"D:\Desktop\2_NLP\_Project_\Gleb_stuff\comorbidity_with_icd11.csv")

# Basic data exploration
print(f"Total number of ICD-11 codes: {len(icd11_df)}")
print(f"Number of columns: {len(icd11_df.columns)}")

# Display the first few rows
icd11_df.head()
     

Total number of ICD-11 codes: 28087
Number of columns: 19


Unnamed: 0,id,code,title,browser_url,class_kind,definition,parent,inclusions,foundation_children,foundation_child_references,index_terms,related_entities,full_text,children,postcoordination_scales,index_term_references,exclusions,exclusion_references,fully_specified_name
0,1937339080,1C22,Infections due to Chlamydia psittaci,https://icd.who.int/browse/2023-01/mms/en#1937...,category,Any condition caused by an infection with the ...,1127435854,Psittacosis; Ornithosis; Parrot fever,Pneumonia in chlamydia psittaci infection,Pneumonia in chlamydia psittaci infection: htt...,Infections due to Chlamydia psittaci; Psittaco...,1935107489,Infections due to Chlamydia psittaci Any condi...,,,,,,
1,1671640403,1F51.0,Gambiense trypanosomiasis,https://icd.who.int/browse/2023-01/mms/en#1671...,category,A disease caused by an infection with the prot...,875488052,West African sleeping sickness; Infection due ...,,,,1945127438,Gambiense trypanosomiasis A disease caused by ...,1842725899; other; unspecified,"{'axis_name': 'hasManifestation', 'required': ...",,,,
2,1528414070,1A07,Typhoid fever,https://icd.who.int/browse/2023-01/mms/en#1528...,category,A condition caused by an infection with the gr...,135352227,,,,,911707612,Typhoid fever A condition caused by an infecti...,364534567; other; unspecified,"{'axis_name': 'hasManifestation', 'required': ...",,,,
3,328097188,1A36.12,Cutaneous amoebiasis,https://icd.who.int/browse/2023-01/mms/en#3280...,category,,1777228366,,,,Cutaneous amoebiasis; Amoebiasis of skin; Amoe...,911707612,Cutaneous amoebiasis Cutaneous amoebiasis; Amo...,,,,,,
4,1483190070,1D03,Infectious abscess of the central nervous system,https://icd.who.int/browse/2023-01/mms/en#1483...,category,A focal suppurative process of the brain paren...,1585949804,,,,,911707612,Infectious abscess of the central nervous syst...,443087096; 613341872; 1147230459; 1128677700; ...,"{'axis_name': 'specificAnatomy', 'required': '...",,,,


# (Temporary) Codes overlap investigation

In [33]:
# Get unique ICD-11 codes from comorbidity file
used_icd11_codes = pd.Series(
    pd.concat([comorbidity_df['ICD11_1'], comorbidity_df['ICD11_2']])
).dropna().unique()

used_icd11_codes.shape
     

(222,)

AttributeError: 'numpy.ndarray' object has no attribute 'str'

In [42]:
beginning = '6A0'

# Convert numpy array to pandas Series for string operations
used_codes_series = pd.Series(used_icd11_codes)

print(
    icd11_df[icd11_df['code'].str.startswith(beginning, na=False)]['code'].head(), "\n\n",
    used_codes_series[used_codes_series.str.startswith(beginning, na=False)].head(), "\n\n"
    )

2452    6A05.1
2482    6A00.1
2511    6A01.2
2583    6A03.1
2585    6A05.0
Name: code, dtype: object 

 80       6A0Z
140    6A00.0
185    6A05.Z
dtype: object 




In [None]:
# Step 3: Filter icd11_df for only those ICD-11 codes
if 'code' not in icd11_df.columns:
    raise ValueError("❌ 'code' column (ICD-11 code identifier) not found in icd11_data_vectorization.csv")


filtered_icd11_df = icd11_df[icd11_df['code'].isin(used_icd11_codes)].copy()
print(f"✅ Selected {len(filtered_icd11_df)} ICD-11 codes from comorbidity data.")

✅ Selected 59 ICD-11 codes from comorbidity data.


In [21]:
icd11_df[icd11_df['code'].isin(used_icd11_codes)]['code']

17        1B70.1
75          1G40
135         1E30
238       1F03.0
276         1C41
295       1E50.0
431       1C62.1
519       1F02.2
680       1D80.0
729         1G80
2015        5B59
2181        5A11
2330        5A14
2428        5A10
2558        6D83
2594      6D72.0
2795        6B41
2980      6A00.0
3106        6A22
3276        8B20
3341      8B11.3
3670        8E47
3718      8B01.2
4152      9A78.8
4734        BC64
4776        BA01
4923     CA23.32
5054        CA00
5363      DA08.0
5431      DB97.2
5522      DB99.7
6613      FA25.0
6645      FB50.1
6727        FA05
6740      FA70.2
6744      FA70.1
7353      JB46.2
9203      ME10.1
9243      ME60.2
9260        MG25
9264        MD12
9276        MD93
9293      MD81.4
9309      MF50.3
9310        MD95
9337        MC85
9389      MC81.3
9673      MD90.1
9714      MF50.1
9912        MG26
10033     MB53.0
10690       NE11
10990       NF05
11407       NE60
11417       NE61
11423       ND30
11624       PC94
12067       PB5A
25652      XN9

In [28]:
missing_codes = set(used_icd11_codes) - set(filtered_icd11_df['code'])
print(sorted(list(missing_codes))[:100])  # Show first 20 missing codes


['1A62.Z', '1B70.Z', '1C17.Z', '1C4Z', '1C62.Z', '1D9Z', '1E50.Z', '1E51.Z', '1E90.Z', '1F23.Z', '1F76.Z', '1G04.Z', '1H0Z', '3A00.Z', '5A02.Z', '5A4Z', '5B5A.Z', '5B7Z', '5B81.Z', '5C70.Z', '5C7Z', '5C8Z', '5D0Y', '6A05.Z', '6A0Z', '6A20.Z', '6A21.Z', '6A23.Z', '6A24.Z', '6A61.Z', '6A70.Z', '6A71.Z', '6A8Z', '6B0Z', '6B20.Z', '6B8Z', '6C20.Z', '6C40.Z', '6C41.Z', '6C43.Z', '6C44.Z', '6C45.Z', '6C49.Z', '6C4B.Z', '6C4C.Z', '6C4D.Z', '6C91.Z', '6D10.Z', '6D3Z', '6D70.Z', '6D8Z', '6E6Z', '7B2Z', '8A00.2Z', '8A02.Z', '8A6Z', '8A84.Z', '8A8Z', '8B00.Z', '8B22.Y', '8B25.Z', '8C03.Z', '8D64.Z', '9A60.Z', '9B10.2Y', '9C61.Z', '9C84.Z', '9C8Y', '9D7Z', 'BA00.Z', 'BA04.Z', 'BA2Z', 'BA40.Z', 'BA41.Z', 'BA5Z', 'BB00.Z', 'BC0Z', 'BC63.4Z', 'BC63.Z', 'BC81.Z', 'BD1Z', 'BD70.Z', 'CA20.1Z', 'CA22.Z', 'CA27.Z', 'CA40.Z', 'CA42.Z', 'CB40.Y', 'DA26.0Z', 'DA42.Z', 'DA60.Z', 'DA63.Z', 'DB30.Z', 'DB31.Z', 'DB33.4Z', 'DB93.Y', 'DB94.Z', 'DB99.Y', 'DD70.Z', 'DD71.Z']


In [31]:
icd11_df[icd11_df['code'].str.startswith('6A0', na=False)]['code']

2452     6A05.1
2482     6A00.1
2511     6A01.2
2583     6A03.1
2585     6A05.0
2622       6A06
2639     6A03.2
2645     6A03.3
2677     6A06.0
2722     6A02.3
2768     6A03.0
2799       6A04
2805       6A03
2838       6A01
2858    6A01.22
2893     6A01.1
2899     6A05.2
2901       6A05
2904     6A00.3
2921     6A00.2
2949     6A06.1
2966       6A00
2969    6A01.23
2980     6A00.0
2982    6A01.20
2985     6A02.0
3015     6A02.2
3022       6A02
3080     6A00.4
3081    6A01.21
3108     6A02.5
3112     6A01.0
3124     6A02.1
Name: code, dtype: object

In [43]:
def normalize_icd_code(code):
    return str(code).strip().upper().replace('.', '')

# Normalize comorbidity codes
normalized_used_codes = set(normalize_icd_code(code) for code in used_icd11_codes)

# Normalize icd11_df['code'] and map back to original
icd11_df['normalized_code'] = icd11_df['code'].apply(normalize_icd_code)

# Match using normalized codes
filtered_icd11_df = icd11_df[icd11_df['normalized_code'].isin(normalized_used_codes)].copy()
print(f"✅ Matched {len(filtered_icd11_df)} ICD-11 codes after normalization.")


✅ Matched 59 ICD-11 codes after normalization.


# 3. Text Preprocessing

In [44]:
# Minimal preprocessing for Bio+ClinicalBERT
def preprocess_text(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    text = re.sub('<.*?>', '', text)
    text = re.sub('[^a-zA-Z0-9\s]', ' ', text)
    return text.strip()

In [None]:
# # Combine relevant columns into one text field
# icd11_df['processed_text'] = icd11_df.apply(
#     lambda row: preprocess_text(' '.join([
#         str(row.get('title', '')),
#         str(row.get('definition', '')),
#         str(row.get('inclusions', '')),
#         str(row.get('index_terms', ''))
#     ])), axis=1
# )


In [45]:
# Combine fields for input
filtered_icd11_df['processed_text'] = filtered_icd11_df.apply(
    lambda row: preprocess_text(' '.join([
        str(row.get('title', '')),
        str(row.get('definition', '')),
        str(row.get('inclusions', '')),
        str(row.get('index_terms', ''))
    ])), axis=1
)

# 4. Bio+clinical BERT Model Implementation

In [46]:
# Load Bio+ClinicalBERT
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Set model to evaluation mode
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [47]:
# Embedding function using CLS token
def get_bert_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # CLS token


# 5. Generating Embeddings for ICD-11 Codes

In [48]:
# Generate embeddings
print(f"🔍 Generating embeddings for {len(filtered_icd11_df)} matched ICD-11 codes...")
embeddings = []
for text in tqdm(filtered_icd11_df['processed_text'].tolist()):
    embeddings.append(get_bert_embedding(text, model, tokenizer))



🔍 Generating embeddings for 59 matched ICD-11 codes...


100%|██████████| 59/59 [00:17<00:00,  3.30it/s]


In [None]:
# # Sample 1000 entries
# sample_size = 3000
# sample_indices = np.random.choice(range(len(icd11_df)), sample_size, replace=False)
# sample_df = icd11_df.iloc[sample_indices].copy().reset_index(drop=True)

# # Generate embeddings in batches
# print(f"Generating Bio+ClinicalBERT embeddings for {sample_size} ICD codes...")


# batch_size = 100
# embeddings = []

# for i in range(0, len(sample_df), batch_size):
#     end_idx = min(i + batch_size, len(sample_df))
#     print(f"Processing batch {i//batch_size + 1}/{(len(sample_df)-1)//batch_size + 1} ({i} to {end_idx-1})")
#     batch_embeddings = sample_df.loc[i:end_idx-1, 'processed_text'].apply(
#         lambda text: get_bert_embedding(text, model, tokenizer)
#     )
#     embeddings.extend(batch_embeddings)

Generating Bio+ClinicalBERT embeddings for 3000 ICD codes...
Processing batch 1/30 (0 to 99)
Processing batch 2/30 (100 to 199)
Processing batch 3/30 (200 to 299)
Processing batch 4/30 (300 to 399)
Processing batch 5/30 (400 to 499)
Processing batch 6/30 (500 to 599)
Processing batch 7/30 (600 to 699)
Processing batch 8/30 (700 to 799)
Processing batch 9/30 (800 to 899)
Processing batch 10/30 (900 to 999)
Processing batch 11/30 (1000 to 1099)
Processing batch 12/30 (1100 to 1199)
Processing batch 13/30 (1200 to 1299)
Processing batch 14/30 (1300 to 1399)
Processing batch 15/30 (1400 to 1499)
Processing batch 16/30 (1500 to 1599)
Processing batch 17/30 (1600 to 1699)
Processing batch 18/30 (1700 to 1799)
Processing batch 19/30 (1800 to 1899)
Processing batch 20/30 (1900 to 1999)
Processing batch 21/30 (2000 to 2099)
Processing batch 22/30 (2100 to 2199)
Processing batch 23/30 (2200 to 2299)
Processing batch 24/30 (2300 to 2399)
Processing batch 25/30 (2400 to 2499)
Processing batch 26/3

# 6. Dimensionality Reduction

In [50]:
from sklearn.decomposition import PCA

# Dimensionality Reduction to 300D
print("Reducing to 300 dimensions using PCA...")
embedding_df = pd.DataFrame(embeddings)
pca = PCA(n_components=59)
reduced_embeddings = pca.fit_transform(embedding_df)

Reducing to 300 dimensions using PCA...


In [105]:
# PCA to 300D
print("⚙️ Reducing to 300 dimensions using PCA...")
pca = PCA(n_components=300)
reduced_embeddings = pca.fit_transform(embeddings)

⚙️ Reducing to 300 dimensions using PCA...


ValueError: n_components=300 must be between 0 and min(n_samples, n_features)=59 with svd_solver='full'

In [52]:
# Prepare output
final_output = pd.DataFrame({
    'ICD11_Code': filtered_icd11_df['code'],
    'Vector': [list(vec) for vec in reduced_embeddings]
})

In [60]:
final_output.head()

Unnamed: 0,ICD11_Code,Vector
17,1B70.1,"[2.083128, -0.72213846, -3.3888264, 0.36609048..."
75,1G40,"[-0.52065057, -1.2162358, 2.3709693, 2.9747782..."
135,1E30,"[-2.756355, 0.7908362, -0.3574265, 0.8769771, ..."
238,1F03.0,"[1.7091948, -2.1360068, 0.44202286, -0.1934314..."
276,1C41,"[-3.9060686, 0.028361302, 1.2480628, 1.4057598..."


In [53]:
# Save output
final_output.to_csv("bio_clinical_bert_300d_sampled.csv", index=False)
print("✅ File saved: bio_clinical_bert_300d_sampled.csv")

✅ File saved: bio_clinical_bert_300d_sampled.csv


# 7. Correlation Evaluation

In [54]:
# === Load Comorbidity Data with ICD-11 Mappings ===
comorbidity_df = pd.read_csv(r"D:\Desktop\2_NLP\_Project_\Gleb_stuff\comorbidity_with_icd11.csv")

# Drop rows with missing ICD-11 codes
comorbidity_df = comorbidity_df.dropna(subset=['ICD11_1', 'ICD11_2'])

In [55]:
# === Load Vector Data (Bio+ClinicalBERT 300D) ===
vector_df = pd.read_csv("bio_clinical_bert_300d_sampled.csv", usecols=["ICD11_Code", "Vector"])

In [56]:
# --- Step 1: Parse the vector string to NumPy arrays ---
def safe_parse_vector(v):
    if isinstance(v, bytes):
        v = v.decode('utf-8')
    numbers = re.findall(r'-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?', str(v))
    return np.array([float(n) for n in numbers], dtype=np.float32)

vector_df['Vector'] = vector_df['Vector'].apply(safe_parse_vector)

In [57]:
# --- Step 2: Build ICD11 -> Vector mapping ---
vector_map = dict(zip(vector_df['ICD11_Code'], vector_df['Vector']))

comorbidity_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 800 entries, 0 to 801
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ICD1        800 non-null    object 
 1   ICD2        800 non-null    object 
 2   Log2_Ratio  800 non-null    float64
 3   ICD11_1     800 non-null    object 
 4   ICD11_2     800 non-null    object 
dtypes: float64(1), object(4)
memory usage: 37.5+ KB


## Very important step of checking the overlap

Since we are embedding only a sample of code, we might very well run into problems if the embedding doesn't contain the icd score included in the evaluation

In [58]:
set_icd11_comorb = set(comorbidity_df['ICD11_1'].dropna()) | set(comorbidity_df['ICD11_2'].dropna())
set_icd11_vectors = set(vector_df['ICD11_Code'])

overlap = set_icd11_comorb & set_icd11_vectors
print(f"✅ Overlapping ICD-11 codes: {len(overlap)}")


✅ Overlapping ICD-11 codes: 59


## Cumputing the metric

In [59]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

# --- Step 3: Compute similarities & Pearson correlation ---
def validate_icd11_vectors(vector_map, comorbidity_df):
    valid_df = comorbidity_df[
        comorbidity_df['ICD11_1'].isin(vector_map) &
        comorbidity_df['ICD11_2'].isin(vector_map)
    ].copy()

    similarities = [
        cosine_similarity([vector_map[row['ICD11_1']]], [vector_map[row['ICD11_2']]])[0][0]
        for _, row in valid_df.iterrows()
    ]

    valid_df['Vector_Similarity'] = similarities
    pearson_corr, _ = pearsonr(valid_df['Log2_Ratio'].astype(float), valid_df['Vector_Similarity'])

    print(f"✅ Pearson correlation between ICD-11 vector similarity and comorbidity score: {pearson_corr:.3f}")
    return valid_df, pearson_corr

# --- Step 4: Run ---
validated_df, correlation_score = validate_icd11_vectors(vector_map, comorbidity_df)

✅ Pearson correlation between ICD-11 vector similarity and comorbidity score: 0.118
