# Convert ICD10 codes to ICD11 codes

In [1]:
import pandas as pd

# === Step 1: Load Comorbidity Data ===
comorbidity_path = "comorbidity_data.xlsx"
df = pd.read_excel(comorbidity_path, sheet_name='Ranked comorbidity pairs', skiprows=1)
df_filtered = df[['ICD1', 'ICD2', 'ln2(ratio)']].copy()
df_filtered.columns = ['ICD1', 'ICD2', 'Log2_Ratio']

# === Step 2: Load ICD-10 to ICD-11 Mapping ===
map_path = "ICD_10to11_mapping.xlsx"
map_df = pd.read_excel(map_path)
map_filtered = map_df[['icd10Code', 'icd10Title', 'icd11Code', 'icd11Title']].copy()

# === Step 3: Create Mapping Dictionary ===
icd10_to_icd11 = dict(zip(map_filtered['icd10Code'], map_filtered['icd11Code']))

# === Step 4: Add ICD-11 Columns ===
df_filtered['ICD11_1'] = df_filtered['ICD1'].map(icd10_to_icd11)
df_filtered['ICD11_2'] = df_filtered['ICD2'].map(icd10_to_icd11)

# === Step 4b: Strip trailing ".Z" from any ICD-11 codes ===
for col in ['ICD11_1', 'ICD11_2']:
    df_filtered[col] = df_filtered[col].astype(str).str.replace(r'\.Z$', '', regex=True)

# === Step 5: Save to CSV ===
output_path = "comorbidity_with_icd11.csv"
df_filtered.to_csv(output_path, index=False)

print(f"✅ File saved as: {output_path}")

✅ File saved as: comorbidity_with_icd11.csv


# Scoring function

In [15]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

def compute_vector_correlation(comorbidity_path, vector_path):
    """
    Compute the Pearson correlation between vector similarity and comorbidity score.

    Parameters:
        comorbidity_path (str): Path to comorbidity dataset (CSV) with ICD11_1, ICD11_2, Log2_Ratio
        vector_path (str): Path to vector file (CSV) with ICD11_Code, Vector (stringified array)

    Returns:
        float: Pearson correlation coefficient
    """
    
    # --- Step 1: Load data ---
    comorbidity_df = pd.read_csv(comorbidity_path)
    vector_df = pd.read_csv(vector_path, usecols=["ICD11_code", "Vector"])

    # --- Step 2: Parse vector strings into NumPy arrays ---
    def safe_parse_vector(v):
        if isinstance(v, bytes):
            v = v.decode('utf-8')
        numbers = re.findall(r'-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?', str(v))
        return np.array([float(n) for n in numbers])

    vector_df['Vector'] = vector_df['Vector'].apply(safe_parse_vector)

    # --- Step 3: Build vector lookup ---
    vector_map = dict(zip(vector_df['ICD11_code'], vector_df['Vector']))

    # --- Step 4: Filter valid comorbidity pairs ---
    valid_df = comorbidity_df[
        comorbidity_df['ICD11_1'].isin(vector_map) &
        comorbidity_df['ICD11_2'].isin(vector_map)
    ].copy()

    # --- Step 5: Compute cosine similarity ---
    similarities = [
        cosine_similarity([vector_map[row['ICD11_1']]], [vector_map[row['ICD11_2']]])[0][0]
        for _, row in valid_df.iterrows()
    ]
    valid_df['Vector_Similarity'] = similarities

    # --- Step 6: Compute correlation ---
    pearson_corr, _ = pearsonr(valid_df['Log2_Ratio'].astype(float), valid_df['Vector_Similarity'])
    pearson_corr_rounded=round(pearson_corr, 4)

    return pearson_corr_rounded

# Results

In [16]:
results={}
for model in ["tfidf", "fasttext", "bert", "biobert", "bioclinicalbert", "pubmedbert", "gatortron"]:
    path=model+"_ICD11_embeddings.csv"
    results[model]=compute_vector_correlation("comorbidity_with_icd11.csv", path)
results_df=pd.DataFrame(results.items(), columns=['Model', 'Comorbidity benchmark score'])
results_df

Unnamed: 0,Model,Comorbidity benchmark score
0,tfidf,0.2801
1,fasttext,0.0966
2,bert,0.1332
3,biobert,0.126
4,bioclinicalbert,0.0972
5,pubmedbert,0.1706
6,gatortron,0.1886
