In [11]:
import pandas as pd
import numpy as np
import ast
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display

# === Step 1: Load ICD11 code → title map once ===
lookup_df = pd.read_csv("icd11-25_data_vectorization.csv")
title_map = dict(zip(lookup_df["code"], lookup_df["title"]))

# === Step 2: Generate list of single symptoms ===
def generate_symptom_list(embedding_files, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)
    m_sets = []
    for f in embedding_files:
        df = pd.read_csv(f).dropna(subset=["ICD11_code"])
        m_sets.append(set(df[df['ICD11_code'].str.startswith('M')]['ICD11_code']))
    common = sorted(set.intersection(*m_sets))
    if not common:
        raise ValueError("No common M-codes found across embedding files.")
    return common

# === Step 3: Single-model best match for each symptom ===
def best_match_for_symptoms(emb_file, symptoms, title_map):
    df = pd.read_csv(emb_file).dropna(subset=["ICD11_code"])
    vec_col = 'Vector' if 'Vector' in df.columns else 'vector'
    def parse_vector(cell):
        if isinstance(cell, str):
            try:
                return np.array(ast.literal_eval(cell), dtype=float)
            except:
                parts = cell.strip('[]').replace(',', ' ').split()
                return np.array([float(x) for x in parts], dtype=float)
        elif isinstance(cell, (list, np.ndarray)):
            return np.array(cell, dtype=float)
        else:
            raise ValueError(f"Unknown vector format: {type(cell)}")
    df['vector'] = df[vec_col].apply(parse_vector)

    m_df = df[df['ICD11_code'].str.startswith('M')]
    non_m_df = df[~df['ICD11_code'].str.startswith('M')]
    m2v = dict(zip(m_df['ICD11_code'], m_df['vector']))
    non2v = dict(zip(non_m_df['ICD11_code'], non_m_df['vector']))
    non_codes = list(non2v.keys())
    disease_stack = np.vstack([non2v[c] for c in non_codes])

    results = []
    for sym in symptoms:
        vec = m2v[sym]
        sims = cosine_similarity([vec], disease_stack)[0]
        idx = np.argmax(sims)
        code = non_codes[idx]
        score = sims[idx]
        title = title_map.get(code, '(title not found)')
        results.append((sym, code, title, score))
    return results

# === Step 4: Execute benchmarking and build raw dataframes ===
embedding_files = [
    "tfidf_ICD11_embeddings.csv",
    "fasttext_ICD11_embeddings.csv",
    "bert_ICD11_embeddings.csv",
    "biobert_ICD11_embeddings.csv",
    "bioclinicalbert_ICD11_embeddings.csv",
    "pubmedbert_ICD11_embeddings.csv",
    "gatortron_ICD11_embeddings.csv"
]
model_names = [f.split('_')[0] for f in embedding_files]

# Generate common symptom list
symptoms = generate_symptom_list(embedding_files, random_state=42)

# Get predictions per model
per_model = {
    name: best_match_for_symptoms(f, symptoms, title_map)
    for name, f in zip(model_names, embedding_files)
}

# Build codes_df and scores_df
codes_df = pd.DataFrame({
    name: [res[i][1] for i in range(len(symptoms))]
    for name, res in per_model.items()
}, index=symptoms)

scores_df = pd.DataFrame({
    name: [res[i][3] for i in range(len(symptoms))]
    for name, res in per_model.items()
}, index=symptoms)

# === Step 5: Build and save combined benchmarking table ===
combined_table = pd.DataFrame({
    'Symptom': [f"{sym} ({title_map.get(sym)})" for sym in symptoms]
})
for name in model_names:
    combined_table[f"{name} (CS)"] = [
        f"{codes_df.loc[sym, name]} ({title_map.get(codes_df.loc[sym, name])}) ({scores_df.loc[sym, name]:.2f})"
        for sym in symptoms
    ]
# Compute consensus counts
consensus_counts = codes_df.apply(lambda row: row.value_counts().max(), axis=1)
combined_table['Consensus Count'] = consensus_counts.values

combined_table.to_csv("benchmarking_results.csv", index=False)
print("Saved benchmarking_results.csv with full table.")

# === Step 6: Summary statistics ===
# 1. Mean & std per model
print("Mean and Standard Deviation of Cosine Similarity per Model:")
for name in model_names:
    mean = scores_df[name].mean()
    std = scores_df[name].std()
    print(f"- {name}: mean={mean:.4f}, std={std:.4f}")
print()

# 2. Consensus distribution
print("Distribution of Consensus Count:")
print(consensus_counts.value_counts().sort_index().to_string())
print()

# 3. Agreement matrix with conditional formatting
agree_mat = pd.DataFrame(index=model_names, columns=model_names, dtype=float)
for m1 in model_names:
    for m2 in model_names:
        agree_mat.loc[m1, m2] = round((codes_df[m1] == codes_df[m2]).mean() * 100, 2)

print("Agreement Matrix (% exact matches):")
styled = (
    agree_mat.style
        .format("{:.2f}")
        .background_gradient(axis=None, cmap='Greens', low=0.2, high=0.8)
        .highlight_max(axis=1, color='lightgreen')
        .set_properties(**{
            'min-width': '150px',
            'max-width': '150px',
            'text-align': 'center'
        })
        .set_table_styles([
            {'selector': 'th', 'props': [('text-align', 'center')]}
        ], overwrite=False)
)
display(styled)


Saved benchmarking_results.csv with full table.
Mean and Standard Deviation of Cosine Similarity per Model:
- tfidf: mean=0.6158, std=0.1340
- fasttext: mean=0.9754, std=0.0101
- bert: mean=0.9600, std=0.0137
- biobert: mean=0.9791, std=0.0076
- bioclinicalbert: mean=0.9768, std=0.0068
- pubmedbert: mean=0.9952, std=0.0017
- gatortron: mean=0.9648, std=0.0138

Distribution of Consensus Count:
1     87
2    272
3    239
4    118
5     99
6     69
7     55

Agreement Matrix (% exact matches):


Unnamed: 0,tfidf,fasttext,bert,biobert,bioclinicalbert,pubmedbert,gatortron
tfidf,100.0,21.73,17.25,17.89,16.08,20.55,22.68
fasttext,21.73,100.0,25.45,27.48,21.09,29.29,26.09
bert,17.25,25.45,100.0,34.61,30.67,31.52,30.67
biobert,17.89,27.48,34.61,100.0,31.31,35.46,33.76
bioclinicalbert,16.08,21.09,30.67,31.31,100.0,26.3,26.94
pubmedbert,20.55,29.29,31.52,35.46,26.3,100.0,38.02
gatortron,22.68,26.09,30.67,33.76,26.94,38.02,100.0
