## 1. LLM generated diagnostics

In [1]:
import pandas as pd

stored_data_filename = 'Advanced Research Topics - Task 2.xlsx'

df = pd.read_excel(stored_data_filename, sheet_name='Generated prompts')
df.head(10)

Unnamed: 0,Prompt,Clinical diagnosis,Final Diagnosis,GPT-4o clinical,GPT-4o final,Gemini clinical,Gemini final,DeepSeek clinical,Deepseek final,Grok clinical,Grok final,o4-mini clinical,o4-mini final
0,"Given the clinical case details, identify the ...",Metastatic melanoma,Pulmonary histoplasmosis,Recurrent metastatic melanoma,Disseminated histoplasmosis,Suspected metastatic melanoma,Paracoccidioidomycosis,Immune checkpoint inhibitor pneumonitis,Immune-related sarcoid-like reaction,Recurrent metastatic melanoma with pulmonary i...,"Metastatic melanoma with BRAF V600E mutation, ...",Pulmonary metastatic melanoma,Pembrolizumab‑induced sarcoid‑like granulomato...
1,"Given the clinical case details, identify the ...",Not available,Human immunodeficiency virus type 2 infection ...,Cerebellar metastasis,Cerebral toxoplasmosis,Intracranial Mass Lesions and AIDS,Can't be determined,Cerebellar mass lesion (suspected neoplasm),CNS lymphoma (AIDS-associated),"Cerebellar tumor, possible metastatic disease.","Cerebellar tumor, possible metastatic disease.",Metastatic cerebellar neoplasm (brain metastas...,HIV‑associated diffuse large B‑cell lymphoma
2,"Given the clinical case details, identify the ...",Not available,Borrelia miyamotoi infection,Autoimmune encephalitis or CNS vasculitis,Chronic Toxoplasma gondii meningoencephalitis,Meningoencephalitis,Primary CNS Lymphoma (PCNSL),Meningoencephalitis (autoimmune/infectious),Neurosarcoidosis,"Encephalitis, possible autoimmune or infectious.",Primary central nervous system lymphoma.,Aseptic lymphocytic meningoencephalitis,Primary CNS lymphoma (vitreoretinal large B‑ce...
3,"Given the clinical case details, identify the ...",Not available,Vitamin D deficiency,"Hypocalcemia with seizure, possibly nutritiona...",Nutritional rickets due to vitamin D deficiency,Hypocalcemic Seizure,Can't be determined,Hypocalcemic seizure secondary to suspected vi...,Vitamin D-dependent rickets (type likely confi...,Hypocalcemic seizure,Vitamin D deficiency rickets,Hypocalcemic seizure (suspected rickets),Nutritional vitamin D–deficiency rickets
4,"Given the clinical case details, identify the ...",Not available,Systemic primary amyloidosis,"Transient monocular vision loss, possible gian...",IgG4-related disease with orbital and hepatobi...,Suspected Giant Cell Arteritis with orbital mass,Systemic Sarcoidosis,Giant Cell Arteritis,IgG4-Related Disease,Suspected giant cell arteritis,Metastatic carcinoma,Giant cell (temporal) arteritis,IgG4‑related orbital inflammatory disease
5,"Given the clinical case details, identify the ...",Cutaneous bacterial infection,Erysipelothrix rhusiopathiae infection,Necrotizing fasciitis,Group A Streptococcus necrotizing fasciitis,Cellulitis with lymphangitis,Aeromonas hydrophila infection,Cellulitis with bullous formation and lymphang...,Necrotizing cellulitis due to Streptococcus py...,Cellulitis,Streptococcal cellulitis,Necrotizing fasciitis,Group A Streptococcus necrotizing fasciitis
6,"Given the clinical case details, identify the ...",Not available,Inflammatory bowel disease (Crohn’s disease),Reactive arthritis,Crohn’s disease,Systemic inflammatory response with arthritis ...,Crohn's disease.,Lyme disease,Lyme arthritis with gastrointestinal involvement,Inflammatory Bowel Disease,Crohn’s Disease,Suspected Lyme disease,Crohn’s disease
7,"Given the clinical case details, identify the ...",Not available,Aspiration pneumonia,Community‑acquired pneumonia,Invasive pulmonary aspergillosis,Infectious Bronchiolitis and Pneumonia,Invasive Pulmonary Aspergillosis,Opportunistic fungal pneumonia (suspected),Invasive pulmonary aspergillosis,Pneumonia,Invasive Pulmonary Aspergillosis,Recurrent pneumonia in an immunocompromised host,Subacute invasive pulmonary aspergillosis
8,"Given the clinical case details, identify the ...",Not available,Granulomatosis with polyangiitis,Community‑acquired pneumonia,Anti‑glomerular basement membrane disease,Pulmonary-renal syndrome,Can't be determined,Pulmonary-renal syndrome (suspected),Anti-glomerular basement membrane (anti-GBM) d...,Pneumonia with suspected glomerulonephritis.,Pulmonary-renal syndrome (suspected Goodpastur...,Community‑acquired pneumonia,Anti‑GBM (Goodpasture) syndrome
9,"Given the clinical case details, identify the ...",Sarcoidosis or tuberculosis,Sarcoidosis,Uveitis,Sarcoidosis,Tuberculous uveitis,Can't be determined,Uveitis (suspected tuberculous etiology),Tuberculous Panuveitis,Uveitis,Sarcoidosis,Bilateral granulomatous anterior uveitis,Sarcoidosis (ocular sarcoidosis)


## 2. BLEU, ROUGE, Semantic similarity evaluation

In [8]:
!pip install -q torchmetrics transformers pandas nltk rouge-score

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [20]:
import pandas as pd
import warnings
from torchmetrics.text import BERTScore
from transformers import AutoTokenizer, AutoModel
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

warnings.filterwarnings("ignore")

def calculate_metrics(df, column_dict, bert_base_model_name='bert-base-uncased', clinical_bert_base_model_name = "emilyalsentzer/Bio_ClinicalBERT"):
    # Initialize BERTScore
    model = AutoModel.from_pretrained(bert_base_model_name)
    clinical_model = AutoModel.from_pretrained(clinical_bert_base_model_name)
    bertscore = BERTScore(model_name_or_path=bert_base_model_name, num_layers=9)
    clinicalbertscore = BERTScore(model_name_or_path=clinical_bert_base_model_name, num_layers=9)

    # Initialize ROUGE scorer
    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

    # Prepare results dictionary
    results = {metric: {} for metric in ['BLEU', 'ROUGE-L', 'BERTScore', 'ClinicalBERTScore']}

    for cand_col, ref_col in column_dict.items():
        references = df[ref_col].tolist()
        candidates = df[cand_col].tolist()

        # Calculate BLEU scores
        bleu_scores = [sentence_bleu([ref.split()], cand.split()) for ref, cand in zip(references, candidates)]
        results['BLEU'][cand_col] = round(sum(bleu_scores) / len(bleu_scores), 3)

        # Calculate ROUGE-L scores
        rouge_scores = [rouge.score(ref, cand)['rougeL'].fmeasure for ref, cand in zip(references, candidates)]
        results['ROUGE-L'][cand_col] = round(sum(rouge_scores) / len(rouge_scores), 3)

        # Calculate BERTScore
        bert_scores = bertscore(references, candidates)
        results['BERTScore'][cand_col] = round(bert_scores['f1'].mean().item(), 3)

        # Calculate ClinicalBERTScore
        clinical_bert_scores = clinicalbertscore(references, candidates)
        results['ClinicalBERTScore'][cand_col] = round(clinical_bert_scores['f1'].mean().item(), 3)

    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    return results_df

column_dict = {
    'GPT-4o final': 'Final Diagnosis',
    'Gemini final': 'Final Diagnosis',
    'Deepseek final': 'Final Diagnosis',
    'Grok final': 'Final Diagnosis',
    'o4-mini final': 'Final Diagnosis',
}

results_df = calculate_metrics(df, column_dict)
results_df

Unnamed: 0,BLEU,ROUGE-L,BERTScore,ClinicalBERTScore
GPT-4o final,0.0,0.313,0.454,0.645
Gemini final,0.0,0.14,0.391,0.619
Deepseek final,0.0,0.033,0.446,0.647
Grok final,0.0,0.271,0.45,0.636
o4-mini final,0.0,0.192,0.41,0.636


## 3. LLM based evaluation
This was performed using three models: Qwen 2.5 Max, Llama 4 Maverick, Claude 3.7 Sonnet

In [3]:
llm_df = pd.read_excel(stored_data_filename, sheet_name='Accuracy Judgement')
llm_df.tail(6)

Unnamed: 0,Column 1,GPT-4o sub prompt,Gemini-Pro-2.5 sub prompt,Deekseek-R1 sub prompt,Grok3 sub prompt,o4-mini sub prompt,Unnamed: 6
12,Qwen 2.5 Max judgement,b b c b c c a c c a (35.0),c b c c c c a c c c (15.0),c b c b c c c c c c (10.0),c c c a c c a c c a (30.0),c b c a c c a c c a (35.0),
13,Claude judgement,b c c b c c a c c a (30.0),c c c c c c a c c c (10.0),c c c b c c c c c c (5.0),c c c b c c a c c a (25.0),c c c b c c a c c a (25.0),
14,Llama judgement,b c c b c c a c c a (30.0),c b c c c c a c c c (15.0),c b c b c c c c b c (15.0),c a b b c c a c b a (45.0),c c c b c c a c c a (25.0),
15,Majority voting by the above 3 models,b c c b c c a c c a (30.0),c b c c c c a c c c (15.0),c b c b c c c c c c (10.0),c c c b c c a c c a (25.0),c c c b c c a c c a (25.0),
16,Majority voting by similar architectures (GPT4...,b b c a c c a c c a (40.0),c b c b c c a c b c (25.0),c b c c c c c c c c (5.0),c c b b c c a b b b (35.0),c c c b c c a c c a (25.0),Microsoft copilot is said to use Prometheus mo...
17,Mean score of the two,35,20,7.5,30,25,


## 4. Chain of thoughts generated

In [2]:
import os

thoughts_dir = 'Chain of Thoughts'

# Number of models
[directory for directory in os.listdir(thoughts_dir) if not directory.startswith('.')]

['GPT-4o', 'o4-mini', 'Gemini 2.5 Pro', 'Deepseek R1', 'Grok 3']

In [3]:
# Cases present
!ls -R Chain\ of\ Thoughts

'Chain of Thoughts':
'Deepseek R1'  'Gemini 2.5 Pro'   GPT-4o  'Grok 3'   o4-mini

'Chain of Thoughts/Deepseek R1':
case10.txt  case2.txt  case4.txt  case6.txt  case8.txt
case1.txt   case3.txt  case5.txt  case7.txt  case9.txt

'Chain of Thoughts/Gemini 2.5 Pro':
case10.txt  case2.txt  case4.txt  case6.txt  case8.txt
case1.txt   case3.txt  case5.txt  case7.txt  case9.txt

'Chain of Thoughts/GPT-4o':
case10.txt  case2.txt  case4.txt  case6.txt  case8.txt
case1.txt   case3.txt  case5.txt  case7.txt  case9.txt

'Chain of Thoughts/Grok 3':
case10.txt  case2.txt  case4.txt  case6.txt  case8.txt
case1.txt   case3.txt  case5.txt  case7.txt  case9.txt

'Chain of Thoughts/o4-mini':
case10.txt  case2.txt  case4.txt  case6.txt  case8.txt
case1.txt   case3.txt  case5.txt  case7.txt  case9.txt
