In [18]:
import json
from pathlib import Path

import pandas as pd
import seaborn

In [19]:
benchmark_dataset_path = Path("../evaluation/data/benchmark_dataset.json")
foldfusion_results_path = Path(
    "../evaluation/data/foldfusion_output/Evaluation/evaluation.json"
)
alphafill_folder_path = Path("../evaluation/data/benchmark_dataset_meta/alphafill")

all_exist = all(
    p.exists()
    for p in [benchmark_dataset_path, foldfusion_results_path, alphafill_folder_path]
)
print("All paths exist:", all_exist)

All paths exist: True


In [None]:
with open(benchmark_dataset_path) as f:
    bench_data = json.load(f)

# Keep raw for reference
benchmark_dataset_raw = bench_data
params = bench_data.get("params", {})
ligand_bins = bench_data.get("ligand_bins", [])

rows = []
for bin_obj in ligand_bins:
    bin_name = bin_obj.get("ligand_bin")
    comp_ids = bin_obj.get("ligand_comp_ids", [])
    for entry in bin_obj.get("entries", []):
        row = {**entry}
        row["ligand_bin"] = bin_name
        row["ligand_comp_ids"] = comp_ids
        rows.append(row)

ligand_dataset = pd.DataFrame(rows)
print(
    f"Loaded {len(ligand_dataset)} entries from {ligand_dataset.ligand_bin.nunique()} bins."
)
print("Columns:", list(ligand_dataset.columns))
ligand_dataset.head()


Loaded 713 entries from 8 bins.
Columns: ['uniprot_id', 'protein', 'gene', 'validation_pdb', 'matched_ligands', 'uniprot_json', 'alphafill_json', 'ligand_bin', 'ligand_comp_ids']


Unnamed: 0,uniprot_id,protein,gene,validation_pdb,matched_ligands,uniprot_json,alphafill_json,ligand_bin,ligand_comp_ids
0,P0AEX9,Maltose/maltodextrin-binding periplasmic protein,malE,2R6G,[ATP],/home/marius/Code/foldfusion/evaluation/data/b...,/home/marius/Code/foldfusion/evaluation/data/b...,ADP/ATP,"[ATP, ADP, ANP, AGS]"
1,Q46AN5,Type-2 serine--tRNA ligase,serS2,2CJA,[ATP],/home/marius/Code/foldfusion/evaluation/data/b...,/home/marius/Code/foldfusion/evaluation/data/b...,ADP/ATP,"[ATP, ADP, ANP, AGS]"
2,P0A3F4,Nitrogen regulatory protein P-II,glnB,2XUL,[ATP],/home/marius/Code/foldfusion/evaluation/data/b...,/home/marius/Code/foldfusion/evaluation/data/b...,ADP/ATP,"[ATP, ADP, ANP, AGS]"
3,P12277,Creatine kinase B-type,CKB,3B6R,[ADP],/home/marius/Code/foldfusion/evaluation/data/b...,/home/marius/Code/foldfusion/evaluation/data/b...,ADP/ATP,"[ATP, ADP, ANP, AGS]"
4,O14727,Apoptotic protease-activating factor 1,APAF1,1Z6T,[ADP],/home/marius/Code/foldfusion/evaluation/data/b...,/home/marius/Code/foldfusion/evaluation/data/b...,ADP/ATP,"[ATP, ADP, ANP, AGS]"


In [None]:
protein_bins = bench_data.get("protein_bins", [])

rows = []
for bin_obj in protein_bins:
    bin_name = bin_obj.get("protein_class")
    for entry in bin_obj.get("entries", []):
        row = {**entry}
        row["protein_class"] = bin_name
        rows.append(row)

protein_dataset = pd.DataFrame(rows)
print(
    f"Loaded {len(protein_dataset)} entries from {protein_dataset.protein_class.nunique()} bins."
)
print("Columns:", list(protein_dataset.columns))
protein_dataset.head()

Loaded 443 entries from 8 bins.
Columns: ['uniprot_id', 'protein', 'gene', 'validation_pdb', 'matched_ligands', 'uniprot_json', 'alphafill_json', 'protein_class']


Unnamed: 0,uniprot_id,protein,gene,validation_pdb,matched_ligands,uniprot_json,alphafill_json,protein_class
0,P68139,"Actin, alpha skeletal muscle",ACTA1,7W52,[ATP],/home/marius/Code/foldfusion/evaluation/data/b...,/home/marius/Code/foldfusion/evaluation/data/b...,Hydrolase
1,Q56313,Holliday junction branch migration complex sub...,ruvB,1J7K,[ATP],/home/marius/Code/foldfusion/evaluation/data/b...,/home/marius/Code/foldfusion/evaluation/data/b...,Hydrolase
2,P07900,Heat shock protein HSP 90-alpha,HSP90AA1,3T2S,[AGS],/home/marius/Code/foldfusion/evaluation/data/b...,/home/marius/Code/foldfusion/evaluation/data/b...,Hydrolase
3,P49902,Cytosolic purine 5'-nucleotidase,NT5C2,6DDZ,[ATP],/home/marius/Code/foldfusion/evaluation/data/b...,/home/marius/Code/foldfusion/evaluation/data/b...,Hydrolase
4,Q27974,Auxilin,DNAJC6,2QWN,[ADP],/home/marius/Code/foldfusion/evaluation/data/b...,/home/marius/Code/foldfusion/evaluation/data/b...,Hydrolase
