# Imports 

In [None]:
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from phenoseeker import BioproxyEvaluator

In [None]:
screens_folders = {
      #  "ChemBL": Path("/projects/synsight/repos/phenospace/bioproxy/screens_data_chembl"),
        "Curie": Path("/projects/synsight/repos/phenospace/bioproxy/screens_data_curie"),
        "ChEMBL": Path("/projects/synsight/repos/phenoseeker/data/ChEMBL/assays_csv"),
}

In [None]:
model = "openphenom"
base_path = Path("/projects/synsight/data/jump_embeddings/compounds_embeddings/")
npy_file = base_path / model / "Embeddings_norm.npy"
parquet_metadata = base_path / model / Path("metadata.parquet")

In [None]:
eval = BioproxyEvaluator(parquet_metadata, npy_file, screens_folders, embeddings_name=f'Embeddings_{model}', embeddings_entity='compound')

In [None]:
eval

In [None]:
model = "dinov2_g"
npy_file = base_path / model / "Embeddings_norm.npy"
parquet_metadata = base_path / model / Path("metadata.parquet")
eval.load(embedding_name=f'Embeddings_{model}', embeddings_file=npy_file, metadata_file=parquet_metadata)

In [None]:
model = "resnet50"
npy_file = base_path / model / "Embeddings_norm.npy"
parquet_metadata = base_path / model / Path("metadata.parquet")
eval.load(embedding_name=f'Embeddings_{model}', embeddings_file=npy_file, metadata_file=parquet_metadata)

In [None]:
model = "chada"
npy_file = base_path / model / "Embeddings_norm.npy"
parquet_metadata = base_path / model / Path("metadata.parquet")
eval.load(embedding_name=f'Embeddings_{model}', embeddings_file=npy_file, metadata_file=parquet_metadata)

In [None]:
eval.global_embedding_manager.embeddings.keys()

In [None]:
results_of = eval.compute_enrichment_factors(source='Curie', embeddings_name='Embeddings_openphenom', thresholds=[1, 3, 5, 10])
results_dino = eval.compute_enrichment_factors(source='Curie', embeddings_name='Embeddings_dinov2_g', thresholds=[1, 3, 5, 10])
results_resnet50 = eval.compute_enrichment_factors(source='Curie', embeddings_name='Embeddings_resnet50', thresholds=[1, 3, 5, 10])
results_chada = eval.compute_enrichment_factors(source='Curie', embeddings_name='Embeddings_chada', thresholds=[1, 3, 5, 10])

In [None]:
results_of = eval.compute_enrichment_factors(source='ChEMBL', embeddings_name='Embeddings_openphenom', thresholds=[1, 3, 5, 10])
results_dino = eval.compute_enrichment_factors(source='ChEMBL', embeddings_name='Embeddings_dinov2_g', thresholds=[1, 3, 5, 10])
results_resnet50 = eval.compute_enrichment_factors(source='ChEMBL', embeddings_name='Embeddings_resnet50', thresholds=[1, 3, 5, 10])
results_chada = eval.compute_enrichment_factors(source='ChEMBL', embeddings_name='Embeddings_chada', thresholds=[1, 3, 5, 10])


# Plot one model

In [None]:
import matplotlib.pyplot as plt
import numpy as np

df = results_dino[results_dino[("N Hits", "mean")] > 3]
# Récupérer les valeurs uniques de Threshold et les trier si besoin
unique_thresholds = sorted(df[("Threshold",)].unique())

for thr in unique_thresholds:
    # Filtrer les données correspondant au threshold courant
    df_thr = df[df[("Threshold",)] == thr]
    
    # Trier les screens par ordre croissant de Hit Rate Random (mean)
    df_thr_sorted = df_thr.sort_values(by=( "Hit Rate Random", "mean"))
    
    # Récupérer la liste des screens triés
    screens = df_thr_sorted[("Screen",)].tolist()
    x = np.arange(len(screens))
    width = 0.25  # largeur des barres

    # Extraire et multiplier par 100 pour avoir des pourcentages
    hit_rate_rand = df_thr_sorted[("Hit Rate Random", "mean")].values * 100
    hit_rate_sel_mean = df_thr_sorted[("Hit Rate Selected", "mean")].values * 100
    hit_rate_sel_max = df_thr_sorted[("Hit Rate Selected", "max")].values * 100
    
    # Création du graphique
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Affichage des barres dans l'ordre : Random, Selected (mean), Selected (max)
    ax.bar(x - width, hit_rate_rand, width, label='Hit Rate Random (%)')
    ax.bar(x, hit_rate_sel_mean, width, label='Hit Rate Selected (mean) (%)')
    ax.bar(x + width, hit_rate_sel_max, width, label='Hit Rate Selected (max) (%)')
    
    # Personnalisation du graphique
    ax.set_xlabel("Screen")
    ax.set_ylabel("Hit Rate (%)")
    ax.set_title(f"Hit Rates pour Threshold = {thr}")
    ax.set_xticks(x)
    ax.set_xticklabels(screens, rotation=45)
    ax.legend()
    ax.grid(True)  # Ajout de la grille
    plt.tight_layout()
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Supposons que votre DataFrame s'appelle df

# Récupérer les valeurs uniques de Threshold et les trier si besoin
unique_thresholds = sorted(df[("Threshold",)].unique())

for thr in unique_thresholds:
    # Filtrer les données correspondant au threshold courant
    df_thr = df[df[("Threshold",)] == thr]
    
    # Trier les screens par ordre croissant de EF (mean)
    df_thr_sorted = df_thr.sort_values(by=( "Hit Rate Random", "mean"))
    
    # Récupérer la liste des screens triés
    screens = df_thr_sorted[("Screen",)].tolist()
    x = np.arange(len(screens))
    width = 0.35  # largeur des barres
    
    # Extraire les valeurs d'EF
    ef_mean = df_thr_sorted[("EF", "mean")].values
    ef_max = df_thr_sorted[("EF", "max")].values
    
    # Création du graphique
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Affichage des barres pour EF (mean) et EF (max)
    ax.bar(x - width/2, ef_mean, width, label='EF (mean)')
    ax.bar(x + width/2, ef_max, width, label='EF (max)')
    
    # Ajout d'une ligne horizontale rouge en pointillé pour EF = 1
    ax.axhline(y=1, color='red', linestyle='--', label='EF = 1')
    
    # Personnalisation du graphique
    ax.set_xlabel("Screen")
    ax.set_ylabel("Enrichment Factor")
    ax.set_title(f"Enrichment Factor pour Threshold = {thr}")
    ax.set_xticks(x)
    ax.set_xticklabels(screens, rotation=45)
    ax.legend()
    ax.grid(True)  # Ajout de la grille
    plt.tight_layout()
    plt.show()


# Plot all models

In [None]:
models = {
 #   "DINOv2": results_dino,
    "OpenPhenom": results_of,
    "Resnet50": results_resnet50,
    "ChAda": results_chada,
}


# Use one model as a reference for the random hit rate values.
ref_df = models["ChAda"]
ref_df = ref_df[ref_df[("N Hits", "mean")] > 3]


In [None]:
unique_thr = sorted(ref_df[("Threshold",)].unique())
measures = ["mean", "median", "max"]

for thr in unique_thr:
    # Build a dict with data indexed by Screen for each model
    common_screens = None
    model_data = {}
    for name, df in models.items():
        df_thr = df[df[("Threshold",)] == thr].copy()
        df_thr = df_thr.set_index(("Screen",))
        model_data[name] = df_thr
        screens = set(df_thr.index)
        if common_screens is None:
            common_screens = screens
        else:
            common_screens &= screens
    common_screens = sorted(common_screens)

    # Count best screens per measure.
    # In case of tie, both models get the point.
    best_counts = {m: {name: 0 for name in models} for m in measures}
    for screen in common_screens:
        for m in measures:
            best_val = max(
                df_thr.loc[screen][("Hit Rate Selected", m)]
                for df_thr in model_data.values()
            )
            for name, df_thr in model_data.items():
                val = df_thr.loc[screen][("Hit Rate Selected", m)]
                if val == best_val:
                    best_counts[m][name] += 1

    # Create a grouped bar plot using counts
    x = np.arange(len(measures))
    bar_width = 0.2
    fig, ax = plt.subplots(figsize=(8, 6))
    for i, name in enumerate(models):
        counts = [best_counts[m][name] for m in measures]
        ax.bar(x + i * bar_width, counts, bar_width, label=name)
    ax.set_xticks(x + (len(models) - 1) * bar_width / 2)
    ax.set_xticklabels(measures)
    ax.set_ylabel("Number of Screens")
    ax.set_title(f"Best Model Count at Threshold {thr}")
    ax.legend()
    plt.tight_layout()
    plt.show()

In [None]:

ref_df = models["DINOv2"]
ref_df = ref_df[ref_df[("N Hits", "mean")] > 3]
unique_thr = sorted(ref_df[("Threshold",)].unique())
measures = ["mean", "median", "max"]

for thr in unique_thr:
    # Build a dict with data indexed by Screen for each model.
    common_screens = None
    model_data = {}
    for name, df in models.items():
        df_thr = df[df[("Threshold",)] == thr].copy()
        df_thr = df_thr.set_index(("Screen",))
        model_data[name] = df_thr
        screens = set(df_thr.index)
        if common_screens is None:
            common_screens = screens
        else:
            common_screens &= screens
    common_screens = sorted(common_screens)

    # Count best screens per measure.
    # In case of equality, no model gets the point.
    best_counts = {m: {name: 0 for name in models} for m in measures}
    for screen in common_screens:
        for m in measures:
            best_val = max(
                df_thr.loc[screen][("Hit Rate Selected", m)]
                for df_thr in model_data.values()
            )
            winners = []
            for name, df_thr in model_data.items():
                val = df_thr.loc[screen][("Hit Rate Selected", m)]
                if val == best_val:
                    winners.append(name)
            if len(winners) == 1:
                best_counts[m][winners[0]] += 1

    # Create a grouped bar plot using counts
    x = np.arange(len(measures))
    bar_width = 0.2
    fig, ax = plt.subplots(figsize=(8, 6))
    for i, name in enumerate(models):
        counts = [best_counts[m][name] for m in measures]
        ax.bar(x + i * bar_width, counts, bar_width, label=name)
    ax.set_xticks(x + (len(models) - 1) * bar_width / 2)
    ax.set_xticklabels(measures)
    ax.set_ylabel("Number of Screens")
    ax.set_title(f"Best Model Count at Threshold {thr}")
    ax.legend()
    plt.tight_layout()
    plt.show()


In [None]:


for thr in unique_thr:
    # Build a dict with data indexed by Screen for each model.
    common_screens = None
    model_data = {}
    for name, df in models.items():
        df_thr = df[df[("Threshold",)] == thr].copy()
        df_thr = df_thr.set_index(("Screen",))
        model_data[name] = df_thr
        screens = set(df_thr.index)
        if common_screens is None:
            common_screens = screens
        else:
            common_screens &= screens
    common_screens = sorted(common_screens)

    # Count unique wins and tie wins per measure.
    best_counts = {m: {name: {"unique": 0, "tie": 0} 
                 for name in models} for m in measures}
    for screen in common_screens:
        for m in measures:
            best_val = max(
                df_thr.loc[screen][("Hit Rate Selected", m)]
                for df_thr in model_data.values()
            )
            winners = []
            for name, df_thr in model_data.items():
                val = df_thr.loc[screen][("Hit Rate Selected", m)]
                if val == best_val:
                    winners.append(name)
            if len(winners) == 1:
                best_counts[m][winners[0]]["unique"] += 1
            elif len(winners) > 1:
                for name in winners:
                    best_counts[m][name]["tie"] += 1

    # Create a stacked bar plot.
    x = np.arange(len(measures))
    n_mod = len(models)
    bar_width = 0.2
    fig, ax = plt.subplots(figsize=(10, 6))
    for i, name in enumerate(models):
        unique_vals = [best_counts[m][name]["unique"] 
                       for m in measures]
        tie_vals = [best_counts[m][name]["tie"] 
                    for m in measures]
        pos = x + i * bar_width
        ax.bar(pos, unique_vals, bar_width, label=f"{name} unique")
        ax.bar(pos, tie_vals, bar_width, bottom=unique_vals,
               label=f"{name} tie", hatch='//', alpha=0.7)
    ax.set_xticks(x + (n_mod - 1) * bar_width / 2)
    ax.set_xticklabels(measures)
    ax.set_ylabel("Number of Screens")
    ax.set_title(f"Best Model Counts at Threshold {thr}")
    ax.legend()
    plt.tight_layout()
    plt.show()


In [None]:

# Get sorted unique thresholds from the reference DataFrame.
unique_thr = sorted(ref_df[("Threshold",)].unique())

for thr in unique_thr:
    fig, ax = plt.subplots(figsize=(10, 6))
    # Filter reference DF for the current threshold and sort screens.
    df_ref_thr = ref_df[ref_df[("Threshold",)] == thr].copy()
    df_ref_thr.sort_values(by=("Hit Rate Random", "mean"),
                           inplace=True)
    
    screens = df_ref_thr[("Screen",)].tolist()
    x = np.arange(len(screens))
    bar_width = 0.25

    # Plot the random hit rate once.
    hit_rand = (df_ref_thr[("Hit Rate Random", "mean")].values * 100)
    ax.bar(x - bar_width, hit_rand, bar_width,
           label="Hit Rate Random (%)")

    # Define offsets for each model's bars.
    n_mod = len(models)
    offsets = np.linspace(0, bar_width * (n_mod - 1), n_mod)
    
    for i, (name, df_mod) in enumerate(models.items()):
        df_mod_thr = df_mod[df_mod[("Threshold",)] == thr].copy()
        # Ensure the same screen order as in the reference DF.
        df_mod_thr = df_mod_thr.set_index(("Screen",))
        df_mod_thr = df_mod_thr.loc[screens].reset_index()
        hit_sel_mean = (df_mod_thr[("Hit Rate Selected", "mean")].values *
                        100)
        hit_sel_max = (df_mod_thr[("Hit Rate Selected", "max")].values *
                       100)
        
        pos_mean = x + offsets[i]
        pos_max = x + offsets[i] + bar_width / 2
        
        ax.bar(pos_mean, hit_sel_mean, bar_width / 2,
               label=f"{name} Selected (mean) (%)")
        ax.bar(pos_max, hit_sel_max, bar_width / 2,
               label=f"{name} Selected (max) (%)")
    
    ax.set_xlabel("Screen")
    ax.set_ylabel("Hit Rate (%)")
    ax.set_title(f"Hit Rates for Threshold = {thr}")
    ax.set_xticks(x)
    ax.set_xticklabels(screens, rotation=45)
    ax.legend()
    ax.grid(True)
    plt.tight_layout()
    plt.show()
