# Imports and functions 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from pathlib import Path

In [2]:
# base_path = Path("/home/maxime/synrepos/phenoseeker/tmp/")
base_path = Path("/projects/synsight/repos/phenoseeker/tmp/")

In [3]:
path_poscon_openphenom = Path(
    "./openphenom/388_plates_3_combi_3/results/maps_jcp2022.csv"
)
path_plates_openphenom = Path(
    "./openphenom/388_plates_3_combi_3/results/maps_plate.csv"
)
path_poscon_dinov2_g = Path("./dinov2_g/388_plates_3_combi_3/results/maps_jcp2022.csv")
path_plates_dinov2_g = Path("./dinov2_g/388_plates_3_combi_3/results/maps_plate.csv")
path_poscon_dinov2_s = Path("./dinov2_s/388_plates_3_combi_3/results/maps_jcp2022.csv")
path_plates_dinov2_s = Path("./dinov2_s/388_plates_3_combi_3/results/maps_plate.csv")
path_poscon_chada = Path("./chada/388_plates_3_combi_3/results/maps_jcp2022.csv")
path_plates_chada = Path("./chada/388_plates_3_combi_3/results/maps_plate.csv")
path_poscon_resnet50_mean = Path(
    "./resnet50_mean/48_plates_2_combi/results/maps_jcp2022.csv"
)
path_plates_resnet50_mean = Path(
    "./resnet50_mean/48_plates_2_combi/results/maps_plate.csv"
)
path_poscon_resnet50_median = Path(
    "./resnet50_median/48_plates_2_combi/results/maps_jcp2022.csv"
)
path_plates_resnet50_median = Path(
    "./resnet50_median/48_plates_2_combi/results/maps_plate.csv"
)

In [4]:
embeddings_random_values = {
    "poscon_dinov2_g": 0.125,
    "plates_dinov2_g": 0.002361,
    "poscon_dinov2_s": 0.125,
    "plates_dinov2_s": 0.002361,
    "poscon_resnet50_mean": 0.125,
    "plates_resnet50_mean": 0.002361,
    "poscon_resnet50_median": 0.125,
    "plates_resnet50_median": 0.002361,
    "poscon_chada": 0.125,
    "plates_chada": 0.002361,
    "poscon_openphenom": 0.125,
    "plates_openphenom": 0.002361,
}

In [5]:
paths = {
    "poscon_openphenom": path_poscon_openphenom,
    "plates_openphenom": path_plates_openphenom,
    "poscon_dinov2_g": path_poscon_dinov2_g,
    "plates_dinov2_g": path_plates_dinov2_g,
    "poscon_dinov2_s": path_poscon_dinov2_s,
    "plates_dinov2_s": path_plates_dinov2_s,
    "poscon_resnet50_mean": path_poscon_resnet50_mean,
    "plates_resnet50_mean": path_plates_resnet50_mean,
    "poscon_resnet50_median": path_poscon_resnet50_median,
    "plates_resnet50_median": path_plates_resnet50_median,
    "poscon_chada": path_poscon_chada,
    "plates_chada": path_plates_chada,
}

In [6]:
labels = [
    "JCP2022_085227",
    "JCP2022_064022",
    "JCP2022_050797",
    "JCP2022_046054",
    "JCP2022_037716",
    "JCP2022_035095",
    "JCP2022_035095",
    "JCP2022_012818",
]

In [7]:
def preprocess(path: Path, label: str = "Mean mAP"):
    df_raw = pd.read_csv(base_path / path)
    df_raw.columns = [
        col.replace("mAP", "")
        .replace("raw_", "")
        .replace("(", "")
        .replace(")", "")
        .replace("raw", "Raw")
        .strip()
        for col in df_raw.columns
    ]
    df = df_raw.drop(columns=["Number of Queries"])
    df = df.set_index("Label").astype(float)
    return df.loc[label]

In [8]:
def plot_curve_comparison(encoders_dict):
    """
    Plot mAP values for multiple encoders with custom styling.
    Highlights specific points for each encoder (No Normalisation, Best Normalisation) with unified colors.

    Args:
        encoders_dict: Dictionary where keys are encoder names and values are lists containing:
                       [poscon_values, batch_effect_values, best_normalisation_index].
    """
    plt.figure(figsize=(14, 8))

    # Define marker styles for each encoder
    marker_styles = ["o", "s", "D", "^", "v", "P", "X"]  # Extend as needed
    unified_no_norm_color = "#D55E00"  # Unified color for "No Normalisation"
    unified_best_norm_color = "#009E73"  # Unified color for "Best Normalisation"
    unified_random_color = "#F0E442"  # Unified color for "Random Values"

    shape_patches = []  # For shape legend

    for i, (encoder, values) in enumerate(encoders_dict.items()):
        mean_values_1, mean_values_2, best_normalisation_index = values

        # Assign unique marker
        marker = marker_styles[i % len(marker_styles)]

        # General scatter plot with lighter points
        plt.scatter(
            mean_values_2, mean_values_1, color="grey", alpha=1, s=20, marker=marker
        )

    for i, (encoder, values) in enumerate(encoders_dict.items()):
        mean_values_1, mean_values_2, best_normalisation_index = values

        # Assign unique marker
        marker = marker_styles[i % len(marker_styles)]
        # Highlight the first point (No Normalisation)
        plt.scatter(
            mean_values_2["Embeddings_Raw"],
            mean_values_1["Embeddings_Raw"],
            color=unified_no_norm_color,
            alpha=1.0,
            s=100,
            edgecolor="black",
            linewidth=1,
            marker=marker,
        )
        # Highlight the best normalisation point
        plt.scatter(
            mean_values_2["Embeddings Random"],
            mean_values_1["Embeddings Random"],
            color=unified_random_color,
            alpha=1.0,
            s=100,
            edgecolor="black",
            linewidth=1,
            marker=marker,
        )
        # Highlight the best normalisation point
        plt.scatter(
            mean_values_2[best_normalisation_index],
            mean_values_1[best_normalisation_index],
            color=unified_best_norm_color,
            alpha=1.0,
            s=100,
            edgecolor="black",
            linewidth=1,
            marker=marker,
        )

        # Add to shape legend if not already added
        if marker not in [line.get_marker() for line in shape_patches]:
            shape_patches.append(
                mlines.Line2D(
                    [],
                    [],
                    color="black",
                    marker=marker,
                    linestyle="None",
                    markersize=10,
                    label=f"{encoder}",
                )
            )

    # Add labels
    plt.xlabel("mAP - Positive Control Molecules Retrieval", fontsize=14, labelpad=10)
    plt.ylabel("mAP - Plates Retrieval", fontsize=14, labelpad=10)

    # Refine grid lines
    plt.grid(color="gray", linestyle="--", linewidth=0.5, alpha=0.6)

    # Customize ticks
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)

    plt.legend(
        handles=shape_patches, loc="upper left", bbox_to_anchor=(1.05, 0.5), fontsize=12
    )

    # Tight layout for better use of space and high resolution
    plt.tight_layout()
    plt.show()

# Plot

In [9]:
processed_data = {}

for key, path in paths.items():
    df = preprocess(path)
    if key in embeddings_random_values:
        df["Embeddings Random"] = embeddings_random_values[key]
    processed_data[key] = df

FileNotFoundError: [Errno 2] No such file or directory: '/projects/synsight/repos/phenoseeker/tmp/openphenom/388_plates_3_combi_3/results/maps_jcp2022.csv'

In [None]:
encoder = "dinov2_s"

df = pd.concat(
    [processed_data[f"plates_{encoder}"], processed_data[f"poscon_{encoder}"]], axis=1
)
df.columns = ["Batch_effect", "Poscon"]
df.sort_values(by="Poscon", ascending=False).head(10)

In [None]:
encoder = "openphenom"

df = pd.concat(
    [processed_data[f"plates_{encoder}"], processed_data[f"poscon_{encoder}"]], axis=1
)
df.columns = ["Batch_effect", "Poscon"]
df.sort_values(by="Poscon", ascending=False).head(20)

In [None]:
encoder = "dinov2_g"

df = pd.concat(
    [processed_data[f"plates_{encoder}"], processed_data[f"poscon_{encoder}"]], axis=1
)
df.columns = ["Batch_effect", "Poscon"]
df.sort_values(by="Poscon", ascending=False)

In [None]:
df.loc["Embeddings_Raw__ZCA_N_C"]

In [None]:
encoder = "chada"

df = pd.concat(
    [processed_data[f"plates_{encoder}"], processed_data[f"poscon_{encoder}"]], axis=1
)
df.columns = ["Batch_effect", "Poscon"]
df.sort_values(by="Poscon", ascending=False).head(10)

In [None]:
all_encoder = {
    "ChAda": [
        processed_data["plates_chada"],
        processed_data["poscon_chada"],
        "Embeddings_Raw__ZCA_N_C__Int",
    ],
    "Dinov2_g": [
        processed_data["plates_dinov2_g"],
        processed_data["poscon_dinov2_g"],
        "Embeddings_Raw__ZCA_C__Int",
    ],
    "Dinov2_s": [
        processed_data["plates_dinov2_s"],
        processed_data["poscon_dinov2_s"],
        "Embeddings_Raw__ZCA_C__Int",
    ],
    "Open_Phenom": [
        processed_data["plates_openphenom"],
        processed_data["poscon_openphenom"],
        "Embeddings_Raw__ZCA_N_C__Int",
    ],
    #   'Resnet50_mean': [ processed_data["plates_resnet50_mean"],processed_data["poscon_resnet50_mean"], 'Embeddings_Raw__ZCA-cor_C__rZMs'],
    #   'Resnet50_median': [ processed_data["plates_resnet50_median"],processed_data["poscon_resnet50_median"], 'Embeddings_Raw__rZMs'],
}

In [None]:
plot_curve_comparison(all_encoder)

# Single molecule

In [None]:
for label in labels:
    print(label)
    processed_data = {}

    for key, path in paths.items():
        if "jcp" in str(path):
            df = preprocess(path, label=label)
        else:
            df = preprocess(path)
        if key in embeddings_random_values:
            df["Embeddings Random"] = embeddings_random_values[key]
        processed_data[key] = df

    all_encoder = {
        "ChAda": [
            processed_data["plates_chada"],
            processed_data["poscon_chada"],
            "Embeddings_Raw__ZCA_N_C__Int",
        ],
        "Dinov2_g": [
            processed_data["plates_dinov2_g"],
            processed_data["poscon_dinov2_g"],
            "Embeddings_Raw__ZCA_N_C__Int",
        ],
        "Dinov2_s": [
            processed_data["plates_dinov2_s"],
            processed_data["poscon_dinov2_s"],
            "Embeddings_Raw__ZCA_N_C__Int",
        ],
        "Open_Phenom": [
            processed_data["plates_openphenom"],
            processed_data["poscon_openphenom"],
            "Embeddings_Raw__ZCA_N_C__Int",
        ],
        #   'Resnet50_mean': [ processed_data["plates_resnet50_mean"],processed_data["poscon_resnet50_mean"], 'Embeddings_Raw__ZCA-cor_C__rZMs'],
        #   'Resnet50_median': [ processed_data["plates_resnet50_median"],processed_data["poscon_resnet50_median"], 'Embeddings_Raw__rZMs'],
    }

    plot_curve_comparison(all_encoder)

In [None]:
for label in labels:
    print(label)
    processed_data = {}

    for key, path in paths.items():
        if "jcp" in str(path):
            df = preprocess(path, label=label)
        else:
            df = preprocess(path)
        if key in embeddings_random_values:
            df["Embeddings Random"] = embeddings_random_values[key]
        processed_data[key] = df

    all_encoder = {
        "ChAda": [
            processed_data["plates_chada"],
            processed_data["poscon_chada"],
            "Embeddings_Raw__ZCA_C__Int",
        ],
        "Dinov2_g": [
            processed_data["plates_dinov2_g"],
            processed_data["poscon_dinov2_g"],
            "Embeddings_Raw__ZCA_C__Int",
        ],
        "Dinov2_s": [
            processed_data["plates_dinov2_s"],
            processed_data["poscon_dinov2_s"],
            "Embeddings_Raw__ZCA_C__Int",
        ],
        "Open_Phenom": [
            processed_data["plates_openphenom"],
            processed_data["poscon_openphenom"],
            "Embeddings_Raw__ZCA_C__Int",
        ],
        #   'Resnet50_mean': [ processed_data["plates_resnet50_mean"],processed_data["poscon_resnet50_mean"], 'Embeddings_Raw__ZCA-cor_C__rZMs'],
        #   'Resnet50_median': [ processed_data["plates_resnet50_median"],processed_data["poscon_resnet50_median"], 'Embeddings_Raw__rZMs'],
    }

    plot_curve_comparison(all_encoder)

# Compare methods

In [None]:
df

In [None]:
# Filtrer les lignes contenant Res01 et Res11
res01_df = df[df.index.str.contains("Res01")]
res11_df = df[df.index.str.contains("Res11")]

# Assurer que les indices sont comparables
common_indices = set(res01_df.index.str.replace("Res01", "")).intersection(
    res11_df.index.str.replace("Res11", "")
)

# Création de listes alignées
res01_filtered = res01_df[res01_df.index.str.replace("Res01", "").isin(common_indices)]
res11_filtered = res11_df[res11_df.index.str.replace("Res11", "").isin(common_indices)]

# Plot des comparaisons
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].scatter(
    res01_filtered["Batch_effect"],
    res11_filtered["Batch_effect"],
    label="Batch Effect",
    alpha=0.7,
)
axes[0].plot(
    [
        min(res01_filtered["Batch_effect"].min(), res11_filtered["Batch_effect"].min()),
        max(res01_filtered["Batch_effect"].max(), res11_filtered["Batch_effect"].max()),
    ],
    [
        min(res01_filtered["Batch_effect"].min(), res11_filtered["Batch_effect"].min()),
        max(res01_filtered["Batch_effect"].max(), res11_filtered["Batch_effect"].max()),
    ],
    "k--",
)
axes[0].set_xlabel("Res01")
axes[0].set_ylabel("Res11")
axes[0].set_title("Comparaison Batch Effect Res01 vs Res11")
axes[0].legend()

axes[1].scatter(
    res01_filtered["Poscon"], res11_filtered["Poscon"], label="Poscon", alpha=0.7
)
axes[1].plot(
    [
        min(res01_filtered["Poscon"].min(), res11_filtered["Poscon"].min()),
        max(res01_filtered["Poscon"].max(), res11_filtered["Poscon"].max()),
    ],
    [
        min(res01_filtered["Poscon"].min(), res11_filtered["Poscon"].min()),
        max(res01_filtered["Poscon"].max(), res11_filtered["Poscon"].max()),
    ],
    "k--",
)
axes[1].set_xlabel("Res01")
axes[1].set_ylabel("Res11")
axes[1].set_title("Comparaison Poscon Res01 vs Res11")
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Filtrer les lignes contenant PCA et ZCA
pca_df = df[df.index.str.contains("PCA")]
zca_df = df[df.index.str.contains("ZCA")]

# Assurer que les indices sont comparables
common_indices = set(pca_df.index.str.replace("PCA", "")).intersection(
    zca_df.index.str.replace("ZCA", "")
)

# Création de listes alignées
pca_filtered = pca_df[pca_df.index.str.replace("PCA", "").isin(common_indices)]
zca_filtered = zca_df[zca_df.index.str.replace("ZCA", "").isin(common_indices)]

# Plot des comparaisons
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].scatter(
    pca_filtered["Batch_effect"],
    zca_filtered["Batch_effect"],
    label="Batch Effect",
    alpha=0.7,
)
axes[0].plot(
    [
        min(pca_filtered["Batch_effect"].min(), zca_filtered["Batch_effect"].min()),
        max(pca_filtered["Batch_effect"].max(), zca_filtered["Batch_effect"].max()),
    ],
    [
        min(pca_filtered["Batch_effect"].min(), zca_filtered["Batch_effect"].min()),
        max(pca_filtered["Batch_effect"].max(), zca_filtered["Batch_effect"].max()),
    ],
    "k--",
)
axes[0].set_xlabel("PCA")
axes[0].set_ylabel("ZCA")
axes[0].set_title("Comparaison Batch Effect PCA vs ZCA")
axes[0].legend()

axes[1].scatter(
    pca_filtered["Poscon"], zca_filtered["Poscon"], label="Poscon", alpha=0.7
)
axes[1].plot(
    [
        min(pca_filtered["Poscon"].min(), zca_filtered["Poscon"].min()),
        max(pca_filtered["Poscon"].max(), zca_filtered["Poscon"].max()),
    ],
    [
        min(pca_filtered["Poscon"].min(), zca_filtered["Poscon"].min()),
        max(pca_filtered["Poscon"].max(), zca_filtered["Poscon"].max()),
    ],
    "k--",
)
axes[1].set_xlabel("PCA")
axes[1].set_ylabel("ZCA")
axes[1].set_title("Comparaison Poscon PCA vs ZCA")
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Filtrer les lignes contenant ZCA et ZCA-cor
zca_df = df[df.index.str.contains("ZCA_")]
zca_cor_df = df[df.index.str.contains("ZCA-cor_")]

# Assurer que les indices sont comparables
common_indices = set(zca_df.index.str.replace("ZCA_", "")).intersection(
    zca_cor_df.index.str.replace("ZCA-cor_", "")
)

# Création de listes alignées
zca_filtered = zca_df[zca_df.index.str.replace("ZCA_", "").isin(common_indices)]
zca_cor_filtered = zca_cor_df[
    zca_cor_df.index.str.replace("ZCA-cor_", "").isin(common_indices)
]

# Plot des comparaisons
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].scatter(
    zca_filtered["Batch_effect"],
    zca_cor_filtered["Batch_effect"],
    label="Batch Effect",
    alpha=0.7,
)
axes[0].plot(
    [
        min(zca_filtered["Batch_effect"].min(), zca_cor_filtered["Batch_effect"].min()),
        max(zca_filtered["Batch_effect"].max(), zca_cor_filtered["Batch_effect"].max()),
    ],
    [
        min(zca_filtered["Batch_effect"].min(), zca_cor_filtered["Batch_effect"].min()),
        max(zca_filtered["Batch_effect"].max(), zca_cor_filtered["Batch_effect"].max()),
    ],
    "k--",
)
axes[0].set_xlabel("ZCA")
axes[0].set_ylabel("ZCA-cor")
axes[0].set_title("Comparaison Batch Effect ZCA vs ZCA-cor")
axes[0].legend()

axes[1].scatter(
    zca_filtered["Poscon"], zca_cor_filtered["Poscon"], label="Poscon", alpha=0.7
)
axes[1].plot(
    [
        min(zca_filtered["Poscon"].min(), zca_cor_filtered["Poscon"].min()),
        max(zca_filtered["Poscon"].max(), zca_cor_filtered["Poscon"].max()),
    ],
    [
        min(zca_filtered["Poscon"].min(), zca_cor_filtered["Poscon"].min()),
        max(zca_filtered["Poscon"].max(), zca_cor_filtered["Poscon"].max()),
    ],
    "k--",
)
axes[1].set_xlabel("ZCA")
axes[1].set_ylabel("ZCA-cor")
axes[1].set_title("Comparaison Poscon ZCA vs ZCA-cor")
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Filtrer les lignes contenant rZ
rzm_df = df[df.index.str.contains("rZM")]
rzm_lower_df = df[df.index.str.contains("rZm")]
rzm_mi_df = df[df.index.str.contains("rZm/Mi")]
rzm_ms_df = df[df.index.str.contains("rZm/Ms")]

# Comparaison rZM vs rZm
common_indices_rzm = set(rzm_df.index.str.replace("rZM", "")).intersection(
    rzm_lower_df.index.str.replace("rZm", "")
)
rzm_filtered = rzm_df[rzm_df.index.str.replace("rZM", "").isin(common_indices_rzm)]
rzm_lower_filtered = rzm_lower_df[
    rzm_lower_df.index.str.replace("rZm", "").isin(common_indices_rzm)
]

# Comparaison rZm/Mi vs rZm/Ms
common_indices_rzm_mi = set(rzm_mi_df.index.str.replace("rZm/Mi", "")).intersection(
    rzm_ms_df.index.str.replace("rZm/Ms", "")
)
rzm_mi_filtered = rzm_mi_df[
    rzm_mi_df.index.str.replace("rZm/Mi", "").isin(common_indices_rzm_mi)
]
rzm_ms_filtered = rzm_ms_df[
    rzm_ms_df.index.str.replace("rZm/Ms", "").isin(common_indices_rzm_mi)
]

# Plot des comparaisons
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

axes[0, 0].scatter(
    rzm_filtered["Batch_effect"],
    rzm_lower_filtered["Batch_effect"],
    label="Batch Effect",
    alpha=0.7,
)
axes[0, 0].plot(
    [
        min(
            rzm_filtered["Batch_effect"].min(), rzm_lower_filtered["Batch_effect"].min()
        ),
        max(
            rzm_filtered["Batch_effect"].max(), rzm_lower_filtered["Batch_effect"].max()
        ),
    ],
    [
        min(
            rzm_filtered["Batch_effect"].min(), rzm_lower_filtered["Batch_effect"].min()
        ),
        max(
            rzm_filtered["Batch_effect"].max(), rzm_lower_filtered["Batch_effect"].max()
        ),
    ],
    "k--",
)
axes[0, 0].set_xlabel("rZM")
axes[0, 0].set_ylabel("rZm")
axes[0, 0].set_title("Comparaison Batch Effect rZM vs rZm")
axes[0, 0].legend()

axes[0, 1].scatter(
    rzm_filtered["Poscon"], rzm_lower_filtered["Poscon"], label="Poscon", alpha=0.7
)
axes[0, 1].plot(
    [
        min(rzm_filtered["Poscon"].min(), rzm_lower_filtered["Poscon"].min()),
        max(rzm_filtered["Poscon"].max(), rzm_lower_filtered["Poscon"].max()),
    ],
    [
        min(rzm_filtered["Poscon"].min(), rzm_lower_filtered["Poscon"].min()),
        max(rzm_filtered["Poscon"].max(), rzm_lower_filtered["Poscon"].max()),
    ],
    "k--",
)
axes[0, 1].set_xlabel("rZM")
axes[0, 1].set_ylabel("rZm")
axes[0, 1].set_title("Comparaison Poscon rZM vs rZm")
axes[0, 1].legend()

axes[1, 0].scatter(
    rzm_mi_filtered["Batch_effect"],
    rzm_ms_filtered["Batch_effect"],
    label="Batch Effect",
    alpha=0.7,
)
axes[1, 0].plot(
    [
        min(
            rzm_mi_filtered["Batch_effect"].min(), rzm_ms_filtered["Batch_effect"].min()
        ),
        max(
            rzm_mi_filtered["Batch_effect"].max(), rzm_ms_filtered["Batch_effect"].max()
        ),
    ],
    [
        min(
            rzm_mi_filtered["Batch_effect"].min(), rzm_ms_filtered["Batch_effect"].min()
        ),
        max(
            rzm_mi_filtered["Batch_effect"].max(), rzm_ms_filtered["Batch_effect"].max()
        ),
    ],
    "k--",
)
axes[1, 0].set_xlabel("rZm/Mi")
axes[1, 0].set_ylabel("rZm/Ms")
axes[1, 0].set_title("Comparaison Batch Effect rZm/Mi vs rZm/Ms")
axes[1, 0].legend()

axes[1, 1].scatter(
    rzm_mi_filtered["Poscon"], rzm_ms_filtered["Poscon"], label="Poscon", alpha=0.7
)
axes[1, 1].plot(
    [
        min(rzm_mi_filtered["Poscon"].min(), rzm_ms_filtered["Poscon"].min()),
        max(rzm_mi_filtered["Poscon"].max(), rzm_ms_filtered["Poscon"].max()),
    ],
    [
        min(rzm_mi_filtered["Poscon"].min(), rzm_ms_filtered["Poscon"].min()),
        max(rzm_mi_filtered["Poscon"].max(), rzm_ms_filtered["Poscon"].max()),
    ],
    "k--",
)
axes[1, 1].set_xlabel("rZm/Mi")
axes[1, 1].set_ylabel("rZm/Ms")
axes[1, 1].set_title("Comparaison Poscon rZm/Mi vs rZm/Ms")
axes[1, 1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Supprimer le préfixe "Embeddings" des index
df.index = df.index.str.replace("^Embeddings_", "", regex=True)

# Filtrer les lignes contenant rZ
rz_df = df[df.index.str.contains("rZ")]

# Comparaison des entrées contenant 'i' et 's'
rzm_i_df = rz_df[rz_df.index.str.contains("i")]
rzm_s_df = rz_df[rz_df.index.str.contains("s")]

# Assurer que les indices sont comparables
common_indices_rzm = set(rzm_i_df.index.str.replace("i", "")).intersection(
    rzm_s_df.index.str.replace("s", "")
)
rzm_i_filtered = rzm_i_df[rzm_i_df.index.str.replace("i", "").isin(common_indices_rzm)]
rzm_s_filtered = rzm_s_df[rzm_s_df.index.str.replace("s", "").isin(common_indices_rzm)]

# Plot des comparaisons
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

axes[0].scatter(
    rzm_i_filtered["Batch_effect"],
    rzm_s_filtered["Batch_effect"],
    label="Batch Effect",
    alpha=0.7,
)
axes[0].plot(
    [
        min(rzm_i_filtered["Batch_effect"].min(), rzm_s_filtered["Batch_effect"].min()),
        max(rzm_i_filtered["Batch_effect"].max(), rzm_s_filtered["Batch_effect"].max()),
    ],
    [
        min(rzm_i_filtered["Batch_effect"].min(), rzm_s_filtered["Batch_effect"].min()),
        max(rzm_i_filtered["Batch_effect"].max(), rzm_s_filtered["Batch_effect"].max()),
    ],
    "k--",
)
axes[0].set_xlabel("rZm avec i")
axes[0].set_ylabel("rZm avec s")
axes[0].set_title("Comparaison Batch Effect rZm/i vs rZm/s")
axes[0].legend()

axes[1].scatter(
    rzm_i_filtered["Poscon"], rzm_s_filtered["Poscon"], label="Poscon", alpha=0.7
)
axes[1].plot(
    [
        min(rzm_i_filtered["Poscon"].min(), rzm_s_filtered["Poscon"].min()),
        max(rzm_i_filtered["Poscon"].max(), rzm_s_filtered["Poscon"].max()),
    ],
    [
        min(rzm_i_filtered["Poscon"].min(), rzm_s_filtered["Poscon"].min()),
        max(rzm_i_filtered["Poscon"].max(), rzm_s_filtered["Poscon"].max()),
    ],
    "k--",
)
axes[1].set_xlabel("rZm avec i")
axes[1].set_ylabel("rZm avec s")
axes[1].set_title("Comparaison Poscon rZm/i vs rZm/s")
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Supprimer le préfixe "Embeddings" des index
df.index = df.index.str.replace("^Embeddings_", "", regex=True)


rz_df = df[df.index.str.contains("rZ")]
# Filtrer les lignes contenant _C
rzm_c_df = rz_df[rz_df.index.str.contains("_C")]

# Créer un DataFrame correspondant sans _C
indices_no_c = rzm_c_df.index.str.replace("_C", "", regex=False)
rzm_no_c_df = rz_df.loc[rz_df.index.isin(indices_no_c)]

# Vérification d'alignement
rzm_no_c_df = rzm_no_c_df.reindex(indices_no_c)

# Plot de la comparaison Batch Effect
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.scatter(
    rzm_c_df["Batch_effect"],
    rzm_no_c_df["Batch_effect"],
    label="Batch Effect",
    alpha=0.7,
)
plt.plot(
    [rzm_c_df["Batch_effect"].min(), rzm_c_df["Batch_effect"].max()],
    [rzm_c_df["Batch_effect"].min(), rzm_c_df["Batch_effect"].max()],
    "k--",
)
plt.xlabel("Avec _C")
plt.ylabel("Sans _C")
plt.title("Comparaison Batch Effect avec/sans _C")
plt.legend()

# Plot de la comparaison Poscon
plt.subplot(1, 2, 2)
plt.scatter(rzm_c_df["Poscon"], rzm_no_c_df["Poscon"], label="Poscon", alpha=0.7)
plt.plot(
    [rzm_c_df["Poscon"].min(), rzm_c_df["Poscon"].max()],
    [rzm_c_df["Poscon"].min(), rzm_c_df["Poscon"].max()],
    "k--",
)
plt.xlabel("Avec _C")
plt.ylabel("Sans _C")
plt.title("Comparaison Poscon avec/sans _C")
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Supprimer le préfixe "Embeddings" des index
df.index = df.index.str.replace("^Embeddings_", "", regex=True)


rz_df = df[df.index.str.contains("ZCA")]
# Filtrer les lignes contenant _C
rzm_c_df = rz_df[rz_df.index.str.contains("_C")]

# Créer un DataFrame correspondant sans _C
indices_no_c = rzm_c_df.index.str.replace("_C", "", regex=False)
rzm_no_c_df = rz_df.loc[rz_df.index.isin(indices_no_c)]

# Vérification d'alignement
rzm_no_c_df = rzm_no_c_df.reindex(indices_no_c)

# Plot de la comparaison Batch Effect
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.scatter(
    rzm_c_df["Batch_effect"],
    rzm_no_c_df["Batch_effect"],
    label="Batch Effect",
    alpha=0.7,
)
plt.plot(
    [rzm_c_df["Batch_effect"].min(), rzm_c_df["Batch_effect"].max()],
    [rzm_c_df["Batch_effect"].min(), rzm_c_df["Batch_effect"].max()],
    "k--",
)
plt.xlabel("Avec _C")
plt.ylabel("Sans _C")
plt.title("Comparaison Batch Effect avec/sans _C")
plt.legend()

# Plot de la comparaison Poscon
plt.subplot(1, 2, 2)
plt.scatter(rzm_c_df["Poscon"], rzm_no_c_df["Poscon"], label="Poscon", alpha=0.7)
plt.plot(
    [rzm_c_df["Poscon"].min(), rzm_c_df["Poscon"].max()],
    [rzm_c_df["Poscon"].min(), rzm_c_df["Poscon"].max()],
    "k--",
)
plt.xlabel("Avec _C")
plt.ylabel("Sans _C")
plt.title("Comparaison Poscon avec/sans _C")
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Supprimer le préfixe "Embeddings" des index
df.index = df.index.str.replace("^Embeddings_", "", regex=True)


rz_df = df[df.index.str.contains("ZCA")]
# Filtrer les lignes contenant _C
rzm_c_df = rz_df[rz_df.index.str.contains("_N")]

# Créer un DataFrame correspondant sans _C
indices_no_c = rzm_c_df.index.str.replace("_N", "", regex=False)
rzm_no_c_df = rz_df.loc[rz_df.index.isin(indices_no_c)]

# Vérification d'alignement
rzm_no_c_df = rzm_no_c_df.reindex(indices_no_c)

# Plot de la comparaison Batch Effect
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.scatter(
    rzm_c_df["Batch_effect"],
    rzm_no_c_df["Batch_effect"],
    label="Batch Effect",
    alpha=0.7,
)
plt.plot(
    [rzm_c_df["Batch_effect"].min(), rzm_c_df["Batch_effect"].max()],
    [rzm_c_df["Batch_effect"].min(), rzm_c_df["Batch_effect"].max()],
    "k--",
)
plt.xlabel("Avec _N")
plt.ylabel("Sans _N")
plt.title("Comparaison Batch Effect avec/sans _N")
plt.legend()

# Plot de la comparaison Poscon
plt.subplot(1, 2, 2)
plt.scatter(rzm_c_df["Poscon"], rzm_no_c_df["Poscon"], label="Poscon", alpha=0.7)
plt.plot(
    [rzm_c_df["Poscon"].min(), rzm_c_df["Poscon"].max()],
    [rzm_c_df["Poscon"].min(), rzm_c_df["Poscon"].max()],
    "k--",
)
plt.xlabel("Avec _N")
plt.ylabel("Sans _N")
plt.title("Comparaison Poscon avec/sans _N")
plt.legend()

plt.tight_layout()
plt.show()