# Imports

In [None]:
from phenoseeker import BioproxyEvaluator
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm import tqdm
from rdkit import Chem, DataStructs
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator

# get data

In [None]:
base_path = Path("/home/maxime/data/jump_embeddings/dinov2_g/compounds/")
npy_file = base_path  / "Embeddings_norm.npy"
parquet_metadata = base_path  / Path("metadata.parquet")

In [None]:
screens_folders = {
      #  "ChemBL": Path("/projects/synsight/repos/phenospace/bioproxy/screens_data_chembl"),
        "Curie": Path("/projects/synsight/repos/phenospace/bioproxy/screens_data_curie"),
        "ChEMBL": Path("/projects/synsight/repos/phenoseeker/data/ChEMBL/assays_csv"),
        "Lit-PCBA": Path("/projects/synsight/repos/phenoseeker/data/Lit_PCBA/csv_files"),
}

In [None]:
eval = BioproxyEvaluator(parquet_metadata, npy_file, screens_folders, embeddings_name='Embeddings_dinov2', embeddings_entity='compound')

In [None]:

eval.compute_distance_matrix(embeddings_cols=["Embeddings_dinov2"], distance='cosine')   

In [None]:
     
for source in eval.screens_data.keys():
    for screen in eval.screens_data[source].keys():
        eval.distance_matrices[source][screen]['Embeddings_mean'] = 1 - eval.distance_matrices[source][screen]['Embeddings_mean']

In [None]:
for source in eval.screens_data.keys():
    for screen in eval.screens_data[source].keys():
        eval.distance_matrices[source][screen]['Embeddings_mean'] = 1 - eval.distance_matrices[source][screen]['Embeddings_mean']

# Ranking

## Functions

In [None]:
def get_tanimono(df):
    inchi_list = df["Metadata_InChI"].tolist()
    tanimoto = []
    mols = [Chem.MolFromInchi(inchi) for inchi in inchi_list]
    gen = GetMorganGenerator(radius=2, fpSize=2048)
    fps = [gen.GetFingerprint(mol) for mol in mols]
    for fp in fps:
        sim = DataStructs.TanimotoSimilarity(fps[0], fp)
        tanimoto.append(sim)
    return tanimoto

In [None]:
def calculate_sims(df):
    df_excluded = df.iloc[1:]
    n_first_5_percent = int(len(df_excluded) * 0.05) + 1
    mean_tanimoto_first_5_percent = df_excluded.nsmallest(n_first_5_percent, 'Distance')['tanimoto_to_target'].mean()
    mean_tanimoto_last_5_percent = df_excluded.nlargest(n_first_5_percent, 'tanimoto_to_target')['tanimoto_to_target'].mean()
    mean_tanimoto_all = df_excluded['tanimoto_to_target'].mean()
    return mean_tanimoto_first_5_percent, mean_tanimoto_last_5_percent, mean_tanimoto_all

In [None]:

def get_sims(source, screen, eval):
    df_inchi = eval.screens_data[source][screen][['Metadata_JCP2022' ,'Metadata_InChI']].drop_duplicates()
    best_jcp = eval.calculate_enrichment_factor(source, screen, 'Embeddings_mean', [5]).sort_values(by='EF', ascending=False).iloc[0]['Metadata_JCP2022']
    res_dic = pd.DataFrame(eval.compute_ranking(source, screen, 'Embeddings_mean', best_jcp,  plot=False))
    df = res_dic.merge(df_inchi, on='Metadata_JCP2022').drop_duplicates()
    df['tanimoto_to_target'] = get_tanimono(df)
    return calculate_sims(df)


## get all data

In [None]:
source = 'Curie'
screen = 'E15'

In [None]:
get_sims(source, screen, eval)

In [None]:
top_5_means_Curie = []
high_5_means_Curie = []
all_means_Curie = []

# Assuming `eval` and `get_sims` are predefined
for source in ["Curie"]:  # Add 'ChemBL' if needed
    for screen in tqdm(eval.screens_data[source].keys()):
        top_5, high_5, all_mean = get_sims(source, screen, eval)
        top_5_means_Curie.append(top_5)
        high_5_means_Curie.append(high_5)
        all_means_Curie.append(all_mean)

In [None]:
top_5_means_ChemBL = []
high_5_means_ChemBL = []
all_means_ChemBL = []

# Assuming `eval` and `get_sims` are predefined
for source in ["ChemBL"]:  # Add 'ChemBL' if needed
    for screen in tqdm(eval.screens_data[source].keys()):
        top_5, high_5, all_mean = get_sims(source, screen, eval)
        top_5_means_ChemBL.append(top_5)
        high_5_means_ChemBL.append(high_5)
        all_means_ChemBL.append(all_mean)

# Plots

In [None]:
import numpy as np
from scipy.stats import shapiro, ttest_rel, wilcoxon

# Your three lists (ensure they are of equal length and correspond element-wise)

# Convert lists to numpy arrays
all_means = np.array(all_means_Curie)
top_5_means = np.array(top_5_means_Curie)
high_5_means = np.array(high_5_means_Curie)

# Ensure that all arrays have the same length
assert len(all_means) == len(top_5_means) == len(high_5_means), "Arrays must be of the same length"

# Compute differences between paired samples
diff_high_all = high_5_means - all_means
diff_high_top = high_5_means - top_5_means

# Step 1: Check for normality of differences
alpha = 0.05  # Significance level

# Test normality for high_5_means vs. all_means
stat_high_all, p_high_all = shapiro(diff_high_all)
print("Normality Test for High 5% vs. All Means Differences:")
print(f"Shapiro-Wilk Test Statistic: {stat_high_all:.10f}, p-value: {p_high_all:.10f}")

# Test normality for high_5_means vs. top_5_means
stat_high_top, p_high_top = shapiro(diff_high_top)
print("\nNormality Test for High 5% vs. Top 5% Means Differences:")
print(f"Shapiro-Wilk Test Statistic: {stat_high_top:.10f}, p-value: {p_high_top:.10f}")

# Decide which test to use based on normality
# Comparison 1: High 5% vs. All Means
if p_high_all > alpha:
    # Differences are normally distributed; use paired t-test
    t_statistic, p_value = ttest_rel(high_5_means, all_means, alternative='greater')
    print("\nPaired t-test for High 5% vs. All Means:")
else:
    # Differences are not normally distributed; use Wilcoxon signed-rank test
    t_statistic, p_value = wilcoxon(high_5_means, all_means, alternative='greater', zero_method='wilcox')
    print("\nWilcoxon signed-rank test for High 5% vs. All Means:")

print(f"Test Statistic: {t_statistic:.10f}, p-value: {p_value:.10f}")

# Interpretation
if p_value < alpha:
    print("Result: Statistically significant difference (High 5% > All Means)")
else:
    print("Result: No statistically significant difference")

# Comparison 2: High 5% vs. Top 5% Means
if p_high_top > alpha:
    # Differences are normally distributed; use paired t-test
    t_statistic, p_value = ttest_rel(high_5_means, top_5_means, alternative='greater')
    print("\nPaired t-test for High 5% vs. Top 5% Means:")
else:
    # Differences are not normally distributed; use Wilcoxon signed-rank test
    t_statistic, p_value = wilcoxon(high_5_means, top_5_means, alternative='greater', zero_method='wilcox')
    print("\nWilcoxon signed-rank test for High 5% vs. Top 5% Means:")

print(f"Test Statistic: {t_statistic:.10f}, p-value: {p_value:.10f}")

# Interpretation
if p_value < alpha:
    print("Result: Statistically significant difference (High 5% > Top 5% Means)")
else:
    print("Result: No statistically significant difference")


In [None]:

import matplotlib.pyplot as plt
from scipy.stats import wilcoxon
def assign_stars(p_value):
    if p_value < 0.00001:
        return "*****"
    elif p_value < 0.0001:
        return "****"
    elif p_value < 0.001:
        return "***"
    elif p_value < 0.01:
        return "**"
    elif p_value < 0.05:
        return "*"
    else:
        return "ns"

# Perform pairwise Wilcoxon signed-rank tests
alpha = 0.05  # Significance level
labels = ['Across all Compounds', '5% Most Similar Phenotypes', '5% Most Similar Structure']
colors = ['skyblue', 'lightgreen', 'salmon']

In [None]:
high_5_means = high_5_means_Curie
all_means = all_means_Curie
top_5_means = top_5_means_Curie

data = [all_means, top_5_means, high_5_means]

# High 5% vs. All Means
stat_high_vs_all, p_high_vs_all = wilcoxon(high_5_means, all_means, alternative='greater')
significance_high_vs_all = assign_stars(p_high_vs_all)

# High 5% vs. Top 5% Means
stat_high_vs_top, p_high_vs_top = wilcoxon(high_5_means, top_5_means, alternative='greater')
significance_high_vs_top = assign_stars(p_high_vs_top)

# Top 5% vs. All Means
stat_top_vs_all, p_top_vs_all = wilcoxon(top_5_means, all_means, alternative='greater')
significance_top_vs_all =assign_stars(p_top_vs_all)

plt.figure(figsize=(4, 10))
box = plt.boxplot(data, labels=labels, patch_artist=True, showmeans=True)

# Apply colors to the boxes
for patch, color in zip(box['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

# Overlay the points
for i, (dataset, color) in enumerate(zip(data, colors), start=1):
    plt.scatter([i] * len(dataset), dataset, color=color, alpha=0.7, edgecolor='k')

# Add significance stars between specific pairs
y_max_all_top5 = max(max(all_means), max(top_5_means)) + 0.01  # Level for All vs. Top 5%
y_max_top5_high5 = max(max(top_5_means), max(high_5_means))  # Level for Top 5% vs. High 5%
h = 0.01  # Height above the maximum for the stars
star_offset = 0.002  # Vertical spacing between the significance lines and stars

# Add significance between All and Top 5%
x1, x2 = 1, 2
plt.plot([x1, x1, x2, x2], [y_max_all_top5, y_max_all_top5 + h, y_max_all_top5 + h, y_max_all_top5], lw=1.5, color='black')
plt.text((x1 + x2) * 0.5, y_max_all_top5 + h + star_offset, significance_top_vs_all, ha='center', fontsize=12)
plt.ylim(0.03, 0.47)
# Add significance between Top 5% and High 5%
x1, x2 = 2, 3
plt.plot([x1, x1, x2, x2], [y_max_top5_high5 + h, y_max_top5_high5 + 2 * h, y_max_top5_high5 + 2 * h, y_max_top5_high5 + h], lw=1.5, color='black')
plt.text((x1 + x2) * 0.5, y_max_top5_high5 + 2 * h + star_offset, significance_high_vs_top, ha='center', fontsize=12)

# Add plot details
plt.ylabel('Tanimoto Similarity')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
high_5_means = high_5_means_ChemBL
all_means = all_means_ChemBL
top_5_means = top_5_means_ChemBL
data = [all_means, top_5_means, high_5_means]

# High 5% vs. All Means
stat_high_vs_all, p_high_vs_all = wilcoxon(high_5_means, all_means, alternative='greater')
significance_high_vs_all = assign_stars(p_high_vs_all)

# High 5% vs. Top 5% Means
stat_high_vs_top, p_high_vs_top = wilcoxon(high_5_means, top_5_means, alternative='greater')
significance_high_vs_top = assign_stars(p_high_vs_top)

# Top 5% vs. All Means
stat_top_vs_all, p_top_vs_all = wilcoxon(top_5_means, all_means, alternative='greater')
significance_top_vs_all =assign_stars(p_top_vs_all)

plt.figure(figsize=(4, 10))
box = plt.boxplot(data, labels=labels, patch_artist=True, showmeans=True)

# Apply colors to the boxes
for patch, color in zip(box['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

# Overlay the points
for i, (dataset, color) in enumerate(zip(data, colors), start=1):
    plt.scatter([i] * len(dataset), dataset, color=color, alpha=0.7, edgecolor='k')

# Add significance stars between specific pairs
y_max_all_top5 = max(max(all_means), max(top_5_means)) + 0.01  # Level for All vs. Top 5%
y_max_top5_high5 = max(max(top_5_means), max(high_5_means))  # Level for Top 5% vs. High 5%
h = 0.01  # Height above the maximum for the stars
star_offset = 0.002  # Vertical spacing between the significance lines and stars

# Add significance between All and Top 5%
x1, x2 = 1, 2
plt.plot([x1, x1, x2, x2], [y_max_all_top5, y_max_all_top5 + h, y_max_all_top5 + h, y_max_all_top5], lw=1.5, color='black')
plt.text((x1 + x2) * 0.5, y_max_all_top5 + h + star_offset, significance_top_vs_all, ha='center', fontsize=12)
plt.ylim(0.03, 0.47)
# Add significance between Top 5% and High 5%
x1, x2 = 2, 3
plt.plot([x1, x1, x2, x2], [y_max_top5_high5 + h, y_max_top5_high5 + 2 * h, y_max_top5_high5 + 2 * h, y_max_top5_high5 + h], lw=1.5, color='black')
plt.text((x1 + x2) * 0.5, y_max_top5_high5 + 2 * h + star_offset, significance_high_vs_top, ha='center', fontsize=12)

# Add plot details
plt.ylabel('Tanimoto Similarity')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()
