# Imports and Functions

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import random

# Core libraries
from pathlib import Path
from tqdm import tqdm

# RDKit for chemical informatics
from rdkit import Chem
from rdkit.Chem import Draw
import numpy as np
import pandas as pd
from scipy.spatial.distance import cosine, pdist
from scipy.stats import ttest_ind
from joblib import Parallel, delayed
# Visualization tools
import seaborn as sns
from IPython.display import display

# Machine learning and clustering
from sklearn.metrics.pairwise import cosine_similarity
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

from normalisation import JumpExplorer


In [2]:

def compute_average_precision_for_hit(hit_embedding, all_embeddings, all_labels):
    # Compute cosine similarity of the hit with all samples
    similarities = cosine_similarity(hit_embedding.reshape(1, -1), all_embeddings).flatten()

    # Rank indices based on similarity
    ranked_indices = np.argsort(-similarities)  # Descending order
    ranked_labels = all_labels[ranked_indices]  # Get labels of ranked samples

    # Compute precision at each rank
    precisions = []
    num_hits = 0
    for rank, rel_label in enumerate(ranked_labels, start=1):
        if rel_label == 1:
            num_hits += 1
            precisions.append(num_hits / rank)

    # Compute average precision for this 'hit' sample
    return np.mean(precisions) if precisions else 0



def compute_phenotypic_similarity(df):
    embeddings = np.stack(df['Embeddings_mean'])  
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix

def visualize_similarity_matrix(df, similarity_matrix, fontsize_sim=14):
    """
    Visualise la matrice de similarité cosinus avec les valeurs affichées.

    Args:
        df: pandas DataFrame contenant la colonne Metadata_JCP2022.
        similarity_matrix: numpy array, matrice de similarité cosinus.
    """
    ids = df['Metadata_JCP2022'].values

    fig, ax = plt.subplots(figsize=(10, 8))

    # Matrice de similarité
    cax = ax.imshow(similarity_matrix, cmap='viridis', interpolation='none')  # Suppression de l'interpolation
    ax.set_title('Cosine Similarity Matrix', fontsize=14)
    ax.set_xticks(range(len(ids)))
    ax.set_yticks(range(len(ids)))
    ax.set_xticklabels(ids, rotation=90, fontsize=8)
    ax.set_yticklabels(ids, fontsize=8)

    # Supprimer les lignes blanches entre les pixels
    ax.set_xticks([], minor=True)
    ax.set_yticks([], minor=True)
    ax.grid(False)  # Désactiver les grilles

    # Ajouter les valeurs dans les cases
    for i in range(similarity_matrix.shape[0]):
        for j in range(similarity_matrix.shape[1]):
            value = f"{similarity_matrix[i, j]:.2f}"  # Formater à 2 décimales
            ax.text(j, i, value, ha='center', va='center', fontsize=fontsize_sim, color='white')

    # Ajouter une barre de couleur
    plt.colorbar(cax, ax=ax)

    plt.tight_layout()
    plt.show()
    
def hierarchical_clustering_and_visualization(df, similarity_matrix, threshold=0.5):
    """
    Effectue un clustering hiérarchique sur les molécules et visualise les résultats.
    
    Args:
        df (pd.DataFrame): DataFrame contenant les molécules et leurs métadonnées.
        similarity_matrix (np.ndarray): Matrice de similarité entre les molécules.
        threshold (float): Seuil pour déterminer les clusters à partir du dendrogramme.
    """
    # Convertir la matrice de similarité en matrice de dissimilarité
    dissimilarity = 1 - similarity_matrix

    # Créer un linkage pour le clustering hiérarchique
    linkage_matrix = linkage(dissimilarity, method='average')

    # Afficher le dendrogramme
    plt.figure(figsize=(10, 7))
    dendrogram(linkage_matrix, labels=df['Metadata_JCP2022'].values, leaf_rotation=90)
    plt.title("Dendrogramme de clustering hiérarchique")
    plt.xlabel("Molécules")
    plt.ylabel("Distance")
    plt.axhline(y=threshold, color='r', linestyle='--', label=f'Seuil {threshold}')
    plt.legend()
    plt.show()

    # Déterminer les clusters à partir du seuil
    clusters = fcluster(linkage_matrix, t=threshold, criterion='distance')
    df['Cluster'] = clusters
    df_sorted = df.sort_values(by='Cluster').reset_index(drop=True)

    # Visualisation des molécules par cluster
    for cluster_id in sorted(df['Cluster'].unique()):
        cluster_df = df[df['Cluster'] == cluster_id]
        mols = [Chem.MolFromInchi(inchi) for inchi in cluster_df['Metadata_InChI']]
        legends = list(cluster_df['Metadata_JCP2022'].astype(str))

        print(f"Cluster {cluster_id} - {len(cluster_df)} molécules")
        img = Draw.MolsToGridImage(mols, molsPerRow=5, subImgSize=(200, 200), legends=legends)
        display(img)
    return df_sorted


# Get cross data

## Binding DB

In [3]:
binding_db_path = "/home/maxime/data/cell_painting/paper_data/BindingDB_All.tsv"

In [4]:
binding_db_path = "/projects/synsight/repos/phenospace/normalisation/publication/pathways/data/BindingDB_All_202412_tsv(1).zip"

In [5]:
df_bd = pd.read_csv(binding_db_path, sep='\t', on_bad_lines='skip')


## JUMP data

In [6]:
df_phenom = pd.read_parquet('/projects/synsight/data/openphenom/norm_2_compounds_embeddings.parquet')


## Cross informations

In [None]:
print(df_bd.columns.to_list())

In [8]:
columns_to_keep = [
    'Ligand InChI',
    'UniProt (SwissProt) Entry Name of Target Chain',
    'UniProt (SwissProt) Recommended Name of Target Chain',
    'Target Source Organism According to Curator or DataSource','Kd (nM)', 'EC50 (nM)','Article DOI',
    
]

new_df = df_bd[columns_to_keep]

In [None]:
genes = []
for i in new_df['UniProt (SwissProt) Entry Name of Target Chain']:
    try:
        genes.append(i.split('_')[0])
    except:
        genes.append(None)
print(set(genes))

In [10]:
new_df['gene_symbol'] = genes

In [11]:
filtered_df = new_df[(new_df['Target Source Organism According to Curator or DataSource']=='Homo sapiens')]

In [None]:
filtered_df['gene_symbol'].value_counts()

In [13]:
filtered_df.drop_duplicates(inplace=True)

In [14]:
df_final = filtered_df.merge(df_phenom, left_on='Ligand InChI', right_on='Metadata_InChI')

In [None]:
df_final['Ligand InChI'].nunique()

In [None]:
set(df_final['gene_symbol'])

In [None]:
df_final['gene_symbol'].value_counts()

In [None]:
df_final['Metadata_JCP2022'].value_counts()

In [None]:
df_final[df_final['Metadata_JCP2022']=='JCP2022_097466']['gene_symbol'].value_counts()

In [20]:
genes_dfs = {}  # Ensure it is a dictionary

for gene in df_final['gene_symbol'].unique():
    gene_df = df_final[df_final['gene_symbol'] == gene][['Metadata_JCP2022', 'Metadata_InChI', 'Embeddings_mean']]
    genes_dfs[gene] = gene_df.drop_duplicates(subset='Metadata_JCP2022').reset_index(drop=True)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Comptage du nombre de JCP_ID par molécule
molecule_counts = df_final.groupby('gene_symbol')['Metadata_JCP2022'].nunique().reset_index()
molecule_counts.columns = ['gene_symbol', 'Number of JCP_ID']

# Tri des molécules par nombre de JCP_ID (optionnel)
molecule_counts = molecule_counts.sort_values(by='Number of JCP_ID', ascending=False)[:10]

# Création du plot
plt.figure(figsize=(12, 6))
sns.barplot(x='gene_symbol', y='Number of JCP_ID', data=molecule_counts, palette="viridis")

# Personnalisation
plt.xticks(rotation=90, ha='right')
plt.xlabel("gene_symbol")
plt.ylabel("Nombre de JCP_ID")
plt.title("Nombre de JCP_ID par gene_symbol")
plt.tight_layout()

# Affichage
plt.show()


# Analyse thoses informations

## BioProxy

In [22]:
df_final = genes_dfs['GSK3B']

In [None]:
df_final

In [24]:
inchi_list = df_final['Metadata_InChI'].to_list()

In [25]:
df_phenom['Metadata_Bioactivity'] = df_phenom['Metadata_InChI'].apply(lambda x: 'hit' if x in inchi_list else 'nan')


In [None]:
df_phenom['Metadata_Bioactivity'].value_counts()

In [27]:

hits = df_phenom[df_phenom['Metadata_Bioactivity'] == 'hit']
non_hits = df_phenom[df_phenom['Metadata_Bioactivity'] != 'hit']

hits_embeddings = np.stack(hits['Embeddings_mean'].values).astype(np.float16)
all_embeddings = np.stack(df_phenom['Embeddings_mean'].values).astype(np.float16)


In [None]:
all_labels = (df_phenom['Metadata_Bioactivity'] == 'hit').astype(int).values

average_precisions = Parallel(n_jobs=40)(
    delayed(compute_average_precision_for_hit)(hit_embedding, all_embeddings, all_labels)
    for hit_embedding in tqdm(hits_embeddings)
)

mAP = np.mean(average_precisions)

print(f"Mean Average Precision (mAP): {mAP}")

In [None]:
random.shuffle(all_labels)

average_precisions = Parallel(n_jobs=40)(
    delayed(compute_average_precision_for_hit)(hit_embedding, all_embeddings, all_labels)
    for hit_embedding in tqdm(hits_embeddings)
)

mAP = np.mean(average_precisions)

print(f"Mean Average Precision (mAP): {mAP}")

In [30]:
def compute_enrichment_factor_at_n(hit_embedding, all_embeddings, all_labels, n_percent=1):
    """
    Calculate the enrichment factor (EF) at n% for a given 'hit' embedding, excluding the control positive.

    Parameters:
    - hit_embedding: numpy array, the embedding of the 'hit' sample.
    - all_embeddings: numpy array, embeddings of all samples.
    - all_labels: numpy array, binary labels for all samples (1 for 'hit', 0 otherwise).
    - n_percent: float, the percentage (0-100) of the dataset to consider for EF calculation.

    Returns:
    - float, the Enrichment Factor at n%.
    """
    # Compute cosine similarity of the hit with all samples
    similarities = cosine_similarity(hit_embedding.reshape(1, -1), all_embeddings).flatten()

    # Rank indices based on similarity
    ranked_indices = np.argsort(-similarities)  # Descending order
    ranked_labels = all_labels[ranked_indices]  # Get labels of ranked samples

    # Remove the first entry (control positive)
    ranked_indices = ranked_indices[1:]  # Exclude the first index
    ranked_labels = ranked_labels[1:]    # Exclude the first label

    # Calculate top n% cutoff
    n_top = max(1, int(len(ranked_labels) * (n_percent / 100)))  # At least 1 sample

    # Count hits in the top n% of ranked samples
    hits_in_top_n = np.sum(ranked_labels[:n_top])

    # Total hits in the dataset
    total_hits = np.sum(all_labels)

    # Compute EF
    if total_hits == 0:  # Avoid division by zero
        return 0.0
    enrichment_factor = (hits_in_top_n / n_top) / (total_hits / len(all_labels))

    return enrichment_factor


In [None]:
all_labels = (df_phenom['Metadata_Bioactivity'] == 'hit').astype(int).values

enrichment_factor = Parallel(n_jobs=40)(
    delayed(compute_enrichment_factor_at_n)(hit_embedding, all_embeddings, all_labels)
    for hit_embedding in tqdm(hits_embeddings)
)

mEF = np.mean(enrichment_factor)
maxEF = np.max(enrichment_factor)
print(f"Mean Normalized Enrichment Factor (mEF): {mEF}")
print(f"Max Normalized Enrichment Factor (mEF): {maxEF}")

In [None]:
enrichment_factor

In [None]:
best_control_idx = np.argmax(enrichment_factor)

# Retrieve the corresponding best hit embedding and its cosine similarities
best_hit_embedding = hits_embeddings[best_control_idx]
similarities = cosine_similarity(best_hit_embedding.reshape(1, -1), all_embeddings).flatten()

# Rank the indices based on similarity (descending order)
ranked_indices = np.argsort(-similarities)
ranked_similarities = similarities[ranked_indices]
ranked_labels = all_labels[ranked_indices]

# Prepare data for plotting
distances = np.arange(1, len(ranked_similarities) + 1)  # Distance ranking

# Updated plot with non-hits as semi-transparent gray and hits as red with larger markers

plt.figure(figsize=(10, 6))

# Plot non-hits
non_hits_mask = ranked_labels == 0
plt.scatter(
    distances[non_hits_mask],
    ranked_similarities[non_hits_mask],
    c='gray',
    alpha=0.01,
    s=1,  # Smaller marker size
    label="Non-Hits"
)

# Plot hits
hits_mask = ranked_labels == 1
plt.scatter(
    distances[hits_mask],
    ranked_similarities[hits_mask],
    c='red',
    alpha=1.0,
    s=2,  # Larger marker size
    label="Hits"
)

# Customize plot
plt.title("Distance Ranking vs Cosine Similarity", fontsize=14)
plt.xlabel("Distance Ranking", fontsize=12)
plt.ylabel("Cosine Similarity", fontsize=12)
plt.axhline(0, color="gray", linestyle="--", linewidth=0.8)
plt.legend()
plt.grid(alpha=0.3)
plt.show()

In [None]:
# Create data for hits and non-hits
hits_similarities = ranked_similarities[ranked_labels == 1]
non_hits_similarities = ranked_similarities[ranked_labels == 0]

# Plot 1: Kernel Density Estimate (KDE) of Cosine Similarity
plt.figure(figsize=(10, 6))
sns.kdeplot(hits_similarities, color="red", label="Hits", fill=True, alpha=0.5)
sns.kdeplot(non_hits_similarities, color="gray", label="Non-Hits", fill=True, alpha=0.5)
plt.title("KDE of Cosine Similarity for Hits vs Non-Hits", fontsize=14)
plt.xlabel("Cosine Similarity", fontsize=12)
plt.ylabel("Density", fontsize=12)
plt.legend()
plt.grid(alpha=0.3)
plt.show()



In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

# Initialize lists to accumulate similarities and labels for all hits
all_ranked_similarities = []
all_ranked_labels = []

# Loop through all hits and compute similarities for each as control positive
for hit_idx in tqdm(range(len(hits_embeddings))):
    # Get the current hit embedding
    current_hit_embedding = hits_embeddings[hit_idx]
    
    # Compute cosine similarities with all embeddings
    similarities = cosine_similarity(current_hit_embedding.reshape(1, -1), all_embeddings).flatten()
    
    # Exclude the control positive (first element) from the ranked indices and labels
    similarities = np.delete(similarities, hit_idx)  # Exclude the current hit
    labels = np.delete(all_labels, hit_idx)  # Exclude the label of the current hit
    
    # Rank the indices based on similarity (descending order)
    ranked_indices = np.argsort(-similarities)
    ranked_similarities = similarities[ranked_indices]
    ranked_labels = labels[ranked_indices]
    
    # Append ranked similarities and labels
    all_ranked_similarities.extend(ranked_similarities)
    all_ranked_labels.extend(ranked_labels)

# Convert to numpy arrays for easier processing
all_ranked_similarities = np.array(all_ranked_similarities)
all_ranked_labels = np.array(all_ranked_labels)

# Separate similarities for hits and non-hits
hits_similarities = all_ranked_similarities[all_ranked_labels == 1]
non_hits_similarities = all_ranked_similarities[all_ranked_labels == 0]

# Plot KDE for all hits as control positives excluding the control positive
plt.figure(figsize=(10, 6))
sns.kdeplot(hits_similarities, color="red", label="Hits", fill=True, alpha=0.5)
sns.kdeplot(non_hits_similarities, color="gray", label="Non-Hits", fill=True, alpha=0.5)
plt.title("KDE of Cosine Similarity for All Hits Excluding Control Positives", fontsize=14)
plt.xlabel("Cosine Similarity", fontsize=12)
plt.ylabel("Density", fontsize=12)
plt.legend()
plt.grid(alpha=0.3)
plt.show()


## Tests stats

In [None]:
df_phenom['Metadata_Bioactivity'].value_counts()


In [None]:
df_phenom.to_parquet('/projects/synsight/data/openphenom/norm_2_compounds_embeddings_g2m.parquet', index=False)


df_phenom


In [38]:
df_hits = df_phenom[df_phenom['Metadata_Bioactivity'] == 'hit']

# Filtrer un échantillon aléatoire de ~500 non-hits
df_non_hits = df_phenom[df_phenom['Metadata_Bioactivity'] == 'nan'].sample(n=10*len(df_hits), random_state=42)

# Combiner les hits et les non-hits
df_phenom_test = pd.concat([df_hits, df_non_hits], ignore_index=True)

# Mélanger les lignes pour éviter tout biais d'ordre
df_phenom_test = df_phenom_test.sample(frac=1, random_state=42).reset_index(drop=True)


In [39]:



def pairwise_similarities(group, metric="cosine", n_jobs=-1):
    """
    Calcul optimisé des distances intra-groupe.
    """
    group_array = np.array(group, dtype=np.float32)  # Convertir en float32
    similarity_matrix = cosine_similarity(group_array, group_array, dense_output=True)
    return similarity_matrix[np.triu_indices(len(group_array), k=1)]


def compute_inter_group_similarities(group_a, group_b, n_jobs=-1):
    """
    Calcul optimisé des distances inter-groupe.
    """
    group_a_array = np.array(group_a, dtype=np.float32)  # Convertir en float32
    group_b_array = np.array(group_b, dtype=np.float32)  # Convertir en float32
    similarity_matrix = cosine_similarity(
        group_a_array,
        group_b_array,
        dense_output=True,
    )
    return similarity_matrix.flatten()



In [40]:
df = df_phenom_test


group_a = df[df['Metadata_Bioactivity'] == 'hit']['Embeddings_mean'].apply(
    lambda x: np.array(x, dtype=np.float32)
).tolist()

group_b = df[df['Metadata_Bioactivity'] == 'nan']['Embeddings_mean'].apply(
    lambda x: np.array(x, dtype=np.float32)
).tolist()


In [None]:

# Calcul des distances intra-groupe
print("Calcul des distances intra-groupe...")

distances_a = pairwise_similarities(group_a, metric='cosine', n_jobs=-1)
print("Moyenne intra-groupe A :", np.mean(distances_a))
distances_b = pairwise_similarities(group_b, metric='cosine', n_jobs=-1)
print("Moyenne intra-groupe B :", np.mean(distances_b))

print("Calcul des distances inter-groupe...")
all_distances = compute_inter_group_similarities(group_a, group_b, n_jobs=-1)

print("Moyenne inter-groupe A-B :", np.mean(all_distances))

print("Calcul du test statistique...")
t_stat, p_value = ttest_ind(distances_a, distances_b, equal_var=False)
# Afficher la p-value avec plus de précision
print(f"T-statistique : {t_stat:.6f}")
print(f"P-value (format standard) : {p_value:.7f}")

In [42]:
np.set_printoptions(precision=10, suppress=False)  # Suppress False pour afficher en notation scientifique


In [None]:
import matplotlib.pyplot as plt

# Les données pour le boxplot
data = [distances_a, distances_b, all_distances]
labels = ['Intra-groupe A (hits)', 'Intra-groupe B (non-hits)', 'Inter-groupe A-B']

# Création du boxplot
plt.figure(figsize=(10, 6))
plt.boxplot(data, labels=labels, showmeans=True, meanline=True)

# Personnalisation de l'affichage
plt.title('Distribution des distances')
plt.ylabel('Cosine Distance')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Afficher le graphique
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Les données pour le violin plot
data = [distances_a, distances_b, all_distances]
labels = ['Intra-groupe A (hits)', 'Intra-groupe B (non-hits)', 'Inter-groupe A-B']

# Création du graphique
plt.figure(figsize=(10, 6))

# Couleurs personnalisées
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']  # Couleurs pour chaque groupe

# Création des violons avec des couleurs personnalisées
parts = plt.violinplot(data, showmeans=True, showextrema=True, showmedians=True)

# Ajouter les couleurs aux violons
for i, pc in enumerate(parts['bodies']):
    pc.set_facecolor(colors[i])  # Couleur de remplissage
    pc.set_edgecolor('black')   # Contour noir
    pc.set_alpha(0.8)           # Transparence

# Personnalisation des indicateurs (moyenne, médiane, etc.)
parts['cmeans'].set_color('red')     # Moyenne en rouge
parts['cmedians'].set_color('blue')  # Médiane en bleu
parts['cmins'].set_color('black')    # Minimum en noir
parts['cmaxes'].set_color('black')   # Maximum en noir

# Ajout des labels pour chaque violon
plt.xticks(ticks=range(1, len(labels) + 1), labels=labels)

# Personnalisation du graphique
plt.title('Distribution des distances (avec couleurs)', fontsize=14, fontweight='bold')
plt.ylabel('Cosine Distance', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Afficher le graphique
plt.show()


In [None]:
import umap
import matplotlib.pyplot as plt

# Préparer les données et les étiquettes
all_embeddings = np.array(df['Embeddings_mean'].tolist(), dtype=np.float16)
labels = df['Metadata_Bioactivity'].apply(lambda x: 'hit' if x == 'hit' else 'non-hit')

# Réduction de dimension avec UMAP
umap_reducer = umap.UMAP(n_components=2, random_state=42, metric='cosine')
embeddings_2d = umap_reducer.fit_transform(all_embeddings)

# Visualisation
plt.figure(figsize=(10, 8))
for label in labels.unique():
    mask = labels == label
    plt.scatter(embeddings_2d[mask, 0], embeddings_2d[mask, 1], label=label, alpha=0.6, s=10)
plt.title("Visualisation UMAP des embeddings")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.legend()
plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Préparer les données pour la classification
X = all_embeddings  # Les embeddings
y = (df['Metadata_Bioactivity'] == 'hit').astype(int)  # 1 pour hit, 0 pour non-hit

# Diviser en ensemble d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Entraîner une régression logistique
logreg = LogisticRegression(max_iter=500, solver='saga', penalty='l2', C=0.1, n_jobs=-1)  # L1 pour sélectionner les dimensions importantes
logreg.fit(X_train, y_train)

# Importance des dimensions
feature_importance = np.abs(logreg.coef_[0])
important_dimensions = np.argsort(feature_importance)[::-1][:10]  # Top 10 dimensions

print("Top 10 dimensions les plus importantes :", important_dimensions)
print("Scores associés :", feature_importance[important_dimensions])

# Évaluation du modèle
y_pred = logreg.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred)
print("AUC-ROC :", roc_auc)


## JUMP Images

In [47]:
jcps = df_final['Metadata_JCP2022'].to_list()
jump_explo = JumpExplorer()
jump_explo.figsize = (10, 6)


In [None]:
jump_explo.plot_multiple_images(num_cols=3, num_rows=3,plot_title=False, Metadata_JCP2022="JCP2022_005529")

In [None]:
jump_explo.plot_images(max_images=3, title_metadata=[], plot_channels=False, plot_perturbations=True, Metadata_JCP2022="JCP2022_005529")

In [None]:
jump_explo.plot_images(max_images=3, title_metadata=[], plot_channels=False, plot_perturbations=True, Metadata_JCP2022="JCP2022_078377")

In [None]:
jump_explo.plot_multiple_images(num_cols=3, num_rows=3,plot_title=True, Metadata_JCP2022="JCP2022_001983")

In [None]:
df_final['gene_symbol'].value_counts()

In [None]:
df_final

In [None]:
df_phenom