# Import 

In [None]:
from pathlib import Path
import pandas as pd

In [None]:
model = "dinov2_g"
base_path = Path("/projects/synsight/data/jump_embeddings/compounds_embeddings/")
parquet_metadata = base_path / model / Path("metadata.parquet")
jump_df = pd.read_parquet(parquet_metadata)

# Chemical Space Coverance

In [None]:
import csv
import pandas as pd

# Détection automatique du délimiteur
file_path_1 = "/projects/synsight/repos/phenoseeker/data/DOWNLOAD-Z1ne6qrt4wu91Pko505qNL8HRHmP9w9SFGyvEarzIcM=.csv"
file_path_2 = "/projects/synsight/repos/phenoseeker/data/DOWNLOAD-Z1ne6qrt4wu91Pko505qNL8HRHmP9w9SFGyvEarzIcM=_part2.csv"

with open(file_path_2, "r", encoding="utf-8") as f:
    sample = f.read(1024)
    sniffer = csv.Sniffer()
    dialect = sniffer.sniff(sample)
    print("Délimiteur détecté :", dialect.delimiter)

# Lecture du CSV en spécifiant le délimiteur et en utilisant l'engine Python
try:
    df_chembl_1 = pd.read_csv(
        file_path_1,
        delimiter=dialect.delimiter,
        engine="python",
        on_bad_lines="skip",  # ou 'warn' pour simplement avertir
        encoding="utf-8"
    )
    df_chembl_2 = pd.read_csv(
        file_path_2,
        delimiter=dialect.delimiter,
        engine="python",
        on_bad_lines="skip",  # ou 'warn' pour simplement avertir
        encoding="utf-8"
    )

except Exception as e:
    print("Erreur lors de la lecture du fichier CSV :", e)


In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import umap.umap_ as umap

##############################
# Fonctions de calcul d'empreintes
##############################
def get_morgan_fp(inchi, radius=2, nBits=2048):
    """Calcule l'empreinte Morgan à partir d'un InChI (chaîne)."""
    mol = Chem.MolFromInchi(inchi)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
    arr = np.zeros((nBits,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

def get_morgan_fp_from_mol(mol, radius=2, nBits=2048):
    """Calcule l'empreinte Morgan à partir d'un objet Mol RDKit."""
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
    arr = np.zeros((nBits,), dtype=int)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

##############################
# Fonctions de sampling
##############################
def uniform_sample(df, n=5000, random_state=42):
    """Sampling uniforme : on prend au maximum n molécules par dataset."""
    if len(df) > n:
        return df.sample(n=n, random_state=random_state)
    else:
        return df

def proportional_sample(df, frac=0.1,random_state=42):
    """Sampling proportionnel : on prend une fraction (ici 10%) du dataset."""
    return df.sample(frac=frac, random_state=random_state)


def plot_umap(df, title, shuffle_points=True):
    """
    Compute the UMAP embedding from molecular fingerprints and display the projection.
    
    Parameters:
      df: DataFrame containing at least columns "fp" (fingerprint) and "dataset"
      title: Title for the plot.
      shuffle_points: If True, randomly shuffles the rows. If False, orders the points
                      starting with the dataset having the most compounds and ending with 
                      the one with the fewest compounds.
    """
    if shuffle_points:
        df_ordered = df.copy().sample(frac=1, random_state=42)
    else:
        # Order rows by dataset size (largest first)
        df_ordered = df.copy()
        ds_counts = df_ordered['dataset'].value_counts()
        df_ordered['ds_count'] = df_ordered['dataset'].map(ds_counts)
        df_ordered = df_ordered.sort_values(by='ds_count', ascending=False)
        df_ordered = df_ordered.drop(columns=['ds_count'])
    
    # Compute UMAP embedding from the fingerprint matrix.
    X = np.vstack(df_ordered["fp"].values)
    reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, metric='jaccard', random_state=42)
    embedding = reducer.fit_transform(X)
    df_ordered["umap_x"] = embedding[:, 0]
    df_ordered["umap_y"] = embedding[:, 1]
    
    # Define a stable ordering for the dataset names
    categories = sorted(df_ordered["dataset"].unique())
    
    # Create a color mapping for each dataset.
    color_dict = {}
    if len(categories) > 1:
        for i, cat in enumerate(categories):
            color_dict[cat] = plt.cm.viridis(i / (len(categories) - 1))
    else:
        color_dict[categories[0]] = plt.cm.viridis(0.5)
    
    # Map each row's dataset to its color.
    df_ordered["color"] = df_ordered["dataset"].map(color_dict)
    
    # Plot the UMAP projection.
    plt.figure(figsize=(12, 10))
    plt.scatter(df_ordered["umap_x"], df_ordered["umap_y"],
                c=df_ordered["color"], alpha=0.6, s=10)
    
    # Build a legend that matches each dataset to its color.
    handles = [
        plt.Line2D([0], [0], marker='o', color='w',
                   label=cat, markerfacecolor=color_dict[cat], markersize=10)
        for cat in categories
    ]
    plt.legend(handles=handles, title="Dataset")
    plt.title(title)
    plt.xlabel("UMAP 1")
    plt.ylabel("UMAP 2")
    plt.show()


def compute_fp(row):
    """Calcule l'empreinte en fonction du type de dataset."""
    if row["dataset"] == "DrugBank":
        # Pour DrugBank, utiliser l'objet Mol directement
        return get_morgan_fp_from_mol(row["mol"])
    else:
        # Pour JUMP-CP et ChemBL, utiliser la colonne 'inchi'
        return get_morgan_fp(row["inchi"])

In [None]:

##############################
# 1. Chargement du dataset JUMP-CP
##############################
df_jump = jump_df.copy(deep=True)
df_jump = df_jump.rename(columns={"Metadata_InChI": "inchi"})
df_jump["dataset"] = "JUMP-CP"


In [None]:

##############################
# 2. Chargement de DrugBank à partir d'un multi SDF
##############################
drugbank_file = "/projects/synsight/repos/phenoseeker/data/open structures.sdf"  # Chemin vers votre fichier DrugBank SDF
supplier = Chem.SDMolSupplier(drugbank_file)
drugbank_data = []
for mol in supplier:
    if mol is None:
        continue
    # Optionnel : récupération du SMILES pour vérification
    smiles = Chem.MolToSmiles(mol)
    drugbank_data.append({"mol": mol, "smiles": smiles})
df_drugbank = pd.DataFrame(drugbank_data)
df_drugbank["dataset"] = "DrugBank"
# Note : Pour DrugBank, nous utiliserons l'objet Mol directement


In [None]:

##############################
# 3. Chargement des fichiers ChemBL et fusion
##############################
import csv
import pandas as pd

# Détection automatique du délimiteur
file_path_1 = "/projects/synsight/repos/phenoseeker/data/DOWNLOAD-Z1ne6qrt4wu91Pko505qNL8HRHmP9w9SFGyvEarzIcM=.csv"
file_path_2 = "/projects/synsight/repos/phenoseeker/data/DOWNLOAD-Z1ne6qrt4wu91Pko505qNL8HRHmP9w9SFGyvEarzIcM=_part2.csv"

with open(file_path_2, "r", encoding="utf-8") as f:
    sample = f.read(1024)
    sniffer = csv.Sniffer()
    dialect = sniffer.sniff(sample)
    print("Délimiteur détecté :", dialect.delimiter)

# Lecture du CSV en spécifiant le délimiteur et en utilisant l'engine Python
try:
    df_chembl_1 = pd.read_csv(
        file_path_1,
        delimiter=dialect.delimiter,
        engine="python",
        on_bad_lines="skip",  # ou 'warn' pour simplement avertir
        encoding="utf-8"
    )
    df_chembl_2 = pd.read_csv(
        file_path_2,
        delimiter=dialect.delimiter,
        engine="python",
        on_bad_lines="skip",  # ou 'warn' pour simplement avertir
        encoding="utf-8"
    )

except Exception as e:
    print("Erreur lors de la lecture du fichier CSV :", e)

# Standardiser la colonne InChI
df_chembl1 = df_chembl_1.rename(columns={"Inchi": "inchi"})
df_chembl2 = df_chembl_2.rename(columns={"Inchi": "inchi"})
df_chembl = pd.concat([df_chembl1, df_chembl2], ignore_index=True)
df_chembl["dataset"] = "ChemBL"


In [None]:

##############################
# 4. Réalisation des samplings
##############################

random_state = 2

# Sampling uniforme : maximum de 5000 molécules par dataset
df_jump_uniform = uniform_sample(df_jump, n=5000, random_state=random_state)
df_drugbank_uniform = uniform_sample(df_drugbank, n=5000, random_state=random_state)
df_chembl_uniform = uniform_sample(df_chembl, n=5000, random_state=random_state)

# Sampling proportionnel : 10% de chaque dataset
df_jump_prop = proportional_sample(df_jump, frac=0.01, random_state=random_state)
df_drugbank_prop = proportional_sample(df_drugbank, frac=0.01, random_state=random_state)
df_chembl_prop = proportional_sample(df_chembl, frac=0.01, random_state=random_state)

# Concaténer pour chaque stratégie
df_uniform = pd.concat([df_jump_uniform, df_drugbank_uniform, df_chembl_uniform], ignore_index=True)
df_proportional = pd.concat([df_jump_prop, df_drugbank_prop, df_chembl_prop], ignore_index=True)


In [None]:

##############################
# 5. Calcul des empreintes pour chaque échantillon
##############################
def compute_fp(row):
    """Calcule l'empreinte en fonction du type de dataset."""
    if row["dataset"] == "DrugBank":
        # Pour DrugBank, utiliser l'objet Mol directement
        try:
            return get_morgan_fp_from_mol(row["mol"])
        except Exception as e:
            return None
    else:
        # Pour JUMP-CP et ChemBL, utiliser la colonne 'inchi'
        inchi = row["inchi"]
        # Vérifier que l'InChI est bien une chaîne de caractères
        if not isinstance(inchi, str):
            return None
        try:
            return get_morgan_fp(inchi)
        except Exception as e:
            return None
# Pour le sampling uniforme
df_uniform["fp"] = df_uniform.apply(compute_fp, axis=1)
df_uniform = df_uniform[df_uniform["fp"].notnull()].reset_index(drop=True)

# Pour le sampling proportionnel
df_proportional["fp"] = df_proportional.apply(compute_fp, axis=1)
df_proportional = df_proportional[df_proportional["fp"].notnull()].reset_index(drop=True)



In [None]:

##############################
# 6. Projection UMAP et visualisation
##############################

# Affichage UMAP pour sampling uniforme
plot_umap(df_uniform, "UMAP - Uniforme Sampling (5000 compounds per dataset)")

# Affichage UMAP pour sampling proportionnel (10% de chaque dataset)
plot_umap(df_proportional, "UMAP - Proportionnal Sampling (1% of each dataset)",  False)


In [None]:

##############################
# 6. Projection UMAP et visualisation
##############################

# Affichage UMAP pour sampling uniforme
plot_umap(df_uniform, "UMAP - Uniforme Sampling (5000 compounds per dataset)")

# Affichage UMAP pour sampling proportionnel (10% de chaque dataset)
plot_umap(df_proportional, "UMAP - Proportionnal Sampling (1% of each dataset)",  False)


In [None]:

##############################
# 6. Projection UMAP et visualisation
##############################

# Affichage UMAP pour sampling uniforme
plot_umap(df_uniform, "UMAP - Uniforme Sampling (5000 compounds per dataset)")

# Affichage UMAP pour sampling proportionnel (10% de chaque dataset)
plot_umap(df_proportional, "UMAP - Proportionnal Sampling (1% of each dataset)",  False)
