# Import 

In [1]:
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import numpy as np
from tqdm import tqdm

In [2]:
#df_phenom = pd.read_parquet('/home/maxime/data/jump_embeddings/metadata_dinov2_g.parquet')
df_phenom = pd.read_parquet('/projects/synsight/data/jump_embeddings/wells_embeddings/openphenom/metadata_openphenom.parquet')

df_jump = df_phenom[["Metadata_JCP2022", "Metadata_InChI"]].drop_duplicates().reset_index()

In [3]:
mg = AllChem.GetMorganGenerator(radius=2, fpSize=2048, includeChirality=False)

In [28]:
def inchi_to_fp(inchi):
    """Convert InChI string to RDKit Morgan fingerprint."""
    mol = Chem.MolFromInchi(inchi)
    if mol:
        return mg.GetFingerprint(mol)
    return None
 
def smiles_to_fp(smiles):
    """Convert SMILES to RDKit fingerprint."""
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return mg.GetFingerprint(mol)
    return None

def bulk_tanimoto_similarity(query_fp, list_of_fps):
    """Compute Tanimoto similarity efficiently in bulk."""
    list_of_fps = list(list_of_fps)  # Ensure it's a Python list
    return DataStructs.BulkTanimotoSimilarity(query_fp, list_of_fps)

def compute_similarity(query_fp, list_of_fps_jump, query_type='smiles'):
    """Compute Tanimoto similarity between a query InChI and a list of InChIs."""

    if query_fp is None:
        raise ValueError("Invalid query")
    
    list_of_fps = [fp for fp in list_of_fps_jump if fp is not None]  # Filter out None values
    
    return bulk_tanimoto_similarity(query_fp, list_of_fps)

In [None]:
list_of_fps_jump = [inchi_to_fp(inchi) for inchi in tqdm(df_jump['Metadata_InChI'].to_list())]
df_jump['Fps'] = list_of_fps_jump
df_jump.dropna(subset='Fps', inplace=True)

# Import mols from lit-pcba

In [None]:
import os
import pandas as pd

def load_smi_files(base_path):
    """
    Charge les fichiers actives.smi et inactives.smi d'un dossier et retourne un dictionnaire de DataFrames.

    Args:
        base_path (str): Le chemin vers le dossier contenant les sous-dossiers avec les fichiers .smi.

    Returns:
        dict: Un dictionnaire où chaque clé est le nom du sous-dossier et la valeur est un DataFrame.
    """
    data_dict = {}

    # Liste tous les sous-dossiers
    for folder in os.listdir(base_path):
        folder_path = os.path.join(base_path, folder)

        # Vérifie si c'est bien un dossier
        if os.path.isdir(folder_path):
            actives_file = os.path.join(folder_path, "actives.smi")
            inactives_file = os.path.join(folder_path, "inactives.smi")

            all_data = []

            # Lire actives.smi
            if os.path.exists(actives_file):
                df_actives = pd.read_csv(actives_file, sep=" ", names=["smiles", "id_lit_pcba"])
                df_actives["Active"] = True
                all_data.append(df_actives)

            # Lire inactives.smi
            if os.path.exists(inactives_file):
                df_inactives = pd.read_csv(inactives_file, sep=" ", names=["smiles", "id_lit_pcba"])
                df_inactives["Active"] = False
                all_data.append(df_inactives)

            # Si on a des données, on les stocke
            if all_data:
                data_dict[folder] = pd.concat(all_data, ignore_index=True)

    return data_dict

# Chemin vers ton dossier "data"
base_path = "../data"

# Charger les données
data_dict = load_smi_files(base_path)


In [None]:

# Afficher un exemple
for key, df in data_dict.items():
    if len(df)<6000:
        print(f"\n📂 {key} (Total: {len(df)} molécules)")
        list_of_fps_jump = [smiles_to_fp(smi) for smi in tqdm(df['smiles'].to_list())]
        df['Fps'] = list_of_fps_jump
        df.dropna(subset='Fps', inplace=True)



In [None]:
df

# Explore

In [31]:
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed

def compute_best_match(query_fp, list_of_fps_jump, jump_ids):
    """
    Trouve la molécule de df_jump la plus similaire à une molécule donnée.

    Args:
        query_fp (str): L'empreinte de la molécule à comparer.
        list_of_fps_jump (list): Liste des empreintes des molécules JUMP.
        jump_ids (list): Liste des ID des molécules JUMP.

    Returns:
        tuple: (meilleure similarité, ID de la molécule correspondante)
    """
    similarities = compute_similarity(query_fp, list_of_fps_jump)  # Calcule la similarité
    best_index = max(range(len(similarities)), key=lambda i: similarities[i])  # Trouve l'index du max
    return similarities[best_index], jump_ids[best_index]  # Retourne (similarité, ID)

def find_best_match_parallel(df, df_jump, n_jobs=-1):
    """
    Utilise joblib pour paralléliser la recherche des meilleures correspondances.

    Args:
        df (pd.DataFrame): DataFrame contenant les molécules à comparer.
        df_jump (pd.DataFrame): DataFrame contenant les molécules de référence.
        n_jobs (int): Nombre de cœurs à utiliser (-1 = tous les cœurs).

    Returns:
        pd.DataFrame: Le DataFrame original avec deux nouvelles colonnes :
                      - Best_Similarity: la similarité max trouvée
                      - Best_ID_Jump: l'ID de la molécule correspondante dans df_jump
    """
    list_of_fps_jump = df_jump['Fps'].to_list()  # Liste des empreintes JUMP
    jump_ids = df_jump['Metadata_JCP2022'].to_list()  # Liste des ID JUMP

    # Utilisation de joblib pour paralléliser le calcul des similarités
    results = Parallel(n_jobs=n_jobs, backend="loky")(
        delayed(compute_best_match)(row['Fps'], list_of_fps_jump, jump_ids) for _, row in tqdm(df.iterrows(), total=df.shape[0])
    )

    # Extraction des résultats
    best_similarities, best_ids = zip(*results)

    df['Best_Similarity'] = best_similarities
    df['Best_ID_Jump'] = best_ids

    return df



In [None]:
find_best_match_parallel(data_dict['TP53'], df_jump)

In [None]:

# 🔥 Application sur toutes les cibles du dictionnaire avec joblib
for target in data_dict:
    print(f"🚀 Processing {target} with multiprocessing...")
    data_dict[target] = find_best_match_parallel(data_dict[target], df_jump)

# Vérification
for key, df in data_dict.items():
    print(f"\n📂 {key} - Extrait avec similarités")
    print(df.head())


In [None]:
# Dictionnaire pour stocker les résultats
similarity_results = {}
list_of_fps_jump = df_jump['Fps'].to_list()
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    query_smiles = row['SMILES'] 
    similarities = compute_similarity(query_smiles, list_of_fps_jump) 
    
    # Stocker temporairement les similarités sous forme de colonne
    similarity_results[f'TC_to_{row["ZINC ID"]}'] = similarities

# Conversion du dictionnaire en DataFrame
similarity_df = pd.DataFrame(similarity_results)
similarity_df.index = df_jump.index  # Assurer l'alignement des index

# Fusionner le tout avec df_jump
final_df = pd.concat([df_jump, similarity_df], axis=1)

In [35]:
tc_columns = [col for col in final_df.columns if col.startswith("TC_to_")]

# Trouver la similarité maximale et l'identifiant correspondant
df_d4["Max_TC"] = final_df[tc_columns].max(axis=1)  # Valeur TC max
df_d4["Best_JCPID"] = final_df[tc_columns].idxmax(axis=1).str.replace("TC_to_", "")  # JCPID associé


In [None]:
df_d4["Max_TC"].hist()