In [1]:
import pandas as pd
import torch
import numpy as np

In [None]:
df = pd.read_csv('/home/maxime/data/test.csv').drop(columns=['Unnamed: 0', 'Replica_R1', 'Replica_R2', 'Mahalanobis_distance', 'compound_id', 'path_embedding'])

In [None]:
def string_to_tensor(tensor_str):
    # Remove 'tensor(' and the closing ')'
    clean_str = tensor_str.replace('tensor(', '').replace(')', '')
    # Convert to a list of floats and then to a PyTorch tensor
    return torch.tensor(eval(clean_str))

# Apply the conversion function to the DataFrame column
df['Embeddings_mean_tensor'] = df['Embeddings_mean'].apply(string_to_tensor)


In [None]:
df.columns

In [None]:
df_hit = pd.read_csv('/home/maxime/data/test_2.csv')

In [None]:
df_hit

In [None]:
ref = df_hit.head(10)['Metadata_JCP2022'].to_list()

In [None]:
import pandas as pd
import torch
from sklearn.metrics.pairwise import cosine_similarity
from rdkit import DataStructs
from rdkit.Chem import AllChem
import matplotlib.pyplot as plt

# Function to calculate Tanimoto similarity
def calculate_tanimoto(fp1, fp2):
    return DataStructs.TanimotoSimilarity(fp1, fp2)

# Function to compute molecular fingerprints from SMILES (or InChI if provided)
def compute_fingerprints(inchi_list):
    from rdkit import Chem
    fingerprints = []
    for inchi in inchi_list:
        mol = Chem.MolFromInchi(inchi)
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024) if mol else None
        fingerprints.append(fp)
    return fingerprints

# Function to compute similarities and plot
def compute_and_plot_similarity_with_ref(df, reference_molecules):
    """
    Compute Tanimoto similarity for a subset of reference molecules (`Metadata_JCP2022`)
    compared to all other molecules and to the closest in terms of cosine similarity.
    
    Args:
        df (pd.DataFrame): The full DataFrame containing all molecules.
        reference_molecules (list): List of `Metadata_JCP2022` identifiers to use as reference.
        
    Returns:
        pd.DataFrame: A DataFrame with computed similarities for the reference molecules.
    """
    # Filter the DataFrame to include only reference molecules
    reference_df = df[df['Metadata_JCP2022'].isin(reference_molecules)]
    
    # Step 1: Compute fingerprints from InChI for all moleculesw
    fingerprints_all = compute_fingerprints(df['inchi'])
    fingerprints_ref = compute_fingerprints(reference_df['inchi'])
    
    # Step 2: Extract embeddings for all molecules and reference molecules
    embeddings_all = df['Embeddings_mean_tensor'].tolist()
    embeddings_ref = reference_df['Embeddings_mean_tensor'].tolist()
    
    # Prepare results
    all_tanimoto_averages = []
    closest_10_percent_averages = []
    
    # Iterate over reference molecules
    for i, fp1 in enumerate(fingerprints_ref):
        if fp1 is None:
            all_tanimoto_averages.append(None)
            closest_10_percent_averages.append(None)
            continue
        
        tanimoto_similarities = []
        cosine_similarities = []
        
        # Compare the reference molecule to all molecules
        for j, fp2 in enumerate(fingerprints_all):
            if fp2 is not None:
                # Compute Tanimoto similarity
                tanimoto_similarities.append(calculate_tanimoto(fp1, fp2))
                # Compute cosine similarity
                cosine_similarities.append(calculate_cosine_similarity(embeddings_ref[i], embeddings_all[j]))
        
        # Average Tanimoto similarity to all molecules
        avg_tanimoto = sum(tanimoto_similarities) / len(tanimoto_similarities) if tanimoto_similarities else 0
        all_tanimoto_averages.append(avg_tanimoto)
        
        # Average Tanimoto similarity to top 10% closest in cosine similarity
        top_10_percent_threshold = int(len(cosine_similarities) * 0.05)
        closest_indices = sorted(range(len(cosine_similarities)), key=lambda x: cosine_similarities[x], reverse=True)[:top_10_percent_threshold]
        closest_tanimotos = [tanimoto_similarities[k] for k in closest_indices]
        avg_closest_tanimoto = sum(closest_tanimotos) / len(closest_tanimotos) if closest_tanimotos else 0
        closest_10_percent_averages.append(avg_closest_tanimoto)
    
    # Step 3: Add results to the reference DataFrame
    reference_df['Average_Tanimoto'] = all_tanimoto_averages
    reference_df['Average_Tanimoto_Top10Percent'] = closest_10_percent_averages
    
    return reference_df


# Example usage
updated_df = compute_and_plot_similarity_with_ref(df, ref)


In [None]:
X = [[1, 0, 0]]

Y = [[1, 0, 0]]

cosine_similarity(X, Y)

In [None]:

def plot_with_bars_and_error(df):
    # Extract values for plotting
    indices = df.index
    avg_tanimoto = df['Average_Tanimoto'].fillna(0).values
    avg_tanimoto_top10 = df['Average_Tanimoto_Top10Percent'].fillna(0).values
    
    # Placeholder for error values (can be replaced with actual std dev or ranges)
    error_tanimoto = np.random.uniform(0.01, 0.05, size=len(avg_tanimoto))  # Example error bars
    error_tanimoto_top10 = np.random.uniform(0.01, 0.05, size=len(avg_tanimoto_top10))
    
    # Bar width
    bar_width = 0.4
    
    # Create bar positions
    x = np.arange(len(indices))
    
    # Plot bars with error bars
    plt.figure(figsize=(7, 14))
    plt.bar(x - bar_width , avg_tanimoto, bar_width, yerr=error_tanimoto, 
            label='Average Tanimoto Similarity', alpha=0.7, color='blue', capsize=4)
    plt.bar(x + bar_width , avg_tanimoto_top10, bar_width, yerr=error_tanimoto_top10, 
            label='Average Tanimoto Similarity (Top 5%)', alpha=0.7, color='red', capsize=4)
    
    # Add labels and legend
    plt.ylabel('Tanimoto Similarity', fontsize=14)
    plt.xticks(x, indices, rotation=45, fontsize=10)
    plt.title('Tanimoto Similarity Metrics', fontsize=16)
    plt.legend(fontsize=12)
    
    # Add grid
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Tight layout for better spacing
    plt.tight_layout()
    plt.show()
plot_with_bars_and_error(updated_df[:1])


In [None]:
def compute_and_plot_similarity_with_hit_ref(df, reference_molecules):
    """
    Compute Tanimoto similarity for a subset of reference molecules (`Metadata_JCP2022`)
    compared to all other molecules, and to the closest molecules with Metadata_Bioactivity = 'hit'.

    Args:
        df (pd.DataFrame): The full DataFrame containing all molecules.
        reference_molecules (list): List of `Metadata_JCP2022` identifiers to use as reference.

    Returns:
        pd.DataFrame: A DataFrame with computed similarities for the reference molecules.
    """
    # Filter the DataFrame to include only reference molecules and "hit" molecules
    reference_df = df[df['Metadata_JCP2022'].isin(reference_molecules)]
    hit_df = df[df['Metadata_Bioactivity'] == 'hit']
    
    # Step 1: Compute fingerprints from InChI for all molecules
    fingerprints_all = compute_fingerprints(df['inchi'])
    fingerprints_ref = compute_fingerprints(reference_df['inchi'])
    fingerprints_hits = compute_fingerprints(hit_df['inchi'])
    
    # Step 2: Extract embeddings for all molecules, reference molecules, and hits
    embeddings_all = df['Embeddings_mean_tensor'].tolist()
    embeddings_ref = reference_df['Embeddings_mean_tensor'].tolist()
    embeddings_hits = hit_df['Embeddings_mean_tensor'].tolist()
    
    # Prepare results
    all_tanimoto_averages = []
    closest_10_percent_averages = []
    
    # Iterate over reference molecules
    for i, fp1 in enumerate(fingerprints_ref):
        if fp1 is None:
            all_tanimoto_averages.append(None)
            closest_10_percent_averages.append(None)
            continue
        
        tanimoto_similarities_all = []
        cosine_similarities_hits = []
        
        # Compare the reference molecule to all molecules
        for j, fp2 in enumerate(fingerprints_all):
            if fp2 is not None:
                # Compute Tanimoto similarity to all molecules
                tanimoto_similarities_all.append(calculate_tanimoto(fp1, fp2))
        
        # Compare the reference molecule to "hit" molecules
        for k, fp_hit in enumerate(fingerprints_hits):
            if fp_hit is not None:
                # Compute cosine similarity to "hit" molecules
                cosine_similarities_hits.append(calculate_cosine_similarity(embeddings_ref[i], embeddings_hits[k]))
        
        # Average Tanimoto similarity to all molecules
        avg_tanimoto = sum(tanimoto_similarities_all) / len(tanimoto_similarities_all) if tanimoto_similarities_all else 0
        all_tanimoto_averages.append(avg_tanimoto)
        
        # Average Tanimoto similarity to top 10% closest "hit" molecules
        top_10_percent_threshold = int(len(cosine_similarities_hits) * 0.05)
        closest_indices = sorted(range(len(cosine_similarities_hits)), key=lambda x: cosine_similarities_hits[x], reverse=True)[:top_10_percent_threshold]
        closest_tanimotos = [tanimoto_similarities_all[k] for k in closest_indices]
        avg_closest_tanimoto = sum(closest_tanimotos) / len(closest_tanimotos) if closest_tanimotos else 0
        closest_10_percent_averages.append(avg_closest_tanimoto)
    
    # Step 3: Add results to the reference DataFrame
    reference_df['Average_Tanimoto'] = all_tanimoto_averages
    reference_df['Average_Tanimoto_Top10Percent'] = closest_10_percent_averages
    
    return reference_df


In [None]:
# List of reference molecules by `ith actual identifiers

# Compute similarities with hits as the top closest reference
reference_df = compute_and_plot_similarity_with_hit_ref(df, ref)

# Display results



In [None]:
plot_with_bars_and_error(reference_df)


In [None]:
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit import Chem
import matplotlib.pyplot as plt

def compute_and_compare_scaffold_percentages(df, reference_molecules):
    """
    Compute the percentage of unique Murcko scaffolds in the top closest molecules
    for each reference molecule, and compare it to the percentage of unique scaffolds
    in the entire dataset.

    Args:
        df (pd.DataFrame): The full DataFrame containing all molecules.
        reference_molecules (list): List of `Metadata_JCP2022` identifiers to use as reference.

    Returns:
        pd.DataFrame: A DataFrame with scaffold percentage comparisons for the reference molecules.
    """
    # Filter the DataFrame to include only reference molecules and "hit" molecules
    reference_df = df[df['Metadata_JCP2022'].isin(reference_molecules)]
    hit_df = df[df['Metadata_Bioactivity'] == 'hit']
    
    # Step 1: Compute fingerprints from InChI for all molecules
    fingerprints_ref = compute_fingerprints(reference_df['inchi'])
    fingerprints_hits = compute_fingerprints(hit_df['inchi'])
    
    # Extract embeddings for reference molecules and hits
    embeddings_ref = reference_df['Embeddings_mean_tensor'].tolist()
    embeddings_hits = hit_df['Embeddings_mean_tensor'].tolist()
    
    # Helper to calculate Murcko scaffolds
    def calculate_murcko_scaffold(inchi):
        mol = Chem.MolFromInchi(inchi)
        if mol:
            scaffold = MurckoScaffold.GetScaffoldForMol(mol)
            return Chem.MolToSmiles(scaffold)
        return None
    
    # Precompute Murcko scaffolds for all molecules in the dataset
    df['Murcko_Scaffold'] = df['inchi'].apply(calculate_murcko_scaffold)
    hit_df['Murcko_Scaffold'] = hit_df['inchi'].apply(calculate_murcko_scaffold)
    
    # Compute percentage of unique scaffolds in the entire dataset
    all_unique_scaffolds = set(df['Murcko_Scaffold'].dropna())
    all_scaffold_percentage = (len(all_unique_scaffolds) / len(df)) * 100
    
    # Prepare results
    closest_scaffold_percentages = []
    
    # Iterate over reference molecules
    for i, fp1 in enumerate(fingerprints_ref):
        if fp1 is None:
            closest_scaffold_percentages.append(None)
            continue
        
        cosine_similarities_hits = []
        
        # Compare the reference molecule to "hit" molecules
        for k, fp_hit in enumerate(fingerprints_hits):
            if fp_hit is not None:
                # Compute cosine similarity to "hit" molecules
                cosine_similarities_hits.append(calculate_cosine_similarity(embeddings_ref[i], embeddings_hits[k]))
        
        # Find the top 10% closest "hit" molecules
        top_10_percent_threshold = int(len(cosine_similarities_hits) * 0.1)
        closest_indices = sorted(range(len(cosine_similarities_hits)), key=lambda x: cosine_similarities_hits[x], reverse=True)[:top_10_percent_threshold]
        closest_scaffolds = [hit_df.iloc[k]['Murcko_Scaffold'] for k in closest_indices if hit_df.iloc[k]['Murcko_Scaffold'] is not None]
        
        # Calculate the percentage of unique scaffolds in the closest molecules
        unique_scaffolds = set(closest_scaffolds)
        scaffold_percentage = (len(unique_scaffolds) / len(closest_scaffolds) * 100) if closest_scaffolds else 0
        closest_scaffold_percentages.append(scaffold_percentage)
    
    # Add results to the reference DataFrame
    reference_df['Closest_Scaffold_Percentage'] = closest_scaffold_percentages
    reference_df['Overall_Scaffold_Percentage'] = all_scaffold_percentage
    
    # Step 4: Plot the results
    plt.figure(figsize=(10, 6))
    
    # Bar plot for closest scaffold percentages
    plt.bar(reference_df['Metadata_JCP2022'], reference_df['Closest_Scaffold_Percentage'], color='blue', alpha=0.7, label='Closest Scaffold Percentage')
    
    # Line for overall scaffold percentage
    plt.axhline(y=all_scaffold_percentage, color='red', linestyle='--', label='Overall Scaffold Percentage')
    
    # Plot settings
    plt.xlabel('Reference Molecules', fontsize=14)
    plt.ylabel('Scaffold Percentage (%)', fontsize=14)
    plt.title('Unique Scaffold Percentage Comparison', fontsize=16)
    plt.xticks(rotation=45)
    plt.legend(fontsize=12)
    plt.tight_layout()
    plt.show()
    
    return reference_df


In [None]:

# Compute scaffold percentages and compare
reference_df = compute_and_compare_scaffold_percentages(df, ref)


In [None]:
from rdkit.Chem import Draw
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit import Chem
import matplotlib.pyplot as plt
from IPython.display import display

def plot_closest_scaffolds_and_molecules(df, reference_molecules, num_closest=5):
    """
    Plot the reference molecule, its Murcko scaffold, and the closest molecules
    (based on cosine similarity) along with their scaffolds.

    Args:
        df (pd.DataFrame): The full DataFrame containing all molecules.
        reference_molecules (list): List of `Metadata_JCP2022` identifiers to use as reference.
        num_closest (int): Number of closest molecules to display for each reference molecule.
    """
    # Helper to calculate Murcko scaffolds
    def calculate_murcko_scaffold(inchi):
        mol = Chem.MolFromInchi(inchi)
        if mol:
            scaffold = MurckoScaffold.GetScaffoldForMol(mol)
            return scaffold
        return None
    
    # Precompute embeddings and scaffolds
    df['Murcko_Scaffold'] = df['inchi'].apply(calculate_murcko_scaffold)
    embeddings_all = df['Embeddings_mean_tensor'].tolist()
    
    # Iterate over reference molecules
    for ref in reference_molecules:
        ref_row = df[df['Metadata_JCP2022'] == ref]
        if ref_row.empty:
            print(f"No reference molecule found for {ref}")
            continue
        
        # Extract the reference molecule's embedding and InChI
        ref_embedding = ref_row.iloc[0]['Embeddings_mean_tensor']
        ref_inchi = ref_row.iloc[0]['inchi']
        ref_mol = Chem.MolFromInchi(ref_inchi)
        ref_scaffold = calculate_murcko_scaffold(ref_inchi)
        
        # Compute cosine similarity to all molecules
        cosine_similarities = [
            calculate_cosine_similarity(ref_embedding, emb) if emb is not None else -1
            for emb in embeddings_all
        ]
        
        # Get the indices of the top closest molecules
        closest_indices = sorted(range(len(cosine_similarities)), key=lambda x: cosine_similarities[x], reverse=True)[:num_closest]
        closest_molecules = df.iloc[closest_indices]
        
        # Plot the reference molecule and its scaffold
        print(f"Reference Molecule: {ref}")
        if ref_mol:
            print("Reference Molecule Structure and Scaffold:")
            display(Draw.MolsToGridImage([ref_mol, ref_scaffold], molsPerRow=2, subImgSize=(300, 300), legends=['Reference Molecule', 'Reference Scaffold']))
        
        # Prepare images for closest molecules and their scaffolds
        mol_images = [Chem.MolFromInchi(inchi) for inchi in closest_molecules['inchi'] if Chem.MolFromInchi(inchi)]
        scaffold_images = [scaf for scaf in closest_molecules['Murcko_Scaffold'] if scaf is not None]
        
        # Draw molecular structures of closest molecules
        if mol_images:
            print("Closest Molecules:")
            mol_grid = Draw.MolsToGridImage(mol_images, molsPerRow=3, subImgSize=(300, 300), legends=[f"Mol {i}" for i in range(len(mol_images))])
            display(mol_grid)
        
        # Draw Murcko scaffolds of closest molecules
        if scaffold_images:
            print("Scaffolds of Closest Molecules:")
            scaffold_grid = Draw.MolsToGridImage(scaffold_images, molsPerRow=3, subImgSize=(300, 300), legends=[f"Scaffold {i}" for i in range(len(scaffold_images))])
            display(scaffold_grid)


In [None]:
# List of reference molecules by `Metadata_JCP2022`
reference_molecules = ['Mol1', 'Mol2', 'Mol3']  # Replace with actual identifiers

# Plot the closest molecules and their scaffolds
plot_closest_scaffolds_and_molecules(df, ref, num_closest=5)


In [None]:
from rdkit.Chem import Draw
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit import Chem
import matplotlib.pyplot as plt
from IPython.display import display

def plot_closest_scaffolds_and_molecules_side_by_side(df, reference_molecules, num_closest=5):
    """
    Plot the reference molecule, its Murcko scaffold, and the closest molecules
    with their corresponding scaffolds side by side.

    Args:
        df (pd.DataFrame): The full DataFrame containing all molecules.
        reference_molecules (list): List of `Metadata_JCP2022` identifiers to use as reference.
        num_closest (int): Number of closest molecules to display for each reference molecule.
    """
    # Helper to calculate Murcko scaffolds
    def calculate_murcko_scaffold(inchi):
        mol = Chem.MolFromInchi(inchi)
        if mol:
            scaffold = MurckoScaffold.GetScaffoldForMol(mol)
            return scaffold
        return None
    
    # Precompute embeddings and scaffolds
    df['Murcko_Scaffold'] = df['inchi'].apply(calculate_murcko_scaffold)
    embeddings_all = df['Embeddings_mean_tensor'].tolist()
    
    # Iterate over reference molecules
    for ref in reference_molecules:
        ref_row = df[df['Metadata_JCP2022'] == ref]
        if ref_row.empty:
            print(f"No reference molecule found for {ref}")
            continue
        
        # Extract the reference molecule's embedding and InChI
        ref_embedding = ref_row.iloc[0]['Embeddings_mean_tensor']
        ref_inchi = ref_row.iloc[0]['inchi']
        ref_mol = Chem.MolFromInchi(ref_inchi)
        ref_scaffold = calculate_murcko_scaffold(ref_inchi)
        
        # Compute cosine similarity to all molecules
        cosine_similarities = [
            calculate_cosine_similarity(ref_embedding, emb) if emb is not None else -1
            for emb in embeddings_all
        ]
        
        # Get the indices of the top closest molecules
        closest_indices = sorted(range(len(cosine_similarities)), key=lambda x: cosine_similarities[x], reverse=True)[:num_closest]
        closest_molecules = df.iloc[closest_indices]
        
        # Prepare images for closest molecules and their scaffolds
        mol_images = [Chem.MolFromInchi(inchi) for inchi in closest_molecules['inchi'] if Chem.MolFromInchi(inchi)]
        scaffold_images = [scaf for scaf in closest_molecules['Murcko_Scaffold'] if scaf is not None]
        
        # Combine molecules and scaffolds side by side
        paired_images = []
        legends = []
        for mol, scaffold, idx in zip(mol_images, scaffold_images, closest_indices):
            paired_images.extend([mol, scaffold])
            legends.extend(["Molecule", "Murcko Scaffold"])
        
        # Plot the reference molecule and its scaffold
        print(f"Reference Molecule: {ref}")
        if ref_mol:
            print("Reference Molecule Structure and Scaffold:")
            display(Draw.MolsToGridImage([ref_mol, ref_scaffold], molsPerRow=2, subImgSize=(300, 300), legends=['Reference Molecule', 'Reference Scaffold']))
        
        # Plot paired molecules and scaffolds
        if paired_images:
            print("Closest Molecules and Their Scaffolds:")
            paired_grid = Draw.MolsToGridImage(paired_images, molsPerRow=2, subImgSize=(300, 300), legends=legends)
            display(paired_grid)


In [None]:
plot_closest_scaffolds_and_molecules_side_by_side(df, ref, num_closest=5)
