# Imports and Functions

In [None]:
import pandas as pd
from normalisation import BioproxyEvaluator
from tqdm import tqdm
from rdkit import Chem, DataStructs
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
import requests
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import umap
import matplotlib.pyplot as plt

In [None]:
df_phenom = pd.read_parquet('/projects/synsight/data/openphenom/norm_2_compounds_embeddings.parquet')
evaluator = BioproxyEvaluator()
evaluator.add_embeddings(df_phenom, embeddings_cols=None)
evaluator

In [None]:
import pandas as pd
from scipy.spatial.distance import cosine

def compute_distances_and_select(dataframe, ref_id, n=14):
    """
    Computes cosine distances from a reference compound to all others and updates the dataframe.
    
    Parameters:
    dataframe (pd.DataFrame): DataFrame containing the data with 'Metadata_JCP2022' and 'Embeddings_mean' columns.
    ref_id (str): The reference compound ID in the 'Metadata_JCP2022' column.
    
    Returns:
    pd.DataFrame: Updated DataFrame with two new columns: 'Cosine_Distance' and 'Selection'.
    """
    # Ensure the reference ID exists
    if ref_id not in dataframe['Metadata_JCP2022'].values:
        raise ValueError(f"Reference ID {ref_id} not found in 'Metadata_JCP2022'.")
    
    # Extract the embedding of the reference compound
    ref_embedding = dataframe.loc[dataframe['Metadata_JCP2022'] == ref_id, 'Embeddings_mean'].iloc[0]
    
    # Calculate cosine distances
    dataframe['Cosine_Distance'] = dataframe['Embeddings_mean'].apply(
        lambda emb: cosine(ref_embedding, emb)
    )
    
    # Sort by cosine distance and select the top 14 closest excluding the reference itself
    closest = dataframe[dataframe['Metadata_JCP2022'] != ref_id].nsmallest(n, 'Cosine_Distance').index
    
    # Assign the selection column
    dataframe['Selection'] = 'Not Selected Compounds'
    dataframe.loc[dataframe['Metadata_JCP2022'] == ref_id, 'Selection'] = 'Positive Control'
    dataframe.loc[closest, 'Selection'] = 'Selected Compounds'
    
    return dataframe


In [None]:
gen = GetMorganGenerator(radius=2, fpSize=2048)

def compute_tanimoto(inchi_list: list):
    mols = [Chem.MolFromInchi(inchi) for inchi in inchi_list]
    fps = [gen.GetFingerprint(mol) for mol in mols]
    n = len(fps)
    sims = []
    fps_cp = fps[0]
    for i in range(n):
        sim = DataStructs.TanimotoSimilarity(fps[i], fps_cp)
        sims.append(sim)
    return sims

def compute_ranking(
    source: str,
    screen: str,
    embeddings_name: str,
    JCP2022_id: str,
) -> dict[str, list]:
    """
    Compute the ranking of distances for a given screen and embedding.

    Args:
        source (str): Source of the screen data.
        screen (str): Name of the screen.
        embeddings_name (str): Name of the embedding.
        JCP2022_id (str): ID of the target to rank against.
        plot (bool, optional): Whether to plot the ranking distance. Defaults to
            False.

    Returns:
        Dict[str, List]: Dictionary containing sorted distances and bioactivities.
    """
    screen_df = evaluator.screens_data[source][screen]
    distances = evaluator.distance_matrices[source][screen][embeddings_name]
    target_index = screen_df.loc[screen_df["Metadata_JCP2022"] == JCP2022_id].index[
        0
    ]

    results_df = screen_df.copy(deep=True)
    results_df["distance_to_target"] = distances[target_index]
    results_df.sort_values(by="distance_to_target", inplace=True)

    return results_df

def get_group_data_norm(source, per):
    data = []
    hit_rates = []
    results_dict = {}
    for screen_name, screen_df in evaluator.screens_data[source].items():
        # Calculate enrichment factors
        results_df = evaluator.calculate_enrichment_factor(
            source,
            screen_name,
            "Embeddings_mean",
            [per],
            norm=True,
            plot=False,
        )
        results_dict[screen_name] = results_df

        # Calculate hit rate
        N_hit = results_df["Metadata_JCP2022"].nunique()
        N_compounds = len(screen_df)

        # Extract relevant data
        assay_data = results_df[["EF", "Norm_EF", "Max_EF"]].copy()
        assay_data.columns = ["EF", "Norm_EF", "Max_theorical_EF"]
        assay_data["Assay"] = screen_name
        assay_data["N_hit"] = N_hit
        assay_data["N_compounds"] = N_compounds
        data.append(assay_data)

        hit_rate = round(N_hit / N_compounds * 100, 1)
        if hit_rate < 45:
            hit_rates.append({"Assay": screen_name, "Hit Rate": hit_rate})

    # Combine all assay data into a single DataFrame
    combined_data = pd.concat(data, ignore_index=True)
    hit_rates_df = pd.DataFrame(hit_rates)

    # Group by "Assay" and calculate max, mean, and median for Norm_EF and EF
    group_data = (
        combined_data.groupby("Assay").agg(
            Max_Norm_EF=("Norm_EF", "max"),
            Mean_Norm_EF=("Norm_EF", "mean"),
            Median_Norm_EF=("Norm_EF", "median"),
            Max_EF=("EF", "max"),
            Mean_EF=("EF", "mean"),
            Median_EF=("EF", "median"),
            N_hit=("N_hit", "first"),  # N_hit is the same across rows for an Assay
            N_compounds=("N_compounds", "first"),  # Same logic applies
            Max_theorical_EF=("Max_theorical_EF", "first"), # Same logic applies
        ).reset_index()
    )

    # Merge with hit rates
    group_data = group_data.merge(hit_rates_df, on="Assay")

    # Sort by hit rate
    group_data = group_data.sort_values("Hit Rate", ascending=True)

    # Ensure Assay is a categorical variable for plotting or ordering
    assay_order = group_data["Assay"]
    group_data["Assay"] = pd.Categorical(
        group_data["Assay"], categories=assay_order, ordered=True
    )

    return group_data, combined_data, results_dict


# Get Screen data

In [None]:
df_screen = evaluator.screens_data['ChemBL']['chembl_688293']

In [None]:
evaluator.plot_dimensionality_reduction('Embeddings_mean', 'ChemBL', 'chembl_688293', "UMAP")

In [None]:
evaluator.compute_assays_distances(embeddings_cols=["Embeddings_mean"], distance='cosine')   


In [None]:
for source in evaluator.distance_matrices.keys():
    for screen in evaluator.distance_matrices[source].keys():
        evaluator.distance_matrices[source][screen]['Embeddings_mean'] = 1 - evaluator.distance_matrices[source][screen]['Embeddings_mean'] 

In [None]:
ChemBL_5, all_ChemBL_5, res_dict_5 = get_group_data_norm("ChemBL", 5)

In [None]:
df_screen.head(5)

In [None]:
results_chembl_688293 = res_dict_5['chembl_688293'] 

In [None]:
df_screen = pd.read_csv("/home/maxime/data/cell_painting/paper_data/screen_df_chembl_688293.csv")

In [None]:
df_screen

In [None]:
embeddings = np.load('/home/maxime/data/cell_painting/paper_data/profiles_chembl_688293.npy')

In [None]:
df_screen['Embeddings_mean'] = [embedding for embedding in embeddings]

In [None]:
res = evaluator.compute_ranking('ChemBL', 'chembl_688293', 'Embeddings_mean', 'JCP2022_010404', True)

In [None]:
res['Distance']

In [None]:
res['Metadata_JCP2022'][:15]

In [34]:
df_screen[df_screen['Metadata_JCP2022'].isin(res['Metadata_JCP2022'][:15])].reset_index().to_csv("/projects/synsight/repos/phenospace/normalisation/publication/pathways/data/selected_mol_egfr.csv", index=False)

In [29]:
inch_list = []
for jcp in res['Metadata_JCP2022'][:15]:
    inch_list.append(df_screen[df_screen['Metadata_JCP2022']==jcp]['Metadata_InChI'].iloc[0])

In [None]:
inch_list

# Get pathway data 

In [None]:
df_pathways = pd.read_csv('/projects/synsight/repos/phenospace/normalisation/publication/pathways/data/relations_result_02_12_24.tsv', sep='\t')

In [None]:
gene_protein_list = list(set(df_pathways['ENTITYA'].to_list() + df_pathways['ENTITYB'].to_list()))

In [None]:

# Liste de gènes/protéines
ref_ = [
    "ERK1/2", "HRAS", "EGF", "PTPN11", "PIP3", "SRC", "PDPK1", "MAPK8",
    "MAP2K4", "AKT", "EGFR", "NCK1", "BRAF", "PAK1", "JAK2", "GAB1",
    "ELK1", "SHC1", "GRB2", "STAT3", "MAP3K1", "ERRFI1", "PIK3CA",
    "mTORC1", "SOS1", "JUN", "MYC", "FOS"
]


In [None]:
for i in gene_protein_list:
    if i not in ref_:
        print(i)

In [None]:
gene_protein_list.remove("Cell_growth")
gene_protein_list.remove("Proliferation")

In [None]:
gene_protein_list = gene_protein_list + ['ERK1', 'ERK2', 'MEK1', 'MEK2']

In [None]:
gene_protein_list = ['PDPK1',
 'MAPK8',
 'PIP3',
 'HRAS',
 'EGFR',
 'PIK3CA',
 'ERRFI1',
 'SHC1',
 'SOS1',
 'JUN',
 'EGF',
 'SRC',
 'AKT',
 'NCK1',
 'GAB1',
 'mTORC1',
 'CBLB',
 'ERK1/2',
 'FOS',
 'MAP3K1',
 'PTPN11',
 'STAT3',
 'ELK1',
 'MEK1/2',
 'MAP2K4',
 'MYC',
 'BRAF',
 'GRB2',
 'PAK1',
 'JAK2',
 'ERK1',
 'ERK2',
 'MEK1',
 'MEK2']

# Get binding data

## Binding DB

In [None]:
binding_db_path = "/projects/synsight/repos/phenospace/normalisation/publication/pathways/data/BindingDB_All.tsv"

In [None]:
binding_db_path = "/home/maxime/data/cell_painting/paper_data/BindingDB_All.tsv"

In [None]:
df_bd = pd.read_csv(binding_db_path, sep='\t', on_bad_lines='skip')


In [None]:
print(list(df_bd.columns))

In [None]:
target_columns = [
    'Target Name',
    'UniProt (SwissProt) Recommended Name of Target Chain',
    'UniProt (SwissProt) Entry Name of Target Chain',
    'UniProt (SwissProt) Primary ID of Target Chain'
]

# Create a filter to check if any target matches across specified columns
target_filter = df_bd[target_columns].apply(
    lambda row: any(target in str(cell) for target in gene_protein_list for cell in row), axis=1
)

# Filter the DataFrame to only include rows with matching targets
df_filtered = df_bd[target_filter]

# Save or inspect the filtered results
df_filtered.to_csv('filtered_bindingdb_results.csv', index=False)
print(f"Filtered results saved to 'filtered_bindingdb_results.csv'.")

In [None]:
df_filtered = pd.read_csv('filtered_bindingdb_results.csv')

## ChemBL

In [None]:

# Fonction pour interroger l'API ChEMBL pour une cible
def fetch_chembl_target(target):
    url = f"https://www.ebi.ac.uk/chembl/api/data/target/search.json?q={target}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data['targets'] if 'targets' in data else []
    else:
        print(f"Erreur lors de la récupération des données pour {target}")
        return []

# Fonction pour récupérer les molécules associées à une cible
def fetch_chembl_molecules(target_chembl_id):
    url = f"https://www.ebi.ac.uk/chembl/api/data/activity.json?target_chembl_id={target_chembl_id}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data['activities'] if 'activities' in data else []
    else:
        print(f"Erreur lors de la récupération des molécules pour {target_chembl_id}")
        return []

# Fonction pour récupérer les détails d'une molécule (InChI/InChIKey)
def fetch_molecule_details(molecule_chembl_id):
    url = f"https://www.ebi.ac.uk/chembl/api/data/molecule/{molecule_chembl_id}.json"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        structures = data.get('molecule_structures', {})
        return {
            "InChI": structures.get('standard_inchi', ''),
            "InChIKey": structures.get('standard_inchi_key', '')
        }
    else:
        print(f"Erreur lors de la récupération des détails pour la molécule {molecule_chembl_id}")
        return {"InChI": "", "InChIKey": ""}


In [None]:

# Stocker les résultats
all_interactions = []

for target in tqdm(gene_protein_list):
    # Étape 1 : Récupérer les informations de la cible
    targets = fetch_chembl_target(target)
    for target_entry in targets:
        target_chembl_id = target_entry.get('target_chembl_id', '')
        target_name = target_entry.get('pref_name', '')
        target_description = target_entry.get('description', '')

        # Étape 2 : Récupérer les molécules associées
        molecules = fetch_chembl_molecules(target_chembl_id)
        for molecule in molecules:
            molecule_chembl_id = molecule.get('molecule_chembl_id', '')
            interaction_type = molecule.get('type', '')

            # Étape 3 : Récupérer les détails de la molécule (InChI/InChIKey)
            molecule_details = fetch_molecule_details(molecule_chembl_id)
            
            all_interactions.append({
                "Gene/Protein": target,
                "Target_Name": target_name,
                "Target_Description": target_description,
                "ChEMBL_ID": target_chembl_id,
                "Molecule_ChEMBL_ID": molecule_chembl_id,
                "InChI": molecule_details['InChI'],
                "InChIKey": molecule_details['InChIKey'],
                "Interaction_Type": interaction_type
            })

# Convertir en DataFrame et sauvegarder en CSV
df_interactions = pd.DataFrame(all_interactions)
df_interactions.to_csv('known_interactions.csv', index=False)
print("Fichier 'known_interactions.csv' créé avec succès.")


In [None]:
df_interactions.info()

In [None]:
df_interactions = pd.read_csv('known_interactions.csv')

# Cross informations

In [None]:
df_screen.info()

In [None]:
df_interactions.info()

In [None]:
df_filtered.info()

In [None]:
df_screen_2.rename(columns={"Metadata_InChIKey": "InChIKey", "Metadata_InChI": "InChI"}, inplace=True)
df_interactions.rename(columns={"InChI": "InChI", "InChIKey": "InChIKey"}, inplace=True)
df_filtered.rename(columns={"Ligand InChI": "InChI", "Ligand InChI Key": "InChIKey"}, inplace=True)


In [None]:
df_screen_2['Metadata_Bioactivity'].value_counts()

In [None]:
df_bindbd = pd.merge(df_screen_2, df_filtered, on=["InChIKey", "InChI"], how="inner", suffixes=("_df1", "_df3"))

In [None]:
df_chembl = pd.merge(df_screen_2, df_interactions, on=["InChIKey", "InChI"], how="inner", suffixes=("_df1", "_df2"))

In [None]:
df_chembl['Metadata_JCP2022'].nunique()

In [None]:
df_bindbd['Metadata_JCP2022'].nunique()

In [None]:
for idx in df_bindbd['Metadata_JCP2022'].unique():
    print(idx, df_bindbd[df_bindbd['Metadata_JCP2022']==idx]['Metadata_Bioactivity'].unique())

In [None]:
for idx in df_chembl['Metadata_JCP2022'].unique():
    if idx not in df_bindbd['Metadata_JCP2022'].unique():
        print(idx)

In [None]:
df_bindbd_cleaned = df_bindbd.dropna(axis=1, how='all')

In [None]:
df_bindbd_cleaned.columns

# Analyse thoses informations

## First

In [None]:
df_chembl.columns

In [None]:
df_chembl = df_chembl[['Metadata_JCP2022', 'InChIKey', 'Metadata_Bioactivity', 'Embeddings_mean',
       'InChI', 'Gene/Protein',
       'Target_Name', 'ChEMBL_ID', 'Molecule_ChEMBL_ID',
       'Interaction_Type']]

In [None]:
dfs_chembl = {}
for jcp in df_chembl['Metadata_JCP2022'].unique():
    dfs_chembl[jcp] = df_chembl[df_chembl['Metadata_JCP2022']==jcp]

JCP2022_052804 H&I M-phase inducer phosphatase 3	

JCP2022_098853 N&I P40763 binder from STAT3

JCP2022_078761 N&I JAK2

JCP2022_072343 H&I  very good inhibit MYC


JCP2022_021857 N&I O60674 ie JAK2

JCP2022_073156 H&I # ERK2 MEK1/2	

JCP2022_054618 N&I CDC25C M-phase inducer phosphatase 3	

In [None]:
dfs_chembl['JCP2022_073156']

In [None]:
df_bindbd_cleaned.columns

In [None]:
df_bindbd_cleaned[df_bindbd_cleaned['Metadata_JCP2022']=='JCP2022_098853'][['Target Name',
       'Target Source Organism According to Curator or DataSource',  'Institution', 
       'PDB ID(s) for Ligand-Target Complex', 'PubChem CID', 'PubChem SID',
       'ChEBI ID of Ligand', 'ChEMBL ID of Ligand', 'DrugBank ID of Ligand',
        'PDB ID(s) of Target Chain',
       'UniProt (SwissProt) Recommended Name of Target Chain',
       'UniProt (SwissProt) Primary ID of Target Chain']]

In [None]:
df_bindbd_cleaned = df_bindbd_cleaned[df_bindbd_cleaned["Target Source Organism According to Curator or DataSource"]=="Homo sapiens"]

In [None]:
df_bindbd_cleaned['Metadata_JCP2022'].nunique()

In [None]:
df_chembl['Metadata_JCP2022'].nunique()

In [None]:
for idx in df_chembl['Metadata_JCP2022'].unique():
    if idx not in df_bindbd_cleaned['Metadata_JCP2022'].unique():
        print(idx)

In [None]:
idx_set = set(df_bindbd_cleaned['Metadata_JCP2022'].to_list() + df_chembl['Metadata_JCP2022'].to_list())
len(idx_set)

In [None]:
for idx in df_bindbd_cleaned['Metadata_JCP2022'].unique():
    if idx in df_chembl['Metadata_JCP2022'].unique():
        print(idx)

## Select a compound

In [None]:
molecule = 'JCP2022_010404'

In [None]:
df_bindbd_cleaned[df_bindbd_cleaned['Metadata_JCP2022']==molecule]['UniProt (SwissProt) Recommended Name of Target Chain'].iloc[0]

In [None]:
df_bindbd_cleaned[df_bindbd_cleaned['Metadata_JCP2022']==molecule]['InChI'].iloc[0]

In [None]:
df_bindbd_cleaned[df_bindbd_cleaned['Metadata_JCP2022']==molecule]

In [None]:
dfs_chembl[molecule]['InChI'].unique()

In [None]:
dfs_chembl[molecule]

# Plots of the screen 

## UMAP

In [None]:
idx_set

In [None]:
df_screen.info()

In [None]:



# Create a new column with the specified logic
def classify(row, idx_set):
    if row["Metadata_Bioactivity"] == "hit":
        if row["Metadata_JCP2022"] in idx_set:
            return "hit and know inhibitor"
        else:
            return "Only hit"
    elif row["Metadata_Bioactivity"] == "none":
        if row["Metadata_JCP2022"] in idx_set:
            return "Non hit and know inhibitor"
        else:
            return "Nothing"

df_screen["Classification"] = df_screen.apply(lambda row: classify(row, idx_set), axis=1)


In [None]:
# Create a new column with the specified logic
def classify_(row, idx_set):
    if row["Metadata_JCP2022"] in idx_set:
        return "known inhibitor"
    else:
        return "Nothing"
df_screen["Classification"] = df_screen.apply(lambda row: classify_(row, idx_set), axis=1)


In [None]:
df_screen['Classification'].value_counts()

In [None]:
df = df_screen.copy(deep=True)
# Extract embeddings and prepare them for UMAP
embeddings = np.vstack(df["Embeddings_mean"].values)
umap_reducer = umap.UMAP(random_state=42)
embedding_umap = umap_reducer.fit_transform(embeddings)

# Add UMAP results to the DataFrame
df["UMAP_1"] = embedding_umap[:, 0]
df["UMAP_2"] = embedding_umap[:, 1]



In [None]:
plt.figure(figsize=(10, 8))
# Define unique classes and consistent colors
# 
unique_classes = df["Classification"].unique()
colors = {cls: plt.cm.tab10(i / len(unique_classes)) for i, cls in enumerate(unique_classes)}
# Plot UMAP with coloring by the classification column
scatter = plt.scatter(
    df["UMAP_1"], 
    df["UMAP_2"], 
    c=df["Classification"].map(colors),  # Map classification to consistent colors
)

# Add a legend with matching colors
legend_handles = [
    plt.Line2D([0], [0], marker='o', color=color, markersize=10, label=cls, linestyle='')
    for cls, color in colors.items()
]
plt.legend(handles=legend_handles, title="Classification", loc="best")

plt.title("UMAP Embeddings Colored by Classification")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.grid()
plt.show()

In [None]:
unique_classes = df["Metadata_Bioactivity"].unique()
colors = {cls: plt.cm.tab10(i / len(unique_classes)) for i, cls in enumerate(unique_classes)}
# Plot UMAP with coloring by the classification column
scatter = plt.scatter(
    df["UMAP_1"], 
    df["UMAP_2"], 
    c=df["Metadata_Bioactivity"].map(colors),  # Map classification to consistent colors
)

# Add a legend with matching colors
legend_handles = [
    plt.Line2D([0], [0], marker='o', color=color, markersize=10, label=cls, linestyle='')
    for cls, color in colors.items()
]
plt.legend(handles=legend_handles, title="Metadata_Bioactivity", loc="best")

plt.title("UMAP Embeddings Colored by Classification")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.grid()
plt.show()

In [None]:
import plotly.express as px
import pandas as pd
import numpy as np
import umap
import plotly.io as pio
pio.renderers.default = "browser"  # Use "notebook" for Jupyter

# Assume df is already defined and contains "Embeddings_mean" and "Classification"
# Example data setup (if needed)
# df = ...

# Extract embeddings and prepare them for UMAP
embeddings = np.vstack(df["Embeddings_mean"].values)
umap_reducer = umap.UMAP(random_state=42)
embedding_umap = umap_reducer.fit_transform(embeddings)

# Add UMAP results to the DataFrame
df["UMAP_1"] = embedding_umap[:, 0]
df["UMAP_2"] = embedding_umap[:, 1]

# Create an interactive plot with Plotly
fig = px.scatter(
    df,
    x="UMAP_1",
    y="UMAP_2",
    color="Classification",  # Color points by the classification column
    hover_data=["Metadata_JCP2022"],  # Display Metadata_JCP2022 when hovering
    title="Interactive UMAP Embeddings Colored by Classification",
    labels={"UMAP_1": "UMAP Dimension 1", "UMAP_2": "UMAP Dimension 2"},
)

# Show the interactive plot
fig.show()


In [None]:
results_df = compute_ranking("ChemBL", 'chembl_688293', 'Embeddings_mean', 'JCP2022_007012')
results_df["tanimoto_to_target"] = compute_tanimoto(results_df["InChI"].to_list())
results_df[['Metadata_JCP2022' , 'Metadata_Bioactivity','distance_to_target', 'tanimoto_to_target']].head(10)
# Assuming 'Metadata_Bioactivity' is categorical, use unique categories to assign colors
bioactivity_categories = results_df['Classification'].unique()


colors = {cls: plt.cm.tab10(i / len(bioactivity_categories)) for i, cls in enumerate(bioactivity_categories)}
# Create scatter plot with colors
plt.figure(figsize=(10, 6))
plt.scatter(
    1 - results_df['distance_to_target'][1:],
    results_df["tanimoto_to_target"][1:],
    c=results_df["Classification"][1:].map(colors),
    alpha=0.7,  # Adjust transparency
    label='Bioactivity'
)

# Add legend
handles = [plt.Line2D([0], [0], marker='o', color='w', label=key, 
                      markerfacecolor=color, markersize=10) for key, color in colors.items()]
plt.legend(handles=handles, title="Metadata_Bioactivity")

# Set labels and title
plt.grid(visible=True)
plt.xlabel('Distance to Target')
plt.ylabel('Tanimoto to Target')
plt.title('Scatter Plot Colored by Metadata_Bioactivity')
# Show the plot
plt.show()

In [None]:
import plotly.graph_objects as go

# Define color mapping
colors = {cls: f'rgba({int(255*i/len(results_df["Classification"].unique()))}, 100, 200, 0.7)' 
          for i, cls in enumerate(results_df["Classification"].unique())}

# Create scatter plot with Plotly
fig = go.Figure()

# Add trace for the scatter points
fig.add_trace(go.Scatter(
    x=1 - results_df['distance_to_target'][1:],  # Inverted x-axis values
    y=results_df["tanimoto_to_target"][1:],
    mode='markers',
    marker=dict(
        color=[colors[cls] for cls in results_df["Classification"][1:]],
        size=10,
        opacity=0.7
    ),
    text=results_df["Classification"][1:],  # Hover text
    name="Bioactivity"
))

# Update layout for grid inversion and labels
fig.update_layout(
    title="Interactive Scatter Plot Colored by Metadata_Bioactivity",
    xaxis=dict(
        title="1 - Distance to Target",
        gridcolor="lightgray",
        zeroline=False,
        autorange="reversed"  # Reverse x-axis
    ),
    yaxis=dict(
        title="Tanimoto to Target",
        gridcolor="lightgray",
        zeroline=False
    ),
    legend_title="Metadata_Bioactivity",
    template="plotly_white"
)

# Show the plot
fig.show()


In [None]:
results_chembl_688293.sort_values('EF', ascending=False).head(20)

In [None]:
#

## Distance 

In [None]:
results_df = compute_ranking("ChemBL", 'chembl_688293', 'Embeddings_mean', 'JCP2022_010404')
results_df["tanimoto_to_target_JCP2022_010404"] = compute_tanimoto(results_df["InChI"].to_list())
results_df[['Metadata_JCP2022' , 'Metadata_Bioactivity','distance_to_target', 'tanimoto_to_target_JCP2022_010404']].head(10)

In [None]:
# Assuming 'Metadata_Bioactivity' is categorical, use unique categories to assign colors
bioactivity_categories = results_df['Classification'].unique()


colors = {cls: plt.cm.tab10(i / len(bioactivity_categories)) for i, cls in enumerate(bioactivity_categories)}
# Create scatter plot with colors
plt.figure(figsize=(10, 6))
plt.scatter(
    results_df['distance_to_target'][1:],
    results_df["tanimoto_to_target_JCP2022_010404"][1:],
    c=results_df["Classification"][1:].map(colors),
    alpha=0.5,  # Adjust transparency
    label='Bioactivity'
)

# Add legend
handles = [plt.Line2D([0], [0], marker='o', color='w', label=key, 
                      markerfacecolor=color, markersize=10) for key, color in colors.items()]
plt.legend(handles=handles, title="Metadata_Bioactivity")

# Set labels and title
plt.grid(visible=True)
plt.xlabel('Distance to Target')
plt.ylabel('Tanimoto to Target')
plt.title('Scatter Plot Colored by Metadata_Bioactivity')
# Show the plot
plt.show()

In [None]:
results_df = compute_ranking("ChemBL", 'chembl_688293', 'Embeddings_mean', 'JCP2022_073458')
results_df["tanimoto_to_target_JCP2022_073458"] = compute_tanimoto(results_df["InChI"].to_list())
results_df[['Metadata_JCP2022' , 'Metadata_Bioactivity','distance_to_target', 'tanimoto_to_target_JCP2022_073458']].head(10)

In [None]:
# Assuming 'Metadata_Bioactivity' is categorical, use unique categories to assign colors
bioactivity_categories = results_df['Classification'].unique()


colors = {cls: plt.cm.tab10(i / len(bioactivity_categories)) for i, cls in enumerate(bioactivity_categories)}
# Create scatter plot with colors
plt.figure(figsize=(10, 6))
plt.scatter(
    results_df['distance_to_target'][1:],
    results_df["tanimoto_to_target_JCP2022_073458"][1:],
    c=results_df["Classification"][1:].map(colors),
    alpha=0.5,  # Adjust transparency
    label='Bioactivity'
)

# Add legend
handles = [plt.Line2D([0], [0], marker='o', color='w', label=key, 
                      markerfacecolor=color, markersize=10) for key, color in colors.items()]
plt.legend(handles=handles, title="Metadata_Bioactivity")

# Set labels and title
plt.grid(visible=True)
plt.xlabel('Distance to Target')
plt.ylabel('Tanimoto to Target')
plt.title('Scatter Plot Colored by Metadata_Bioactivity')
# Show the plot
plt.show()

In [None]:
results_chembl_688293.sort_values('EF', ascending=False).head(10)

In [None]:
results_df = compute_ranking("ChemBL", 'chembl_688293', 'Embeddings_mean', 'JCP2022_031167')
results_df["tanimoto_to_target"] = compute_tanimoto(results_df["InChI"].to_list())
results_df[['Metadata_JCP2022' , 'Metadata_Bioactivity','distance_to_target', 'tanimoto_to_target']].head(10)
# Assuming 'Metadata_Bioactivity' is categorical, use unique categories to assign colors
bioactivity_categories = results_df['Classification'].unique()


colors = {cls: plt.cm.tab10(i / len(bioactivity_categories)) for i, cls in enumerate(bioactivity_categories)}
# Create scatter plot with colors
plt.figure(figsize=(10, 6))
plt.scatter(
    results_df['distance_to_target'][1:],
    results_df["tanimoto_to_target"][1:],
    c=results_df["Classification"][1:].map(colors),
    alpha=0.7,  # Adjust transparency
    label='Bioactivity'
)

# Add legend
handles = [plt.Line2D([0], [0], marker='o', color='w', label=key, 
                      markerfacecolor=color, markersize=10) for key, color in colors.items()]
plt.legend(handles=handles, title="Metadata_Bioactivity")

# Set labels and title
plt.grid(visible=True)
plt.xlabel('Distance to Target')
plt.ylabel('Tanimoto to Target')
plt.title('Scatter Plot Colored by Metadata_Bioactivity')
# Show the plot
plt.show()

In [None]:
results_df = compute_ranking("ChemBL", 'chembl_688293', 'Embeddings_mean', 'JCP2022_047982')
results_df["tanimoto_to_target"] = compute_tanimoto(results_df["InChI"].to_list())
results_df[['Metadata_JCP2022' , 'Metadata_Bioactivity','distance_to_target', 'tanimoto_to_target']].head(10)
# Assuming 'Metadata_Bioactivity' is categorical, use unique categories to assign colors
bioactivity_categories = results_df['Classification'].unique()


colors = {cls: plt.cm.tab10(i / len(bioactivity_categories)) for i, cls in enumerate(bioactivity_categories)}
# Create scatter plot with colors
plt.figure(figsize=(10, 6))
plt.scatter(
    results_df['distance_to_target'][1:],
    results_df["tanimoto_to_target"][1:],
    c=results_df["Classification"][1:].map(colors),
    alpha=0.7,  # Adjust transparency
    label='Bioactivity'
)

# Add legend
handles = [plt.Line2D([0], [0], marker='o', color='w', label=key, 
                      markerfacecolor=color, markersize=10) for key, color in colors.items()]
plt.legend(handles=handles, title="Metadata_Bioactivity")

# Set labels and title
plt.grid(visible=True)
plt.xlabel('Distance to Target')
plt.ylabel('Tanimoto to Target')
plt.title('Scatter Plot Colored by Metadata_Bioactivity')
# Show the plot
plt.show()

# My new control positif

In [None]:
df = compute_distances_and_select(df_screen, 'JCP2022_010404', 15)

In [None]:
import plotly.express as px
import pandas as pd
import numpy as np
import umap
import plotly.io as pio
pio.renderers.default = "browser"  # Use "notebook" for Jupyter

# Assume df is already defined and contains "Embeddings_mean" and "Classification"
# Example data setup (if needed)
# df = ...


In [None]:
# for jcp in ['JCP2022_021533', 'JCP2022_010404', 'JCP2022_046511', 'JCP2022_073458', 'JCP2022_041800']:
for jcp in ["JCP2022_010404"]:
    df = compute_distances_and_select(df_screen, jcp, 15)
    df = df_screen.copy(deep=True)
    embeddings = np.vstack(df["Embeddings_mean"].values)
    umap_reducer = umap.UMAP(random_state=42)
    embedding_umap = umap_reducer.fit_transform(embeddings)

    # Add UMAP results to the DataFrame
    df["UMAP_1"] = embedding_umap[:, 0]
    df["UMAP_2"] = embedding_umap[:, 1]

    # Create an interactive plot with Plotly
    fig = px.scatter(
        df,
        x="UMAP_1",
        y="UMAP_2",
        color="Selection",  # Color points by the classification column
        hover_data=["Metadata_JCP2022"],  # Display Metadata_JCP2022 when hovering
        title="Interactive UMAP Embeddings Colored by Selection",
        labels={"UMAP_1": "UMAP Dimension 1", "UMAP_2": "UMAP Dimension 2"},
    )

    # Show the interactive plot
    fig.show()


In [None]:
df = df_screen.copy(deep=True)
# Extract embeddings and prepare them for UMAP
embeddings = np.vstack(df["Embeddings_mean"].values)
umap_reducer = umap.UMAP(random_state=42)
embedding_umap = umap_reducer.fit_transform(embeddings)

# Add UMAP results to the DataFrame
df["UMAP_1"] = embedding_umap[:, 0]
df["UMAP_2"] = embedding_umap[:, 1]


In [None]:
plt.figure(figsize=(10, 8))
# Define unique classes and consistent colors
# 
unique_classes = df["Selection"].unique()
colors = {cls: plt.cm.tab10(i / len(unique_classes)) for i, cls in enumerate(unique_classes)}
# Plot UMAP with coloring by the classification column
scatter = plt.scatter(
    df["UMAP_1"], 
    df["UMAP_2"], 
    c=df["Selection"].map(colors),  # Map classification to consistent colors
)

# Add a legend with matching colors
legend_handles = [
    plt.Line2D([0], [0], marker='o', color=color, markersize=10, label=cls, linestyle='')
    for cls, color in colors.items()
]
plt.legend(handles=legend_handles, title="Selection", loc="best")

plt.title("UMAP Embeddings Colored by Selection")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.grid()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib as mpl

# Set publication-quality style
mpl.rcParams.update({
    'font.size': 12,               # General font size
    'axes.titlesize': 14,          # Title font size
    'axes.labelsize': 12,          # Axis label size
    'legend.fontsize': 10,         # Legend font size
    'xtick.labelsize': 10,         # X-axis tick size
    'ytick.labelsize': 10,         # Y-axis tick size
    'figure.dpi': 300,             # High resolution for publication
    'figure.figsize': (14, 8),      # Figure size
    'axes.grid': True,             # Grid enabled
    'grid.linestyle': '--',        # Grid line style
    'grid.alpha': 0.5              # Grid transparency
})

# Explicitly set colors
colors = {
    "Positive Control": "red",
    "Selected Compounds": "blue",
    "Not Selected Compounds": "grey"
}

# Adjust point sizes for clarity
point_sizes = df["Selection"].apply(lambda x: 50 if x == "Other" else 80)

# Create the scatter plot
plt.figure()
for category, color in colors.items():
    subset = df[df["Selection"] == category]
    plt.scatter(
        subset["UMAP_1"], 
        subset["UMAP_2"], 
        c=color, 
        label=category, 
        s=point_sizes[subset.index],  # Adjust size
        edgecolor='k',                # Black edges for clarity
        alpha=0.8                     # Transparency for overlap
    )

# Legend outside the plot
plt.legend(
    title="Selection", 
    loc="center left", 
    bbox_to_anchor=(1, 0.5),  # Move legend outside the plot
    frameon=False             # Remove legend box
)

# Add titles and labels
plt.title("UMAP Embeddings Colored by Selection")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")

# Adjust layout to fit legend outside
plt.tight_layout()

# Show the figure
plt.show()


# Clusters 

In [None]:
df.info()

In [None]:
from sklearn.cluster import KMeans
import pandas as pd
df = df[df['Classification']!='Nothing']
# Extract embeddings from 'Embeddings_mean'
embeddings = np.vstack(df["Embeddings_mean"].values)

# Perform clustering using KMeans
num_clusters = 3 # You can adjust the number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=54)
clusters = kmeans.fit_predict(embeddings)

# Add cluster labels to the DataFrame
df["Cluster"] = clusters

# Calculate the distribution of 'Classification' within each cluster
cluster_classification_distribution = df.groupby("Cluster")["Classification"].value_counts().unstack()
cluster_classification_distribution

In [None]:
df_screen['Metadata_Bioactivity'].value_counts()

In [None]:
df[(df["Cluster"] == 1) & (df['Classification']!='Nothing')][['Metadata_JCP2022', 
       'Classification']]

In [None]:

umap_reducer = umap.UMAP(random_state=42)
embedding_umap = umap_reducer.fit_transform(embeddings)

# Add UMAP results to the DataFrame
df["UMAP_1"] = embedding_umap[:, 0]
df["UMAP_2"] = embedding_umap[:, 1]

# Create a scatter plot of the clusters in UMAP space
plt.figure(figsize=(10, 8))
for cluster in range(3):
    cluster_data = df[df["Cluster"] == cluster]
    plt.scatter(
        cluster_data["UMAP_1"],
        cluster_data["UMAP_2"],
        label=f"Cluster {cluster}",
        alpha=0.7
    )

plt.title("UMAP Visualization of Clusters in Embedding Space")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.legend(title="Clusters")
plt.grid(True)
plt.show()

In [None]:
df.sample(frac=1)

In [None]:
# Create a scatter plot of the clusters in UMAP space
plt.figure(figsize=(10, 8))
for Classification in df['Classification'].unique():
    cluster_data = df[df["Classification"] == Classification].sample(frac=1)
    plt.scatter(
        cluster_data["UMAP_1"],
        cluster_data["UMAP_2"],
        label=f"Classification : {Classification}",
        alpha=0.7
    )

plt.title("UMAP Visualization of Clusters in Embedding Space")
plt.xlabel("UMAP Dimension 1")
plt.ylabel("UMAP Dimension 2")
plt.legend(title="Clusters")
plt.grid(True)
plt.show()