# Imports and paths

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h5py
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from pathlib import Path

In [2]:
npy_file = ("/projects/synsight/data/website_data/jump_compounds_matrix.npy")
METADATA_FILE = ("/projects/synsight/data/website_data/jump_compounds_matrix_metadata.parquet")
output_file = '/projects/synsight/data/website_data/nearest_neighbors_test.h5'  # Output HDF5 file

# Precompute distances

In [3]:
m = 1000  # Number of nearest neighbors

In [4]:
test_metadata = pd.read_csv('/projects/synsight/repos/phenoseeker-website/dev/molecules_with_pubchem_chembl.csv')

In [5]:
test_metadata

Unnamed: 0,Metadata_JCP2022,Metadata_InChI,path_embedding,InChIKey,PubChem_ID,ChEMBL_ID
0,JCP2022_108820,InChI=1S/C18H17NO7/c1-11(20)26-14-6-7-16(15(9-...,/projects/synsight/data/openphenom/norm_2_comp...,YJGWMUPKRXDZJL-UHFFFAOYSA-N,3566022,
1,JCP2022_078486,InChI=1S/C15H28N4O3S/c1-12-7-10-19(11-8-12)9-5...,/projects/synsight/data/openphenom/norm_2_comp...,RHVVFNOWSUHOMT-UHFFFAOYSA-N,86778624,CHEMBL4924980
2,JCP2022_086327,InChI=1S/C19H19BrN4O/c1-13-18(15-6-4-3-5-7-15)...,/projects/synsight/data/openphenom/norm_2_comp...,SYBJQAWDEXICLR-UHFFFAOYSA-N,122161574,
3,JCP2022_008667,"InChI=1S/C12H14N2O2S2/c1-8-4-6-11(7-5-8)18(15,...",/projects/synsight/data/openphenom/norm_2_comp...,BUAFJXXXUDMLJO-UHFFFAOYSA-N,548045,CHEMBL1382626
4,JCP2022_012583,InChI=1S/C22H33N3O3/c1-17(2)9-16-28-19-5-3-18(...,/projects/synsight/data/openphenom/norm_2_comp...,CPCMBIPMYCFDNT-UHFFFAOYSA-N,124204801,
5,JCP2022_070427,InChI=1S/C23H23NO2/c1-2-17-8-9-21-20(15-23(25)...,/projects/synsight/data/openphenom/norm_2_comp...,PRCKGPKGLNUWJI-UHFFFAOYSA-N,2461072,CHEMBL1525486
6,JCP2022_031449,InChI=1S/C17H22N6O2/c1-21-15(12-3-4-12)13(11-1...,/projects/synsight/data/openphenom/norm_2_comp...,HOAZQKOKYJTJRK-UHFFFAOYSA-N,49025680,CHEMBL3469234
7,JCP2022_022677,InChI=1S/C14H15N3O3S/c1-2-20-13(19)10-3-5-11(6...,/projects/synsight/data/openphenom/norm_2_comp...,FSODNBIPUNZDQT-UHFFFAOYSA-N,1922396,CHEMBL1531273
8,JCP2022_101439,InChI=1S/C20H23N5O2/c1-4-16(19(26)21-14-10-11-...,/projects/synsight/data/openphenom/norm_2_comp...,WVIFQTZSBSHZGG-UHFFFAOYSA-N,20919750,CHEMBL1452584
9,JCP2022_073507,InChI=1S/C11H10N6/c1-2-4-9-8(3-1)13-10(12-7-5-...,/projects/synsight/data/openphenom/norm_2_comp...,QHQAYTZYYODIIJ-UHFFFAOYSA-N,6623390,


In [6]:
embeddings = np.load("/projects/synsight/data/website_data/jump_compounds_embeddings.npy")

In [7]:
embeddings = embeddings[:10,: ] 

In [10]:


norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
normalized_embeddings = embeddings / norms

# Compute cosine similarity matrix: (10, 384) dot (384, 10) = (10, 10)
cosine_similarity = np.dot(normalized_embeddings, normalized_embeddings.T)

# Convert similarity to cosine distance: cosine distance = 1 - cosine similarity
cosine_distance = 1 - cosine_similarity



In [11]:
matrix = cosine_distance

In [12]:
np.save('test_matrix.npy', matrix)

In [None]:
# Load metadata
metadata = pd.read_parquet(METADATA_FILE)
metadata_ids = metadata['Metadata_JCP2022'].values  # Unique molecule IDs


In [None]:
metadata = test_metadata[['Metadata_InChI', 'Metadata_JCP2022', 'InChIKey', 'PubChem_ID', 'ChEMBL_ID']]

In [None]:
metadata.columns = ['Metadata_InChI', 'Metadata_JCP2022', 'Metadata_InChIKey', 'Metadata_PubChem_ID', 'Metadata_ChEMBL_ID']

In [None]:
metadata.info()

In [None]:
metadata['Metadata_JCP2022']

In [None]:
matrix = np.load(npy_file)

In [None]:
matrix

In [None]:
m=9

In [None]:
def process_row(i):
    """
    Process a single row of the distance matrix to find the m closest neighbors
    and include the distance to a specific molecule.

    Args:
        i (int): Index of the row in the distance matrix.

    Returns:
        tuple: (molecule_id, closest_ids, closest_distances)
    """
    distances = matrix[i]

    # Distance to the target molecule (JCP2022_033924)
    target_index = np.where(metadata_ids == 'JCP2022_031449')[0][0]
    dmso_distance = distances[target_index]

    # Find m closest molecules using partial sorting (excluding self if needed)
    closest_indices = np.argpartition(distances, m)[1:m+1]  # Top m indices (unsorted)
    closest_distances = distances[closest_indices]

    # Sort these m indices to ensure proper order
    sorted_indices_within_chunk = np.argsort(closest_distances)
    closest_indices = closest_indices[sorted_indices_within_chunk]
    closest_distances = closest_distances[sorted_indices_within_chunk]

    # Get IDs for the closest molecules
    closest_ids = metadata_ids[closest_indices]

    # Return the results
    return metadata_ids[i], closest_ids, closest_distances, dmso_distance

# Parallel processing
with Pool(processes=cpu_count()) as pool:
    # Use tqdm for progress tracking
    results = list(tqdm(pool.imap(process_row, range(matrix.shape[0])), total=matrix.shape[0]))



# Save results to HDF5
with h5py.File(output_file, 'w') as h5f:
    for molecule_id, closest_ids, closest_distances, dmso_distance in results:
        group = h5f.create_group(molecule_id)
        group.create_dataset('closest_ids', data=closest_ids.astype('S'))  # Save IDs as strings
        group.create_dataset('distances', data=closest_distances)
        group.create_dataset('dmso_distance', data=dmso_distance)


In [None]:
output_file

In [None]:
# Save results to HDF5
with h5py.File(output_file, 'w') as h5f:
    for molecule_id, closest_ids, closest_distances, dmso_distance in results:
        group = h5f.create_group(molecule_id)
        group.create_dataset('closest_ids', data=closest_ids.astype('S'))  # Save IDs as strings
        group.create_dataset('distances', data=closest_distances)
        group.create_dataset('dmso_distance', data=dmso_distance)

In [None]:
import h5py

# File path
H5_DISTANCE_FILE = output_file

# Query molecule ID
query_id = "JCP2022_078486"  # Replace with the molecule ID you want to query

# Open the HDF5 file and retrieve the `dmso_distance`
with h5py.File(H5_DISTANCE_FILE, 'r') as h5f:
    if query_id in h5f:
        dmso_distance = h5f[f"{query_id}/dmso_distance"][()]
        print(f"DMSO distance for {query_id}: {dmso_distance}")
    else:
        print(f"Molecule ID {query_id} not found in the HDF5 file.")


# Convert to pg10

In [None]:
import psycopg2


In [None]:

np.load('/projects/synsight/repos/phenoseeker-website/dev/distance_matrix.npy').shape

In [None]:
# Connect to your PostgreSQL database
conn = psycopg2.connect(
    dbname="phenoseeker",
    user="phenosk",
    password="yourpassword",
    host="pg10!dmsi.biologie.ens.fr",
)
cur = conn.cursor()