# Imports and paths

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h5py
from tqdm import tqdm
from multiprocessing import Pool, cpu_count
from pathlib import Path

In [None]:
npy_file = ("/projects/synsight/data/website_data/jump_compounds_matrix.npy")
METADATA_FILE = ("/projects/synsight/data/website_data/jump_compounds_matrix_metadata.parquet")
output_file = '/projects/synsight/data/website_data/nearest_neighbors.h5'  # Output HDF5 file

# Precompute distances

In [None]:
m = 1000  # Number of nearest neighbors

In [None]:
# Load metadata
metadata = pd.read_parquet(METADATA_FILE)
metadata_ids = metadata['Metadata_JCP2022'].values  # Unique molecule IDs


In [None]:
metadata.info()

In [None]:
matrix = np.load(npy_file)

In [None]:
matrix

In [None]:
test_metadata = pd.read_csv('/projects/synsight/repos/phenoseeker-website/dev/molecules_with_pubchem_chembl.csv')

In [None]:
def process_row(i):
    """
    Process a single row of the distance matrix to find the m closest neighbors
    and include the distance to a specific molecule.

    Args:
        i (int): Index of the row in the distance matrix.

    Returns:
        tuple: (molecule_id, closest_ids, closest_distances)
    """
    distances = matrix[i]

    # Distance to the target molecule (JCP2022_033924)
    target_index = np.where(metadata_ids == 'JCP2022_033924')[0][0]
    dmso_distance = distances[target_index]

    # Find m closest molecules using partial sorting (excluding self if needed)
    closest_indices = np.argpartition(distances, m)[1:m+1]  # Top m indices (unsorted)
    closest_distances = distances[closest_indices]

    # Sort these m indices to ensure proper order
    sorted_indices_within_chunk = np.argsort(closest_distances)
    closest_indices = closest_indices[sorted_indices_within_chunk]
    closest_distances = closest_distances[sorted_indices_within_chunk]

    # Get IDs for the closest molecules
    closest_ids = metadata_ids[closest_indices]

    # Return the results
    return metadata_ids[i], closest_ids, closest_distances, dmso_distance

# Parallel processing
with Pool(processes=cpu_count()) as pool:
    # Use tqdm for progress tracking
    results = list(tqdm(pool.imap(process_row, range(matrix.shape[0])), total=matrix.shape[0]))






In [None]:
# Save results to HDF5
with h5py.File(output_file, 'w') as h5f:
    for molecule_id, closest_ids, closest_distances, dmso_distance in results:
        group = h5f.create_group(molecule_id)
        group.create_dataset('closest_ids', data=closest_ids.astype('S'))  # Save IDs as strings
        group.create_dataset('distances', data=closest_distances)
        group.create_dataset('dmso_distance', data=dmso_distance)

In [None]:
import h5py

# Parameters
molecule_id = 'JCP2022_080538'  # Example molecule ID

# Access the HDF5 file
with h5py.File(output_file, 'r') as h5f:
    # Check if the molecule_id exists in the HDF5 file
    if molecule_id in h5f:
        print(f"Molecule ID {molecule_id} found.")
        
        # Access the datasets
        closest_ids = h5f[f'{molecule_id}/dmso_distance'][:].astype(str)  # Convert bytes to strings
        distances = h5f[f'{molecule_id}/distances'][:]
        
        # Print the results
        print(f'Closest molecules to {molecule_id}:')
        print('IDs:', closest_ids)
        print('Distances:', distances)
    else:
        print(f"Molecule ID {molecule_id} not found in the HDF5 file.")


In [None]:
import h5py

# File path
H5_DISTANCE_FILE = output_file

# Query molecule ID
query_id = "JCP2022_080538"  # Replace with the molecule ID you want to query

# Open the HDF5 file and retrieve the `dmso_distance`
with h5py.File(H5_DISTANCE_FILE, 'r') as h5f:
    if query_id in h5f:
        dmso_distance = h5f[f"{query_id}/dmso_distance"][()]
        print(f"DMSO distance for {query_id}: {dmso_distance}")
    else:
        print(f"Molecule ID {query_id} not found in the HDF5 file.")


# Convert to pg10