# Imports and paths

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import h5py
from tqdm import tqdm
from multiprocessing import Pool, cpu_count


In [3]:
npy_file = ("/projects/synsight/data/website_data/jump_compounds_matrix.npy")
METADATA_FILE = ("/projects/synsight/data/website_data/jump_compounds_matrix_metadata.parquet")
output_file = '/projects/synsight/data/website_data/nearest_neighbors.h5'  # Output HDF5 file

In [2]:
npy_file = ("/users/biocomp/msanchez/Documents/website-data/jump_compounds_matrix.npy")
METADATA_FILE = ("/users/biocomp/msanchez/Documents/website-data/jump_compounds_matrix_metadata.parquet")
output_file = '/users/biocomp/msanchez/Documents/website-data/nearest_neighbors.h5'  # Output HDF5 file

# Histograms

In [4]:


# Charger la matrice depuis le fichier .npy
matrix = np.load(npy_file)


In [5]:
all_values = matrix.flatten()

In [None]:

# Afficher la distribution des valeurs avec un histogramme
plt.figure(figsize=(8, 6))
plt.hist(all_values, bins=5000, log=True, color='blue', alpha=0.3)
plt.title('Distribution des valeurs dans la matrice')
plt.xlabel('Valeurs')
plt.ylabel('Fréquence')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:

# Afficher la distribution des valeurs avec un histogramme
plt.figure(figsize=(8, 6))
plt.hist(all_values, bins=5000, color='blue', alpha=0.3)
plt.title('Distribution des valeurs dans la matrice')
plt.xlabel('Valeurs')
plt.ylabel('Fréquence')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Precompute distances

In [3]:
m = 100  # Number of nearest neighbors

In [4]:
# Load metadata
metadata = pd.read_parquet(METADATA_FILE)
metadata_ids = metadata['Metadata_JCP2022'].values  # Unique molecule IDs


In [None]:
metadata_ids

In [6]:
matrix = np.load(npy_file, mmap_mode='r')  # Memory mapping pour éviter le chargement complet en RAM
matrix = matrix.astype(np.float16) 

In [None]:
def process_row(i):
    """
    Process a single row of the distance matrix to find the m closest neighbors
    and include the distance to a specific molecule.

    Args:
        i (int): Index of the row in the distance matrix.

    Returns:
        tuple: (molecule_id, closest_ids, closest_distances)
    """
    distances = matrix[i]

    # Distance to the target molecule (JCP2022_033924)
    target_index = np.where(metadata_ids == 'JCP2022_033924')[0][0]
    dmso_distance = distances[target_index]

    # Find m closest molecules using partial sorting (excluding self if needed)
    closest_indices = np.argpartition(distances, m)[1:m+1]  # Top m indices (unsorted)
    closest_distances = distances[closest_indices]

    # Sort these m indices to ensure proper order
    sorted_indices_within_chunk = np.argsort(closest_distances)
    closest_indices = closest_indices[sorted_indices_within_chunk]
    closest_distances = closest_distances[sorted_indices_within_chunk]

    # Get IDs for the closest molecules
    closest_ids = metadata_ids[closest_indices]

    # Return the results
    return metadata_ids[i], closest_ids, closest_distances, dmso_distance

# Parallel processing
with Pool(processes=cpu_count()) as pool:
    # Use tqdm for progress tracking
    results = list(tqdm(pool.imap(process_row, range(matrix.shape[0])), total=matrix.shape[0]))






In [25]:
# Save results to HDF5
with h5py.File(output_file, 'w') as h5f:
    for molecule_id, closest_ids, closest_distances, dmso_distance in results:
        group = h5f.create_group(molecule_id)
        group.create_dataset('closest_ids', data=closest_ids.astype('S'))  # Save IDs as strings
        group.create_dataset('distances', data=closest_distances)
        group.create_dataset('dmso_distance', data=dmso_distance)

In [None]:
import h5py

# Parameters
molecule_id = 'JCP2022_080538'  # Example molecule ID

# Access the HDF5 file
with h5py.File(output_file, 'r') as h5f:
    # Check if the molecule_id exists in the HDF5 file
    if molecule_id in h5f:
        print(f"Molecule ID {molecule_id} found.")
        
        # Access the datasets
        closest_ids = h5f[f'{molecule_id}/dmso_distance'][:].astype(str)  # Convert bytes to strings
        distances = h5f[f'{molecule_id}/distances'][:]
        
        # Print the results
        print(f'Closest molecules to {molecule_id}:')
        print('IDs:', closest_ids)
        print('Distances:', distances)
    else:
        print(f"Molecule ID {molecule_id} not found in the HDF5 file.")


In [None]:
import h5py

# File path
H5_DISTANCE_FILE = output_file

# Query molecule ID
query_id = "JCP2022_080538"  # Replace with the molecule ID you want to query

# Open the HDF5 file and retrieve the `dmso_distance`
with h5py.File(H5_DISTANCE_FILE, 'r') as h5f:
    if query_id in h5f:
        dmso_distance = h5f[f"{query_id}/dmso_distance"][()]
        print(f"DMSO distance for {query_id}: {dmso_distance}")
    else:
        print(f"Molecule ID {query_id} not found in the HDF5 file.")


# Convert to pg10

In [8]:
import psycopg2

def insert_data_parallel(args):
    """Insère une molécule et ses voisins dans PostgreSQL en créant une connexion distincte pour chaque process."""
    molecule_id, closest_ids, closest_distances, dmso_distance = args

    # Connexion dédiée pour ce processus
    conn = psycopg2.connect(
        dbname="phenoseeker",
        user="phenouser",
        password="phenopass",
        host="localhost",
        port="5432"
    )
    cur = conn.cursor()

    try:
        # Vérifier si la molécule est déjà en base
        cur.execute("SELECT id FROM molecules WHERE metadata_id = %s", (molecule_id,))
        result = cur.fetchone()

        if result:
            molecule_db_id = result[0]
        else:
            cur.execute("INSERT INTO molecules (metadata_id) VALUES (%s) RETURNING id", (molecule_id,))
            molecule_db_id = cur.fetchone()[0]
            conn.commit()

        # Insérer les voisins
        for i in range(len(closest_ids)):
            neighbor_id = closest_ids[i]
            distance = float(closest_distances[i])  # Convertir float16 en float standard

            # Vérifier si le voisin existe déjà
            cur.execute("SELECT id FROM molecules WHERE metadata_id = %s", (neighbor_id,))
            neighbor_result = cur.fetchone()

            if neighbor_result:
                neighbor_db_id = neighbor_result[0]
            else:
                cur.execute("INSERT INTO molecules (metadata_id) VALUES (%s) RETURNING id", (neighbor_id,))
                neighbor_db_id = cur.fetchone()[0]
                conn.commit()

            # Insérer la relation de distance
            cur.execute("""
                INSERT INTO distances (molecule_id, neighbor_id, distance)
                VALUES (%s, %s, %s) ON CONFLICT DO NOTHING
            """, (molecule_db_id, neighbor_db_id, distance))

        conn.commit()

    except psycopg2.Error as e:
        print(f"Erreur SQL : {e}")
        conn.rollback()  # Réinitialiser la transaction en cas d'erreur

    except Exception as e:
        print(f"Erreur inattendue : {e}")
        conn.rollback()  # Réinitialiser la transaction en cas d'erreur

    finally:
        cur.close()
        conn.close()  # Fermer la connexion pour éviter les fuites


In [None]:
from multiprocessing import Pool, cpu_count
from tqdm import tqdm

n_job =  cpu_count() - 2

# Création d'un pool de processus
with Pool(processes=n_job) as pool:
    list(tqdm(pool.imap(insert_data_parallel, results), total=len(results)))

# Add molecular ids (zinc, chembl, pubchem)

In [None]:
data[data['Metadata_InChI']=='InChI=1S/C2H6OS/c1-4(2)3/h1-2H3']

In [None]:
data_test = data.sample(n=100)

In [None]:
data_test

In [None]:
# Add new columns for metadata (initialize them with default values, e.g., None)
data['Zinc_id'] = None  # Replace None with the logic to populate Zinc ids if available
data['Canonical_SMILES'] = None  # Replace None with the logic to populate Canonical SMILES if available
data['PubChem_id'] = None  # Replace None with the logic to populate PubChem ids if available



In [None]:

updated_parquet_file_path = "updated_file.parquet"  # Replace with your desired output file path
data.to_parquet(updated_parquet_file_path, index=False)

print("Metadata columns added successfully!")
