# Import 

In [1]:
import numpy as np
import pandas as pd
import torch 
from tqdm import tqdm
import matplotlib.pyplot as plt
from pathlib import Path
import os
from joblib import Parallel, delayed
from typing import Union, List


In [2]:
df_phenom = pd.read_parquet('/projects/synsight/data/openphenom/raw_well_embeddings/wells_em_openphenom.parquet')
df_meta = pd.read_csv('/projects/cpjump1/jump/metadata/complete_metadata.csv')

  df_meta = pd.read_csv('/projects/cpjump1/jump/metadata/complete_metadata.csv')


In [35]:

df_dinov2_s = pd.read_parquet('/projects/synsight/data/dinov2_small/raw_well_embeddings/wells_em_dinov2s.parquet')
df_dinov2_g = pd.read_parquet('/projects/synsight/data/dinov2/raw_well_embeddings/wells_em_dinov2.parquet')
df_resnet = pd.read_parquet('/projects/cpjump1/jump/images_embeddings/resnet50_raw/metadata/load_wells/df_meta_raw_well.parquet')
df_chada = pd.read_parquet('/projects/synsight/data/chada/well_embeddings/metadata_raw_mean.parquet')


In [3]:
no_dmso_plates = [
            "Dest210823-174240",
            "Dest210628-162003",
            "Dest210823-174422",
        ]

In [4]:
base_path = Path('/projects/synsight/data/jump_embeddings/wells_embeddings')

# Reference

In [5]:
images_df_path = "/projects/cpjump1/jump/load_data/final"
images_df = pd.read_parquet(images_df_path)

images_df["Metadata_Batch"] = images_df["Metadata_Batch"].astype(str)
df_meta["Metadata_Batch"] = df_meta["Metadata_Batch"].astype(str)
images_df["Metadata_Plate"] = images_df["Metadata_Plate"].astype(str)
df_meta["Metadata_Plate"] = df_meta["Metadata_Plate"].astype(str)
images_df["Metadata_Source"] = images_df["Metadata_Source"].astype(str)
df_meta["Metadata_Source"] = df_meta["Metadata_Source"].astype(str)
images_df["Metadata_Well"] = images_df["Metadata_Well"].astype(str)
df_meta["Metadata_Well"] = df_meta["Metadata_Well"].astype(str)

merged_df = pd.merge(images_df, df_meta, on=["Metadata_Source", "Metadata_Plate", "Metadata_Well"])
df = merged_df[merged_df["Metadata_PlateType"]=='COMPOUND'][['Metadata_Source', 'Metadata_Batch_x', 'Metadata_Plate',
       'Metadata_Well', 'Metadata_JCP2022', 'Metadata_InChI']]
df.columns = ['Metadata_Source', 'Metadata_Batch', 'Metadata_Plate', 'Metadata_Well', 'Metadata_JCP2022', 'Metadata_InChI']

In [6]:
ref_compound_df = df[~df['Metadata_JCP2022'].isin(['JCP2022_999999', 'JCP2022_UNKNOWN'])].drop_duplicates()[~df['Metadata_JCP2022'].isin(['JCP2022_999999', 'JCP2022_UNKNOWN'])].reset_index().drop(columns='index')

  ref_compound_df = df[~df['Metadata_JCP2022'].isin(['JCP2022_999999', 'JCP2022_UNKNOWN'])].drop_duplicates()[~df['Metadata_JCP2022'].isin(['JCP2022_999999', 'JCP2022_UNKNOWN'])].reset_index().drop(columns='index')


In [7]:
ref_compound_df = ref_compound_df[~ref_compound_df['Metadata_Plate'].isin(no_dmso_plates)].reset_index().drop(columns='index')

In [8]:
ref_compound_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 745884 entries, 0 to 745883
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Metadata_Source   745884 non-null  object
 1   Metadata_Batch    745884 non-null  object
 2   Metadata_Plate    745884 non-null  object
 3   Metadata_Well     745884 non-null  object
 4   Metadata_JCP2022  745884 non-null  object
 5   Metadata_InChI    745884 non-null  object
dtypes: object(6)
memory usage: 34.1+ MB


# pre-process

### function

In [9]:

def filter_and_save_embeddings_with_metadata(
    df: pd.DataFrame,
    ref_compound_df: pd.DataFrame,
    embedding_col: str,
    name: str,
    metadata_cols: List[str],
    storage_folder: Union[str, Path],
    num_jobs: int = -1,
) -> None:
    """
    Filters rows of a DataFrame to match a reference DataFrame, keeps relevant columns,
    loads embeddings into a single NumPy array, and saves metadata with matching indices.

    Args:
        df (pd.DataFrame): The DataFrame containing embedding paths and metadata.
        ref_compound_df (pd.DataFrame): Reference DataFrame to filter rows.
        embedding_col (str): The column containing paths to torch tensors.
        name (str): A name to identify the saved files.
        metadata_cols (List[str]): List of metadata columns to include.
        storage_folder (Union[str, Path]): Folder to save the outputs.
        num_jobs (int): Number of parallel jobs for embedding processing.
    """
    # Ensure the storage folder exists
    storage_folder = Path(storage_folder)
    storage_folder.mkdir(parents=True, exist_ok=True)

    # Validate embedding and metadata columns
    if embedding_col not in df.columns:
        raise ValueError(f"Column '{embedding_col}' not found in the DataFrame.")

    missing_metadata_cols = [col for col in metadata_cols if col not in df.columns]
    if missing_metadata_cols:
        raise ValueError(f"Missing metadata columns: {missing_metadata_cols}")

    # Filter rows based on the reference DataFrame
    filtered_df = df.merge(ref_compound_df, on=metadata_cols, how="inner")

    if filtered_df.empty:
        print("No matching rows found between the DataFrame and the reference DataFrame.")
        return

    # Function to load an embedding
    def load_embedding(embedding_path):
        try:
            tensor = torch.load(embedding_path, weights_only=True)
            return tensor.numpy()
        except Exception as e:
            print(f"Error loading embedding at {embedding_path}: {e}")
            return None

    # Wrap embedding loading with tqdm for progress tracking
    embedding_paths = filtered_df[embedding_col].tolist()

    # Use tqdm to wrap Parallel processing
    embeddings = Parallel(n_jobs=num_jobs)(
        delayed(lambda path: (path, load_embedding(path)))(path)
        for path in tqdm(embedding_paths)
    )
    embeddings = [emb for _, emb in embeddings if emb is not None]

    # Filter out failed embeddings and update metadata accordingly
    valid_indices = [i for i, emb in enumerate(embeddings) if emb is not None]
    embeddings = np.array([embeddings[i] for i in valid_indices])
    filtered_metadata = filtered_df.iloc[valid_indices].drop(columns=[embedding_col])[metadata_cols]

    # Save embeddings as a single NumPy array
    embeddings_path = storage_folder / f"embeddings_{name}.npy"
    np.save(embeddings_path, embeddings)

    # Save filtered metadata as a Parquet file
    metadata_parquet_path = storage_folder / f"metadata_{name}.parquet"
    filtered_metadata.to_parquet(metadata_parquet_path, index=False)

    print(f"Embeddings saved to: {embeddings_path}")
    print(f"Metadata saved to: {metadata_parquet_path}")
    return filtered_metadata, embeddings


# Example Usage
common_columns = [
    "Metadata_Source", "Metadata_Batch", "Metadata_Plate",
    "Metadata_Well", "Metadata_JCP2022", "Metadata_InChI"
]




### OpenPhenom

In [10]:
openphenom_path = base_path / 'openphenom'
df_phenom.columns

Index(['Metadata_Well', 'Metadata_Source', 'Metadata_Batch', 'Metadata_Plate',
       'Metadata_JCP2022', 'Metadata_InChI', 'Metadata_Is_Control',
       'Metadata_Row', 'Metadata_Col', 'path_embedding'],
      dtype='object')

In [14]:
ref_compound_df.sample(n=4)['Metadata_Plate'].unique()

array(['ATSJUM122', 'UL001713', 'AETJUM108', 'UL001669'], dtype=object)

In [15]:
ref_compound_df[ref_compound_df['Metadata_Plate'].isin(['A1166172', 'AETJUM108', '1053600896'])]

Unnamed: 0,Metadata_Source,Metadata_Batch,Metadata_Plate,Metadata_Well,Metadata_JCP2022,Metadata_InChI
223720,source_2,20210614_Batch_1,1053600896,A01,JCP2022_085227,InChI=1S/C17H30N2O5/c1-6-23-17(22)14-13(24-14)...
223721,source_2,20210614_Batch_1,1053600896,A02,JCP2022_033924,InChI=1S/C2H6OS/c1-4(2)3/h1-2H3
223722,source_2,20210614_Batch_1,1053600896,A03,JCP2022_106987,InChI=1S/C26H21FN2O3S/c27-21-15-17-22(18-16-21...
223723,source_2,20210614_Batch_1,1053600896,A04,JCP2022_011874,InChI=1S/C32H29NO5/c34-25(20-11-3-1-4-12-20)19...
223724,source_2,20210614_Batch_1,1053600896,A05,JCP2022_058580,InChI=1S/C21H25FN4O4S/c1-15-6-7-19(16(2)12-15)...
...,...,...,...,...,...,...
597205,source_8,J4,A1166172,P20,JCP2022_009816,InChI=1S/C20H23FN2O4S/c1-15(24)22-20(16-6-4-3-...
597206,source_8,J4,A1166172,P21,JCP2022_024818,InChI=1S/C19H18Cl2N4O/c1-24(13-18(26)23-19-16(...
597207,source_8,J4,A1166172,P22,JCP2022_103073,InChI=1S/C17H16ClN3O/c1-11-3-4-13(7-16(11)18)2...
597208,source_8,J4,A1166172,P23,JCP2022_033924,InChI=1S/C2H6OS/c1-4(2)3/h1-2H3


In [17]:
filtered_metadata, embeddings = filter_and_save_embeddings_with_metadata(
    df=df_phenom,
    ref_compound_df=ref_compound_df[ref_compound_df['Metadata_Plate'].isin(['A1166172', 'AETJUM108', '1053600896'])],
    embedding_col="path_embedding",  
    name='openphenom_test_3_plates',
    metadata_cols=common_columns,
    storage_folder=openphenom_path,
    num_jobs=10,
)

filtered_metadata.info()

100%|██████████| 1151/1151 [00:19<00:00, 58.51it/s]


Embeddings saved to: /projects/synsight/data/jump_embeddings/wells_embeddings/openphenom/embeddings_openphenom_test_3_plates.npy
Metadata saved to: /projects/synsight/data/jump_embeddings/wells_embeddings/openphenom/metadata_openphenom_test_3_plates.parquet
<class 'pandas.core.frame.DataFrame'>
Index: 1151 entries, 0 to 1150
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Metadata_Source   1151 non-null   object
 1   Metadata_Batch    1151 non-null   object
 2   Metadata_Plate    1151 non-null   object
 3   Metadata_Well     1151 non-null   object
 4   Metadata_JCP2022  1151 non-null   object
 5   Metadata_InChI    1151 non-null   object
dtypes: object(6)
memory usage: 62.9+ KB
