In [1]:
# Imports
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import isfile, join
from pathlib import Path

In [2]:
# get current working directory
cwd = os.getcwd()
# find path to assay data directory
full_path = cwd
assay_info_dir = "clustered_output_27_02_2024/"
# create a list of filenames in assay data directory
file_list = [f for f in listdir(assay_info_dir) if isfile(join(assay_info_dir, f))]

In [3]:
file_list

['AML_biochemical_clustered.csv',
 'AML_mitochondrion_clustered.csv',
 'AML_protein_complex_clustered.csv',
 'AML_cell-free_clustered.csv',
 'AML_small-molecule_physicochemical_clustered.csv',
 'AML_subcellular_clustered.csv',
 'AML_nucleic_acid_clustered.csv',
 'AML_cell_membrane_clustered.csv',
 'AML_tissue-based_clustered.csv',
 'AML_microsome_clustered.csv',
 'AML_organism-based_clustered.csv',
 'AML_cell-based_clustered.csv',
 'AML_protein_clustered.csv',
 'AML_single_protein_clustered.csv']

In [4]:
# dictionary to store all dataframes
dataframe_dict = {}
# loop read_csv to store every csv as a dataframe in dictionary
for filename in file_list:
    assay_df = pd.read_csv(f"{assay_info_dir}/{filename}",sep=",")
    dataframe_dict[filename] = assay_df  # new key, add

In [5]:
keys = list(dataframe_dict.keys())

In [6]:
max_cluster = 0

for key in keys:
    assay_df = dataframe_dict[key]
    
    assay_df.embedding_cluster = assay_df.embedding_cluster + max_cluster
    dataframe_dict.update({key: assay_df})
    
    max_cluster = assay_df.embedding_cluster.max()

In [10]:
# concatenate dataframes into single dataframe and sort by assay chEMBL ID
assays_clustered = pd.concat(dataframe_dict.values(), ignore_index=True)
assays_clustered = assays_clustered.sort_values(by="ChEMBL ID")
# export as csv
assays_clustered.to_csv("clustered_output_27_02_2024/AML_assays_clustered.csv", index=False)

In [8]:
assays_clustered.embedding_cluster.unique().max()

1421

In [9]:
assays_clustered

Unnamed: 0,ChEMBL ID,Assay Type,Description,Organism,BAO Format ID,BAO Format,Confidence Score,Confidence Label,Strain,Source,...,Assay Classification L1,Assay Classification L2,Assay Classification L3,Variant Sequence Accession,Variant Sequence Mutation,embeddings,embedding_cluster,parent_compound_chembl_id,target_chembl_id,check_assay_chembl_id
3031,CHEMBL1001041,F,Cytotoxicity against human HL60 cells after 72...,Homo sapiens,BAO_0000219,cell-based format,1.0,1 - Target assigned is non-molecular,,Scientific Literature,...,,,,,,[ 8.54650885e-02 -1.13341741e-01 -3.96913290e-...,616,CHEMBL53463,CHEMBL383,CHEMBL1001041
1160,CHEMBL1014323,B,Binding affinity to poly(dA).poly(dT) DNA asse...,Not specified,BAO_0000225,nucleic acid format,3.0,3 - Target assigned is molecular non-protein t...,,Scientific Literature,...,,,,,,[ 2.81251609e-01 4.58082706e-02 -1.69811621e-...,125,CHEMBL58,CHEMBL345,CHEMBL1014323
1096,CHEMBL1014323,B,Binding affinity to poly(dA).poly(dT) DNA asse...,Not specified,BAO_0000225,nucleic acid format,3.0,3 - Target assigned is molecular non-protein t...,,Scientific Literature,...,,,,,,[ 2.81251609e-01 4.58082706e-02 -1.69811621e-...,125,CHEMBL53463,CHEMBL345,CHEMBL1014323
2951,CHEMBL1016754,B,Inhibition of LFA1/ICAM1-mediated HL60 cell ag...,,BAO_0000219,cell-based format,6.0,6 - Homologous protein complex subunits assigned,,Scientific Literature,...,,,,,,[-8.41736645e-02 -1.94282368e-01 -1.41950279e-...,706,CHEMBL384467,CHEMBL2096661,CHEMBL1016754
2952,CHEMBL1016755,A,Cytotoxicity against human HL60 cells by XTT a...,Homo sapiens,BAO_0000219,cell-based format,1.0,1 - Target assigned is non-molecular,,Scientific Literature,...,,,,,,[ 8.13143421e-03 1.91921443e-02 -4.55224872e-...,702,CHEMBL384467,CHEMBL383,CHEMBL1016755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3938,CHEMBL996019,B,Inhibition of Escherichia coli Eco R1 assessed...,Escherichia coli,BAO_0000357,single protein format,9.0,9 - Direct single protein target assigned,,Scientific Literature,...,,,,,,[-7.92199373e-02 -8.29450712e-02 1.32942289e-...,1376,CHEMBL53463,CHEMBL5729,CHEMBL996019
3939,CHEMBL996020,B,Inhibition of Haemophilus influenzae Hind 3 as...,Haemophilus influenzae,BAO_0000357,single protein format,8.0,8 - Homologous single protein target assigned,,Scientific Literature,...,,,,,,[-9.44212303e-02 -8.45549628e-02 -9.39499065e-...,1419,CHEMBL53463,CHEMBL5004,CHEMBL996020
3940,CHEMBL996021,B,Inhibition of Providencia stuartii Pst 1 asses...,Providencia stuartii,BAO_0000357,single protein format,9.0,9 - Direct single protein target assigned,,Scientific Literature,...,,,,,,[ 4.95547988e-02 -1.20248385e-01 -3.45167220e-...,1421,CHEMBL53463,CHEMBL5713,CHEMBL996021
3036,CHEMBL997800,F,Cytotoxicity against human HL60 cells after 72...,Homo sapiens,BAO_0000219,cell-based format,1.0,1 - Target assigned is non-molecular,,Scientific Literature,...,,,,,,[ 8.54650885e-02 -1.13341741e-01 -3.96913290e-...,616,CHEMBL53463,CHEMBL383,CHEMBL997800
