In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch
from math import sqrt
from functions import *

In [2]:
# import dataset
assay_data = pd.read_csv("AML_assays_with_embeddings.csv")
# additional filter for subset_col
AML_cell_lines = pd.read_csv("AML_cell_types.csv")['0'].to_list()

# find AML cell types in imported AML line line list
assay_data_ = assay_data[assay_data["BAO Format"]=="cell-based format"]
cell_lines_all = assay_data_["Cell Type"].to_list()
cell_lines_AML = list(set(cell_lines_all).intersection(AML_cell_lines))

## Exploration

In [3]:
assay_data["BAO Format"].unique()

array(['cell-based format', 'assay format', 'organism-based format',
       'nucleic acid format', 'single protein format', 'cell-free format',
       'small-molecule physicochemical format', 'tissue-based format',
       'mitochondrion format', 'protein format', 'biochemical format',
       'cell membrane format', 'microsome format', 'subcellular format',
       'protein complex format'], dtype=object)

In [6]:
assay_type = assay_data["BAO Format"].unique()[5] # 4 for later
print(assay_type)
assay_data[assay_data["BAO Format"]==assay_type]["Cell Type"].unique()
print(len(assay_data[assay_data["BAO Format"]==assay_type]))

cell-free format
27


## Clustering

In [7]:
# import dataset and convert embeddings from string to numpy array
assay_data = pd.read_csv("AML_assays_with_embeddings.csv")
assay_data = convert_embeddings(assay_data)

# check if cell type data is available for assay format, if yes then set subsetting column to "Cell Type", otherwise set column to "Organism"
if len(set(assay_data[assay_data["BAO Format"]==assay_type]["Cell Type"].unique()).intersection(AML_cell_lines)) > 2:
    subset_col = "Cell Type"
else:
    subset_col = "Organism"

print(f"Grouping by {subset_col}")
# fill nan values in dataframe with string if using "Organism" col as subsetting criteria
if subset_col == "Organism":
    assay_data = assay_data.fillna({"Organism": "Not specified"})
# create dictionary containing subsets of assays based on subset_col
if subset_col == "Cell Type":
    subset_dict = create_subset_dict_2(assay_data, assay_type, subset_col, cell_lines_AML)
else:
    subset_dict = create_subset_dict_2(assay_data, assay_type, subset_col)
# assign clusters within asssay subsets using hierarchical clustering
assay_data_with_clusters = assign_clusters(subset_dict)
# combine list of clustered subsets into a single dataframe
assay_data_clustered = pd.concat(assay_data_with_clusters)
# drop unneeded columns 
assay_data_clustered = assay_data_clustered.drop(labels=["Unnamed: 0","Compounds","Activities",'PubMed ID', 'DOI', 'Year', 'Journal', 'Volume',
       'First Page', 'Last Page'], axis=1)
# add columns for drugs and targets associated with assay ID
assay_data_clustered_ = add_drugs_and_targets(assay_data_clustered)
# create filename using assay_type
assay_type_name = assay_type.replace(" format","").replace(" ","_")
filename = f"AML_{assay_type_name}_clustered.csv"
# export dataframe as .csv
assay_data_clustered_.to_csv(filename, index=False)

Grouping by Organism

Homo sapiens_A
Calculating pairwise cosine similarities
n = 14
t = 7.237624155400388

Homo sapiens_B
Calculating pairwise cosine similarities
n = 2
t = 1.681792830507429

Rattus norvegicus_A
Calculating pairwise cosine similarities
n = 7
t = 4.303517070658851

Rattus norvegicus_B

Canis lupus familiaris_A

Canis lupus familiaris_B

Mus musculus_A

Mus musculus_B

Not specified_A
Calculating pairwise cosine similarities
n = 2
t = 1.681792830507429

Not specified_B


## Check clustering

In [None]:
# unique assay IDs
assay_data_clustered_["ChEMBL ID"].unique().shape

In [None]:
assay_data_clustered_["Cell Type"].unique()

In [None]:
# clusters
assay_data_clustered_["embedding_cluster"].unique().shape

In [None]:
import numpy as np

x = assay_data_clustered_["embedding_cluster"]
unique, counts = np.unique(x, return_counts=True)

print(np.asarray((unique, counts)).T)

In [None]:
import numpy as np

x = assay_data_clustered_["embedding_cluster"]
unique, counts = np.unique(x, return_counts=True)

for cluster,count in zip(unique, counts):
    if count > 25:
        print(cluster, count)

In [None]:
#print all descriptions for a specific cluster
for txt in assay_data_clustered_[assay_data_clustered_["embedding_cluster"]==15].Description:
    print(txt)

In [None]:
# histogram plot to check distribution (should be relatively even, if not, clustering is likely bad)
from matplotlib import pyplot as plt 
import numpy as np  
   
plt.hist(assay_data_clustered_["embedding_cluster"]) 
plt.title("histogram") 
plt.show()