## Clustering using embeddings

### Load embeddings data

In [None]:
import numpy as np
import pandas as pd

processed_data_embeddings = pd.read_csv("assays_with_embeddings.csv")
processed_data_embeddings

### Clean data

In [None]:
import itertools

# for some reason embeddings list is a string, needs to be converted
processed_data_embeddings['embeddings'].replace('\[','', regex=True, inplace=True)
processed_data_embeddings['embeddings'].replace('\]','', regex=True, inplace=True)
processed_data_embeddings['embeddings'].replace('\n','', regex=True, inplace=True)
processed_data_embeddings['embeddings'] = processed_data_embeddings['embeddings'].str.strip()
processed_data_embeddings['embeddings'] = processed_data_embeddings['embeddings'].str.split(' ')

# loops to remove empty elements from embeddings and converts embeddings to np array
new_embeddings = []
for embedding_list in processed_data_embeddings['embeddings'].values:
    # using lambda function
    embedding_list_ = list(itertools.filterfalse(lambda x: x == '', embedding_list))
    # convert list to np array
    embedding_array = np.array(embedding_list_)
    # cast elements to float
    embedding_array = embedding_array.astype(float) 
    # append to list of arrays
    new_embeddings.append(embedding_array)
    
# swap in reformatted embeddings
processed_data_embeddings['embeddings'] = new_embeddings

In [None]:
# shape of embedding
print(processed_data_embeddings['embeddings'][3].shape)

### Find embeddings distance and create distance matrix

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Example cosine_similarity calculation
# Word embeddings for two words
word1_embedding = processed_data_embeddings['embeddings'][0]
word2_embedding = processed_data_embeddings['embeddings'][1000]
# Reshape the arrays to match the expected input shape of cosine_similarity
word1_embedding = word1_embedding.reshape(1, -1)
word2_embedding = word2_embedding.reshape(1, -1)
# Calculate cosine similarity
similarity = cosine_similarity(word1_embedding, word2_embedding)[0][0]
print(similarity)

In [None]:
# for speed, drop duplicate assays as rows were duplicated for each compound
processed_data_embeddings_ = processed_data_embeddings # store copy of orginal dataframe
processed_data_embeddings = processed_data_embeddings.drop_duplicates(subset=['assay_chembl_id'])

In [None]:
similarity_cols = processed_data_embeddings.assay_chembl_id.to_list()
similarity_index = processed_data_embeddings.assay_chembl_id.to_list()
embeddings_list = processed_data_embeddings.embeddings

In [None]:
# set number of IDs to calculate cosine similarity for
n = len(embeddings_list) # for faster example, change 'n' to something ≈ 500-1000, otherwise -> len(embeddings_list)
# create list to store all lists of embedding similarities
similarity_frame = []

# iterate over list of embeddings, assign embedding as embedding_a
# NOTE!!!: This takes some time, have to calculate consine similarities for n_embeddings^2 (I think 5000 unique description embeddins takes roughly 1 hour using 6 cores of 2GB memory on CPU)
for count, embedding_a in enumerate(embeddings_list[:n],1):
    # create list to store all embedding cosine similarities for embedding_a
    similarity_list = []
    # iterate over list of embeddings, assign embedding as embedding_b, for compariosn
    for embedding_b in embeddings_list[:n]:
        # Reshape the arrays to match the expected input shape of cosine_similarity
        embedding_a = embedding_a.reshape(1, -1)
        embedding_b = embedding_b.reshape(1, -1)
        # Calculate cosine similarity
        similarity = cosine_similarity(embedding_a, embedding_b)[0][0]
        # append value to list
        similarity_list.append(similarity)
    # append similarity list for embedding_a to list of lists
    similarity_frame.append(similarity_list)
    # print count if multiple of 100
    if count % 100 == 0:
        print(f"Iteration {count}")

In [None]:
# create cosine similarity matrix
cosine_similarity_matrix = pd.DataFrame(similarity_frame,columns=similarity_cols[:n],index=similarity_index[:n])
cosine_similarity_matrix

### Clustering using similarity matrix

In [None]:
from sklearn.cluster import AgglomerativeClustering

# Agglomerative clustering (can swap out for other clustering methods/parameters)
clustering = AgglomerativeClustering(n_clusters=100).fit(cosine_similarity_matrix)

In [None]:
# subset for example chembl IDs, will also work if using all embeddings
data_subset = processed_data_embeddings[processed_data_embeddings['assay_chembl_id'].isin(cosine_similarity_matrix.columns)] 

In [None]:
# create column for cluster label
data_subset['embedding_cluster'] = clustering.labels_

In [None]:
# histogram plot to check distribution (should be relatively even, if not, clustering is likely bad)
from matplotlib import pyplot as plt 
import numpy as np  
   
plt.hist(clustering.labels_) 
plt.title("histogram") 
plt.show()

In [None]:
# check if clustering is giving useful groupings
for row in data_subset[data_subset.embedding_cluster==0]['description']:
    print(row)

In [None]:
# check if clustering is giving useful groupings
for row in data_subset[data_subset.embedding_cluster==10]['description']:
    print(row)

In [None]:
# check if clustering is giving useful groupings
for row in data_subset[data_subset.embedding_cluster==20]['description']:
    print(row)

In [None]:
# Make dictionary mapping assay ID to respective cluster
# store assay IDs as keys and cluster label as values
keys = data_subset.assay_chembl_id
vals = data_subset.embedding_cluster
# using dict() and zip() to convert lists to dictionary
cluster_dict = dict(zip(keys, vals))

In [None]:
# extract list of assays IDs
all_assay_IDs = processed_data_embeddings_.assay_chembl_id

In [None]:
all_cluster_labels = []
for ID in all_assay_IDs:
    if ID in cluster_dict.keys():
        cluster_ID = cluster_dict[ID]
        all_cluster_labels.append(cluster_ID)

In [None]:
# create column for cluster labels in expanded dataframe
processed_data_embeddings_['embedding_cluster'] = all_cluster_labels

In [None]:
processed_data_embeddings_