In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

In [3]:
file_name = 'embeddings.csv'
df_embedding = pd.read_csv(file_name)
df_moa = pd.read_csv('moa.csv')
df_moa = df_moa[['moa','Image_FileName_DAPI']]
df = pd.merge(df_embedding, df_moa, how = 'left', on = 'Image_FileName_DAPI')

In [4]:
df.moa.unique()



array(['DMSO', 'Actin disruptors', 'Microtubule stabilizers', nan],
      dtype=object)

Let's change the nan values in MoA by "unknown"

In [5]:
df.moa = df.moa.fillna('unknown')
df.moa.unique()

array(['DMSO', 'Actin disruptors', 'Microtubule stabilizers', 'unknown'],
      dtype=object)

In [36]:
def scout_kmeans(df, n_cluster = [4, 5, 6, 7, 8], init = ['k-means++', 'random'], max_iter = [300, 500, 1000], random_state = 0):
    """Scouts Kmeans and stores the results of each trial in a dataframe that is returned by the function.
    The dataframe contains the Kmeans paramters as well as the percentage of overlap between mode of action and 
    cluster found
    Arguments:
        - df: the dataframe that contains the embedding data
        - n_cluster: list. The values to be tested for hyperparameter n_cluster
        - init: list. The values to be tested for hyperparameter init
        - max_iter: list. The values to be tested for hyperparameter max_iter
    Returns:
        - summary_df: pd.DataFrame object. Contains the combinations of kmeans hyperparameters tested and 
          the resulting overlap between clusters and MoA"""
    
    #get the values of interest from the df
    matrix = df.iloc[:, 4:-1].values
    
    #create a dictionary to store the overlaps between clusters and MoA
    overlap_dict = {}
    for moa in df.moa.unique():
        overlap_dict[moa] = []
    
    #create 3 lists to store the combination of hyperparameters
    cluster_list = []
    init_list = []
    max_iter_list = []
    
    for cluster in n_cluster:
        for i in init:
            for iter_ in max_iter:
                print('testing {} clusters, {} inits, {} max_iter'.format(cluster, i, iter_))
                
                #update the lists
                cluster_list.append(cluster)
                init_list.append(i)
                max_iter_list.append(iter_)
                
                #create the kmeans object
                km = KMeans(n_clusters = cluster, init = i, max_iter = iter_, random_state = random_state)
                km.fit(matrix)
                labels = km.labels_
                
                #update df with labels
                df_copy = df.copy()
                df_copy['cluster'] = labels
                
                #calculate percent overlap between clusters and known MoAs
                overlap = df_copy.groupby('moa')['cluster'].apply(lambda x: x.value_counts().iloc[0])/df_copy.groupby('moa').count().TableNumber
                
                #store each overlap in the overlap lists
                for n in range(len(overlap)):
                    overlap_dict[overlap.index[n]].append(overlap[n])
                    
    #create a DataFrame to store kmeans hyperparameters               
    summary_df = pd.DataFrame({**{'n_cluster': cluster_list,
                               'init' : init_list,
                               'max_iter': max_iter_list}, **overlap_dict})
    


                   
    return summary_df
                

summary_df = scout_kmeans(df, [4,5,6,7,8,9,10], ['k-means++', 'random'], [300], random_state = 0)

testing 4 clusters, k-means++ inits, 300 max_iter
testing 4 clusters, random inits, 300 max_iter
testing 5 clusters, k-means++ inits, 300 max_iter
testing 5 clusters, random inits, 300 max_iter
testing 6 clusters, k-means++ inits, 300 max_iter
testing 6 clusters, random inits, 300 max_iter
testing 7 clusters, k-means++ inits, 300 max_iter
testing 7 clusters, random inits, 300 max_iter
testing 8 clusters, k-means++ inits, 300 max_iter
testing 8 clusters, random inits, 300 max_iter
testing 9 clusters, k-means++ inits, 300 max_iter
testing 9 clusters, random inits, 300 max_iter
testing 10 clusters, k-means++ inits, 300 max_iter
testing 10 clusters, random inits, 300 max_iter


In [39]:
summary_df

Unnamed: 0,n_cluster,init,max_iter,DMSO,Actin disruptors,Microtubule stabilizers,unknown
0,4,k-means++,300,0.75,0.34375,1.0,0.78125
1,4,random,300,0.666667,0.375,1.0,0.6875
2,5,k-means++,300,0.75,0.375,1.0,0.40625
3,5,random,300,0.916667,0.3125,1.0,0.5
4,6,k-means++,300,0.75,0.375,0.73913,0.40625
5,6,random,300,0.833333,0.40625,0.956522,0.65625
6,7,k-means++,300,0.666667,0.3125,0.956522,0.34375
7,7,random,300,0.5,0.3125,0.521739,0.8125
8,8,k-means++,300,0.5,0.3125,0.608696,0.53125
9,8,random,300,0.583333,0.3125,1.0,0.34375


In [40]:
matrix = df.iloc[:, 4:-1].values
km = KMeans(n_clusters = 5, init = 'random', max_iter = 300, random_state = 0)
km.fit(matrix)
labels = km.labels_

df_copy = df.copy()
df_copy['cluster'] = labels
df_copy.groupby('moa').cluster.value_counts()

moa                      cluster
Actin disruptors         0          10
                         2          10
                         4          10
                         3           2
DMSO                     2          11
                         0           1
Microtubule stabilizers  3          23
unknown                  1          16
                         2          15
                         0           1
Name: cluster, dtype: int64

In [41]:
df_copy.groupby('moa')['cluster'].apply(lambda x: x.value_counts().iloc[0])/df_copy.groupby('moa').count().TableNumber

moa
Actin disruptors           0.312500
DMSO                       0.916667
Microtubule stabilizers    1.000000
unknown                    0.500000
dtype: float64