In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import os
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri

In [2]:
df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans','ARI_HC',
                                   'AMI_Louvain','AMI_kmeans','AMI_HC',
                                   'Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC'])

In [3]:
workdir = './output/'
path_fm = os.path.join(workdir,'feature_matrices/')
path_clusters = os.path.join(workdir,'clusters/')
path_metrics = os.path.join(workdir,'metrics/')
os.system('mkdir -p '+path_clusters)
os.system('mkdir -p '+path_metrics)

0

In [4]:
metadata = pd.read_csv('./input/metadata.tsv',sep='\t',index_col=0)
num_clusters = len(np.unique(metadata['label']))

In [5]:
files = [x for x in os.listdir(path_fm) if x.startswith('FM')]
len(files)

17

In [6]:
files

['FM_Control_Erynoisyp4.rds',
 'FM_BROCKMAN_Erynoisyp4.rds',
 'FM_Cusanovich2018_Erynoisyp4.rds',
 'FM_cisTopic_Erynoisyp4.rds',
 'FM_chromVAR_Erynoisyp4_kmers.rds',
 'FM_chromVAR_Erynoisyp4_motifs.rds',
 'FM_chromVAR_Erynoisyp4_kmers_pca.rds',
 'FM_chromVAR_Erynoisyp4_motifs_pca.rds',
 'FM_GeneScoring_Erynoisyp4.rds',
 'FM_GeneScoring_Erynoisyp4_pca.rds',
 'FM_Cicero_Erynoisyp4.rds',
 'FM_Cicero_Erynoisyp4_pca.rds',
 'FM_SnapATAC_Erynoisyp4.rds',
 'FM_Scasat_Erynoisyp4.rds',
 'FM_scABC_Erynoisyp4.rds',
 'FM_SCRAT_Erynoisyp4.rds',
 'FM_SCRAT_Erynoisyp4_pca.rds']

In [7]:
def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):
    this_step = 0
    this_min = float(range_min)
    this_max = float(range_max)
    while this_step < max_steps:
        print('step ' + str(this_step))
        this_resolution = this_min + ((this_max-this_min)/2)
        sc.tl.louvain(adata,resolution=this_resolution)
        this_clusters = adata.obs['louvain'].nunique()
        
        print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))
        
        if this_clusters > n_cluster:
            this_max = this_resolution
        elif this_clusters < n_cluster:
            this_min = this_resolution
        else:
            return(this_resolution, adata)
        this_step += 1
    
    print('Cannot find the number of clusters')
    print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + str(this_resolution))

In [8]:
for file in files:
    file_split = file.split('_')
    method = file_split[1]
    dataset = file_split[2].split('.')[0]
    if(len(file_split)>3):
        method = method + '_' + '_'.join(file_split[3:]).split('.')[0]
    print(method)

    pandas2ri.activate()
    readRDS = robjects.r['readRDS']
    df_rds = readRDS(os.path.join(path_fm,file))
    fm_mat = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))
    fm_mat.columns = metadata.index
    
    adata = sc.AnnData(fm_mat.T)
    adata.var_names_make_unique()
    adata.obs = metadata.loc[adata.obs.index,]
    df_metrics.loc[method,] = ""
    #Louvain
    sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')
#     sc.tl.louvain(adata)
    getNClusters(adata,n_cluster=num_clusters)
    #kmeans
    kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)
    adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
    #hierachical clustering
    hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
    adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
    #clustering metrics
    
    #adjusted rank index
    ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])
    ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])
    ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
    #adjusted mutual information
    ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')
    ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic')   
    ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
    #homogeneity
    homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])
    homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])
    homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])

    df_metrics.loc[method,['ARI_Louvain','ARI_kmeans','ARI_HC']] = [ari_louvain,ari_kmeans,ari_hc]
    df_metrics.loc[method,['AMI_Louvain','AMI_kmeans','AMI_HC']] = [ami_louvain,ami_kmeans,ami_hc]
    df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC']] = [homo_louvain,homo_kmeans,homo_hc] 
    adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')

Control


  res = PandasDataFrame.from_items(items)


step 0
got 9 at resolution 1.5
step 1
got 13 at resolution 2.25
step 2
got 9 at resolution 1.875
step 3
got 11 at resolution 2.0625
step 4
got 12 at resolution 2.15625
BROCKMAN


  res = PandasDataFrame.from_items(items)


step 0
got 10 at resolution 1.5
step 1
got 11 at resolution 2.25
step 2
got 14 at resolution 2.625
step 3
got 13 at resolution 2.4375
step 4
got 12 at resolution 2.34375
Cusanovich2018


  res = PandasDataFrame.from_items(items)


step 0
got 9 at resolution 1.5
step 1
got 13 at resolution 2.25
step 2
got 10 at resolution 1.875
step 3
got 10 at resolution 2.0625
step 4
got 12 at resolution 2.15625
cisTopic


  res = PandasDataFrame.from_items(items)


step 0
got 7 at resolution 1.5
step 1
got 10 at resolution 2.25
step 2
got 11 at resolution 2.625
step 3
got 12 at resolution 2.8125
chromVAR_kmers


  res = PandasDataFrame.from_items(items)


step 0
got 6 at resolution 1.5
step 1
got 17 at resolution 2.25
step 2
got 10 at resolution 1.875
step 3
got 14 at resolution 2.0625
step 4
got 11 at resolution 1.96875
step 5
got 13 at resolution 2.015625
step 6
got 14 at resolution 1.9921875
step 7
got 11 at resolution 1.98046875
step 8
got 13 at resolution 1.986328125
step 9
got 12 at resolution 1.9833984375
chromVAR_motifs


  res = PandasDataFrame.from_items(items)


step 0
got 8 at resolution 1.5
step 1
got 18 at resolution 2.25
step 2
got 12 at resolution 1.875
chromVAR_kmers_pca


  res = PandasDataFrame.from_items(items)


step 0
got 6 at resolution 1.5
step 1
got 12 at resolution 2.25
chromVAR_motifs_pca


  res = PandasDataFrame.from_items(items)


step 0
got 10 at resolution 1.5
step 1
got 16 at resolution 2.25
step 2
got 11 at resolution 1.875
step 3
got 15 at resolution 2.0625
step 4
got 13 at resolution 1.96875
step 5
got 12 at resolution 1.921875
GeneScoring


  res = PandasDataFrame.from_items(items)


step 0
got 33 at resolution 1.5
step 1
got 2 at resolution 0.75
step 2
got 15 at resolution 1.125
step 3
got 5 at resolution 0.9375
step 4
got 10 at resolution 1.03125
step 5
got 13 at resolution 1.078125
step 6
got 11 at resolution 1.0546875
step 7
got 11 at resolution 1.06640625
step 8
got 12 at resolution 1.072265625
GeneScoring_pca


  res = PandasDataFrame.from_items(items)


step 0
got 13 at resolution 1.5
step 1
got 6 at resolution 0.75
step 2
got 10 at resolution 1.125
step 3
got 11 at resolution 1.3125
step 4
got 11 at resolution 1.40625
step 5
got 11 at resolution 1.453125
step 6
got 12 at resolution 1.4765625
Cicero


  res = PandasDataFrame.from_items(items)


step 0
got 37 at resolution 1.5
step 1
got 1 at resolution 0.75
step 2
got 18 at resolution 1.125
step 3
got 7 at resolution 0.9375
step 4
got 14 at resolution 1.03125
step 5
got 11 at resolution 0.984375
step 6
got 10 at resolution 1.0078125
step 7
got 12 at resolution 1.01953125
Cicero_pca


  res = PandasDataFrame.from_items(items)


step 0
got 13 at resolution 1.5
step 1
got 7 at resolution 0.75
step 2
got 9 at resolution 1.125
step 3
got 11 at resolution 1.3125
step 4
got 13 at resolution 1.40625
step 5
got 12 at resolution 1.359375
SnapATAC


  res = PandasDataFrame.from_items(items)


step 0
got 11 at resolution 1.5
step 1
got 12 at resolution 2.25
Scasat


  res = PandasDataFrame.from_items(items)


step 0
got 7 at resolution 1.5
step 1
got 11 at resolution 2.25
step 2
got 14 at resolution 2.625
step 3
got 12 at resolution 2.4375
scABC


  res = PandasDataFrame.from_items(items)


step 0
got 8 at resolution 1.5
step 1
got 28 at resolution 2.25
step 2
got 17 at resolution 1.875
step 3
got 13 at resolution 1.6875
step 4
got 8 at resolution 1.59375
step 5
got 10 at resolution 1.640625
step 6
got 11 at resolution 1.6640625
step 7
got 9 at resolution 1.67578125
step 8
got 8 at resolution 1.681640625
step 9
got 9 at resolution 1.6845703125
step 10
got 9 at resolution 1.68603515625
step 11
got 9 at resolution 1.686767578125
step 12
got 9 at resolution 1.6871337890625
step 13
got 13 at resolution 1.68731689453125
step 14
got 13 at resolution 1.687225341796875
step 15
got 13 at resolution 1.6871795654296875
step 16
got 9 at resolution 1.6871566772460938
step 17
got 13 at resolution 1.6871681213378906
step 18
got 13 at resolution 1.6871623992919922
step 19
got 13 at resolution 1.687159538269043
Cannot find the number of clusters
Clustering solution from last iteration is used:13 at resolution 1.687159538269043
SCRAT


  res = PandasDataFrame.from_items(items)


step 0
got 9 at resolution 1.5
step 1
got 13 at resolution 2.25
step 2
got 10 at resolution 1.875
step 3
got 11 at resolution 2.0625
step 4
got 12 at resolution 2.15625
SCRAT_pca


  res = PandasDataFrame.from_items(items)


step 0
got 10 at resolution 1.5
step 1
got 15 at resolution 2.25
step 2
got 11 at resolution 1.875
step 3
got 12 at resolution 2.0625


In [9]:
df_metrics.to_csv(path_metrics+'clustering_scores.csv')

In [10]:
df_metrics

Unnamed: 0,ARI_Louvain,ARI_kmeans,ARI_HC,AMI_Louvain,AMI_kmeans,AMI_HC,Homogeneity_Louvain,Homogeneity_kmeans,Homogeneity_HC
Control,0.65972,0.636097,0.591576,0.79335,0.780363,0.756768,0.789569,0.77741,0.7553
BROCKMAN,0.641155,0.532434,0.576959,0.784956,0.73782,0.758699,0.78375,0.736047,0.756478
Cusanovich2018,0.692886,0.592626,0.618372,0.814455,0.787994,0.771733,0.817862,0.777585,0.760077
cisTopic,0.721536,0.720576,0.654783,0.824928,0.838678,0.784448,0.826919,0.835139,0.787615
chromVAR_kmers,0.392939,0.362402,0.375366,0.552959,0.565858,0.539151,0.54188,0.562366,0.542549
chromVAR_motifs,0.177749,0.212574,0.167491,0.359959,0.4059,0.325384,0.364856,0.415697,0.337029
chromVAR_kmers_pca,0.407467,0.405397,0.3525,0.554217,0.583439,0.53764,0.547964,0.590954,0.541055
chromVAR_motifs_pca,0.214423,0.184863,0.151007,0.394982,0.374467,0.324611,0.399325,0.386423,0.334291
GeneScoring,0.00284294,0.1755,0.175657,0.00441611,0.289236,0.282009,0.0252127,0.266834,0.279538
GeneScoring_pca,0.149945,0.184965,0.197432,0.327187,0.334777,0.34613,0.334635,0.33875,0.346018
