In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import os
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri

In [2]:
df_metrics = pd.DataFrame(columns=['ARI_Louvain','ARI_kmeans','ARI_HC',
                                   'AMI_Louvain','AMI_kmeans','AMI_HC',
                                   'Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC'])

In [3]:
workdir = './peaks_intensity_results/'
path_fm = os.path.join(workdir,'feature_matrices/')
path_clusters = os.path.join(workdir,'clusters/')
path_metrics = os.path.join(workdir,'metrics/')
os.system('mkdir -p '+path_clusters)
os.system('mkdir -p '+path_metrics)

0

In [4]:
metadata = pd.read_csv('../../input/metadata.tsv',sep='\t',index_col=0)
num_clusters = len(np.unique(metadata['label']))
print(num_clusters)

10


In [5]:
files = [x for x in os.listdir(path_fm) if x.startswith('FM')]
len(files)

11

In [6]:
files

['FM_control1.rds',
 'FM_control0.8.rds',
 'FM_control0.6.rds',
 'FM_control0.4.rds',
 'FM_control0.2.rds',
 'FM_control0.1.rds',
 'FM_control0.08.rds',
 'FM_control0.06.rds',
 'FM_control0.04.rds',
 'FM_control0.02.rds',
 'FM_control0.01.rds']

In [7]:
def getNClusters(adata,n_cluster,range_min=0,range_max=3,max_steps=20):
    this_step = 0
    this_min = float(range_min)
    this_max = float(range_max)
    while this_step < max_steps:
        print('step ' + str(this_step))
        this_resolution = this_min + ((this_max-this_min)/2)
        sc.tl.louvain(adata,resolution=this_resolution)
        this_clusters = adata.obs['louvain'].nunique()
        
        print('got ' + str(this_clusters) + ' at resolution ' + str(this_resolution))
        
        if this_clusters > n_cluster:
            this_max = this_resolution
        elif this_clusters < n_cluster:
            this_min = this_resolution
        else:
            return(this_resolution, adata)
        this_step += 1
    
    print('Cannot find the number of clusters')
    print('Clustering solution from last iteration is used:' + str(this_clusters) + ' at resolution ' + str(this_resolution))

In [8]:
for file in files:
    file_split = file[:-4].split('_')
    method = file_split[1]
    print(method)

    pandas2ri.activate()
    readRDS = robjects.r['readRDS']
    df_rds = readRDS(os.path.join(path_fm,file))
    fm_mat = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))
    fm_mat.fillna(0,inplace=True)
    fm_mat.columns = metadata.index
    
    adata = sc.AnnData(fm_mat.T)
    adata.var_names_make_unique()
    adata.obs = metadata.loc[adata.obs.index,]
    df_metrics.loc[method,] = ""
    #Louvain
    sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')
#     sc.tl.louvain(adata)
    getNClusters(adata,n_cluster=num_clusters)
    #kmeans
    kmeans = KMeans(n_clusters=num_clusters, random_state=2019).fit(adata.X)
    adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
    #hierachical clustering
    hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
    adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
    #clustering metrics
    
    #adjusted rank index
    ari_louvain = adjusted_rand_score(adata.obs['label'], adata.obs['louvain'])
    ari_kmeans = adjusted_rand_score(adata.obs['label'], adata.obs['kmeans'])
    ari_hc = adjusted_rand_score(adata.obs['label'], adata.obs['hc'])
    #adjusted mutual information
    ami_louvain = adjusted_mutual_info_score(adata.obs['label'], adata.obs['louvain'],average_method='arithmetic')
    ami_kmeans = adjusted_mutual_info_score(adata.obs['label'], adata.obs['kmeans'],average_method='arithmetic')   
    ami_hc = adjusted_mutual_info_score(adata.obs['label'], adata.obs['hc'],average_method='arithmetic')
    #homogeneity
    homo_louvain = homogeneity_score(adata.obs['label'], adata.obs['louvain'])
    homo_kmeans = homogeneity_score(adata.obs['label'], adata.obs['kmeans'])
    homo_hc = homogeneity_score(adata.obs['label'], adata.obs['hc'])

    df_metrics.loc[method,['ARI_Louvain','ARI_kmeans','ARI_HC']] = [ari_louvain,ari_kmeans,ari_hc]
    df_metrics.loc[method,['AMI_Louvain','AMI_kmeans','AMI_HC']] = [ami_louvain,ami_kmeans,ami_hc]
    df_metrics.loc[method,['Homogeneity_Louvain','Homogeneity_kmeans','Homogeneity_HC']] = [homo_louvain,homo_kmeans,homo_hc] 
    adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')

control1


  res = PandasDataFrame.from_items(items)


step 0
got 16 at resolution 1.5
step 1
got 12 at resolution 0.75
step 2
got 5 at resolution 0.375
step 3
got 10 at resolution 0.5625
control0.8


  res = PandasDataFrame.from_items(items)


step 0
got 18 at resolution 1.5
step 1
got 12 at resolution 0.75
step 2
got 5 at resolution 0.375
step 3
got 9 at resolution 0.5625
step 4
got 12 at resolution 0.65625
step 5
got 10 at resolution 0.609375
control0.6


  res = PandasDataFrame.from_items(items)


step 0
got 20 at resolution 1.5
step 1
got 11 at resolution 0.75
step 2
got 5 at resolution 0.375
step 3
got 11 at resolution 0.5625
step 4
got 9 at resolution 0.46875
step 5
got 9 at resolution 0.515625
step 6
got 9 at resolution 0.5390625
step 7
got 11 at resolution 0.55078125
step 8
got 10 at resolution 0.544921875
control0.4


  res = PandasDataFrame.from_items(items)


step 0
got 18 at resolution 1.5
step 1
got 12 at resolution 0.75
step 2
got 5 at resolution 0.375
step 3
got 11 at resolution 0.5625
step 4
got 8 at resolution 0.46875
step 5
got 10 at resolution 0.515625
control0.2


  res = PandasDataFrame.from_items(items)


step 0
got 17 at resolution 1.5
step 1
got 12 at resolution 0.75
step 2
got 5 at resolution 0.375
step 3
got 10 at resolution 0.5625
control0.1


  res = PandasDataFrame.from_items(items)


step 0
got 16 at resolution 1.5
step 1
got 10 at resolution 0.75
control0.08


  res = PandasDataFrame.from_items(items)


step 0
got 17 at resolution 1.5
step 1
got 11 at resolution 0.75
step 2
got 6 at resolution 0.375
step 3
got 8 at resolution 0.5625
step 4
got 9 at resolution 0.65625
step 5
got 10 at resolution 0.703125
control0.06


  res = PandasDataFrame.from_items(items)


step 0
got 14 at resolution 1.5
step 1
got 9 at resolution 0.75
step 2
got 12 at resolution 1.125
step 3
got 11 at resolution 0.9375
step 4
got 9 at resolution 0.84375
step 5
got 9 at resolution 0.890625
step 6
got 9 at resolution 0.9140625
step 7
got 9 at resolution 0.92578125
step 8
got 9 at resolution 0.931640625
step 9
got 10 at resolution 0.9345703125
control0.04


  res = PandasDataFrame.from_items(items)


step 0
got 13 at resolution 1.5
step 1
got 8 at resolution 0.75
step 2
got 10 at resolution 1.125
control0.02


  res = PandasDataFrame.from_items(items)


step 0
got 12 at resolution 1.5
step 1
got 6 at resolution 0.75
step 2
got 10 at resolution 1.125
control0.01


  res = PandasDataFrame.from_items(items)


step 0
got 11 at resolution 1.5
step 1
got 7 at resolution 0.75
step 2
got 8 at resolution 1.125
step 3
got 10 at resolution 1.3125


In [9]:
df_metrics.to_csv(path_metrics+'clustering_scores.csv')

In [10]:
df_metrics

Unnamed: 0,ARI_Louvain,ARI_kmeans,ARI_HC,AMI_Louvain,AMI_kmeans,AMI_HC,Homogeneity_Louvain,Homogeneity_kmeans,Homogeneity_HC
control1,0.179826,0.0234096,0.0314772,0.349128,0.0308791,0.0587904,0.362356,0.0324073,0.0540063
control0.8,0.198269,0.0236603,0.0225297,0.367389,0.0300587,0.0475972,0.384257,0.0319474,0.0451048
control0.6,0.152558,0.0237327,0.0422615,0.321883,0.0304497,0.0724074,0.333292,0.0321494,0.064689
control0.4,0.146743,0.023586,0.027486,0.318021,0.0304841,0.0402353,0.332556,0.0320985,0.040072
control0.2,0.182738,0.0246456,0.0231816,0.347361,0.0373328,0.0413083,0.361275,0.0403357,0.0405167
control0.1,0.133545,0.0261013,0.016136,0.267155,0.0470087,0.0425216,0.283382,0.0471308,0.0411577
control0.08,0.120395,0.0253612,0.0285373,0.247703,0.0367279,0.0370956,0.263102,0.0398583,0.037751
control0.06,0.0847967,0.0226808,0.0280202,0.208772,0.0297222,0.0455833,0.221557,0.0315745,0.0479385
control0.04,0.0919906,0.0261321,0.0284394,0.208795,0.0367532,0.0461805,0.223945,0.0398588,0.0450928
control0.02,0.0498526,0.0266725,0.0227106,0.126866,0.0374406,0.0317983,0.141032,0.0406044,0.0325217
