In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import os
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri

In [2]:
df_metrics = pd.DataFrame(columns=['ARI_lovain','ARI_kmeans','ARI_hc',
                                   'AMI_lovain','AMI_kmeans','AMI_hc',
                                   'Homogeneity_lovain','Homogeneity_kmeans','Homogeneity_hc'])

In [3]:
workdir = './erythropoesis_noisy04_results/'
path_fm = os.path.join(workdir,'feature_matrix/')
path_clusters = os.path.join(workdir,'clusters/')
path_metrics = os.path.join(workdir,'metrics/')
os.system('mkdir -p '+path_clusters)
os.system('mkdir -p '+path_metrics)

0

In [4]:
file_metric = 'metrics_erythropoesis_noisy04.csv'
num_clusters = 12

In [5]:
pandas2ri.activate()
readRDS = robjects.r['readRDS']
df_rds = readRDS('../../Datasets/Simulation/erythropoesis/erythropoesis_noisy_0.4.rds')
df_rds = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))


  res = PandasDataFrame.from_items(items)


In [6]:
df_metadata = pd.DataFrame(index=df_rds.columns,columns=['celltype'])
df_metadata['celltype'] = pd.DataFrame(df_rds.columns.str.split('_').tolist()).iloc[:,0].tolist()

In [7]:
files = [x for x in os.listdir(path_fm) if x.startswith('FM')]
len(files)

18

In [8]:
files

['FM_Control_Erynoisy04.tsv',
 'FM_BROCKMAN_Erynoisy04.tsv',
 'FM_LSI_Erynoisy04.tsv',
 'FM_cisTopic_Erynoisy04.tsv',
 'FM_chromVAR_Erynoisy04_kmers.tsv',
 'FM_chromVAR_Erynoisy04_motifs.tsv',
 'FM_chromVAR_Erynoisy04_kmers_pca.tsv',
 'FM_chromVAR_Erynoisy04_motifs_pca.tsv',
 'FM_GeneScoring_Erynoisy04.tsv',
 'FM_GeneScoring_Erynoisy04_pca.tsv',
 'FM_Cicero_Erynoisy04.tsv',
 'FM_Cicero_Erynoisy04_pca.tsv',
 'FM_SnapATAC_Erynoisy04.tsv',
 'FM_Scasat_Erynoisy04.tsv',
 'FM_scABC_Erynoisy04.tsv',
 'FM_scABC_Erynoisy04_pca.tsv',
 'FM_SCRAT_Erynoisy04.tsv',
 'FM_SCRAT_Erynoisy04_pca.tsv']

In [9]:
#dict_colors = {'HSC':'#00441B','MPP':'#46A040','CMP':'#FFC179','MEP':'#F6313E','P1':'#3b82ae','P2':'#547294',"P3":"#6d617a", 
#                    "P4":"#865160", "P5":"#9f4046", "P6":"#b8302c", "P7":"#d11f12", "P8":"#de1705"}
#dict_colors = {'CD4':'#0081C9','CD8':'#001588','CMP':'#FFC179','Ery':'#8F1336','HSC': '#00441B','NK':'#490C65'} 
# list_colors = list()
# for x in adata.obs['celltype']:
#     list_colors.append(dict_colors[x])
# adata.obs['celltype_color'] = list_colors

In [10]:
for file in files:
    file_split = file.split('_')
    method = file_split[1]
    dataset = file_split[2].split('.')[0]
    if(len(file_split)>3):
        method = method + '_' + '_'.join(file_split[3:]).split('.')[0]
    print(method)
    fm_mat = pd.read_csv(os.path.join(path_fm,file),sep='\t',header=0,names=df_metadata.index)
    adata = sc.AnnData(fm_mat.T)
    adata.var_names_make_unique()
    adata.obs = df_metadata.loc[adata.obs.index,]
    df_metrics.loc[method,] = ""
    #Louvain
    sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')
    sc.tl.louvain(adata)
    #kmeans
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(adata.X)
    adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
    #hierachical clustering
    hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
    adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
    #clustering metrics
    #adjusted rank index
    ari_lovain = adjusted_rand_score(adata.obs['celltype'], adata.obs['louvain'])
    ari_kmeans = adjusted_rand_score(adata.obs['celltype'], adata.obs['kmeans'])
    ari_hc = adjusted_rand_score(adata.obs['celltype'], adata.obs['hc'])
    #adjusted mutual information
    ami_lovain = adjusted_mutual_info_score(adata.obs['celltype'], adata.obs['louvain'],average_method='arithmetic')
    ami_kmeans = adjusted_mutual_info_score(adata.obs['celltype'], adata.obs['kmeans'],average_method='arithmetic')   
    ami_hc = adjusted_mutual_info_score(adata.obs['celltype'], adata.obs['hc'],average_method='arithmetic')
    #homogeneity
    homo_lovain = homogeneity_score(adata.obs['celltype'], adata.obs['louvain'])
    homo_kmeans = homogeneity_score(adata.obs['celltype'], adata.obs['kmeans'])
    homo_hc = homogeneity_score(adata.obs['celltype'], adata.obs['hc'])
    df_metrics.loc[method,['ARI_lovain','ARI_kmeans','ARI_hc']] = [ari_lovain,ari_kmeans,ari_hc]
    df_metrics.loc[method,['AMI_lovain','AMI_kmeans','AMI_hc']] = [ami_lovain,ami_kmeans,ami_hc]
    df_metrics.loc[method,['Homogeneity_lovain','Homogeneity_kmeans','Homogeneity_hc']] = [homo_lovain,homo_kmeans,homo_hc] 
    adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')

Control
BROCKMAN
LSI
cisTopic
chromVAR_kmers
chromVAR_motifs
chromVAR_kmers_pca
chromVAR_motifs_pca
GeneScoring


Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GeneScoring_pca
Cicero
Cicero_pca
SnapATAC
Scasat
scABC
scABC_pca
SCRAT
SCRAT_pca


In [11]:
df_metrics

Unnamed: 0,ARI_lovain,ARI_kmeans,ARI_hc,AMI_lovain,AMI_kmeans,AMI_hc,Homogeneity_lovain,Homogeneity_kmeans,Homogeneity_hc
Control,0.49084,0.320551,0.408736,0.724214,0.549148,0.62007,0.618245,0.552567,0.618774
BROCKMAN,0.412712,0.368855,0.398771,0.632785,0.584616,0.616543,0.594597,0.591606,0.615464
LSI,0.46608,0.424971,0.422502,0.674312,0.633199,0.646904,0.619538,0.620661,0.624585
cisTopic,0.410425,0.366879,0.385679,0.643243,0.587753,0.587705,0.530944,0.590069,0.58102
chromVAR_kmers,0.17553,0.152891,0.142596,0.38028,0.310716,0.285681,0.270824,0.318135,0.292698
chromVAR_motifs,0.0348019,0.062142,0.0488341,0.0696394,0.137131,0.0991929,0.0682246,0.15456,0.117037
chromVAR_kmers_pca,0.187769,0.175249,0.142378,0.369111,0.34892,0.29158,0.286704,0.361179,0.303151
chromVAR_motifs_pca,0.0811831,0.063366,0.0425691,0.148586,0.125071,0.0902918,0.133929,0.143045,0.108083
GeneScoring,0.00174021,0.120981,0.0895474,0.00420637,0.229257,0.148245,0.0251994,0.191817,0.153057
GeneScoring_pca,0.0992469,0.104298,0.0740414,0.154774,0.158861,0.130406,0.166737,0.169774,0.144749


In [12]:
df_metrics.to_csv(path_metrics+file_metric)