In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import os
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri

In [2]:
df_metrics = pd.DataFrame(columns=['ARI_lovain','ARI_kmeans','ARI_hc',
                                   'AMI_lovain','AMI_kmeans','AMI_hc',
                                   'Homogeneity_lovain','Homogeneity_kmeans','Homogeneity_hc'])

In [3]:
workdir = './bonemarrow_noisy02_results/'
path_fm = os.path.join(workdir,'feature_matrix/')
path_clusters = os.path.join(workdir,'clusters/')
path_metrics = os.path.join(workdir,'metrics/')
os.system('mkdir -p '+path_clusters)
os.system('mkdir -p '+path_metrics)

0

In [4]:
file_metric = 'metrics_bonemarrow_noisy02.csv'
num_clusters = 6

In [5]:
pandas2ri.activate()
readRDS = robjects.r['readRDS']
df_rds = readRDS('../../Datasets/Simulation/bonemarrow/bonemarrow_noisy_0.2.rds')
df_rds = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))


  res = PandasDataFrame.from_items(items)


In [6]:
df_metadata = pd.DataFrame(index=df_rds.columns,columns=['celltype'])
df_metadata['celltype'] = pd.DataFrame(df_rds.columns.str.split('_').tolist()).iloc[:,0].tolist()

In [7]:
files = [x for x in os.listdir(path_fm) if x.startswith('FM')]
len(files)

18

In [8]:
files

['FM_Control_BMnosiy02.tsv',
 'FM_BROCKMAN_BMnosiy02.tsv',
 'FM_LSI_BMnosiy02.tsv',
 'FM_cisTopic_BMnosiy02.tsv',
 'FM_chromVAR_BMnosiy02_kmers.tsv',
 'FM_chromVAR_BMnosiy02_motifs.tsv',
 'FM_chromVAR_BMnosiy02_kmers_pca.tsv',
 'FM_chromVAR_BMnosiy02_motifs_pca.tsv',
 'FM_GeneScoring_BMnosiy02.tsv',
 'FM_GeneScoring_BMnosiy02_pca.tsv',
 'FM_Cicero_BMnosiy02.tsv',
 'FM_Cicero_BMnosiy02_pca.tsv',
 'FM_SnapATAC_BMnosiy02.tsv',
 'FM_Scasat_BMnosiy02.tsv',
 'FM_scABC_BMnosiy02.tsv',
 'FM_scABC_BMnosiy02_pca.tsv',
 'FM_SCRAT_BMnosiy02.tsv',
 'FM_SCRAT_BMnosiy02_pca.tsv']

In [9]:
#dict_colors = {'HSC':'#00441B','MPP':'#46A040','CMP':'#FFC179','MEP':'#F6313E','P1':'#3b82ae','P2':'#547294',"P3":"#6d617a", 
#                    "P4":"#865160", "P5":"#9f4046", "P6":"#b8302c", "P7":"#d11f12", "P8":"#de1705"}
#dict_colors = {'CD4':'#0081C9','CD8':'#001588','CMP':'#FFC179','Ery':'#8F1336','HSC': '#00441B','NK':'#490C65'} 
# list_colors = list()
# for x in adata.obs['celltype']:
#     list_colors.append(dict_colors[x])
# adata.obs['celltype_color'] = list_colors

In [10]:
for file in files:
    file_split = file.split('_')
    method = file_split[1]
    dataset = file_split[2].split('.')[0]
    if(len(file_split)>3):
        method = method + '_' + '_'.join(file_split[3:]).split('.')[0]
    print(method)
    fm_mat = pd.read_csv(os.path.join(path_fm,file),sep='\t',header=0,names=df_metadata.index)
    adata = sc.AnnData(fm_mat.T)
    adata.var_names_make_unique()
    adata.obs = df_metadata.loc[adata.obs.index,]
    df_metrics.loc[method,] = ""
    #Louvain
    sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')
    sc.tl.louvain(adata)
    #kmeans
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(adata.X)
    adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
    #hierachical clustering
    hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
    adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
    #clustering metrics
    #adjusted rank index
    ari_lovain = adjusted_rand_score(adata.obs['celltype'], adata.obs['louvain'])
    ari_kmeans = adjusted_rand_score(adata.obs['celltype'], adata.obs['kmeans'])
    ari_hc = adjusted_rand_score(adata.obs['celltype'], adata.obs['hc'])
    #adjusted mutual information
    ami_lovain = adjusted_mutual_info_score(adata.obs['celltype'], adata.obs['louvain'],average_method='arithmetic')
    ami_kmeans = adjusted_mutual_info_score(adata.obs['celltype'], adata.obs['kmeans'],average_method='arithmetic')   
    ami_hc = adjusted_mutual_info_score(adata.obs['celltype'], adata.obs['hc'],average_method='arithmetic')
    #homogeneity
    homo_lovain = homogeneity_score(adata.obs['celltype'], adata.obs['louvain'])
    homo_kmeans = homogeneity_score(adata.obs['celltype'], adata.obs['kmeans'])
    homo_hc = homogeneity_score(adata.obs['celltype'], adata.obs['hc'])
    df_metrics.loc[method,['ARI_lovain','ARI_kmeans','ARI_hc']] = [ari_lovain,ari_kmeans,ari_hc]
    df_metrics.loc[method,['AMI_lovain','AMI_kmeans','AMI_hc']] = [ami_lovain,ami_kmeans,ami_hc]
    df_metrics.loc[method,['Homogeneity_lovain','Homogeneity_kmeans','Homogeneity_hc']] = [homo_lovain,homo_kmeans,homo_hc] 
    adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')

Control
BROCKMAN
LSI
cisTopic
chromVAR_kmers
chromVAR_motifs
chromVAR_kmers_pca
chromVAR_motifs_pca
GeneScoring


Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GeneScoring_pca
Cicero
Cicero_pca
SnapATAC
Scasat
scABC
scABC_pca
SCRAT
SCRAT_pca


In [11]:
df_metrics

Unnamed: 0,ARI_lovain,ARI_kmeans,ARI_hc,AMI_lovain,AMI_kmeans,AMI_hc,Homogeneity_lovain,Homogeneity_kmeans,Homogeneity_hc
Control,0.621098,0.584772,0.600179,0.725243,0.718615,0.730611,0.770647,0.719173,0.727779
BROCKMAN,0.470165,0.492454,0.495961,0.613577,0.631445,0.662692,0.673418,0.630321,0.643514
LSI,0.832713,0.690402,0.703785,0.856452,0.79538,0.808632,0.856647,0.773493,0.780179
cisTopic,0.484049,0.551658,0.448938,0.617721,0.650094,0.630044,0.578082,0.651286,0.619538
chromVAR_kmers,0.471124,0.43072,0.41619,0.69987,0.579016,0.53646,0.548763,0.576935,0.533612
chromVAR_motifs,0.429585,0.351304,0.309445,0.561214,0.486006,0.399504,0.496113,0.487616,0.397724
chromVAR_kmers_pca,0.481874,0.429166,0.418962,0.655359,0.574486,0.550234,0.571177,0.576219,0.547765
chromVAR_motifs_pca,0.439587,0.34593,0.346701,0.574483,0.477018,0.433265,0.505222,0.478624,0.420158
GeneScoring,0.00323195,0.0688879,0.101819,0.00513762,0.142082,0.194013,0.0135763,0.11997,0.163015
GeneScoring_pca,0.19794,0.135241,0.10172,0.257704,0.220421,0.188547,0.29921,0.205379,0.170881


In [12]:
df_metrics.to_csv(path_metrics+file_metric)