In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import os
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri

In [2]:
df_metrics = pd.DataFrame(columns=['ARI_lovain','ARI_kmeans','ARI_hc',
                                   'AMI_lovain','AMI_kmeans','AMI_hc',
                                   'Homogeneity_lovain','Homogeneity_kmeans','Homogeneity_hc'])

In [3]:
workdir = './bonemarrow_clean_results/'
path_fm = os.path.join(workdir,'feature_matrix/')
path_clusters = os.path.join(workdir,'clusters/')
path_metrics = os.path.join(workdir,'metrics/')
os.system('mkdir -p '+path_clusters)
os.system('mkdir -p '+path_metrics)

0

In [4]:
file_metric = 'metrics_bonemarrow_clean.csv'
num_clusters = 6

In [5]:
pandas2ri.activate()
readRDS = robjects.r['readRDS']
df_rds = readRDS('../../Datasets/Simulation/bonemarrow/bonemarrow_clean.rds')
df_rds = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))


  res = PandasDataFrame.from_items(items)


In [6]:
df_metadata = pd.DataFrame(index=df_rds.columns,columns=['celltype'])
df_metadata['celltype'] = pd.DataFrame(df_rds.columns.str.split('_').tolist()).iloc[:,0].tolist()

In [7]:
files = [x for x in os.listdir(path_fm) if x.startswith('FM')]
len(files)

18

In [8]:
files

['FM_Control_BMclean.tsv',
 'FM_BROCKMAN_BMclean.tsv',
 'FM_LSI_BMclean.tsv',
 'FM_cisTopic_BMclean.tsv',
 'FM_chromVAR_BMclean_kmers.tsv',
 'FM_chromVAR_BMclean_motifs.tsv',
 'FM_chromVAR_BMclean_kmers_pca.tsv',
 'FM_chromVAR_BMclean_motifs_pca.tsv',
 'FM_GeneScoring_BMclean.tsv',
 'FM_GeneScoring_BMclean_pca.tsv',
 'FM_Cicero_BMclean.tsv',
 'FM_Cicero_BMclean_pca.tsv',
 'FM_SnapATAC_BMclean.tsv',
 'FM_Scasat_BMclean.tsv',
 'FM_scABC_BMclean.tsv',
 'FM_scABC_BMclean_pca.tsv',
 'FM_SCRAT_BMclean.tsv',
 'FM_SCRAT_BMclean_pca.tsv']

In [9]:
#dict_colors = {'HSC':'#00441B','MPP':'#46A040','CMP':'#FFC179','MEP':'#F6313E','P1':'#3b82ae','P2':'#547294',"P3":"#6d617a", 
#                    "P4":"#865160", "P5":"#9f4046", "P6":"#b8302c", "P7":"#d11f12", "P8":"#de1705"}
#dict_colors = {'CD4':'#0081C9','CD8':'#001588','CMP':'#FFC179','Ery':'#8F1336','HSC': '#00441B','NK':'#490C65'} 
# list_colors = list()
# for x in adata.obs['celltype']:
#     list_colors.append(dict_colors[x])
# adata.obs['celltype_color'] = list_colors

In [10]:
for file in files:
    file_split = file.split('_')
    method = file_split[1]
    dataset = file_split[2].split('.')[0]
    if(len(file_split)>3):
        method = method + '_' + '_'.join(file_split[3:]).split('.')[0]
    print(method)
    fm_mat = pd.read_csv(os.path.join(path_fm,file),sep='\t',header=0,names=df_metadata.index)
    adata = sc.AnnData(fm_mat.T)
    adata.var_names_make_unique()
    adata.obs = df_metadata.loc[adata.obs.index,]
    df_metrics.loc[method,] = ""
    #Louvain
    sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')
    sc.tl.louvain(adata)
    #kmeans
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(adata.X)
    adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
    #hierachical clustering
    hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
    adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
    #clustering metrics
    #adjusted rank index
    ari_lovain = adjusted_rand_score(adata.obs['celltype'], adata.obs['louvain'])
    ari_kmeans = adjusted_rand_score(adata.obs['celltype'], adata.obs['kmeans'])
    ari_hc = adjusted_rand_score(adata.obs['celltype'], adata.obs['hc'])
    #adjusted mutual information
    ami_lovain = adjusted_mutual_info_score(adata.obs['celltype'], adata.obs['louvain'],average_method='arithmetic')
    ami_kmeans = adjusted_mutual_info_score(adata.obs['celltype'], adata.obs['kmeans'],average_method='arithmetic')   
    ami_hc = adjusted_mutual_info_score(adata.obs['celltype'], adata.obs['hc'],average_method='arithmetic')
    #homogeneity
    homo_lovain = homogeneity_score(adata.obs['celltype'], adata.obs['louvain'])
    homo_kmeans = homogeneity_score(adata.obs['celltype'], adata.obs['kmeans'])
    homo_hc = homogeneity_score(adata.obs['celltype'], adata.obs['hc'])
    df_metrics.loc[method,['ARI_lovain','ARI_kmeans','ARI_hc']] = [ari_lovain,ari_kmeans,ari_hc]
    df_metrics.loc[method,['AMI_lovain','AMI_kmeans','AMI_hc']] = [ami_lovain,ami_kmeans,ami_hc]
    df_metrics.loc[method,['Homogeneity_lovain','Homogeneity_kmeans','Homogeneity_hc']] = [homo_lovain,homo_kmeans,homo_hc] 
    adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')

Control
BROCKMAN
LSI
cisTopic
chromVAR_kmers
chromVAR_motifs
chromVAR_kmers_pca
chromVAR_motifs_pca
GeneScoring


Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GeneScoring_pca
Cicero
Cicero_pca
SnapATAC
Scasat
scABC
scABC_pca
SCRAT
SCRAT_pca


In [11]:
df_metrics

Unnamed: 0,ARI_lovain,ARI_kmeans,ARI_hc,AMI_lovain,AMI_kmeans,AMI_hc,Homogeneity_lovain,Homogeneity_kmeans,Homogeneity_hc
Control,0.628747,0.641598,0.643455,0.73815,0.748854,0.766331,0.805433,0.749091,0.739945
BROCKMAN,0.552898,0.590986,0.581306,0.687584,0.70564,0.705595,0.774055,0.705947,0.697308
LSI,0.872591,0.744099,0.767224,0.909084,0.85214,0.873581,0.964938,0.826848,0.836618
cisTopic,0.944198,0.951095,0.923793,0.939813,0.944068,0.925704,0.940133,0.944368,0.925757
chromVAR_kmers,0.477218,0.631083,0.531254,0.720782,0.715327,0.642416,0.564475,0.716025,0.637126
chromVAR_motifs,0.473373,0.398188,0.378635,0.638185,0.555339,0.516852,0.562279,0.555846,0.517452
chromVAR_kmers_pca,0.493088,0.610011,0.496139,0.655705,0.708011,0.628545,0.577247,0.709475,0.622575
chromVAR_motifs_pca,0.478305,0.394793,0.406932,0.641398,0.550418,0.548449,0.56446,0.55264,0.537866
GeneScoring,0.00629564,0.19916,0.188128,0.00808917,0.293056,0.258134,0.018147,0.26782,0.242123
GeneScoring_pca,0.257835,0.149146,0.152961,0.314762,0.230005,0.241335,0.360456,0.219529,0.228297


In [12]:
df_metrics.to_csv(path_metrics+file_metric)