In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import os
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri

In [2]:
df_metrics = pd.DataFrame(columns=['ARI_lovain','ARI_kmeans','ARI_hc',
                                   'AMI_lovain','AMI_kmeans','AMI_hc',
                                   'Homogeneity_lovain','Homogeneity_kmeans','Homogeneity_hc'])

In [3]:
workdir = './erythropoesis_noisy02_results/'
path_fm = os.path.join(workdir,'feature_matrix/')
path_clusters = os.path.join(workdir,'clusters/')
path_metrics = os.path.join(workdir,'metrics/')
os.system('mkdir -p '+path_clusters)
os.system('mkdir -p '+path_metrics)

0

In [4]:
file_metric = 'metrics_erythropoesis_noisy02.csv'
num_clusters = 12

In [5]:
pandas2ri.activate()
readRDS = robjects.r['readRDS']
df_rds = readRDS('../../Datasets/Simulation/erythropoesis/erythropoesis_noisy_0.2.rds')
df_rds = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))


  res = PandasDataFrame.from_items(items)


In [6]:
df_metadata = pd.DataFrame(index=df_rds.columns,columns=['celltype'])
df_metadata['celltype'] = pd.DataFrame(df_rds.columns.str.split('_').tolist()).iloc[:,0].tolist()

In [7]:
files = [x for x in os.listdir(path_fm) if x.startswith('FM')]
len(files)

18

In [8]:
files

['FM_Control_Erynoisy02.tsv',
 'FM_BROCKMAN_Erynoisy02.tsv',
 'FM_LSI_Erynoisy02.tsv',
 'FM_cisTopic_Erynoisy02.tsv',
 'FM_chromVAR_Erynoisy02_kmers.tsv',
 'FM_chromVAR_Erynoisy02_motifs.tsv',
 'FM_chromVAR_Erynoisy02_kmers_pca.tsv',
 'FM_chromVAR_Erynoisy02_motifs_pca.tsv',
 'FM_GeneScoring_Erynoisy02.tsv',
 'FM_GeneScoring_Erynoisy02_pca.tsv',
 'FM_Cicero_Erynoisy02.tsv',
 'FM_Cicero_Erynoisy02_pca.tsv',
 'FM_SnapATAC_Erynoisy02.tsv',
 'FM_Scasat_Erynoisy02.tsv',
 'FM_scABC_Erynoisy02.tsv',
 'FM_scABC_Erynoisy02_pca.tsv',
 'FM_SCRAT_Erynoisy02.tsv',
 'FM_SCRAT_Erynoisy02_pca.tsv']

In [9]:
#dict_colors = {'HSC':'#00441B','MPP':'#46A040','CMP':'#FFC179','MEP':'#F6313E','P1':'#3b82ae','P2':'#547294',"P3":"#6d617a", 
#                    "P4":"#865160", "P5":"#9f4046", "P6":"#b8302c", "P7":"#d11f12", "P8":"#de1705"}
#dict_colors = {'CD4':'#0081C9','CD8':'#001588','CMP':'#FFC179','Ery':'#8F1336','HSC': '#00441B','NK':'#490C65'} 
# list_colors = list()
# for x in adata.obs['celltype']:
#     list_colors.append(dict_colors[x])
# adata.obs['celltype_color'] = list_colors

In [10]:
for file in files:
    file_split = file.split('_')
    method = file_split[1]
    dataset = file_split[2].split('.')[0]
    if(len(file_split)>3):
        method = method + '_' + '_'.join(file_split[3:]).split('.')[0]
    print(method)
    fm_mat = pd.read_csv(os.path.join(path_fm,file),sep='\t',header=0,names=df_metadata.index)
    adata = sc.AnnData(fm_mat.T)
    adata.var_names_make_unique()
    adata.obs = df_metadata.loc[adata.obs.index,]
    df_metrics.loc[method,] = ""
    #Louvain
    sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')
    sc.tl.louvain(adata)
    #kmeans
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(adata.X)
    adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
    #hierachical clustering
    hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
    adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
    #clustering metrics
    #adjusted rank index
    ari_lovain = adjusted_rand_score(adata.obs['celltype'], adata.obs['louvain'])
    ari_kmeans = adjusted_rand_score(adata.obs['celltype'], adata.obs['kmeans'])
    ari_hc = adjusted_rand_score(adata.obs['celltype'], adata.obs['hc'])
    #adjusted mutual information
    ami_lovain = adjusted_mutual_info_score(adata.obs['celltype'], adata.obs['louvain'],average_method='arithmetic')
    ami_kmeans = adjusted_mutual_info_score(adata.obs['celltype'], adata.obs['kmeans'],average_method='arithmetic')   
    ami_hc = adjusted_mutual_info_score(adata.obs['celltype'], adata.obs['hc'],average_method='arithmetic')
    #homogeneity
    homo_lovain = homogeneity_score(adata.obs['celltype'], adata.obs['louvain'])
    homo_kmeans = homogeneity_score(adata.obs['celltype'], adata.obs['kmeans'])
    homo_hc = homogeneity_score(adata.obs['celltype'], adata.obs['hc'])
    df_metrics.loc[method,['ARI_lovain','ARI_kmeans','ARI_hc']] = [ari_lovain,ari_kmeans,ari_hc]
    df_metrics.loc[method,['AMI_lovain','AMI_kmeans','AMI_hc']] = [ami_lovain,ami_kmeans,ami_hc]
    df_metrics.loc[method,['Homogeneity_lovain','Homogeneity_kmeans','Homogeneity_hc']] = [homo_lovain,homo_kmeans,homo_hc] 
    adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')

Control
BROCKMAN
LSI
cisTopic
chromVAR_kmers
chromVAR_motifs
chromVAR_kmers_pca
chromVAR_motifs_pca
GeneScoring


Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GeneScoring_pca
Cicero
Cicero_pca
SnapATAC
Scasat
scABC
scABC_pca
SCRAT
SCRAT_pca


In [11]:
df_metrics

Unnamed: 0,ARI_lovain,ARI_kmeans,ARI_hc,AMI_lovain,AMI_kmeans,AMI_hc,Homogeneity_lovain,Homogeneity_kmeans,Homogeneity_hc
Control,0.544332,0.452346,0.458572,0.743676,0.673295,0.681317,0.690554,0.67089,0.672394
BROCKMAN,0.595912,0.485748,0.519722,0.760082,0.695243,0.708187,0.7311,0.694942,0.705067
LSI,0.566812,0.466543,0.494391,0.754737,0.688944,0.710459,0.718352,0.668787,0.682226
cisTopic,0.538706,0.587413,0.525717,0.76254,0.744125,0.704136,0.671395,0.745063,0.704839
chromVAR_kmers,0.233987,0.345586,0.26855,0.487036,0.519764,0.450276,0.350566,0.521857,0.452756
chromVAR_motifs,0.17652,0.137565,0.101946,0.30593,0.265737,0.228283,0.253281,0.278864,0.239636
chromVAR_kmers_pca,0.296193,0.354575,0.265437,0.568101,0.543943,0.449636,0.43339,0.549798,0.452528
chromVAR_motifs_pca,0.142875,0.137592,0.120723,0.300887,0.276421,0.262542,0.265082,0.290582,0.274939
GeneScoring,-0.000615578,0.0907439,0.147936,-0.000626394,0.214534,0.230612,0.0167404,0.168084,0.225709
GeneScoring_pca,0.119578,0.132378,0.157512,0.223969,0.254133,0.259264,0.222783,0.260458,0.258191


In [12]:
df_metrics.to_csv(path_metrics+file_metric)