In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import os
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics.cluster import homogeneity_score
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri

In [2]:
df_metrics = pd.DataFrame(columns=['ARI_lovain','ARI_kmeans','ARI_hc',
                                   'AMI_lovain','AMI_kmeans','AMI_hc',
                                   'Homogeneity_lovain','Homogeneity_kmeans','Homogeneity_hc'])

In [3]:
workdir = './erythropoesis_clean_results/'
path_fm = os.path.join(workdir,'feature_matrix/')
path_clusters = os.path.join(workdir,'clusters/')
path_metrics = os.path.join(workdir,'metrics/')
os.system('mkdir -p '+path_clusters)
os.system('mkdir -p '+path_metrics)

0

In [4]:
file_metric = 'metrics_erythropoesis_clean.csv'
num_clusters = 12

In [5]:
pandas2ri.activate()
readRDS = robjects.r['readRDS']
df_rds = readRDS('../../Datasets/Simulation/erythropoesis/erythropoesis_clean.rds')
df_rds = pandas2ri.ri2py(robjects.r['data.frame'](robjects.r['as.matrix'](df_rds)))


  res = PandasDataFrame.from_items(items)


In [6]:
df_metadata = pd.DataFrame(index=df_rds.columns,columns=['celltype'])
df_metadata['celltype'] = pd.DataFrame(df_rds.columns.str.split('_').tolist()).iloc[:,0].tolist()

In [7]:
files = [x for x in os.listdir(path_fm) if x.startswith('FM')]
len(files)

18

In [8]:
files

['FM_Control_Eryclean.tsv',
 'FM_BROCKMAN_Eryclean.tsv',
 'FM_LSI_Eryclean.tsv',
 'FM_cisTopic_Eryclean.tsv',
 'FM_chromVAR_Eryclean_kmers.tsv',
 'FM_chromVAR_Eryclean_motifs.tsv',
 'FM_chromVAR_Eryclean_kmers_pca.tsv',
 'FM_chromVAR_Eryclean_motifs_pca.tsv',
 'FM_GeneScoring_Eryclean.tsv',
 'FM_GeneScoring_Eryclean_pca.tsv',
 'FM_Cicero_Eryclean.tsv',
 'FM_Cicero_Eryclean_pca.tsv',
 'FM_SnapATAC_Eryclean.tsv',
 'FM_Scasat_Eryclean.tsv',
 'FM_scABC_Eryclean.tsv',
 'FM_scABC_Eryclean_pca.tsv',
 'FM_SCRAT_Eryclean.tsv',
 'FM_SCRAT_Eryclean_pca.tsv']

In [9]:
#dict_colors = {'HSC':'#00441B','MPP':'#46A040','CMP':'#FFC179','MEP':'#F6313E','P1':'#3b82ae','P2':'#547294',"P3":"#6d617a", 
#                    "P4":"#865160", "P5":"#9f4046", "P6":"#b8302c", "P7":"#d11f12", "P8":"#de1705"}
#dict_colors = {'CD4':'#0081C9','CD8':'#001588','CMP':'#FFC179','Ery':'#8F1336','HSC': '#00441B','NK':'#490C65'} 
# list_colors = list()
# for x in adata.obs['celltype']:
#     list_colors.append(dict_colors[x])
# adata.obs['celltype_color'] = list_colors

In [10]:
for file in files:
    file_split = file.split('_')
    method = file_split[1]
    dataset = file_split[2].split('.')[0]
    if(len(file_split)>3):
        method = method + '_' + '_'.join(file_split[3:]).split('.')[0]
    print(method)
    fm_mat = pd.read_csv(os.path.join(path_fm,file),sep='\t',header=0,names=df_metadata.index)
    adata = sc.AnnData(fm_mat.T)
    adata.var_names_make_unique()
    adata.obs = df_metadata.loc[adata.obs.index,]
    df_metrics.loc[method,] = ""
    #Louvain
    sc.pp.neighbors(adata, n_neighbors=15,use_rep='X')
    sc.tl.louvain(adata)
    #kmeans
    kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(adata.X)
    adata.obs['kmeans'] = pd.Series(kmeans.labels_,index=adata.obs.index).astype('category')
    #hierachical clustering
    hc = AgglomerativeClustering(n_clusters=num_clusters).fit(adata.X)
    adata.obs['hc'] = pd.Series(hc.labels_,index=adata.obs.index).astype('category')
    #clustering metrics
    #adjusted rank index
    ari_lovain = adjusted_rand_score(adata.obs['celltype'], adata.obs['louvain'])
    ari_kmeans = adjusted_rand_score(adata.obs['celltype'], adata.obs['kmeans'])
    ari_hc = adjusted_rand_score(adata.obs['celltype'], adata.obs['hc'])
    #adjusted mutual information
    ami_lovain = adjusted_mutual_info_score(adata.obs['celltype'], adata.obs['louvain'],average_method='arithmetic')
    ami_kmeans = adjusted_mutual_info_score(adata.obs['celltype'], adata.obs['kmeans'],average_method='arithmetic')   
    ami_hc = adjusted_mutual_info_score(adata.obs['celltype'], adata.obs['hc'],average_method='arithmetic')
    #homogeneity
    homo_lovain = homogeneity_score(adata.obs['celltype'], adata.obs['louvain'])
    homo_kmeans = homogeneity_score(adata.obs['celltype'], adata.obs['kmeans'])
    homo_hc = homogeneity_score(adata.obs['celltype'], adata.obs['hc'])
    df_metrics.loc[method,['ARI_lovain','ARI_kmeans','ARI_hc']] = [ari_lovain,ari_kmeans,ari_hc]
    df_metrics.loc[method,['AMI_lovain','AMI_kmeans','AMI_hc']] = [ami_lovain,ami_kmeans,ami_hc]
    df_metrics.loc[method,['Homogeneity_lovain','Homogeneity_kmeans','Homogeneity_hc']] = [homo_lovain,homo_kmeans,homo_hc] 
    adata.obs[['louvain','kmeans','hc']].to_csv(os.path.join(path_clusters ,method + '_clusters.tsv'),sep='\t')

Control
BROCKMAN
LSI
cisTopic
chromVAR_kmers
chromVAR_motifs
chromVAR_kmers_pca
chromVAR_motifs_pca
GeneScoring


Variable names are not unique. To make them unique, call `.var_names_make_unique`.


GeneScoring_pca
Cicero
Cicero_pca
SnapATAC
Scasat
scABC
scABC_pca
SCRAT
SCRAT_pca


In [11]:
df_metrics

Unnamed: 0,ARI_lovain,ARI_kmeans,ARI_hc,AMI_lovain,AMI_kmeans,AMI_hc,Homogeneity_lovain,Homogeneity_kmeans,Homogeneity_hc
Control,0.675334,0.557691,0.655914,0.822063,0.751807,0.811664,0.804406,0.751899,0.805884
BROCKMAN,0.639125,0.537834,0.576181,0.814972,0.722333,0.750222,0.775316,0.725856,0.751454
LSI,0.771333,0.59339,0.669699,0.884286,0.782251,0.829786,0.847462,0.772054,0.810234
cisTopic,0.637157,0.638866,0.649876,0.84454,0.785071,0.789733,0.76138,0.786983,0.786667
chromVAR_kmers,0.36405,0.402035,0.394884,0.658258,0.588633,0.56649,0.526064,0.579511,0.566992
chromVAR_motifs,0.213969,0.179487,0.177086,0.395437,0.379945,0.365965,0.328583,0.390344,0.372967
chromVAR_kmers_pca,0.376414,0.491253,0.408893,0.680746,0.668015,0.602586,0.54356,0.673105,0.593645
chromVAR_motifs_pca,0.210264,0.197429,0.171264,0.393746,0.378278,0.356757,0.328589,0.389335,0.363676
GeneScoring,0.00193703,0.109126,0.17522,0.00237623,0.221779,0.295543,0.0133464,0.19905,0.280839
GeneScoring_pca,0.198715,0.170814,0.202343,0.362484,0.327493,0.34633,0.329254,0.33427,0.340341


In [12]:
df_metrics.to_csv(path_metrics+file_metric)