## Make and plot cluster-specific markers
This notebook compares each cluster with all the remaining clusters (as a whole) and pulls out the top X markers for each cluster.

In [1]:
import pandas as pd
import numpy as np
import cellstates as cs
from cellstates.chelpers import marker_scores
import scipy.io as sio
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import pickle as pkl

In [2]:
path='/scicore/home/doetsch/GROUP/scigrp/vargasSingleCell3_2males_added/'
pathold='/scicore/home/doetsch/GROUP/scigrp/vargasSingleCell3/'

In [3]:
pklfile=open("varagasSingleCell3_2males_added.pkl",'rb')
df=pkl.load(pklfile)
clusters=pkl.load(pklfile)
hierarchy_df=pkl.load(pklfile)
score_df=pkl.load(pklfile)
annotation=pkl.load(pklfile)
n_scale=pkl.load(pklfile)
lmbd=pkl.load(pklfile)
pklfile.close()
data = df.to_numpy().astype(int)

In [4]:
clst = cs.Cluster(data, lmbd, clusters, max_clusters=max(clusters)+1, num_threads=12, n_cache=1000)

This sets the colors for the top row of the ballplot.

In [5]:
colordict={"Tom":"#1f77b4","Adam":"#17bec7","Viole":"#e377c2","Ana":"#d62728","Eve":"#a62728","Fiona":"#d41f7d","John":"#4287f5","Melvin":"#03255c"}
colors = list(map(colordict.get, np.unique(annotation)))
print(np.unique(annotation))
cl, clsizes = np.unique(clusters, return_counts=True)

['Adam' 'Ana' 'Eve' 'Fiona' 'John' 'Melvin' 'Tom' 'Viole']


In [6]:
with open("/scicore/home/doetsch/GROUP/scigrp/vargasSingleCell3/utils.py") as f:
    exec(f.read())

In [7]:
(promidtable,genetable)=loadGeneNames(path+"OUTPUT/concat_mat/prom_expr_promoters_w_genes.tsv")
(promidtable,genetable)=loadGeneNames(pathold+"OUTPUT/concat_mat/prom_expr_promoters_w_genes.tsv",promidtable,genetable)

promids = score_df.columns.values
geneids = []
for promid in promids[3:]:
    added=False
    for gene in genetable:
        if promid in genetable[gene]:
            geneids.append(gene)
            added=True
            break
    if not added:
        geneids.append(promid)

geneidswithpromoterids=[]
for promid in promids[3:]:
    added=False
    for gene in genetable:
        if promid in genetable[gene]:
            geneidswithpromoterids.append(gene+"_"+promid)
            added=True
            break
    if not added:
        geneidswithpromoterids.append(promid)

In [8]:
nc = 96 # number of clusters
merged_clusters = cs.clusters_from_hierarchy(hierarchy_df, cluster_init=clusters, steps= - nc + 1)
newick_string = cs.hierarchy_to_newick(hierarchy_df[-nc+1:], merged_clusters, cell_leaves=False)
t = Tree(newick_string, format=1)
ts = get_TreeStyle_attributes(t, merged_clusters, annotation, colors=colors,leaf_scale=0.05,normalize=True,showInternalNodeNames=True)
new_leaf_names = ["merged"+str(nc)+"C"+str(i) for i in np.arange(nc)]
name_dict = dict(zip(t.iter_leaf_names(),  new_leaf_names))
for key in name_dict:
    name_dict[key]=name_dict[key]+"_"+key

In [9]:
clusters96=[]
super96f=open(path+"averaging/superclusters96.txt",'r')
for c in super96f:
    cluster=int(c.strip())
    if cluster not in clusters96:
        clusters96.append(cluster)
super96f.close()

The following cell compares each cluster (at a given level of superclustering) to all the other clusters and notes the top 30 markers each time. At the end a list of this markers, stripped of duplicates, is written. 

In [None]:
c1=resolveHierarchy(hierarchy_df,clusters96[0])
c2=[c for c in range(len(clsizes)) if not c in c1]
cl_marker_table=[marker_scores(clst,c1,c2)]
for cluster in clusters96[1:]:
    c1=[cluster]
    c1.extend(resolveHierarchy(hierarchy_df,cluster))
    c2=[c for c in range(len(clsizes)) if not c in c1]
    cl_marker_table.append(marker_scores(clst,c1,c2))

cl_marker_table = pd.DataFrame(cl_marker_table, columns=geneids)
cl_topmarkers=open(path+"single_clusters_markers_96_top30.txt",'w')
markers=[]
for index,row in cl_marker_table.iterrows():
    sortedindex=row.abs().sort_values(ascending=False)[:30].index
    markers.extend(list(map(str,sortedindex)))
    cl_topmarkers.write(str(clusters96[index])+" "+name_dict["C"+str(clusters96[index])]+" "+",".join(list(map(str,sortedindex)))+"\n")
cl_topmarkers.close()

In [None]:
markers=np.unique(markers)

fp = open(path+"single_cluster_markers_96_top30_unique_list.txt", 'w')
fp.write("\n".join(markers))
fp.close()

This cell does the same as above, but adds the promoter ids to the list.

In [None]:
c1=resolveHierarchy(hierarchy_df,clusters96[0])
c2=[c for c in range(len(clsizes)) if not c in c1]
cl_marker_table=[marker_scores(clst,c1,c2)]
for cluster in clusters96[1:]:
    c1=[cluster]
    c1.extend(resolveHierarchy(hierarchy_df,cluster))
    c2=[c for c in range(len(clsizes)) if not c in c1]
    cl_marker_table.append(marker_scores(clst,c1,c2))

cl_marker_table = pd.DataFrame(cl_marker_table, columns=geneidswithpromoterids)
cl_topmarkers=open(path+"single_clusters_markers_96_top30_with_promoterids.txt",'w')
for index,row in cl_marker_table.iterrows():
    sortedindex=row.abs().sort_values(ascending=False)[:30].index
    cl_topmarkers.write(str(clusters96[index])+" "+name_dict["C"+str(clusters96[index])]+" "+",".join(list(map(str,sortedindex)))+"\n")
cl_topmarkers.close()

This cell makes a ballplot for the above generated marker lists.

In [None]:
import os
os.environ['QT_QPA_PLATFORM']='offscreen'
with open(path+"single_cluster_markers_96_top30_unique_list.txt", 'r') as genelist:
    makeBallPlot(96,genelist,'single_cluster_markers_96_top30_expression.pdf',plotpath=path+"nb/plots/",addExpression=True,minweight=0.1)

This cell makes a separate ballplot plus expression matrix of the top 30 markers for each cluster.

In [10]:
import os
os.environ['QT_QPA_PLATFORM']='offscreen'
c1=resolveHierarchy(hierarchy_df,clusters96[0])
c2=[c for c in range(len(clsizes)) if not c in c1]
cl_marker_table=[marker_scores(clst,c1,c2)]
for cluster in clusters96[1:]:
    c1=[cluster]
    c1.extend(resolveHierarchy(hierarchy_df,cluster))
    c2=[c for c in range(len(clsizes)) if not c in c1]
    cl_marker_table.append(marker_scores(clst,c1,c2))
cl_marker_table = pd.DataFrame(cl_marker_table, columns=geneids)

In [None]:
for index,row in cl_marker_table.iterrows():
    if not clusters96[index] in [5,203,11,13,34,96,183,148,441,221]:
        continue
    sortedindex=row.abs().sort_values(ascending=False)[:30].index
    cl_topmarkers=open(path+"single_clusters_markers_96_Cluster_"+str(clusters96[index])+"top30.txt",'w')
    for s in sortedindex:
        cl_topmarkers.write(str(s)+"\n")
    cl_topmarkers.close()
    with open(path+"single_clusters_markers_96_Cluster_"+str(clusters96[index])+"top30.txt", 'r') as genelist:
        makeBallPlot(96,genelist,"single_clusters_markers_96_Cluster_"+str(clusters96[index])+"top30.pdf",plotpath=path+"nb/plots/",addExpression=True,minweight=0.1)

/scicore/home/doetsch/GROUP/scigrp/vargasSingleCell3_2males_added/nb/plots/0_single_clusters_markers_96_Cluster_221top30.pdf


QStandardPaths: XDG_RUNTIME_DIR not set, defaulting to '/scratch/saktho00/slurm-job.17995583/runtime-saktho00'


/scicore/home/doetsch/GROUP/scigrp/vargasSingleCell3_2males_added/nb/plots/0_single_clusters_markers_96_Cluster_441top30.pdf
/scicore/home/doetsch/GROUP/scigrp/vargasSingleCell3_2males_added/nb/plots/0_single_clusters_markers_96_Cluster_148top30.pdf


Here we can pull out the specific markers for a list of clusters.
The list of cluster names, which is only valid for a given level of superclustering, here 96, is set in the first line.

In [None]:
subclusters=["C40","C185","C98","C90"]
def resolveHierarchy(hierarchy_df,c):
    subs=hierarchy_df["cluster_old"][hierarchy_df["cluster_new"]==c]
    #print(subs)
    returnsubs=list(subs)
    for s in subs:
        #print(s)
        returnsubs.extend(list(resolveHierarchy(hierarchy_df,s)))
    return(returnsubs)

for cl in subclusters:
    c1 = resolveHierarchy(hierarchy_df[:-96],int(cl[1:]))
    c1.extend([int(cl[1:])])
    c2=[c for c in range(len(clsizes)) if not c in c1]
    cl_marker_table=[marker_scores(clst,c1,c2)]
    subsfile=open(path+"nb/plots/"+cl+"_subclusters.txt",'w')
    subsfile.write("\n".join(["C"+str(c) for c in c1]))
    subsfile.write("\nvs\n")
    subsfile.write("\n".join(["C"+str(c) for c in c2]))
    subsfile.close()
    cl_marker_table = pd.DataFrame(cl_marker_table, columns=geneids)
    cl_topmarkers=open(path+"nb/plots/spec_markers"+cl+".txt",'w')
    markerscores=open(path+"nb/plots/spec_markerscores"+cl+".txt",'w')
    for index,row in cl_marker_table.iterrows():
        sortedindex=row.abs().sort_values(ascending=False)[:1000].index
        cl_topmarkers.write("\n".join(list(map(str,sortedindex)))+"\n")
        markerscores.write("\n".join(list(map(str,row.abs().sort_values(ascending=False)[:1000]))))
    cl_topmarkers.close()
    markerscores.close()
    #with open(path+"nb/plots/spec_markers"+cl+".txt", 'r') as genelist:
    #    makeBallPlot(genelist,subclusters,"spec_markers_plot_"+cl+".pdf",plotpath=path+"nb/plots/",addExpression=False)