In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from scipy.special import softmax
from scipy.spatial.distance import euclidean

# MS: TODO comment out for now
#import biomart


import umap
import pickle
import scipy.spatial as sp
import seaborn as sns
import itertools

from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets import load_iris

from scipy.spatial.distance import pdist,squareform
from scipy.cluster import hierarchy

from numpy.linalg import eig

import glob

In [10]:
from API import Visualizations, Resolver, TFProcessStaticBeta, TFProcessAnnealing, Genome, GenomeFromTFDicts,GenomeFromEnAssociations

# API

# Figure 2 - Tabula Muris and reprogramming

In [11]:
reprog_paths = pd.read_csv("../tabula_muris/direct_reprogramming_recipes.csv")
reprog_facts = []
_=[[reprog_facts.append(y) for y in x.split(",")] for x in reprog_paths.factors.values]
reprog_facts=np.unique(reprog_facts)
inv_mapping = pickle.load(open('../inv_mapping.pickle','rb'))
mapping = pickle.load(open('../mapping.pickle','rb'))
annotations = pd.read_csv("../tabula_muris/annotations_facs.csv").set_index('cell')
metadata = pd.read_csv("../tabula_muris/metadata_FACS.csv")

rel_ctypes = [
          'Bergmann glial cell',
          'Brush cell of epithelium proper of large intestine',
          'Kupffer cell',
          'astrocyte',
          'basal cell', 
          'basal cell of epidermis', 
          'basophil', 
          'bladder cell',
          'bladder urothelial cell', 
          'brain pericyte',
          'cardiac muscle cell',
          'ciliated columnar cell of tracheobronchial tree', 
          'endocardial cell',
          'endothelial cell of hepatic sinusoid',
          'enterocyte of epithelium of large intestine', 
          'enteroendocrine cell',
          'epidermal cell', 
          'epithelial cell of large intestine', 
          'epithelial cell of lung',
          'epithelial cell of proximal tubule', 
          'erythrocyte', 
          'fibroblast',
          'hepatocyte',
          'B cell', 'T cell',
          'natural killer cell',
          'keratinocyte',
          'kidney collecting duct epithelial cell',
          'large intestine goblet cell',
          'luminal epithelial cell of mammary gland', 
          'lung endothelial cell',
          'macrophage',
          'mesenchymal cell',
          'microglial cell', 
          'monocyte', 
          'myofibroblast cell',
          'neuron', 
          'oligodendrocyte',
          'pancreatic A cell',
          'pancreatic D cell', 
          'pancreatic PP cell', 
          'pancreatic acinar cell',
          'pancreatic ductal cell', 
          'pancreatic stellate cell',
          'professional antigen presenting cell', 
          'skeletal muscle satellite cell', 
          'smooth muscle cell', 
          'stromal cell',
          'type B pancreatic cell']

FileNotFoundError: [Errno 2] No such file or directory: '../tabula_muris/direct_reprogramming_recipes.csv'

In [None]:
def initialize_ctypes():
    def load_tabmuris_data(fs):
        total_data_tfs = []
        for f in fs:
            data =pd.read_csv(f,index_col=0)
            new_idx_tfs = []
            for x in data.index:
                if x in inv_mapping.keys() and inv_mapping[x] in tfs.index:
                    new_idx_tfs.append(x)
                elif x in ["Spi1","Lmo2"]:
                    new_idx_tfs.append(x)
            data_tfs = data.loc[new_idx_tfs].T
            total_data_tfs.append(data_tfs)
        total_data_tfs = pd.concat(total_data_tfs)
        return total_data_tfs

    def gen_ctype_lst(dataset):
        lst = []
        ctypes = []
        for j in dataset.index:
            lst.append(j in annotations.index)
            if lst[-1]:
                ctypes.append(annotations.loc[j].cell_ontology_class)
        return ctypes,lst
    def load_and_generate_avgd_by_celltypes():

        total_data_tfs = load_tabmuris_data(glob.glob("tabula_muris/FACS/*"))
        ctypes,lst = gen_ctype_lst(total_data_tfs)
        total_data_tfs = total_data_tfs[lst]
        avgd_by_ctype = total_data_tfs.groupby(ctypes).mean()
        avgd_by_ctype.to_csv("../tabula_muris/avgd_by_ctype.csv")
        
def load_ctype_patterns():
    avgd_by_ctype = pd.read_csv("../tabula_muris/avgd_by_ctype.csv",index_col=0)
    return avgd_by_ctype

def preprocess_ctype_patterns(patterns):
    df = np.log1p(patterns)
    rel_tfs = list(set(list(df.columns[(df.max()>0) & (df.mean()>np.log1p(3)) & (df.std()>np.log1p(3))].values)  + list(reprog_facts)))
    #df=patterns
    df = df.loc[:,set(rel_tfs)]
    return df

In [7]:
df = preprocess_ctype_patterns(load_ctype_patterns().loc[rel_ctypes])
pd.DataFrame(df.index).to_csv("../outputs/TM_Celltypes.csv",header=False, index=False)
pd.DataFrame(df.columns).to_csv("../outputs/TM_TFs.csv",header=False, index=False)


NameError: name 'preprocess_ctype_patterns' is not defined

In [8]:
# Define the plot grid layout
reprog_idx = 2
enhancer_net = TFProcessStaticBeta(df,np.zeros(df.shape[0]))

fig = plt.figure(figsize=(12,12))
grid = fig.add_gridspec(4, 3)

# Create the individual subplots

empty = fig.add_subplot(grid[1,-1])
heatmap_ax = fig.add_subplot(grid[:2,:-1])
idxs=[x.lower() for x in df.index]
Visualizations.plot_patterns(heatmap_ax,enhancer_net.patterns,aspect=.8)
heatmap_ax.set_yticks([])
heatmap_ax.set_xticks([])
heatmap_ax.set_ylabel("TFs")
heatmap_ax.set_xlabel("enhancers, correspond to terminal cell types")
heatmap_ax.set_ylabel("TFs")
heatmap_ax.set_xlabel("enhancers, correspond to terminal cell types")

noised_attractor_ax = fig.add_subplot(grid[0,-1])

pulse_plot = fig.add_subplot(grid[2,0])
pca_ax = fig.add_subplot(grid[3,0 ])
tf_axs = fig.add_subplot(grid[2:,1])
en_axs = fig.add_subplot(grid[2:,2])

empty.set_facecolor('white')
empty.set_axis_off()



resolvers=enhancer_net.generate_noised_input_trajectories()
Visualizations.plot_pca_trajs(noised_attractor_ax,enhancer_net.patterns,resolvers,range(len(resolvers)))


resolvers=enhancer_net.process_reprogramming_input(reprog_paths,verbose=False)
Visualizations.plot_pca_trajs(pca_ax,enhancer_net.patterns,resolvers,[reprog_idx,])

solver=resolvers[reprog_idx]


pulse_plot.set_facecolor('white')
pulse = lambda t: 1*(t>solver.t.max()/3)*(t<2*solver.t.max()/3)
pulse_plot.plot(solver.t,pulse(solver.t),c='k')
pulse_plot.set_yticks([])
pulse_plot.set_xlabel('time')
pulse_plot.set_ylabel('reprogramming factors')
pulse_plot.set_ylim([-0.1,2])


Visualizations.plot_enhancer_probs(en_axs,solver,aspect=2,cbar=True,label_annealing=False)
en_axs.set_ylabel("enhancer")
en_axs.set_xlabel("time")
en_axs.set_title("enhancer activity")


Visualizations.plot_tf_levels(tf_axs,solver,aspect=2,cbar=True,label_annealing=False)
tf_axs.set_ylabel("transcription factor")
tf_axs.set_xlabel("time")
tf_axs.set_title("TF expression")

fig.tight_layout()

#plt.savefig('../figures/enhancer_selection.png', dpi=600)


NameError: name 'TFProcessStaticBeta' is not defined

# Figure 3

In [None]:
terminal_lineages = ['B Cell Lineage', 'Basophil Lineage', 'Dendritic Cell Lineage',
                     'Eosinophil Lineage', 'Erythrocyte Lineage', 'Macrophage Lineage',
                      'Mast Cell Lineage', 'Megakaryocyte Lineage','NK Cell Lineage', 'Neutrophil Lineage',
                      'T Cell Lineage']

tfs = pd.read_csv("../haemopoiesis/mouse_ensemble_tfs_from_lambertetal_isyes.unique.txt",names=["geneId",]).set_index("geneId")
data = pd.read_csv("../haemopoiesis/Haemopedia-Mouse-RNASeq_tpm.txt",sep='\t').set_index("geneId")
samples = pd.read_csv("../haemopoiesis/Haemopedia-Mouse-RNASeq_samples.txt",sep='\t')

data_reduced_merged_lineage = pd.merge(samples.loc[:,["sampleId","cell Type Description","cell_lineage"]],data.T.reset_index().rename(columns={'index': 'sampleId'})).groupby("cell_lineage").mean()
data_reduced_merged_lineage = data_reduced_merged_lineage.drop(["Multi Potential Progenitor","Restricted Potential Progenitor"])

data_reduced_merged = data_reduced_merged_lineage.copy()
data_reduced_merged_log = np.log1p(data_reduced_merged)

# get all variable TFs
data_reduced =data_reduced_merged_log.loc[:,np.intersect1d(tfs.index.values,data_reduced_merged_log.columns)]
data_reduced = data_reduced.loc[:,data_reduced.max()>0]
data_reduced = data_reduced.loc[:,data_reduced.mean()>np.log1p(3)]
data_reduced = data_reduced.loc[:,data_reduced.std()>np.log1p(3)]

data_reduced.index=[x.split(" Lineage")[0] for x in data_reduced.index]


pd.DataFrame(data_reduced.index).to_csv("../outputs/HEM_Celltypes.csv",header=False, index=False)
pd.DataFrame([mapping[x] for x in data_reduced.columns]).to_csv("../outputs/HEM_TFs.csv",header=False, index=False)

In [None]:



def annealing_simple_model(en_ax,tf_ax,patterns_ax):
    N = 9 # bits
    K=6 # overall patterns


    df = np.random.normal(size=(K,N))

    df[:,:] = 0
    df[0][:2] = 1
    df[1][1:3] = 1
    df[2][3:5] = 1
    df[3][4:6] = 1
    df[4][6:8] = 1
    df[5][7:9] = 1
    df = pd.DataFrame(df)

    w = np.repeat(0.0,K)
    w[4]=0.05


    enhancer_net = TFProcessAnnealing(df,w,beta_max=8)
    resolver=enhancer_net.annealing(resolver.patterns[0])

    Visualizations.plot_enhancer_probs(en_ax,resolver)
    Visualizations.plot_tf_levels(tf_ax,resolver)
    
    
    Visualizations.plot_patterns(patterns_ax,enhancer_net.patterns)
    

In [None]:
# perform and plot hierarchial clustering
fig = plt.figure(figsize=(19,16))
grid = fig.add_gridspec(8, 10)

empty = fig.add_subplot(grid[:2,:4])
empty.set_facecolor('white')
empty.set_axis_off()


dendro_fig = fig.add_subplot(grid[4:,:2])
heatmap_fig = fig.add_subplot(grid[4:,2:5])
umap_fig = fig.add_subplot(grid[4:,5:])

en_ax = fig.add_subplot(grid[2:4,7:])
tf_ax = fig.add_subplot(grid[2:4,4:7])
patterns_ax = fig.add_subplot(grid[:2,4:7])

annealing_simple_model(en_ax,tf_ax,patterns_ax)

enhancer_net = TFProcessAnnealing(data_reduced,sigma=0.01,beta_max=50,frac_init=0)
Visualizations.plot_heatmap_dendrogram(enhancer_net,heatmap_fig,dendro_fig)


resolvers = enhancer_net.produce_balanced_differentiation_trajectories()
Visualizations.umap_plot_diff_trajs(umap_fig,resolvers)

fig.tight_layout()

#plt.savefig('../figures/heatmap.png', dpi=600)


# Figure 4

In [None]:
def run_all(genome,axs,beta=50):
    def run_orig():
        x0=np.zeros(genome.N)
        x0[:4]=1/4
        return genome.run_x0(x0,beta=beta)
    def run_ctype_A():
        x0=np.zeros(genome.N)
        x0[:3]=1/3
        return genome.run_x0(x0,beta=beta)

    def run_ctype_B():
        x0=np.zeros(genome.N)
        x0[:2]=1/3
        x0[3]=1/3
        return genome.run_x0(x0,beta=beta)
    
    def run_ctype_C():
        x0=np.zeros(genome.N)
        x0[2]=0.5
        x0[4]=0.5
        return genome.run_x0(x0,beta=beta)

    Visualizations.plot_tf_levels(axs[0],run_orig(),aspect=1.5,vmax=0.5)
    axs[0].set_title("ancestral cell")
    Visualizations.plot_tf_levels(axs[1],run_ctype_A(),aspect=1.5,vmax=0.5)
    axs[1].set_title("ancestral cell")
    Visualizations.plot_tf_levels(axs[2],run_ctype_B(),aspect=1.5,vmax=0.5)
    axs[2].set_title("ancestral cell")
    Visualizations.plot_tf_levels(axs[3],run_ctype_C(),aspect=1.5,vmax=0.5)
    axs[3].set_title("ancestral cell")
    


In [None]:
enhancer_binding_profiles =   {"EN0" : [0,1,2,3], "EN1" : [0,], "EN2" : [2,], "EN3" : [2,4]}
enhancer_locations = {"EN0" : [0,0,1,1,2,3], "EN1" : [0,],"EN2" : [2,], "EN3" : [2,4]}

genome = GenomeFromEnAssociations(enhancer_binding_profiles,enhancer_locations)


fig = plt.figure(figsize=(12,12))
grid = fig.add_gridspec(4, 6)


empty=fig.add_subplot(grid[:,0])
empty.set_facecolor('white')
empty.set_axis_off()

run_all(genome,[fig.add_subplot(grid[1,j]) for j in range(2,6)])
genome.xi_mat[5][2]=0
genome.xi_mat[4][3]=0
run_all(genome,[fig.add_subplot(grid[2,j]) for j in range(2,6)])
genome.xi_mat[0][2]=0
genome.xi_mat[1][3]=0
genome.xi_mat[2][2]=0
genome.xi_mat[3][3]=0
run_all(genome,[fig.add_subplot(grid[3,j]) for j in range(2,6)])
fig.tight_layout()

#plt.savefig('../figures/new_ctype.png', dpi=600)
