# Clustering of all data

Read in all necessary packages:

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pandas.io.parsers import read_csv
import scvelo as scv
import scanpy as sc
import numpy as np
from functools import reduce
from anndata import AnnData, read_h5ad
import singlecellmultiomics.bamProcessing.bamToRNACounts
import loompy

In [None]:
scv.settings.verbosity = 3 # show errors(0), warnings(1), info(2), hints(3)
sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()

scv.settings.presenter_view = True  # set max width size for presenter view
#scv.set_figure_params('scvelo')  # for beautified visualization
sc.set_figure_params(dpi=100, color_map = 'viridis')

# Load the dataset

In [None]:
pathToData = '/Users/m.blotenburg/Documents/Projects/TCHIC/data/'

In [None]:
pathToSaveData = '/Users/m.blotenburg/Documents/Projects/TCHIC/data/rep3_rep4/dataframes/'

#only need to do this once
#REP2 day 5
day5 = pathToData + 'rep2/20210215_OUD5651_K27me3K4me3_day5_20201023fixed/dataframes/20210215_OUD5651_full_dataset.loom'

#REP2 day 5, 6, 7
p1 = pathToData + 'rep2/all/dataframes/20210404_OUD5651_OUD5771_OUD5772_rep2_day567.loom'

#REP3 day 5
p2 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd5-rep3-H3K27me3-5/rna_counts/PZ-MB-TChIC-Gastd5-rep3-H3K27me3-5.loom'
p3 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd5-rep3-H3K27me3-6/rna_counts/PZ-MB-TChIC-Gastd5-rep3-H3K27me3-6.loom'

#REP4 day 3
p4 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd3-rep4-H3K27me3-1/rna_counts/PZ-MB-TChIC-Gastd3-rep4-H3K27me3-1.loom'
p5 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd3-rep4-H3K27me3-2/rna_counts/PZ-MB-TChIC-Gastd3-rep4-H3K27me3-2.loom'
p6 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd3-rep4-H3K27me3-3/rna_counts/PZ-MB-TChIC-Gastd3-rep4-H3K27me3-3.loom'
p7 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd3-rep4-H3K27me3-4/rna_counts/PZ-MB-TChIC-Gastd3-rep4-H3K27me3-4.loom'
p8 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd3-rep4-H3K27me3-5/rna_counts/PZ-MB-TChIC-Gastd3-rep4-H3K27me3-5.loom'
p9 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd3-rep4-H3K27me3-6/rna_counts/PZ-MB-TChIC-Gastd3-rep4-H3K27me3-6.loom'

p10 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k4me3/processed_transcriptome/PZ-MB-TChIC-Gastd3-rep4-H3K4me3-1/rna_counts/PZ-MB-TChIC-Gastd3-rep4-H3K4me3-1.loom'
p11 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k4me3/processed_transcriptome/PZ-MB-TChIC-Gastd3-rep4-H3K4me3-2/rna_counts/PZ-MB-TChIC-Gastd3-rep4-H3K4me3-2.loom'
p12 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k4me3/processed_transcriptome/PZ-MB-TChIC-Gastd3-rep4-H3K4me3-3/rna_counts/PZ-MB-TChIC-Gastd3-rep4-H3K4me3-3.loom'
p13 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k4me3/processed_transcriptome/PZ-MB-TChIC-Gastd3-rep4-H3K4me3-4/rna_counts/PZ-MB-TChIC-Gastd3-rep4-H3K4me3-4.loom'
p14 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k4me3/processed_transcriptome/PZ-MB-TChIC-Gastd3-rep4-H3K4me3-5/rna_counts/PZ-MB-TChIC-Gastd3-rep4-H3K4me3-5.loom'
p15 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k4me3/processed_transcriptome/PZ-MB-TChIC-Gastd3-rep4-H3K4me3-6/rna_counts/PZ-MB-TChIC-Gastd3-rep4-H3K4me3-6.loom'

#REP4 day 4
p16 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd4-rep4-H3K27me3-1/rna_counts/PZ-MB-TChIC-Gastd4-rep4-H3K27me3-1.loom'
p17 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd4-rep4-H3K27me3-2/rna_counts/PZ-MB-TChIC-Gastd4-rep4-H3K27me3-2.loom'
p18 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd4-rep4-H3K27me3-3/rna_counts/PZ-MB-TChIC-Gastd4-rep4-H3K27me3-3.loom'
p19 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd4-rep4-H3K27me3-4/rna_counts/PZ-MB-TChIC-Gastd4-rep4-H3K27me3-4.loom'
p20 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd4-rep4-H3K27me3-5/rna_counts/PZ-MB-TChIC-Gastd4-rep4-H3K27me3-5.loom'
p21 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd4-rep4-H3K27me3-6/rna_counts/PZ-MB-TChIC-Gastd4-rep4-H3K27me3-6.loom'

p22 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k4me3/processed_transcriptome/PZ-MB-TChIC-Gastd4-rep4-H3K4me3-1/rna_counts/PZ-MB-TChIC-Gastd4-rep4-H3K4me3-1.loom'
p23 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k4me3/processed_transcriptome/PZ-MB-TChIC-Gastd4-rep4-H3K4me3-2/rna_counts/PZ-MB-TChIC-Gastd4-rep4-H3K4me3-2.loom'

#REP4 day 5
p24 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd5-rep4-H3K27me3-2/rna_counts/PZ-MB-TChIC-Gastd5-rep4-H3K27me3-2.loom'
p25 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd5-rep4-H3K27me3-3/rna_counts/PZ-MB-TChIC-Gastd5-rep4-H3K27me3-3.loom'
p26 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd5-rep4-H3K27me3-4/rna_counts/PZ-MB-TChIC-Gastd5-rep4-H3K27me3-4.loom'
p27 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd5-rep4-H3K27me3-5/rna_counts/PZ-MB-TChIC-Gastd5-rep4-H3K27me3-5.loom'
p28 = pathToData + 'rep3_rep4/20210615_OUD6104_OUD5886_day345_fixed/k27me3/processed_transcriptome/PZ-MB-TChIC-Gastd5-rep4-H3K27me3-6/rna_counts/PZ-MB-TChIC-Gastd5-rep4-H3K27me3-6.loom'


paths_all = [p1,p2,p3,p4,p5,p6,p7,p8,p9,p10,
         p11,p12,p13,p14,p15,p16,p17,p18,p19,p20,
         p21,p22,p23,p24,p25,p26,p27,p28]

paths_rep3 = [p2,p3]

paths_rep4 = [p4,p5,p6,p7,p8,p9,p10,
         p11,p12,p13,p14,p15,p16,p17,p18,p19,p20,
         p21,p22,p23,p24,p25,p26,p27,p28]

paths_day5 = [day5,p2,p3,p24,p25,p26,p27,p28]

Here we combine the loom files for all separate libraries into one big loom file ready to import as adata for scanpy. We only need to do this once, afterwards we can just import the .loom directly into adata.

#only need to do this once
loompy.combine(paths_all, (pathToSaveData +'/20210615_all_OUD5651_OUD5771_OUD5772_OUD6104_OUD5886_rep234_day34567.loom'), key="Accession")

loompy.combine(paths_rep3, (pathToSaveData +'/20210615_OUD6104_OUD5886_rep3_day5.loom'), key="Accession")

loompy.combine(paths_rep4, (pathToSaveData +'/20210615_OUD6104_OUD5886_rep4_day345.loom'), key="Accession")

loompy.combine(paths_day5, (pathToSaveData +'/20210615_OUD5651_OUD5771_OUD5772_OUD6104_OUD5886_rep234_day5.loom'), key="Accession")

In [None]:
#import loom in adata
adata_test = scv.read_loom(pathToSaveData + '/20210615_all_OUD5651_OUD5771_OUD5772_OUD6104_OUD5886_rep234_day34567.loom',obs_names='CellID')
adata_test

In [None]:
# define a file to write results to
results_file = pathToSaveData + '20210630_all_OUD5651_OUD5771_OUD5772_OUD6104_OUD5886_rep234_day34567_Scanpy.h5ad'  # the file that will store the analysis results

# Scanpy

## Preprocessing of dataset 

In [None]:
adata = adata_test

adata

Check the amount of spliced/unspliced reads in the data.

In [None]:
scv.pl.proportions(adata)

In [None]:
#Make names unique
adata.var_names_make_unique()

Show those genes that yield the highest fraction of counts in each single cells, across all cells.

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20)

In [None]:
adata

Basic filtering. You can play around with these cut-offs to see what works best.

In [None]:
minCount = 1000
sc.pp.filter_cells(adata, min_counts=minCount)
sc.pp.filter_cells(adata, min_genes=1000)
sc.pp.filter_genes(adata, min_cells=10)

In [None]:
#maxCount = 40000
#sc.pp.filter_cells(adata, max_counts=maxCount)
#sc.pp.filter_cells(adata, max_genes=4000)

In [None]:
adata

Let us assemple some information about mitochondrial genes, which are important for quality control.

Citing from "Simple Single Cell" workflows [(Lun, McCarthy & Marioni, 2017)](https://master.bioconductor.org/packages/release/workflows/html/simpleSingleCell.html#examining-gene-level-metrics):

> High proportions are indicative of poor-quality cells (Islam et al. 2014; Ilicic et al. 2016), possibly because of loss of cytoplasmic RNA from perforated cells. The reasoning is that mitochondria are larger than individual transcript molecules and less likely to escape through tears in the cell membrane.

Note you can also use the function `pp.calculate_qc_metrics` to compute the fraction of mitochondrial genes and additional measures.

In [None]:
mito_genes = adata.var_names.str.startswith('mt-')

# for each cell compute fraction of counts in mito genes vs. all genes
adata.obs['percent_mito'] = np.sum(
    adata[:, mito_genes].X, axis=1) / np.sum(adata.X, axis=1)
# add the total counts per cell as observations-annotation to adata
adata.obs['n_counts'] = adata.X.sum(axis=1)

A violin plot of the computed quality measures.

In [None]:
sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'],
             jitter=0.4, multi_panel=True)

Remove cells that have too many mitochondrial genes expressed or too many total counts.

In [None]:
sc.pl.scatter(adata, x='n_counts', y='percent_mito')  
sc.pl.scatter(adata, x='n_counts', y='n_genes')

Actually do the filtering.

In [None]:
adata = adata[adata.obs['percent_mito'] < 0.5, :]

throw out any other unwanted genes

In [None]:
# removing genes with annotations we are not interested in
banned = ['NA', 'mt-', "Malat1"] #, 'Rpl', 'Rps', 'Hist1h']
keptGenes = [geneName for geneName in adata.var.index if not any( (b in geneName for b in banned) )]

adata = adata[:,keptGenes]

adata

In [None]:
scv.pl.proportions(adata)

minCount = 1000
sc.pp.filter_cells(adata, min_counts=minCount)
sc.pp.filter_cells(adata, min_genes=100)
sc.pp.filter_genes(adata, min_cells=2)

Total-count normalize (library-size correct) the data matrix $\mathbf{X}$ to 10,000 reads per cell, so that counts become comparable among cells.

In [None]:
sc.pp.normalize_per_cell(adata, counts_per_cell_after=minCount)

Logarithmize the data.

In [None]:
sc.pp.log1p(adata)

Set the `.raw` attribute of AnnData object to the logarithmized raw gene expression for later use in differential testing and visualizations of gene expression. This simply freezes the state of the AnnData object. While many people consider the normalized data matrix as the "relevant data" for visualization and differential testing, some would prefer to store the unnormalized data.

In [None]:
adata.raw = adata

Identify highly-variable genes.

In [None]:
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)

In [None]:
sc.pl.highly_variable_genes(adata)

Actually do the filtering.

In [None]:
#adata = adata[:, adata.var['highly_variable']]

Regress out effects of total counts per cell and the percentage of mitochondrial genes expressed. Scale the data to unit variance.

In [None]:
sc.pp.regress_out(adata, ['n_counts', 'percent_mito'])

Scale each gene to unit variance. Clip values exceeding standard deviation 10. 

In [None]:
sc.pp.scale(adata, max_value=10)

In [None]:
sc.pl.highest_expr_genes(adata, n_top=20)

## Principal component analysis

Reduce the dimensionality of the data by running principal component analysis (PCA), which reveals the main axes of variation and denoises the data.

In [None]:
sc.tl.pca(adata, svd_solver='arpack')

We can make a scatter plot in the PCA coordinates, but we will not use that later on.

In [None]:
plotgene = [geneName for geneName in adata.var.index if any( (b in geneName for b in ['Gata6']) )]
sc.pl.pca(adata, color=plotgene, color_map = 'viridis')

In [None]:
sc.pl.pca(adata, color = ['T', 'Gata6'], color_map = 'viridis')

Let us inspect the contribution of single PCs to the total variance in the data. This gives us information about how many PCs we should consider in order to compute the neighborhood relations of cells, e.g. used in the clustering function  `sc.tl.louvain()` or tSNE `sc.tl.tsne()`. In our experience, often, a rough estimate of the number of PCs does fine.

In [None]:
sc.pl.pca_variance_ratio(adata, log=True)

Save the result.

In [None]:
adata.write(results_file)

## Computing the neighborhood graph

Let us compute the neighborhood graph of cells using the PCA representation of the data matrix. You might simply use default values here. For the sake of reproducing Seurat's results, let's take the following values.

In [None]:
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

## Embedding the neighborhood graph

We advertise embedding the graph in 2 dimensions using UMAP ([McInnes et al., 2018](https://arxiv.org/abs/1802.03426)), see below. It is  potentially more faithful to the global connectivity of the manifold than tSNE, i.e., it better preservers trajectories. In some ocassions, you might still observe disconnected clusters and similar connectivity violations. They can usually be remedied by running:

```
tl.paga(adata)
pl.paga(adata, plot=False)  # remove `plot=False` if you want to see the coarse-grained graph
tl.umap(adata, init_pos='paga')
```

In [None]:
sc.tl.umap(adata)

In [None]:
sc.tl.tsne(adata)

We can define which characteristics of the data we would like to display. E.g. libraries, histone marks, time points etc. Then we can easily plot those to check for possible batch effects.

In [None]:
batches = [ob.split(':')[0] for ob in adata.obs_names]
days = [ob.split('-')[3] for ob in batches]
reps = [ob.split('-')[4] for ob in batches]
marks = [ob.split('-')[5] for ob in batches]
#marks = [sub.replace('K', 'k') for sub in marks]
#marks = [sub.replace('k427', 'k27') for sub in marks]

adata.obs['batch'] = batches 
adata.obs['day'] = days
adata.obs['replicates'] = reps 
adata.obs['mark'] = marks 

In [None]:
#they are now added under 'obs'
adata

In [None]:
sc.pl.umap(adata, color=['n_genes','n_counts'], size = 20, color_map = 'viridis')

In [None]:
sc.pl.tsne(adata, color=['n_genes','n_counts'], size = 20, color_map = 'viridis')

In [None]:
sc.pl.umap(adata, color=['day', 'mark','batch'], legend_fontsize=0,size=20)

In [None]:
len(adata.obs['batch'].unique())

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='batch', palette=sns.color_palette("tab20", len(adata.obs['batch'].unique())), ax=ax,size=20)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='day',ax=ax,size=20, frameon=False,palette=sns.color_palette('Set2'), save = '_days.png')

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='replicates',ax=ax,size=25,frameon=False,palette=['coral','navy','gold'], alpha=0.7, save = '_replicates.png')

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='mark',ax=ax,palette=['skyblue','indigo'],frameon=False,
           size=25, alpha=0.7, save='_mark.png')

In [None]:
sc.pl.umap(adata, color = ['Dppa5a','Sox17', 'Sox7', 'T', 'Onecut2', 'Meox1','Cdh1','Hand1'], frameon=False, size = 50)

In [None]:
sc.pl.umap(adata, color=['day', 'mark'],size=15, frameon=False)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='batch',ax=ax, palette=sns.color_palette("nipy_spectral", len(adata.obs['batch'].unique())),size=20, frameon=False)


As we set the `.raw` attribute of `adata`, the previous plots showed the "raw" (normalized, logarithmized, but uncorrected) gene expression. You can also plot the scaled and corrected gene expression by explicitly stating that you don't want to use `.raw`.

In [None]:
sc.tl.embedding_density(adata, groupby='mark')
sc.pl.embedding_density(adata, groupby='mark')

In [None]:
for batch in adata.obs['mark'].unique():
    sc.pl.umap(adata, color='mark', groups=[batch], size=5,
               palette=['cornflowerblue','indigo'], frameon= False)

# OR reload here!

In [None]:
pathToData = '/Users/m.blotenburg/Documents/Projects/TCHIC/data/rep3_rep4/'

In [None]:
adata = sc.read(pathToData + 'dataframes/20210630_all_OUD5651_OUD5771_OUD5772_OUD6104_OUD5886_rep234_day34567_Scanpy.h5ad')

# Clustering the neighbourhood graph

As Seurat and many others, we recommend the Louvain graph-clustering method (community detection based on optimizing modularity). It has been proposed for single-cell data by [Levine et al. (2015)](https://doi.org/10.1016/j.cell.2015.05.047). Note that Louvain clustering directly clusters the neighborhood graph of cells, which we already computed in the previous section.

In [None]:
sc.tl.leiden(adata, resolution = 0.1, key_added='leiden_general') #3

In [None]:
sc.tl.leiden(adata, resolution = 3)

In [None]:
sc.tl.louvain(adata)

Plot the clusters, which agree quite well with the result of Seurat.

In [None]:
cols = sns.color_palette("Set3")+sns.color_palette("husl") + sns.color_palette("Spectral")+sns.color_palette("gnuplot2_r")

In [None]:
#rename louvain or leiden clusters to your new defined cell types
adata.rename_categories('leiden_general', ['Pluripotency','Mesoderm','Neural','Endoderm','4','Haemogenic','NeuralTube' ])

adata.obs['leiden_general'] = [sub.replace('4', 'Pluripotency') for sub in adata.obs['leiden_general']]
adata.obs['leiden_general'] = [sub.replace('NeuralTube', 'Neural') for sub in adata.obs['leiden_general']]




In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='leiden_general',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=20, save = '_mainGroups.png')

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='leiden',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=20)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='louvain',ax=ax, palette=sns.color_palette("tab20", len(adata.obs['louvain'].unique())), legend_loc = "on data", size=30)

In [None]:
sc.pl.umap(adata, color=['day','leiden'], palette=cols, #sns.color_palette("tab20", 21), 
           legend_loc = "on data", legend_fontsize=8,  legend_fontweight="normal",
           legend_fontoutline=2, size=10, frameon=False)

# Correlation plots

In [None]:
adata.obs['dayleiden'] = adata.obs['day'].astype(str) + '_' + adata.obs['leiden'].astype(str)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='dayleiden',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False,# legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=20)

In [None]:
sc.tl.dendrogram(adata, groupby="dayleiden", cor_method = 'pearson')


In [None]:
sc.set_figure_params(scanpy=True, fontsize=10)
ax = sc.pl.correlation_matrix(adata, 'dayleiden', cmap='viridis',show_correlation_numbers=True)#  dendrogram=True)

In [None]:
adata.obs['leidenday'] = adata.obs['leiden'].astype(str) + '_' + adata.obs['day'].astype(str)

In [None]:
adata.obs['leidenday'] = adata.obs['leidenday'].astype('category')

In [None]:
sc.tl.dendrogram(adata, groupby="leidenday", cor_method = 'pearson')


In [None]:
sc.set_figure_params(scanpy=True, fontsize=10)
ax = sc.pl.correlation_matrix(adata, 'leidenday', cmap='viridis',show_correlation_numbers=True)#  dendrogram=True)



In [None]:
sc.set_figure_params(scanpy=True, fontsize=10)
ax = sc.pl.correlation_matrix(adata, 'mark', cmap='viridis',show_correlation_numbers=True)#  dendrogram=True)




In [None]:
adata.obs['day_mark'] = adata.obs['day'].astype(str) + '_' + adata.obs['mark'].astype(str)

In [None]:
adata.obs['day_mark'] = adata.obs['day_mark'].astype('category')

In [None]:
sc.set_figure_params(scanpy=True, fontsize=10)
ax = sc.pl.correlation_matrix(adata, 'day_mark', cmap='viridis',show_correlation_numbers=True)#  dendrogram=True)


In [None]:
adata.obs['leiden_mark'] = adata.obs['leiden'].astype(str) + '_' + adata.obs['mark'].astype(str)
adata.obs['leiden_mark'] = adata.obs['leiden_mark'].astype('category')

In [None]:
sc.set_figure_params(scanpy=True, fontsize=10)
ax = sc.pl.correlation_matrix(adata, 'leiden_mark', cmap='viridis',show_correlation_numbers=True)#  dendrogram=True)



In [None]:
adata.obs['rep_mark'] = adata.obs['replicates'].astype(str) + '_' + adata.obs['mark'].astype(str)
adata.obs['rep_mark'] = adata.obs['rep_mark'].astype('category')

In [None]:
adata.obs['day_rep'] = adata.obs['day'].astype(str) + '_' + adata.obs['replicates'].astype(str)
adata.obs['day_rep'] = adata.obs['day_rep'].astype('category')

In [None]:
sc.set_figure_params(scanpy=True, fontsize=10)
ax = sc.pl.correlation_matrix(adata, 'rep_mark', cmap='viridis',show_correlation_numbers=True)#  dendrogram=True)




In [None]:
sc.set_figure_params(scanpy=True, fontsize=10)
ax = sc.pl.correlation_matrix(adata, 'day_rep', cmap='viridis',show_correlation_numbers=True)#  dendrogram=True)




In [None]:
adata.obs['day_rep_mark'] = adata.obs['day_rep'].astype(str) + '_' + adata.obs['mark'].astype(str)
adata.obs['day_rep_mark'] = adata.obs['day_rep_mark'].astype('category')

In [None]:
sc.set_figure_params(scanpy=True, fontsize=10)
ax = sc.pl.correlation_matrix(adata, 'day_rep_mark', cmap='viridis',show_correlation_numbers=True)#  dendrogram=True)





In [None]:
adata.obs

# Define cluster identities based on known marker genes

Overview of louvain clusters, cell type annotation, known marker genes and marker genes as defined by Scanpy.
(120hAA)

Louvain Group | Cell type | Known marker genes | Scanpy marker genes
---|---|---|---
1 | Neurons | Sox1, Sox2 | Sox1, Sox2, Sox13, Ncam1, Crabp2, Nkx1-2, Gbx2, Hoxc8, Hoxb9, Hoxc9
2 | Anterior neural / Neurectoderm | Foxg1 | Sox1, Sox21, Tmsb4x, Epha4
3 | Branchial arches | Onecut2, Elavl3, Tmsb4x, Celsr3, Btbd17 | Onecut2, Elavl3, Tmsb4x, Hes6
4 | Early endoderm | Prdm1, Irx1 | Peg3, Prrx1, Prrx2, Pbx1, Meis2 | -
5 | Endoderm | Foxa2, Cdh1, Sox17, Trh, 
6 | Early heart progenitors | Bmp4, Gata4, Gata6, Hand1, Hand2 | Bmp4, Runx1t1, Gata6, Msx1, Nrp1, Peg10
7 | Allantois (?) | Tbx4 | Mdm2, Klf9, Stx3, Cdkn1a, Ccng1
8 | Presomitic mesoderm | Dll1, Hes7, Snai1, Hes5, Dkk1, Lfng | Dll1, Hes7, Rspo3, Dll3, Tbx6
9 | Early somitic | Eya1, Pax3, Six1, Cer1 | Eya1, Pax3, Meox1, Cadm1, Nav1
10 | Somitogenesis wavefront | Aldh1a2, Notch1, Lfng | Aldh1a2, Notch1, Foxc1, Foxc2, Foxp1, Lef1
11 | Tailbud | T, Wnt3a, Cdx2, Fgf8 | T, Foxb1, Fgf17, Rspo3, Hoxc6, Hoxc8, Hes7
12a | PGCs | Sox2, Nanog | Nanog, Mt1, Mt2
12b | Node | Cdh1, Trh, Nodal, Sox17, Epcam | Cdh1, Trh
13 | Haemogenic | Sox7 | Car2, Kdr

In [None]:
# if your color map is being annoying, rerun this:
sc.set_figure_params(dpi=100, color_map = 'viridis')

In [None]:
adata.var.filter(regex=r'Myod', axis=0).index

In [None]:
sc.pl.umap(adata, color=['Pax3','Uncx','Myod1','Myf5','Fgf8','leiden'], use_raw=True, color_map = 'viridis')

In [None]:
sc.pl.umap(adata, color=['Rspo3', 'Mesp2', 'Ripply2', 'Cer1', 'Meox1', 'Pax3','Pax1',
                        'Irx3', 'Nes', 'Pax6', 'Dbx1', 'Uncx'], use_raw=True, color_map = 'viridis')

In [None]:
sc.pl.umap(adata, color=['Gata4', 'Gata6', 'Hand1', 'Hand2', 
                         'Bmp4', 'Wnt3a', 'Mixl1', 'Sox2ot',
                        'Dll1', 'Hes7', 'Snai1', 'Hes5', #presomitic
                         'Dkk1', 'Lfng', 'Aldh1a2', 'Notch1', #wavefront
                        'Eya1', 'Pax3', 'Six1', 'Cer1', #early somitic
                        'Cxcl12'], use_raw=True, color_map = 'viridis')

In [None]:
sc.pl.umap(adata, color=['Eomes', 'Gsc', 'Foxa2', 'Sox17', 
                         'Hand1', "Hand2", "Cdx2", "T",
                        "Kdr", "Sox7", 'Dppa5a', "Dazl",
                        "Lfng", "Aldh1a2", "Dll1", "Meox1"], use_raw=True, color_map = 'viridis')

In [None]:
histone = adata.var.filter(regex=r'Hist', axis=0).index

In [None]:
polycomb = ['Jarid2', "Eed","Ezh2", "Prc1", "Ring1", "Suz12", "Phc1","Phc2", "Suv39h2"] #Prc2
k4 = ['Setd5', 'Setd1a', 'Setd3', 'Setdb2', 'Setd7', 'Setd1b','Jarid2']

markergenes = ['Esrrb', 'Dppa2', 'Dppa4', 'Nanog', 'Dppa5a','Gsc', 'Klf9', 'Klf2', 'Utf1', 'Pim2', 
               'Dppa3', 'Dnmt3b', 'Dazl', #PGC
               'Cdh1', 'Trh', 'Sox17', 'Nodal', 'Epcam', 'Foxa2', 'Lefty1', #endoderm/PGC/node
               'Sox7','Kdr','Car2', #haemogenic
               'Bmp4', 'Gata4', 'Gata6', 'Hand1', 'Hand2', #heart
               'Dll1', 'Hes7', 'Uncx', 'Snai1', 'Hes5', 'Dkk1', #presomitic
               'Eya1', 'Pax3', 'Six1', 'Cer1', #early somite
               'Aldh1a2', 'Notch1', 'Lfng', #wavefront
               'Onecut2', 'Elavl3', 'Crabp1', #neural tube/branchial arches
               'Wnt3a', 'Fgf8', 'T', 'Cdx2', #posterior
               'Sox21', 'Nkx1-2', #neural SOX2 SOX1
               'Prdm1', 'Irx1', 'Eomes', 'Meox1','Tbx4', 'Sox3','H19', 'Xist','Gata2', 'Gata3','Gata4' #other   
              ]

hox = adata.var.filter(regex=r'Hox', axis=0).index

In [None]:
sc.pl.umap(adata, color=histone, use_raw=True, color_map = 'viridis')

In [None]:
sc.pl.umap(adata, color=markergenes, use_raw=True, color_map = 'viridis')

In [None]:
sc.pl.umap(adata, color=hox, use_raw=True, color_map = 'viridis', size = 50, frameon=False)

In [None]:
sc.pl.umap(adata, color=k4, use_raw=True, color_map = 'viridis')

In [None]:
sc.pl.umap(adata, color=polycomb, use_raw=True, color_map = 'viridis')

Save the result.

In [None]:
adata.write(results_file)

You can also make a heatmap. ! this takes a bit of time to compute.

In [None]:
#sc.pl.heatmap(adata, var_names = adata.var_names[adata.var['highly_variable'] & (np.log(adata.var['means'])>-1)])

# Finding marker genes

Let us compute a ranking for the highly differential genes in each cluster. For this, by default, the `.raw` attribute of AnnData is used in case it has been initialized before. The simplest and fastest method to do so is the t-test.

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='t-test')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

In [None]:
sc.settings.verbosity = 2  # reduce the verbosity

The result of a [Wilcoxon rank-sum (Mann-Whitney-U)](https://de.wikipedia.org/wiki/Wilcoxon-Mann-Whitney-Test) test is very similar. We recommend using the latter in publications, see e.g., [Sonison & Robinson (2018)](https://doi.org/10.1038/nmeth.4612). You might also consider much more powerful differential testing packages like MAST, limma, DESeq2 and, for python, the recent diffxpy.

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

Save the result.

In [None]:
adata.write(results_file)

As an alternative, let us rank genes using logistic regression. For instance, this has been suggested by [Natranos et al. (2018)](https://doi.org/10.1101/258566). The essential difference is that here, we use a multi-variate appraoch whereas conventional differential tests are uni-variate. [Clark et al. (2014)](https://doi.org/10.1186/1471-2105-15-79) has more details.

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', method='logreg')
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)

Reload the object that has been save with the Wilcoxon Rank-Sum test result.

In [None]:
adata = sc.read(results_file)

Show the 10 top ranked genes per cluster 0, 1, ..., 7 in a dataframe.
Columns are louvain clusters.

In [None]:
pd.set_option('display.max_columns', None)
pd.DataFrame(adata.uns['rank_genes_groups']['names']).head(20)

In [None]:
#pd.DataFrame(adata.uns['rank_genes_groups']['names']).to_csv(pathToData + 'markergenes_leiden.csv')

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='leiden', ax=ax, palette = sns.color_palette("hls",25),legend_loc='on data')

We can plot the top x differentially expressed genes for each cluster.

In [None]:
cluster = '49'
number_genes = 25

genes = pd.DataFrame(adata.uns['rank_genes_groups']['names'])[cluster].head(number_genes)
sc.pl.umap(adata, color=genes, color_map = 'viridis', frameon=False, size = 50)

In [None]:
adata.var.filter(regex=r'Shh', axis=0).index

In [None]:
sc.pl.umap(adata, color='Shh', frameon=False, size = 80)

Get a table with the scores and groups.

In [None]:
result = adata.uns['rank_genes_groups']
groups = result['names'].dtype.names
pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'pvals']}).head(5)

In [None]:
rank_genes_groups = pd.DataFrame(
    {group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'pvals']})

rank_genes_groups.head()

In [None]:
#rank_genes_groups.to_csv(pathToData + 'rank_genes_groups_leidenclusters.csv')

Compare to a single cluster. 

In [None]:
sc.tl.rank_genes_groups(adata, 'leiden', groups=['0'], reference='1', method='wilcoxon')
sc.pl.rank_genes_groups(adata, groups=['0'], n_genes=20)

# Cluster annotation

Split giant umap into different groups:

In [None]:
# Separate main groups and annotate

adata_pluri = adata[adata.obs['leiden_general'].isin(['Pluripotency'])]

adata_endo = adata[adata.obs['leiden_general'].isin(['Endoderm'])]

adata_meso = adata[adata.obs['leiden_general'].isin(['Mesoderm'])]

adata_neuro = adata[adata.obs['leiden_general'].isin(['Neural'])]

adata_haemo = adata[adata.obs['leiden_general'].isin(['Haemogenic'])]


## Check umap

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='leiden_general',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=20)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='leiden',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=20)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='louvain',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=20)

In [None]:
sc.pl.umap(adata, color=['day','leiden'], palette=cols, #sns.color_palette("tab20", 21), 
           legend_loc = "on data", legend_fontsize=8,  legend_fontweight="normal",
           legend_fontoutline=2, size=10, frameon=False)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='leiden',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=20)



## Haemogenic

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_haemo, color='leiden',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=80)



In [None]:
sc.tl.leiden(adata_haemo, resolution = 0.5, key_added='leiden_new') #3

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_haemo, color='leiden_new',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=100)

In [None]:
#rename louvain or leiden clusters to your new defined cell types
adata_haemo.rename_categories('leiden_new', ['h_0','h_1','h_2','h_3','h_4','h_5','h_6'
                                    ])


adata_haemo.obs['leiden_new'] = [sub.replace('h_5', 'h_0') for sub in adata_haemo.obs['leiden_new']]
adata_haemo.obs['leiden_new'] = [sub.replace('h_6', 'h_0') for sub in adata_haemo.obs['leiden_new']]



In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_haemo, color='leiden_new',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=100)

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_haemo, color='day',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, #legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=100)

## Endoderm

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_endo, color='leiden',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=80)

In [None]:
sc.tl.leiden(adata_endo, resolution = 0.5, key_added='leiden_new') #3

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_endo, color='leiden_new',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=100)

In [None]:
#rename louvain or leiden clusters to your new defined cell types
adata_endo.rename_categories('leiden_new', ['e_0','e_1','e_2','e_3','e_4','e_5','e_6'
                                    ])
#adata_endo.obs['leiden_new'] = [sub.replace('h_5', 'h_0') for sub in adata_endo.obs['leiden_new']]
#adata_endo.obs['leiden_new'] = [sub.replace('h_6', 'h_0') for sub in adata_endo.obs['leiden_new']]



In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_endo, color='leiden_new',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=100)

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_endo, color='day',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, #legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=100)



In [None]:
## Mesoderm

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_meso, color='leiden',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=50)

In [None]:
sc.tl.leiden(adata_meso, resolution = 1, key_added='leiden_new') #3

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_meso, color='leiden_new',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=50)

In [None]:
#rename louvain or leiden clusters to your new defined cell types
adata_meso.rename_categories('leiden_new', ['m_0','m_1','m_2','m_3','m_4','m_5','m_6','m_7','m_8','m_9','m_10','m_11','m_12','m_13','m_14','m_15','m_16'
                                    ])

adata_meso.obs['leiden_new'] = [sub.replace('m_14', 'm_0') for sub in adata_meso.obs['leiden_new']]
adata_meso.obs['leiden_new'] = [sub.replace('m_11', 'm_4') for sub in adata_meso.obs['leiden_new']]
adata_meso.obs['leiden_new'] = [sub.replace('m_16', 'm_12') for sub in adata_meso.obs['leiden_new']]

adata_meso.obs['leiden_new'] = [sub.replace('m_12', 'm_11') for sub in adata_meso.obs['leiden_new']]
adata_meso.obs['leiden_new'] = [sub.replace('m_13', 'm_12') for sub in adata_meso.obs['leiden_new']]
adata_meso.obs['leiden_new'] = [sub.replace('m_15', 'm_13') for sub in adata_meso.obs['leiden_new']]


In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_meso, color='leiden_new',ax=ax, palette=sns.color_palette("Paired", len(adata_meso.obs['leiden_new'].unique())),
           legend_fontsize=10,
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=50)

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_meso, color='day',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, #legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=100)

## Neural / ectoderm

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_neuro, color='leiden',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=50)

In [None]:
sc.tl.leiden(adata_neuro, resolution = 0.8, key_added='leiden_new') #3

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_neuro, color='leiden_new',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=50)

In [None]:
adata_neuro.obs['leiden_new'] = [sub.replace('10', '8') for sub in adata_neuro.obs['leiden_new']]
adata_neuro.obs['leiden_new'] = [sub.replace('9', '8') for sub in adata_neuro.obs['leiden_new']]
adata_neuro.obs['leiden_new'] = [sub.replace('4', '2') for sub in adata_neuro.obs['leiden_new']]
adata_neuro.obs['leiden_new'] = [sub.replace('5', '2') for sub in adata_neuro.obs['leiden_new']]

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_neuro, color='leiden_new',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=50)

In [None]:
#rename louvain or leiden clusters to your new defined cell types
adata_neuro.rename_categories('leiden_new', ['n_0','n_1','n_2','n_3','n_4','n_5','n_6','n_7','n_8'
                                    ])

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_neuro, color='leiden_new',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=50,alpha=0.6)

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_neuro, color='day',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, #legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=50, alpha=0.8)

In [None]:
## Pluripotency

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_pluri, color='leiden',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=50, alpha=0.7)


In [None]:
sc.tl.leiden(adata_pluri, resolution = 1.2, key_added='leiden_new') #3

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_pluri, color='leiden_new',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=40, alpha=0.7)


In [None]:
adata_pluri.obs['leiden_new'] = [sub.replace('12', '2') for sub in adata_pluri.obs['leiden_new']]
adata_pluri.obs['leiden_new'] = [sub.replace('8', '4') for sub in adata_pluri.obs['leiden_new']]
adata_pluri.obs['leiden_new'] = [sub.replace('13', '10') for sub in adata_pluri.obs['leiden_new']]
adata_pluri.obs['leiden_new'] = [sub.replace('11', '10') for sub in adata_pluri.obs['leiden_new']]

adata_pluri.obs['leiden_new'] = [sub.replace('14', '0') for sub in adata_pluri.obs['leiden_new']]

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_pluri, color='leiden_new',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=40, alpha=0.7)



In [None]:
#rename louvain or leiden clusters to your new defined cell types
adata_pluri.rename_categories('leiden_new', ['p_0','p_1','p_2','p_3','p_4','p_5','p_6','p_7','p_8','p_9','p_10','p_11'
                                    ])

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_pluri, color='leiden_new',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=40, alpha=0.7)



In [None]:
fig, ax = plt.subplots(figsize=(6,6))
sc.pl.umap(adata_pluri, color='day',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, #legend_loc = "on data", legend_fontweight="normal",
           legend_fontoutline=2, size=30, alpha=0.8)

## Back to full dataset

In [None]:
new_clusters = pd.concat([adata_pluri.obs['leiden_new'],adata_meso.obs['leiden_new'],adata_neuro.obs['leiden_new'],adata_endo.obs['leiden_new'],adata_haemo.obs['leiden_new']])
new_clusters

In [None]:
len(adata_pluri.obs), len(adata_meso.obs), len(adata_neuro.obs), len(adata_haemo.obs), len(adata_endo.obs)

In [None]:
len(adata_pluri.obs) + len(adata_meso.obs) + len(adata_neuro.obs) + len(adata_haemo.obs) + len(adata_endo.obs)

In [None]:
len(adata)

In [None]:
adata.obs['new_leiden'] = new_clusters

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='new_leiden',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal", legend_fontsize=10,
           legend_fontoutline=2, size=40, alpha=0.7)

In [None]:
adata.obs['leiden_annot'] = new_clusters

#endoderm
adata.obs['leiden_annot'] = [sub.replace('e_0', 'Visceral Endoderm') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('e_1', 'ExE Endoderm') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('e_2', 'Foregut') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('e_3', 'Midgut') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('e_4', 'Hindgut') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('e_5', 'Gut') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('e_6', 'Parietal Endoderm') for sub in adata.obs['leiden_annot']]

#Neural
adata.obs['leiden_annot'] = [sub.replace('n_0', 'NMPs') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('n_1', 'Neural Progenitors') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('n_2', 'Neural Progenitors') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('n_3', 'Floor Plate Neural Tube') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('n_4', 'Floor Plate Neural Tube') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('n_5', 'Early Motor Neurons') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('n_6', 'Roof Plate Neural Tube') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('n_7', 'Late Motor Neurons') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('n_8', 'Neural Crest') for sub in adata.obs['leiden_annot']]

#Haemogenic
adata.obs['leiden_annot'] = [sub.replace('h_0', 'Haematoendothelial Progenitors') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('h_1', 'Endothelium') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('h_2', 'Endothelium') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('h_3', 'Erythrocytes') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('h_4', 'Megakaryocytes') for sub in adata.obs['leiden_annot']]

#Mesoderm
adata.obs['leiden_annot'] = [sub.replace('m_0', '(pre)Somitic/Wavefront') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('m_2', 'Pharyngeal Mesoderm/SHF') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('m_3', 'LP/Intermediate Mesoderm') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('m_4', 'Dermomyotome') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('m_5', 'Pharyngeal Mesoderm/SHF') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('m_6', '(early) Somite') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('m_7', 'Nascent Mesoderm') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('m_8', 'First Heart Field') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('m_9', 'Cardiac Mesoderm') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('m_10', 'Sclerotome') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('m_11', 'LP/Intermediate Mesoderm') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('m_12', 'Myotome') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('m_13', 'Cardiomyocytes') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('m_1', 'LP/Intermediate Mesoderm') for sub in adata.obs['leiden_annot']]

#Pluripotency
adata.obs['leiden_annot'] = [sub.replace('p_10', 'p10') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('p_11', 'p11') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('p_0', 'Early PGCs') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('p_1', 'Epiblast') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('p_2', 'Caudal Mesoderm') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('p_3', 'Mature PGCs') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('p_4', 'Caudal Epiblast') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('p_5', 'Caudal Epiblast') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('p_6', 'Primitive Streak') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('p_7', 'Anterior Primitive Streak') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('p_8', 'Anterior Primitive Streak') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('p_9', 'Reprogramming PGCs') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('p10', 'p_10') for sub in adata.obs['leiden_annot']]
adata.obs['leiden_annot'] = [sub.replace('p11', 'p_11') for sub in adata.obs['leiden_annot']]

In [None]:
cols2 = sns.color_palette("pastel")+sns.color_palette("Set2_r") #+ sns.color_palette("Set3")+sns.color_palette("tab10")+sns.color_palette("pastel")# + sns.color_palette("gnuplot_r")
cols = sns.color_palette("Set3")+sns.color_palette("Set2") + sns.color_palette("pastel")+sns.color_palette("tab10")+sns.color_palette("pastel")# + sns.color_palette("gnuplot_r")


In [None]:
fig, ax = plt.subplots(figsize=(13,13))
sc.pl.umap(adata, color='leiden_annot',ax=ax, palette=cols, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal", legend_fontsize=10, save = 'umap_all.png',
           legend_fontoutline=2, size=60, alpha=1)


In [None]:
adata.obs['leiden_merge'] = adata.obs['leiden_annot']

#endoderm
adata.obs['leiden_merge'] = [sub.replace('Visceral Endoderm', 'Endoderm') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('ExE Endoderm', 'Endoderm') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Foregut', 'Endoderm') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Midgut', 'Endoderm') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Hindgut', 'Endoderm') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Gut', 'Endoderm') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Parietal Endoderm', 'Endoderm') for sub in adata.obs['leiden_merge']]

#Neural
adata.obs['leiden_merge'] = [sub.replace('Floor Plate Neural Tube', 'Neural Tube') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Early Motor Neurons', 'Motor Neurons') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Roof Plate Neural Tube', 'Neural Tube') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Late Motor Neurons', 'Motor Neurons') for sub in adata.obs['leiden_merge']]

#Haemogenic
adata.obs['leiden_merge'] = [sub.replace('Haematoendothelial Progenitors', 'Haemogenic') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Endothelium', 'Haemogenic') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Erythrocytes', 'Haemogenic') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Megakaryocytes', 'Haemogenic') for sub in adata.obs['leiden_merge']]

#Mesoderm
adata.obs['leiden_merge'] = [sub.replace('LP/Intermediate Mesoderm', 'Mesoderm') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Dermomyotome', 'Somite') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Pharyngeal Mesoderm/SHF', 'Mesoderm') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('(early) Somite', 'Somite') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Nascent Mesoderm', 'Mesoderm') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('First Heart Field', 'Heart') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Cardiac Mesoderm', 'Heart') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Sclerotome', 'Somite') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('LP/Intermediate Mesoderm', 'Mesoderm') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Myotome', 'Somite') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Cardiomyocytes', 'Heart') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('LP/Intermediate Mesoderm', 'Mesoderm') for sub in adata.obs['leiden_merge']]

#Pluripotency
adata.obs['leiden_merge'] = [sub.replace('Early PGCs', 'PGCs') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Mature PGCs', 'PGCs') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Anterior Primitive Streak', 'Primitive Streak') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('Reprogramming PGCs', 'PGCs') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('p_10', 'Epiblast') for sub in adata.obs['leiden_merge']]
adata.obs['leiden_merge'] = [sub.replace('p_11', 'Epiblast') for sub in adata.obs['leiden_merge']]

In [None]:
fig, ax = plt.subplots(figsize=(13,13))
sc.pl.umap(adata, color='leiden_merge',ax=ax, palette=cols2, #sns.color_palette("hls", len(adata.obs['leiden'].unique())),
           frameon=False, legend_loc = "on data", legend_fontweight="normal", legend_fontsize=12,
           legend_fontoutline=2, size=60, alpha=1, save = 'umap_merge.png')



# Various ways of displaying interesting (marker) genes

Reload the object that computed differential expression by comparing to the rest of the groups.

In [None]:
adata = sc.read(results_file)

Let us also define a list of marker genes for later reference.

In [None]:
polycomb = ['Jarid2', "Eed","Ezh2", "Prc1", "Ring1", "Suz12", "Phc1","Phc2", "Suv39h2"] #Prc2
k4 = ['Setd5', 'Setd1a', 'Setd3', 'Setdb2', 'Setd7', 'Setd1b','Jarid2']

marker_genes = ['Esrrb', 'Dppa2', 'Dppa4', 'Nanog', 'Dppa5a','Gsc', 'Klf9', 'Klf2', 'Utf1', 'Pim2', 
               'Dppa3', 'Dnmt3b', 'Dazl', #PGC
               'Cdh1', 'Trh', 'Sox17', 'Nodal', 'Epcam', 'Foxa2', 'Lefty1', #endoderm/PGC/node
               'Sox7','Kdr','Car2', #haemogenic
               'Bmp4', 'Gata4', 'Gata6', 'Hand1', 'Hand2', #heart
               'Dll1', 'Hes7', 'Uncx', 'Snai1', 'Hes5', 'Dkk1', #presomitic
               'Eya1', 'Pax3', 'Six1', 'Cer1', #early somite
               'Aldh1a2', 'Notch1', 'Lfng', #wavefront
               'Onecut2', 'Elavl3', 'Crabp1', #neural tube/branchial arches
               'Wnt3a', 'Fgf8', 'T', 'Cdx2', #posterior
               'Sox21', 'Nkx1-2', #neural SOX2 SOX1
               'Prdm1', 'Irx1', 'Eomes', 'Meox1','Tbx4', 'Sox3','H19', 'Xist','Gata2', 'Gata3','Gata4' #other   
              ]

hox = adata.var.filter(regex=r'Hox', axis=0).index

If we want a more detailed view for a certain group, use `sc.pl.rank_genes_groups_violin`.

In [None]:
sc.pl.rank_genes_groups_violin(adata, groups='0', n_genes=8)

In [None]:
sc.pl.rank_genes_groups_violin(adata, groups='0', n_genes=8)

If you want to compare a certain gene across groups, use the following.

In [None]:
sc.pl.violin(adata, marker_genes, groupby='leiden', size = 1)

In [None]:
sc.pl.umap(adata, color = ['Suv39h1', 'Suv39h2', 'Ehmt1', 'Ehmt2', 'Prmt1', 'Phf21a', 'Otx2'], use_raw = True)

In [None]:
#sc.pl.umap(adata, color = (['celltype'] + marker_genes), use_raw = True)
sc.pl.umap(adata, color = hox, use_raw = True)

Now that we annotated the cell types, let us visualize the marker genes.

In [None]:
ax = sc.pl.dotplot(adata, marker_genes, groupby='leiden', title = '') # ,save='dotplot_markergenes.png')

In [None]:
ax = sc.pl.dotplot(adata, hox, groupby='leiden', title = '') # ,save='dotplot_markergenes.png')

In [None]:
genes = [ 'Eya1','Prrx2',                                               # early somite            - 'Six1'
         'Dll1','Hes7','Lfng','Snai1','Tbx6',                           # wavefront               - 'Notch1', 'Cer1', 'Aldh1a2'
         'Gata6','Bmp4','Tbx20', 'Meox1',                               # cardiac mesoderm
         'Ttn','Tnnt2',                                                 # cardiomyocytes          - 'Tnni1','Myh7','Ryr2','Slc8a1',
         'Six1','Pax3','Uncx',                                          # dermomyotome            - 'Pax9','Pax7','Sox9','Eya1',
         'Neurod4','Rassf4','Hes6',                                     # early motor neurons     - 'Btg2'
         'Kdr','Cdh5',                                                  # endothelium             - 'Mest','Vim','Nrp2',  
         'Ttr','Apob',                                                  # exe endoderm            - 'Nrk','Reln','Apoe','Mttp','Lgmn','Dpp4', 
         'Hand1','Hand2',                                               # FHF                     - 'Plagl1','Efnb1', 
         'Sox21','Pax7',                                                # floor plate             - 'Tns1','Fat4','Adgrv1','Lrig1','Ndnf',
         'Gata1','Hbb-bh1','Runx1',                                     # HSCs Erys               - 'Epb41', 'Vav1'
         'Onecut2','Robo3',                                             # late MN                 - 'Nefl','Slc32a1'         
         'Myod1','Myog',                                                # myotome                 - 'Myo16','Pax7',
         'Pax2','Sox10','Foxd3',                                        # neural crest?           - 'Tfap2a','Tspan18','Cldn6',
         'Sox21','Sox3',                                                # neural progenitors      - 'Tmsb4x','Epha4','Crabp2','Cenpa','Foxg1',
         'Nanog','Dppa5a','Esrrb','Dazl','Cdh1','Epcam',                # PGCs                    - 'Dppa2','Dppa4'
         'Epas1','Npl','Sox7',                                          # Parietal endoderm       - 'Lama1','Pdgfra','Car2'
         'Cxcl12','Sned1','Six2',                                       # SHF                     - 'Meis1','Ebf2','Nr2f2','Col26a1','Ptn'
         'Msx1','Wnt1','Lmx1a',                                         # Roof plate              - 'Zic1','Zic2','Npr3','Msx2','Adamts3',
         'T','Cdx2',                                                    # tailbud                 - 'Wnt5a'
         'Trh','Lhx1','Gsc','Foxa2','Sox17','Gata4',                    # visceral endoderm       - 'Fzd5', 'Gpc4'
         ]

In [None]:
order = ['Mature PGCs', 'Reprogramming PGCs', 'Early PGCs',
         'Epiblast','p_10','p_11',
         'Caudal Epiblast',
         'Anterior Primitive Streak','Primitive Streak',
         'Caudal Mesoderm', 'NMPs',
         '(pre)Somitic/Wavefront','(early) Somite', 'Dermomyotome', 'Sclerotome', 'Myotome',
         'Nascent Mesoderm','LP/Intermediate Mesoderm','Pharyngeal Mesoderm/SHF','Cardiac Mesoderm','First Heart Field','Cardiomyocytes',
         'Haematoendothelial Progenitors','Endothelium','Erythrocytes', 'Megakaryocytes',
         'Parietal Endoderm', 'ExE Endoderm', 'Visceral Endoderm', 'Gut', 'Foregut', 'Midgut', 'Hindgut',        
         'Neural Progenitors', 'Roof Plate Neural Tube','Floor Plate Neural Tube','Early Motor Neurons','Late Motor Neurons','Neural Crest' ]

In [None]:
genes = [
    'Nanog','Dppa5a','Esrrb',                # PGCs
    'Tex19.1','Dazl','Dppa3','Prdm14',       # Mature PGCs
    'Dnmt3b',                                # Reprogramming PGCs
    'Perp','Bhlhe41',                        # Epiblast
    'Pim2','Gbx2','Fst',                     # Caudal epiblast
    'Mixl1','Eomes','Gsc','Tdgf1','Fgf5',    # Primitive streak
    'Wnt3a',                                 # Caudal mesoderm
    'T','Cdx2','Fgf17',                      # NMP
    'Hes7','Lfng','Dll1','Aldh1a2',          # Presomitic/Wavefront                - 'Notch1', 'Tbx6','Dll1','Snai1','Tbx6','Aldh1a2',
    'Eya1','Prrx2',                          # (early) somite
    'Pax3','Uncx',                           # Dermomyotome
    'Pax9','Pax1',                           # Sclerotome
    'Pax7','Myod1','Myog',                   # Myotome
    'Mesp1','Mixl1',                        # Nascent mesoderm 
    'Pax8','Pax2',                           # LP/Intermediate mesoderm
    'Meox1','Sned1','Six2','Cxcl12','Gata6', # Pharyngeal mesoderm
    'Hand1','Hand2',                         # Cardiac mesoderm
    'Bmp4','Tbx20',                          # First Heart Field
    'Myh7','Ryr2','Tnnt2',                   # Cardiomyocytes                      - Ttn,
    'Etv2','Fev',                            # Hematoendothelial progenitors
    'Cdh5','Igf1','Sox7',                    # Endothelium
    'Hbb-bh1','Hba-a1','Alas2',              # Erythrocytes
    'Plek','Mpl','P2rx1',                    # Megakaryocytes
    'Sox17','Foxa2',                         # Endoderm
    'Epas1','Npl',                           # Parietal endoderm
    'Apob','Ttr',                            # ExE Endoderm
    'Lhx1','Trh',                            # Visceral endoderm
    'Rab15',                                 # Gut
    'Bmper','Cfc1',                          # Foregut
    'Prss12','Pga5',                         # Midgut
    'Irs4',                                  # Hindgut
    'Sox3',                                  # Neural progenitors
    'Wnt1','Lmx1a','Msx1','Zic1',            # Roof plate NT
    'Sox21',                                 # Floor plate NT
    'Neurod4','Rassf4',                      # Early MN
    'Onecut2','Robo3','Pax2',                # Late MN
    'Sox10','Foxd3'                          # Neural crest
    ]

In [None]:
ax = sc.pl.dotplot(adata, genes, groupby='leiden_annot', cmap="viridis",categories_order = order,
                   log=True  ,save=  'dotplot_markergenes.png')

There is also a very compact violin plot.

In [None]:
ax = sc.pl.stacked_violin(adata, genes, rotation=45,
                          groupby='leiden_annot', cmap="viridis",categories_order = order,
                   log=False  ,save=  'violinplot_markergenes.png')

And a trackplot.

In [None]:
sc.pl.rank_genes_groups_tracksplot(adata, n_genes=6, key='rank_genes_groups')

## Finally: we can display our new annotated cell names onto the umap

Actually mark the cell types.

In [None]:
adata.obs['celltype'] = adata.obs['leiden_annot']
adata.obs

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='celltype', palette=cols, legend_fontoutline=2,
          # sns.color_palette('tab20',len(adata.obs['celltype'].unique())),
           title='Leiden clustering', ax=ax, frameon=False,legend_loc="on data", size = 20)

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='leiden_merge',ax=ax, palette=cols2, 
           legend_fontweight="normal",legend_fontoutline=2,
           legend_loc = "on data", title='Cell types - merged', 
           legend_fontsize=8, frameon=False, size = 40)#, save = '_rep2_OUD5651.png')

In [None]:
fig, ax = plt.subplots(figsize=(8,8))
sc.pl.umap(adata, color='day',ax=ax, 
           legend_fontweight="normal",legend_fontoutline=2,
            title='Days', 
           legend_fontsize=8, frameon=False, size = 20)#, save = '_rep2_OUD5651.png')

# Exporting

Before we save the final dataframe, we want to add some useful observations - e.g. actual cell names

In [None]:
barcodes = pd.read_csv("/Users/m.blotenburg/Documents/Projects/Helena/data/VASAbarcodes.csv", sep = '\t', header = None, index_col=0, squeeze=True).to_dict()
barcodes = {y:x for x,y in barcodes.items()}
adata.obs['cellnames'] = ['-'.join(x) for x in [ob.split('-')[0:7] for ob in adata.obs.index]]
adata.obs['bc'] = [ob.split(':')[1] for ob in adata.obs['cellnames']]
adata.obs['bc'] = adata.obs['bc'].map(lambda x: barcodes[x])
adata.obs['cellname'] = adata.obs['batch'].astype(str) + '_' + adata.obs['bc'].astype(str)
adata.obs.index = adata.obs['cellname']
adata.obs.index.rename('index')

adata.obs.head(2)

we can save the cells + louvain or leiden clusters in a table, which we can export to a csv and use for e.g. bam splitting.

In [None]:
adata.obs.index = adata.obs.index.rename('index')
adata.obs.head(2)

In [None]:
adata.obs.head(2)

In [None]:
cells_clusters = pd.DataFrame(adata.obs['leiden_merge'])
cells_clusters.head(2)
#cells_clusters.to_csv(pathToData + 'clusters.csv')

In [None]:
cells_clusters.to_csv(pathToData + 'dataframes/20210701_rep234_day34567_cellsClusters_celltypesMerged.csv', sep = '\t')

During the course of this analysis, the AnnData accumlated the following annotations.

In [None]:
adata

In [None]:
pathToData

In [None]:
results_file

In [None]:
adata.write(results_file, compression='gzip')  # `compression='gzip'` saves disk space, but slows down writing and subsequent reading

Get a rough overview of the file using `h5ls`, which has many options - for more details see [here](https://github.com/theislab/scanpy_usage/blob/master/170505_seurat/info_h5ad.md). The file format might still be subject to further optimization in the future. All reading functions will remain backwards-compatible, though.

If you want to share this file with people who merely want to use it for visualization, a simple way to reduce the file size is by removing the dense scaled and corrected data matrix. The file still contains the raw data used in the visualizations.

In [None]:
adata.X = None
adata.write('./write/Scanpy120hAA.h5ad', compression='gzip')

If you want to export to "csv", you have the following options:

In [None]:
# Export single fields of the annotation of observations
# adata.obs[['n_counts', 'louvain_groups']].to_csv(
#     './write/pbmc3k_corrected_louvain_groups.csv')

# Export single columns of the multidimensional annotation
# adata.obsm.to_df()[['X_pca1', 'X_pca2']].to_csv(
#     './write/pbmc3k_corrected_X_pca.csv')

# Or export everything except the data using `.write_csvs`.
# Set `skip_data=False` if you also want to export the data.
# adata.write_csvs(results_file[:-5], )