# B1. Downstream analysis

- Authors: Marcos Malumbres & Agustín Sánchez-Belmonte
- Project: miR-203 controls developmental timing and early fate restriction during preimplantation embryogenesis
- Experiment: single cell RNAseq in early embryos (E3.5 and E4.5) in KO, KI and WT conditions.
- Part: B1. Downstream analysis

This notebook use the output h5ad file from Part A. 

It is very similar to B2 Part, but here there are important analysis like:

- Clustering
- Scoring of signatures
- Classification of cells
- Add of MERVL data
- Velocity

All saved in the new data file `mir203_all.h5ad` added as 
supplementary file in GEO.

### Content

0. Set up
1. Initial Exploratory Analysis
2. Markers and Gene Signature Scores
3. Clustering
4. Classification of cells
5. Add MERVL
6. miR203_KO vs. Control
7. miR203_KI vs. Control
8. Velocity and Pseudotime
9. Save modified .h5ad file


# 0. Set up

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import rcParams

In [None]:
# Settings
sc.settings.verbosity = 0
sc.logging.print_header()
sc.set_figure_params(dpi=120, color_map='viridis', dpi_save=300)
sc.set_figure_params(figsize=[5,4])

DATA = '/Users/mmalumbres/Library/CloudStorage/OneDrive-VHIO/BioInformatics/BioProjects/miR203 & early embryos/data/'
DESKTOP = '/Users/mmalumbres/Desktop/'
sc.settings.figdir = DESKTOP

In [None]:
import warnings   
warnings.filterwarnings("ignore")

In [None]:
sns.__version__

In [None]:
# Lists and filters
pal1 = ["lightblue", "deepskyblue", "dodgerblue", "navajowhite", "darkorange", "orangered"]

### Load all data

In [None]:
adata = sc.read(DATA + "231002_mir203.h5ad")
adata.obs_names_make_unique()
adata

In [None]:
adata.obs.head(2)

In [None]:
adata.obs = adata.obs.rename(columns={"Phenotype": "Sample"})

In [None]:
adata.obs.Sample.unique()

In [None]:
adata.obs.Sample = adata.obs.Sample.str.replace("3_5", "3.5")
adata.obs.Sample = adata.obs.Sample.str.replace("4_5", "4.5")
adata.obs.Experiment = adata.obs.Experiment.str.replace("3_5", "3.5")
adata.obs.Experiment = adata.obs.Experiment.str.replace("4_5", "4.5")
adata.obs.Stage = adata.obs.Stage.str.replace("3_5", "3.5")
adata.obs.Stage = adata.obs.Stage.str.replace("4_5", "4.5")
adata.obs.head(2)

In [None]:
sc.pl.umap(adata, color=["Stage", "Treatment", "Sample"])

# 1. Initial Exploratory Analysis

In [None]:
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'], vmax=[8000, 10000], jitter=0.4, multi_panel=True)

In [None]:
fig, ax = plt.subplots()
sc.pl.violin(adata, ['total_counts'], rotation=90, jitter=0.4, ax=ax, show=False)
ax.set_ylim(10, 1000)
plt.show()

In [None]:
adata.obs.total_counts.min()

In [None]:
sc.pl.umap(adata, color=["n_genes", "Sample"])

In [None]:
adata.obs.Sample.value_counts()

In [None]:
sns.countplot(data=adata.obs, x="Sample")
plt.xticks(rotation=45)

In [None]:
sns.countplot(data=adata.obs, x="Stage")

In [None]:
sns.countplot(data=adata.obs, x="Treatment")

In [None]:
sns.__version__

In [None]:
#Plot distribution of mitochondrial and ribosomal genes
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_MT','pct_counts_RB'], groupby='Experiment',
             jitter=0.4, multi_panel=True, rotation=90)

In [None]:
# Plot mitochondrial genes expressed
sc.pl.scatter(adata, x='total_counts', y='pct_counts_MT', size=100)
# Plot total counts
sc.pl.scatter(adata, x='total_counts', y='pct_counts_RB', size=100)
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', size=100)

In [None]:
sc.pl.pca(adata, color=["Stage", 'Treatment', "Sample", ])

In [None]:
sc.pl.umap(adata, color=["Stage", 'Treatment', "Sample"])

In [None]:
sc.pl.umap(adata, color=['Sample'], s=30, #save='_KO.png', 
           palette={"E3.5": "DEEPSKYBLUE", "E3.5_KO": "DARKBLUE", "E3.5_dox": "BLUE",
                    "E4.5": "GREENYELLOW", "E4.5_KO": "DARKGREEN", "E4.5_dox": "LIMEGREEN"})

In [None]:
sc.pl.umap(adata, color=["Stage", 'Treatment', "Sample", ])

In [None]:
sc.pl.umap(adata, color=['total_counts', 'pct_counts_MT','pct_counts_RB'], palette="Set2", color_map= plt.cm.Purples)

# 2. Markers  

Initial exploration
- ICM/Epiblast markers: `['Pou5f1', 'Klf4', 'Nanog']`
- Hypoblast markers: `['Gata6', 'Gata4', 'Sox17']`  
- Trophectoderm markers: `['Cdx2', 'Gata3', 'Krt8']`


In [None]:
sc.pl.umap(adata, size=100,color=['Pou5f1', 'Klf4', 'Nanog'],palette="Set2", color_map= plt.cm.Purples,) #save = '_Epiblast_ICM.png'

In [None]:
sc.pl.umap(adata, size=100,color=['Cdx2', 'Gata3', 'Krt8'],palette="Set2", color_map= plt.cm.Purples,) #save = '_TE_mark.png'

In [None]:
sc.pl.umap(adata, size=100,color=['Gata6', 'Gata4', 'Sox17'],palette="Set2", color_map= plt.cm.Purples,) #save = '_Hipoblast.png'

# 3. Gene Signatures

Supplementary Table 1. Gene signatures used in this work.

In [None]:
signature_folder = "/Users/mmalumbres/Library/CloudStorage/OneDrive-VHIO/BioInformatics/BioProjects/miR203 & early embryos/resources/Signatures/"

#### 2-cell stage

In [None]:
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5755687/
two_cell_16 = ["Zscan4c", "Zscan4e", "Spz1", "Naalad2", "Sp110", "Pramef6", "Fgf1", "Bex6", "Pramel7", "Kdm4d", "Zfp352", "Sytl2",
           "Oog4", "Hmgn3", "Hspa1b", "Foxa1"]
print("Length two_cell_16:", len(two_cell_16))

#### 8-cell-like

In [None]:
# This signature is obtained from E2.5 WT (our unpublished data)
tmp = pd.read_csv(signature_folder + "8_cell.tsv", sep="\t")
tmp = tmp.astype(str)
eight_cell = tmp["E2_5"].tolist()
eight_cell = [x. capitalize() for x in eight_cell if x != 'nan']
print("Length eight_cell:", len(eight_cell))

#### EPI

In [None]:
naive_pluri = ['Tfcp2l1','Tbx3','Prdm14','Nanog','Esrrb','Klf4','Nr5a2','Pou5f1','Sox2','Nr0b1','Tet2','Klf2','Fbxo15','Utf1','Upp1',
               'Zfp42','Tet1','Tdgf1','Tcf15','Dppa5a']
print("Length naive_pluri:", len(naive_pluri))

#### TE and prE

In [None]:
# paper Guo et al., Cell
TE_Guo = ["Cdx2", "Tspan8", "Dppa1", "Id2", "Krt8", "Gata3"]
print("Length TE_Guo:", len(TE_Guo))

prE_Guo = ["Gata4", "Gata6", "Runx1", "Pdgfra", "Creb3l2"]
print("Length prE_Guo:", len(prE_Guo))

### Score gene signatures

In [None]:
# 2-cell like
sc.tl.score_genes(adata, two_cell_16, score_name='2-cell-like')

# 8-cell like
sc.tl.score_genes(adata, eight_cell, score_name='8-cell-like')

# Epiblast & ICM
sc.tl.score_genes(adata, naive_pluri, score_name='EPI')

# Throphoectoderm
sc.tl.score_genes(adata, TE_Guo, score_name='TE')

# Primitive endoderm
sc.tl.score_genes(adata, prE_Guo, score_name='prE')

Select:
- `2-cell-like`: very specific of 2-cell/totipotent cells
- `8-cell-like`: 8-cell 50 genes
- `EPI`: Epiblast and ICM lineage
- `TE`: throphoectoderm, very similar to TE_3
- `prE`: primmitive endoderm

In [None]:
sc.pl.umap(adata, size=100, color=['2-cell-like', '8-cell-like', 'EPI', 'TE', "prE"],
           palette="Set2", color_map= plt.cm.Purples,)

## 3. Clustering

In [None]:
sc.tl.leiden(adata, resolution= 0.1)

In [None]:
sc.pl.umap(adata, color=["leiden", "Treatment", "Sample"])

In [None]:
adata.obs.leiden.dtype

In [None]:
new_cluster_names = ['E4.5_TE', 'E4.5_PrE', 'E4.5_Epi', 'E3.5_ICM', 'E4.5_preTE', 'E3.5_Mixed', "E3.5_TE"]
adata.obs["leiden_groups"] = adata.obs["leiden"]
adata.rename_categories('leiden_groups', new_cluster_names)

In [None]:
sc.pl.umap(adata, color='leiden_groups', legend_loc='on data', title='', frameon=True, ) 
sc.pl.umap(adata, color='leiden_groups',) #save='_ANNO_nolabels.png'

## 4. Classify based on developmental markers

### Classify using scaled data and highest score

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
df = adata.obs[["2-cell-like", "8-cell-like", "EPI", "TE", "prE"]]

In [None]:
df_scaled = scaler.fit_transform(df.to_numpy())
df_scaled = pd.DataFrame(df_scaled, columns=["2-cell-like", "8-cell-like", "EPI", "TE", "prE"])
 
print("Scaled Dataset Using MinMaxScaler")
df_scaled.head()

In [None]:
df.describe()

In [None]:
df_scaled.describe()

In [None]:
df_scaled["2-cell-like"]

In [None]:
df_scaled["2-cell-like"] = df_scaled["2-cell-like"]*1.5

In [None]:
adata.obs["2-cell-like_scaled"] = df_scaled["2-cell-like"].tolist()
adata.obs["8-cell-like_scaled"] = df_scaled["8-cell-like"].tolist()
adata.obs["EPI_scaled"] = df_scaled["EPI"].tolist()
adata.obs["TE_scaled"] = df_scaled["TE"].tolist()
adata.obs["prE_scaled"] = df_scaled["prE"].tolist()

In [None]:
scaled_score = np.argmax(df_scaled, axis=1)
scaled_score

In [None]:
adata.obs["scaled_scores"] = scaled_score.astype("str")

In [None]:
adata.obs["scaled_scores"].value_counts()

In [None]:
sc.pl.umap(adata, color=["scaled_scores"], s=40, 
palette={"0": "DARKRED","1": "ORANGE","2": "YELLOWGREEN",
         "3": "SEAGREEN","4":"CORNFLOWERBLUE"})

In [None]:
adata.obs.scaled_scores.dtype

In [None]:
new_scores_names = ['2-cell-like', '8-cell-like', 'Epi-like', 'TE-like', 'prE-like']
adata.obs["Subpop_scaled_scores"] = adata.obs["scaled_scores"]
adata.rename_categories('Subpop_scaled_scores', new_scores_names)

In [None]:
adata.obs.head(2)

In [None]:
sc.pl.umap(adata, color=["Subpop_scaled_scores"], s=40, 
palette={"2-cell-like": "DARKRED","8-cell-like": "ORANGE","Epi-like": "YELLOWGREEN",
         "TE-like": "SEAGREEN","prE-like":"CORNFLOWERBLUE"})

In [None]:
sc.pl.dotplot(adata, ['2-cell-like','8-cell-like','EPI','TE', "prE"], groupby="Subpop_scaled_scores", 
              standard_scale='var',) #save='.png' - categories_order = order_heat, 

# 5. Add MERVL
Compute MERVL sequences from publication below using bash

Original publication
- https://www.nature.com/articles/s41467-021-21808-x

In [None]:
# Calculation for MERVL in all cells
MERVL_all = pd.read_csv('DATA/MERVL_all.csv', index_col=0)
print(MERVL_all.shape)
# remove duplicated indexes in MERVL
MERVL_all = MERVL_all[~MERVL_all.index.duplicated(keep='first')]

# Calculation for MERVL in KO 3.5 cells
MERVL = pd.read_csv('DATA/MERVL.csv', index_col=0)  # Only KO 3.5 cells
MERVL = MERVL.rename(columns={"MERVL": "MERVL_E3.5"})

print(MERVL_all.shape)
print(MERVL.shape)

In [None]:
adata.obs = adata.obs.merge(MERVL_all, left_index=True, right_index=True, how = 'left')
adata.obs = adata.obs.merge(MERVL, left_index=True, right_index=True, how = 'left')
adata.obs.head(2)

In [None]:
adata_na = adata.obs[adata.obs.MERVL.isna()]
adata_na.Experiment.value_counts()

In [None]:
adata_na = adata.obs[adata.obs["MERVL_E3.5"].isna()]
adata_na.Experiment.value_counts()

In [None]:
sc.pl.umap(adata, color=["MERVL", "MERVL_E3.5", "Treatment"], palette="Set2", color_map= plt.cm.Purples)
sc.pl.umap(adata, color=["MERVL", "MERVL_E3.5", "Treatment"])

In [None]:
sc.pl.umap(adata, color=["MERVL", "Treatment"], vmin=1, vmax=6, palette="Set2", color_map= plt.cm.Purples)

In [None]:
sc.pl.violin(adata, ["MERVL"], groupby="Sample", rotation=90, jitter=0.35) 

In [None]:
sc.pl.dotplot(adata, ["MERVL"], groupby="Sample")  #, standard_scale="var"

# 6. miR-203 KO versus Control

In [None]:
adata.obs.Treatment.unique()

In [None]:
adata_KO = adata[adata.obs.Treatment != 'dox']
adata_KO

### Check KO_E3.5

In [None]:
adata_KO_3_5 = adata[adata.obs.Sample == 'E3.5_KO']

In [None]:
sc.pl.umap(adata_KO_3_5, color=["leiden_groups", "Subpop_scaled_scores"], s=200)

In [None]:
sc.pl.dotplot(adata_KO_3_5, ['2-cell-like','8-cell-like','EPI','TE', "prE"], groupby="Subpop_scaled_scores", 
              standard_scale='var',) #save='.png' - categories_order = order_heat, 

## Re-UMAP KO 3.5 (NOT NEEDED)

In [None]:
adata_KO_3_5_reUMAP = adata_KO_3_5.copy()
adata_KO_3_5_reUMAP

In [None]:
# Compute PCA
sc.tl.pca(adata_KO_3_5_reUMAP, svd_solver='auto')

In [None]:
sc.pp.neighbors(adata_KO_3_5_reUMAP, n_neighbors=8, n_pcs=14) #6 AND 5/!5
sc.tl.umap(adata_KO_3_5_reUMAP)
sc.pl.umap(adata_KO_3_5_reUMAP, color=["2-cell-like", "8-cell-like", "EPI", "TE", "prE"], size=200,
          palette="Set2", color_map= plt.cm.Purples,) #, save='umapE3_5_KO_new.png'

In [None]:
sc.pl.umap(adata_KO_3_5, color=["Subpop_scaled_scores"], size=300) #, save='umapE3_5_KO_new.png'
sc.pl.umap(adata_KO_3_5_reUMAP, color=["Subpop_scaled_scores"], size=300) #, save='umapE3_5_KO_new.png'

# 7. miR-203 KI versus Control

In [None]:
adata_KI = adata[adata.obs.Treatment != 'KO']
adata_KI

# 8. Velocity and Pseudotime

In [None]:
import scvelo as scv
scv.set_figure_params()

### 8.1 Pseudotime in WT cells

In [None]:
adata_WT = adata[adata.obs.Treatment == 'Control']

In [None]:
adata_WT.layers

In [None]:
scv.pl.proportions(adata_WT, groupby='scaled_scores')

In [None]:
# pre-process
scv.pp.filter_and_normalize(adata_WT)
scv.pp.moments(adata_WT)

In [None]:
#compute velocity
scv.tl.velocity(adata_WT, mode='stochastic')
scv.tl.velocity_graph(adata_WT)

In [None]:
scv.pl.velocity_embedding(adata_WT, basis='umap',frameon=False)

In [None]:
scv.pl.velocity_embedding_grid(adata_WT, basis='umap',frameon=False, color='Subpop_scaled_scores',title='',scale=0.25,
                               palette={"2-cell-like": "DARKRED","8-cell-like": "ORANGE","Epi-like": "YELLOWGREEN",
                               "TE-like": "SEAGREEN","prE-like":"CORNFLOWERBLUE", "Other":"GRAY"})

In [None]:
scv.tl.velocity_confidence(adata_WT)
keys = 'velocity_length','velocity_confidence'
scv.pl.scatter(adata_WT,c=keys,cmap='coolwarm',perc=[5,95])

In [None]:
scv.pl.velocity_graph(adata_WT, threshold=0.1, color='Subpop_scaled_scores')

In [None]:
adata_WT.obs.Subpop_scaled_scores.dtype

In [None]:
scv.pl.velocity_embedding_stream(adata_WT, basis='umap',color='Subpop_scaled_scores',frameon=False)

In [None]:
scv.tl.velocity_pseudotime(adata_WT,root_key=45, end_key=1)
scv.pl.scatter(adata_WT, color='velocity_pseudotime', cmap='gnuplot', size=60)

In [None]:
adata_WT.uns['neighbors']['distances'] = adata_WT.obsp['distances']
adata_WT.uns['neighbors']['connectivities'] = adata_WT.obsp['connectivities']
scv.tl.paga(adata_WT, groups='Subpop_scaled_scores')
df = scv.get_df(adata_WT, 'paga/transitions_confidence').T
df.style.background_gradient(cmap='Blues').format('{:.2g}')

In [None]:
scv.pl.paga(adata_WT, basis='umap', color='Subpop_scaled_scores', size=50, alpha=.3,
            min_edge_width=2,node_size_scale=1.5)

### 8.2 Pseudotime in KO data

In [None]:
adata_KO.layers

In [None]:
scv.pl.proportions(adata_KO, groupby='scaled_scores')

In [None]:
# pre-process
scv.pp.filter_and_normalize(adata_KO)
scv.pp.moments(adata_KO)

In [None]:
#compute velocity
scv.tl.velocity(adata_KO, mode='stochastic')
scv.tl.velocity_graph(adata_KO)

In [None]:
scv.pl.velocity_embedding(adata_KO, basis='umap',frameon=False)

In [None]:
scv.pl.velocity_embedding_grid(adata_KO, basis='umap',
                               frameon=False, color=['Subpop_scaled_scores'],title='',
                               scale=0.25,
                               #palette={"0": "DARKRED","1": "ORANGE","2": "YELLOWGREEN", "3": "SEAGREEN","4":"CORNFLOWERBLUE"}
                              )

In [None]:
palette={"2-cell-like": "DARKRED","8-cell-like": "ORANGE","EPI": "YELLOWGREEN",
         "TE": "SEAGREEN","prE":"CORNFLOWERBLUE", "Other":"GRAY"}

In [None]:
scv.pl.velocity_embedding_grid(adata_KO, basis='umap',frameon=False, color='Subpop_scaled_scores',title='',scale=0.25,
                               palette={"2-cell-like": "DARKRED","8-cell-like": "ORANGE","Epi-like": "YELLOWGREEN",
                               "TE-like": "SEAGREEN","prE-like":"CORNFLOWERBLUE", "Other":"GRAY"})

In [None]:
scv.tl.velocity_confidence(adata_KO)
keys = 'velocity_length','velocity_confidence'
scv.pl.scatter(adata_KO,c=keys,cmap='coolwarm',perc=[5,95])

In [None]:
scv.pl.velocity_graph(adata_KO,threshold=0.1,color='Subpop_scaled_scores')

In [None]:
adata_KO.obs.Subpop_scaled_scores.dtype

In [None]:
scv.pl.velocity_embedding_stream(adata_KO, basis='umap',color='Subpop_scaled_scores',frameon=False)

In [None]:
scv.tl.velocity_pseudotime(adata_KO,root_key=45, end_key=1)

In [None]:
scv.pl.scatter(adata_KO, color='velocity_pseudotime', cmap='gnuplot', size=40)

In [None]:
adata_KO.uns['neighbors']['distances'] = adata_KO.obsp['distances']
adata_KO.uns['neighbors']['connectivities'] = adata_KO.obsp['connectivities']
scv.tl.paga(adata_KO, groups='Subpop_scaled_scores')
df = scv.get_df(adata_KO, 'paga/transitions_confidence').T
df.style.background_gradient(cmap='Blues').format('{:.2g}')

In [None]:
scv.pl.paga(adata_KO, basis='umap', color='Subpop_scaled_scores',
            size=50,apha=.3,min_edge_width=2,node_size_scale=1.5)

### 8.3 Pseudotime in KO E3-5 cells

In [None]:
adata_KO_3_5 = adata[adata.obs.Sample == 'E3.5_KO']

In [None]:
adata.layers

In [None]:
scv.pl.proportions(adata_KO_3_5, groupby='scaled_scores')

In [None]:
# pre-process
scv.pp.filter_and_normalize(adata_KO_3_5)
scv.pp.moments(adata_KO_3_5)

In [None]:
#compute velocity
scv.tl.velocity(adata_KO_3_5, mode='stochastic')
scv.tl.velocity_graph(adata_KO_3_5)

In [None]:
scv.pl.velocity_embedding(adata_KO_3_5, basis='umap',frameon=False)

In [None]:
scv.pl.velocity_embedding_grid(adata_KO_3_5, basis='umap',frameon=False, color='Subpop_scaled_scores',title='',scale=0.25,
                               #palette={"0": "DARKRED","1": "ORANGE","2": "YELLOWGREEN", "3": "SEAGREEN","4":"CORNFLOWERBLUE"}
                              )

In [None]:
palette={"2-cell-like": "DARKRED","8-cell-like": "ORANGE","EPI": "YELLOWGREEN",
         "TE": "SEAGREEN","prE":"CORNFLOWERBLUE", "Other":"GRAY"}

In [None]:
scv.pl.velocity_embedding_grid(adata_KO_3_5, basis='umap',frameon=False, color='Subpop_scaled_scores',title='',scale=0.25,
                               palette={"2-cell-like": "DARKRED","8-cell-like": "ORANGE","Epi-like": "YELLOWGREEN",
                               "TE-like": "SEAGREEN","prE-like":"CORNFLOWERBLUE", "Other":"GRAY"})

In [None]:
scv.tl.velocity_confidence(adata_KO_3_5)
keys = 'velocity_length','velocity_confidence'
scv.pl.scatter(adata_KO_3_5,c=keys,cmap='coolwarm',perc=[5,95])

In [None]:
scv.pl.velocity_graph(adata_KO_3_5,threshold=0.1,color='Subpop_scaled_scores')

In [None]:
adata_KO_3_5.obs.Subpop_scaled_scores.dtype

In [None]:
scv.pl.velocity_embedding_stream(adata_KO_3_5, basis='umap',color='Subpop_scaled_scores',frameon=False)

In [None]:
scv.tl.velocity_pseudotime(adata_KO_3_5,root_key=45, end_key=1)
scv.pl.scatter(adata_KO_3_5, color='velocity_pseudotime', cmap='gnuplot', size=150)

In [None]:
adata_KO_3_5.uns['neighbors']['distances'] = adata_KO_3_5.obsp['distances']
adata_KO_3_5.uns['neighbors']['connectivities'] = adata_KO_3_5.obsp['connectivities']
scv.tl.paga(adata_KO_3_5, groups='Subpop_scaled_scores')
df = scv.get_df(adata_KO_3_5, 'paga/transitions_confidence').T
df.style.background_gradient(cmap='Blues').format('{:.2g}')

In [None]:
scv.pl.paga(adata_KO_3_5,basis='umap',color='Subpop_scaled_scores',size=50,apha=.3,min_edge_width=2,node_size_scale=1.5)

In [None]:
adata

# 9. Save modified .h5ad file

This file has been loaded as supplementary file in GEO

In [None]:
adata.write("231215_miR203_all.h5ad")