# C. Differential expression and functional analysis

- Authors: Marcos Malumbres & Agustín Sánchez-Belmonte
- Project: miR-203 controls developmental timing and early fate restriction during preimplantation embryogenesis
- Experiment: single cell RNAseq in early embryos (E3.5 and E4.5) in KO, KI and WT conditions.
- Part C: Differential expression and functional analysis

In this notebooks has been done differential expresion and functional analysis doing comparisons of subpopulations, treatments, conditions and others. The h5ad file output of B1 part has been use in this notebook.

# Set up

In [None]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import rcParams
from matplotlib.pyplot import rc_context

In [None]:
import warnings   
warnings.filterwarnings("ignore")

In [None]:
# Settings
sc.settings.verbosity = 0
sc.logging.print_header()
sc.set_figure_params(dpi=120, color_map='viridis', dpi_save=300)
sc.set_figure_params(figsize=[5,4])

DATA = '/Users/mmalumbres/Library/CloudStorage/OneDrive-VHIO/BioInformatics/BioProjects/miR203 & early embryos/data/'
DESKTOP = '/Users/mmalumbres/Desktop/'
sc.settings.figdir = DESKTOP

## GSEApy

In [None]:
import gseapy as gspy
from gseapy import Msigdb
from gseapy import gseaplot
from gseapy import barplot, dotplot

In [None]:
gspy.__version__

In [None]:
msig = Msigdb()

In [None]:
# list msigdb version you wanna query
msig.list_dbver().tail(4)

In [None]:
# list categories given dbver.
msig.list_category(dbver="2023.2.Hs") 

In [None]:
# mouse hallmark genesets
# mh.all
# m2.cp.reactome
gmt_mh = msig.get_gmt(category='mh.all', dbver="2023.2.Mm")
gmt_mR = msig.get_gmt(category='m2.cp.reactome', dbver="2023.2.Mm")
gmt_h = msig.get_gmt(category='h.all', dbver="2023.2.Hs")
gmt_R = msig.get_gmt(category='2.cp.reactome', dbver="2023.2.Hs")

In [None]:
print(gmt['HALLMARK_WNT_BETA_CATENIN_SIGNALING'])

## Read data

In [None]:
adata = sc.read(DATA + "231215_mir203_all.h5ad")      #231208_mir203_all.h5ad
adata

In [None]:
# This new column does not allow saving the new .h5ad file ¿¿¿????
adata.obs['leiden_groups_genotype'] = adata.obs['leiden_groups'].astype("string") + '_' + adata.obs['Treatment'].astype("string")
adata.obs.leiden_groups_genotype = adata.obs.leiden_groups_genotype.astype("category")

In [None]:
print(adata.obs.leiden_groups_genotype.dtype)
adata.obs.head(2)

In [None]:
sc.pl.umap(adata, color=["Subpop_scaled_scores", "leiden_groups"])

In [None]:
sc.pl.umap(adata, color=["leiden_groups_genotype"], save="_leiden_groups_genotype.png")

# Differential expression and functional analysis

### SCRIPTS

In [None]:
# Functions
def dedf_to_rnk_mouse_to_human(de_df, outdir, dataname, samplename):
    """Generates a .rnk file from differential expression for GSEA analysis."""
    
    rnk = de_df[["names", "scores"]].copy()
    rnk.columns = ["#names", "scores"]
    rnk["#names"] = rnk["#names"].astype(str).str.upper()                     # all to uppercase
    rnk.to_csv(outdir + "/" + samplename + ".rnk", sep="\t", index=False)
    
    return rnk

def dedf_to_rnk(de_df, outdir, dataname, samplename):
    """Generates a .rnk file from differential expression for GSEA analysis."""
    
    rnk = de_df[["names", "scores"]].copy()
    rnk.columns = ["#names", "scores"]
    rnk["#names"] = rnk["#names"].astype(str)                                # no need to uppercase
    rnk.to_csv(outdir + "/" + samplename + ".rnk", sep="\t", index=False)
    
    return rnk


def rnk_to_geseapy(pre_res, rnk, gset, outdir, samplename):
    """Run GSEApy and merge index results with a previous table."""
    
    gene_set    = gset + ".gmt"
    pre_res_new = gspy.prerank(rnk=rnk, gene_sets= gene_set, processes=4, permutation_num=1000,
                               outdir=outdir, graph_num=60, format='png', seed=6)
    pre_res_new_df = pd.DataFrame(pre_res_new.res2d.sort_index())
    pre_res_new_df["Sample"] = samplename
    pre_res_new_df.to_csv(outdir + "/" + "GSEApy_results.tsv", sep="\t")
    pre_res     = pd.concat([pre_res, pre_res_new_df], join="inner", ignore_index=True)

    return pre_res

def add_classes(pre_res):
    pre_res["Class"] = np.where(pre_res["Term"].str.startswith("REACTOME"), "REACTOME",
                          np.where(pre_res["Term"].str.startswith("WP"), "WP",
                             np.where(pre_res["Term"].str.startswith("KEGG"), "KEGG",
                                np.where(pre_res["Term"].str.startswith("PID"), "PIP",
                                   np.where(pre_res["Term"].str.startswith("HALL"), "HALLMARK",
                                      np.where(pre_res["Term"].str.startswith("BIO"), "BIOCARTA",
                                         np.where(pre_res["Term"].str.startswith("ST"), "ST",
                                            "Others")))))))
    return pre_res
    
def run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res):
    """RUN all scripts"""
    
    # Organize variables & folders
    sample = groupA + "_vs_" + groupB
    outdir = DESKTOP + "/" + dataname + "_" + sample
    samplename = dataname + "_" + groupA + "_vs_" + groupB

    if not os.path.exists(outdir):
        os.mkdir(outdir)

    # Differential expression
    sc.tl.rank_genes_groups(database, score, groups=[groupA], reference=groupB, method=method, 
                        use_raw=True, log_transformed=True, n_genes=-1) #adata.raw = adata
    de_df = sc.get.rank_genes_groups_df(database, group=groupA)
    de_df.to_csv(outdir + "/" + samplename + ".tsv", sep='\t', index=False) 

    with rc_context({'figure.figsize': (9, 4)}):
        sc.pl.rank_genes_groups(database, n_genes=50, save="_" + groupA + "_vs_" + groupB + ".png")
    
    # Save .rnk list adn run GSEA
    rnk = dedf_to_rnk(de_df, outdir, dataname, samplename)
    pre_res = rnk_to_geseapy(pre_res, rnk, gset, outdir, samplename)

    return pre_res

### Manual

In [None]:
adata.obs.Sample.unique()

In [None]:
# EDIT
database = adata            # adata (all samples) or subset of data
dataname = "ad"

# Column and groups
score = "Sample"            # column in adata.obs
groupA = "E3.5_KO"                      # value for column in adata.obs
groupB = "E3.5"                 # value for column in adata.obs

# Statistical algorithm
method = 'wilcoxon'         # 't-test' or 'wilcoxon'

# Select Pathways
GENESETS = "/Users/mmalumbres/Library/CloudStorage/OneDrive-VHIO/BioInformatics/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "h.c2.cp.v7.2.symbols_mix"
gene_set = gset + ".gmt"

In [None]:
# Organize variables & folders
sample = groupA + "_vs_" + groupB
outdir = DESKTOP + "/" + dataname + "_" + sample
samplename = dataname + "_" + groupA + "_vs_" + groupB

if not os.path.exists(outdir):
    os.mkdir(outdir)

In [None]:
# Differential expression
sc.tl.rank_genes_groups(database, score, groups=[groupA], reference=groupB, method=method, 
                        use_raw=True, log_transformed=True, n_genes=-1) #adata.raw = adata
de_df = sc.get.rank_genes_groups_df(database, group=groupA)
de_df.to_csv(outdir + "/" + samplename + ".tsv", sep='\t', index=False) 

In [None]:
rnk = dedf_to_rnk_mouse_to_human(de_df, outdir, dataname, samplename)
#rnk = dedf_to_rnk_mouse_to_human(de_df, outdir, dataname, samplename)    # use mouse gene_sets

In [None]:
pre_res = gspy.prerank(rnk=rnk,
                       gene_sets= gene_set,         # gmt_R, 'KEGG_2016', etc.
                       threads=4,
                       min_size=5,
                       max_size=1000,
                       permutation_num=1000,     # reduce number to speed up testing
                       outdir=outdir,            # or None
                       format='png',
                       seed=6,
                       #verbose=True, # see what's going on behind the scenes
                       )

In [None]:
pre_res.res2d.head(3)

In [None]:
terms = pre_res.res2d.Term
terms

In [None]:
axs = pre_res.plot(terms=terms[0]) # v1.0.5

In [None]:
gseaplot(rank_metric=pre_res.ranking, term=terms[0], ofname=DESKTOP + 'your.plot.png', **pre_res.results[terms[0]])

In [None]:
axs = pre_res.plot(terms=terms[0:6],
                   #legend_kws={'loc': (1.2, 0)}, # set the legend loc
                   show_ranking=True, # whether to show the second yaxis
                   figsize=(3,4)
                  )
# or use this to have more control on the plot
# from gseapy import gseaplot2
# terms = pre_res.res2d.Term[1:5]
# hits = [pre_res.results[t]['hits'] for t in terms]
# runes = [pre_res.results[t]['RES'] for t in terms]
# fig = gseaplot2(terms=terms, ress=runes, hits=hits,
#               rank_metric=gs_res.ranking,              # rank_metric=pre_res.ranking
#               legend_kws={'loc': (1.2, 0)},            # set the legend loc
#               figsize=(4,5))                           

In [None]:
# to save your figure, make sure that ``ofname`` is not None
ax = dotplot(pre_res.res2d,
             column="NOM p-val",                             #“Adjusted P-value”, “P-value”, “NOM p-val”, “FDR q-val”
             title='TEST',
             ofname=DESKTOP + "/mix_E4.5_KO_vs_WT.png",                                     # save to file
             cmap=plt.cm.viridis,
             top_term=12,
             size=6,                                          # adjust dot size
             figsize=(6,5), cutoff=0.25, show_ring=False)


In [None]:
from gseapy import dotplot
# to save your figure, make sure that ``ofname`` is not None
ax = dotplot(pre_res.res2d,
             column="FDR q-val",
             title='KEGG_2016',
             cmap=plt.cm.viridis,
             size=6, # adjust dot size
             figsize=(4,5), cutoff=0.25, show_ring=False)

In [None]:
https://gseapy.readthedocs.io/en/latest/gseapy_example.html

In [None]:
from gseapy import enrichment_map
# return two dataframe
nodes, edges = enrichment_map(pre_res.res2d)

In [None]:
import networkx as nx

In [None]:
# build graph
G = nx.from_pandas_edgelist(edges,
                            source='src_idx',
                            target='targ_idx',
                            edge_attr=['jaccard_coef', 'overlap_coef', 'overlap_genes'])

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))

# init node cooridnates
pos=nx.layout.spiral_layout(G)
#node_size = nx.get_node_attributes()
# draw node
nx.draw_networkx_nodes(G,
                       pos=pos,
                       cmap=plt.cm.RdYlBu,
                       node_color=list(nodes.NES),
                       node_size=list(nodes.Hits_ratio *1000))
# draw node label
nx.draw_networkx_labels(G,
                        pos=pos,
                        labels=nodes.Term.to_dict())
# draw edge
edge_weight = nx.get_edge_attributes(G, 'jaccard_coef').values()
nx.draw_networkx_edges(G,
                       pos=pos,
                       width=list(map(lambda x: x*10, edge_weight)),
                       edge_color='#CDDBD4')
plt.show()

### Run a single comparison

In [None]:
# EDIT
database = adata            # adata (all samples) or subset of data
dataname = "ad"

# Column and groups
score = "leiden_groups_genotype"            # column in adata.obs
groupA = "E4.5_Epi_KO"                      # value for column in adata.obs
groupB = "E4.5_Epi_Control"                 # value for column in adata.obs

# Statistical algorithm
method = 'wilcoxon'         # 't-test' or 'wilcoxon'

# Select Pathways
GENESETS = "/Users/mmalumbres/Library/CloudStorage/OneDrive-VHIO/BioInformatics/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "h.c2.cp.v7.2.symbols_mix"

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empty.tsv", sep="\t", ) 

In [None]:
# RUN
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

pre_res = add_classes(pre_res)
pre_res.to_excel(DESKTOP + "GSEApy_Combined_results.xlsx")

In [None]:
pre_res

### Run multiple comparisons

In [None]:
# EDIT
database = adata            # adata (all samples) or subset of data
dataname = "ad"

# Statistical algorithm
method = 'wilcoxon'         # 't-test' or 'wilcoxon'

# Select Pathways
GENESETS = "/Users/mmalumbres/Library/CloudStorage/OneDrive-VHIO/BioInformatics/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "h.c2.cp.v7.2.symbols_mix"

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empty.tsv", sep="\t", ) 

#### by Sample

In [None]:
adata.obs.Sample.unique()

In [None]:
# Column and groups
score = "Sample"            # column in adata.obs

groupA = "E3.5_KO"              
groupB = "E3.5"                 
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "E3.5_dox"              
groupB = "E3.5"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "E4.5_KO"              
groupB = "E4.5"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "E4.5_dox"              
groupB = "E4.5"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

pre_res = add_classes(pre_res)
pre_res.to_excel(DESKTOP + "GSEApy_Combined_results.xlsx")

#### by Leiden_groups_genotype

In [None]:
sc.pl.umap(adata, color=["leiden_groups_genotype"])

In [None]:
adata.obs.leiden_groups_genotype.unique()

In [None]:
# Column and groups
score = "leiden_groups_genotype"            # column in adata.obs

groupA = "Mix(E3.5)_KO"              
groupB = "TE(E3.5)_Control"                 
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Mix(E3.5)_KO"              
groupB = "ICM(E3.5)_Control"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "TE(E3.5)_dox"              
groupB = "TE(E3.5)_Control"             
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "ICM(E3.5)_dox"             
groupB = "ICM(E3.5)_Control"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "TE(E4.5)_KO"             
groupB = "TE(E4.5)_Control"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "TE(E4.5)_dox"             
groupB = "TE(E4.5)_Control"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Epi(E4.5)_KO"             
groupB = "Epi(E4.5)_Control"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Epi(E4.5)_dox"             
groupB = "Epi(E4.5)_Control"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "PrE(E4.5)_KO"             
groupB = "PrE(E4.5)_Control"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "PrE(E4.5)_dox"             
groupB = "PrE(E4.5)_Control"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

pre_res = add_classes(pre_res)
pre_res.to_excel(DESKTOP + "GSEApy_Combined_results.xlsx")

## Pathway visualization

In [None]:
# required for scattermaps_mm
sns.__version__

In [None]:
#import scattermaps_mm
from scattermaps_mm import scattermap

In [None]:
gsea_combined = pd.read_csv('/Users/asanchezb/Desktop/ko_vs_control_110523/GSEApy_Combined_results.tsv', sep='\t', index_col=0)
path = pd.read_csv('/Users/asanchezb/Desktop/KO_final_pathways.csv')
chr_path = pd.read_csv('/Users/asanchezb/Desktop/new_sc_pgonz/GSEA_MM_NEW/CHROMATIN_PATHWAYS.csv')
list_paths = list(path.PATH)

In [None]:
#list_paths = list(chr_path.PATH)
gsea_combined_filter = gsea_combined.loc[list_paths]

In [None]:
gsea_combined_filter

In [None]:
gsea_combined_nes = gsea_combined_filter.pivot_table(index='Sample', columns="Term", values='nes')
gsea_combined_fdr = gsea_combined_filter.pivot_table(index='Sample', columns="Term", values='fdr')
gsea_combined_pval = gsea_combined_filter.pivot_table(index='Sample', columns="Term", values='pval')

In [None]:
gsea_combined_nes

In [None]:
sample_order =['ad_E2_5_plus_dox_vs_E2_5_minus_dox',
              'ad_ICM(E3_5)_plus_dox_vs_ICM(E3_5)_minus_dox',
              'ad_TE(E3_5)_plus_dox_vs_TE(E3_5)_minus_dox',
              'ad_TE(E4_5)_plus_dox_vs_TE(E4_5)_minus_dox',
              'ad_Epi(E4_5)_plus_dox_vs_Epi(E4_5)_minus_dox',
              'ad_PrE(E4_5)_plus_dox_vs_PrE(E4_5)_minus_dox']

In [None]:
sample_order =['ad_E3_5_ko_KO_vs_ICM(E3_5)_Control',
              'ad_E3_5_ko_KO_vs_TE(E3_5)_Control',
              'ad_Epi(E4_5)_KO_vs_Epi(E4_5)_Control',
              'ad_PrE(E4_5)_KO_vs_PrE(E4_5)_Control',
              'ad_TE(E4_5)_KO_vs_TE(E4_5)_Control']

In [None]:
gsea_combined_nes = gsea_combined_nes.reindex(index=sample_order)
gsea_combined_fdr = gsea_combined_fdr.reindex(index=sample_order)
gsea_combined_pval = gsea_combined_pval.reindex(index=sample_order)

In [None]:
gsea_combined_nes

In [None]:
# Transform to -log
size_fdr = -np.log(gsea_combined_fdr)
size2_fdr = size_fdr.replace(np.inf, 10)

size_pval = -np.log(gsea_combined_pval)
size2_pval = size_pval.replace(np.inf, 10)

## Plot NES and FDR

In [None]:
gsea_combined_nes.shape

In [None]:
gsea_combined_nes = gsea_combined_nes.reindex(columns=list_paths)
gsea_combined_fdr = gsea_combined_fdr.reindex(columns=list_paths)
gsea_combined_pval = gsea_combined_pval.reindex(columns=list_paths)

In [None]:
plt.figure(figsize=(30,10))
ax = scattermap(gsea_combined_nes, cmap='coolwarm', marker_size=size2_fdr, factor=100, vmin=-1.5, vmax=1.5, square=True)
plt.tight_layout()
plt.savefig("/Users/asanchezb/Desktop/ko_vs_control_110523/plots/top_GSEA_FDR.png")

In [None]:
plt.figure(figsize=(30,10))
ax = scattermap(gsea_combined_nes, cmap='coolwarm', marker_size=size2_pval, factor=100, vmin=-1.5, vmax=1.5, square=True)
plt.tight_layout()
plt.savefig("/Users/asanchezb/Desktop/ko_vs_control_110523/plots/top_GSEA_pvalue.png")

# Selected

In [None]:
gsea_combined_nes = gsea_combined_nes.reindex(columns=list_paths)
gsea_combined_fdr = gsea_combined_fdr.reindex(columns=list_paths)
gsea_combined_pval = gsea_combined_pval.reindex(columns=list_paths)

In [None]:
plt.figure(figsize=(30,15))
ax = scattermap(gsea_combined_nes, cmap='coolwarm', marker_size=size2_pval, factor=15, vmin=-1.5, vmax=1.5, square=True)
plt.tight_layout()
plt.savefig("/Users/asanchezb/Desktop/new_sc_pgonz/GSEA_MM/selected_GSEA_pval.png")

In [None]:
plt.figure(figsize=(30,15))
ax = scattermap(gsea_combined_nes, cmap='coolwarm', marker_size=size2_fdr, factor=15, vmin=-1.5, vmax=1.5, square=True)
plt.tight_layout()
plt.savefig("/Users/asanchezb/Desktop/new_sc_pgonz/GSEA_MM/selected_GSEA_fdr.png")

# Add targets

In [None]:
gsea_combined_filter

In [None]:
targets = pd.read_csv('/Users/asanchezb/Desktop/new_sc_pgonz/TARGETS.csv')
list_targets = list(targets.x)

In [None]:
list_targets = [x.upper() for x in list_targets]

In [None]:
gsea_combined_filter.ledge_genes.iloc[0,]

In [None]:
tar = []
for i in range(gsea_combined_filter.shape[0]):
    genes = gsea_combined_filter.ledge_genes.iloc[i,]
    list_genes = genes.split(';')
    matches = list(set(list_genes) & set(list_targets))
    tar.append(matches)

gsea_combined_filter['target_genes'] = tar

In [None]:
gsea_combined_filter

In [None]:
gsea_combined_filter.to_csv('/Users/asanchezb/Desktop/ko_vs_control_110523/plots/gsea_combined_filter_target.tsv', sep='\t')

# GSEA NEW

In [None]:
gsea_new = pd.read_csv('/Users/asanchezb/Desktop/new_sc_pgonz/GSEA_MM/h.c2.cp.v7.2.symbols_mix.gmt', sep='\t', header=None, index_col=0)

In [None]:
gsea_new = gsea_new.loc[list_paths]

In [None]:
pluri = pd.read_csv('/Users/asanchezb/Desktop/pluripotency.csv')
toti = pd.read_csv('/Users/asanchezb/Desktop/totipotency.csv')

In [None]:
targets = pd.read_csv('/Users/asanchezb/Desktop/new_sc_pgonz/TARGETS_2.csv')
#list_targets = list(targets.MIR-203_TARGETS)

In [None]:
targets['MIR-203_TARGETS'] = targets['MIR-203_TARGETS'].str.upper()

In [None]:
pluri['WONG_EMBRYONIC_STEM_CELL_CORE'] = pluri['WONG_EMBRYONIC_STEM_CELL_CORE'].str.upper()
toti['TOTIPOTENCY'] = toti['TOTIPOTENCY'].str.upper()

In [None]:
targets.T

In [None]:
gsea_new = gsea_new.append(targets.T)

In [None]:
gsea_new = gsea_new.append(pluri.T)
gsea_new = gsea_new.append(toti.T)

In [None]:
pd.DataFrame(gsea_new.loc['MIR-203_TARGETS'])

In [None]:
gsea_new.to_csv('/Users/asanchezb/Desktop/new_sc_pgonz/GSEA_MM/gsea_new_pepe_100523.gmt', index=True, header= False, sep= '\t')

In [None]:
targets.T.to_csv('GSEA_MM/MIR_203.gmt', index=True, header= False, sep= '\t')

In [None]:
pwd

In [None]:
# EDIT
database = adata            # adata (all samples) or subset of data
dataname = "ad"

method = 'wilcoxon'         # 't-test' or 'wilcoxon'

# Select Pathways
GENESETS = "/Users/asanchezb/Desktop/new_sc_pgonz/GSEA_MM/"
gset = GENESETS + "gsea_new_pepe_100523"

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")

# Column and groups
score = "Phenotype_2"            # column in adata.obs

groupA = "E3_5_ko_KO"              # label for score in adata.obs
groupB = "ICM(E3_5)_Control"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "E3_5_ko_KO"              # label for score in adata.obs
groupB = "TE(E3_5)_Control"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "PrE(E4_5)_KO"              # label for score in adata.obs
groupB = "PrE(E4_5)_Control"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "TE(E4_5)_KO"              # label for score in adata.obs
groupB = "TE(E4_5)_Control"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Epi(E4_5)_KO"              # label for score in adata.obs
groupB = "Epi(E4_5)_Control"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)


pre_res.to_csv(DESKTOP + "GSEApy_Combined_results.tsv", sep="\t")

In [None]:
gsea_combined = pd.read_csv('/Users/asanchezb/Desktop/ko_vs_control_160523/GSEApy_Combined_results.tsv', sep='\t', index_col=0)

In [None]:
gsea_combined_filter = gsea_combined

In [None]:
path = pd.read_csv('/Users/asanchezb/Desktop/KO_final_pathways_2.csv')
list_paths = list(path.PATH)

In [None]:
gsea_combined_nes = gsea_combined_filter.pivot_table(index='Sample', columns="Term", values='nes')
gsea_combined_fdr = gsea_combined_filter.pivot_table(index='Sample', columns="Term", values='fdr')
gsea_combined_pval = gsea_combined_filter.pivot_table(index='Sample', columns="Term", values='pval')

In [None]:
sample_order =['ad_E2_5_plus_dox_vs_E2_5_minus_dox',
              'ad_ICM(E3_5)_plus_dox_vs_ICM(E3_5)_minus_dox',
              'ad_TE(E3_5)_plus_dox_vs_TE(E3_5)_minus_dox',
              'ad_TE(E4_5)_plus_dox_vs_TE(E4_5)_minus_dox',
              'ad_Epi(E4_5)_plus_dox_vs_Epi(E4_5)_minus_dox',
              'ad_PrE(E4_5)_plus_dox_vs_PrE(E4_5)_minus_dox']

In [None]:
gsea_combined_nes = gsea_combined_nes.reindex(index=sample_order)
gsea_combined_fdr = gsea_combined_fdr.reindex(index=sample_order)
gsea_combined_pval = gsea_combined_pval.reindex(index=sample_order)

In [None]:
# Transform to -log
size_fdr = -np.log(gsea_combined_fdr)
size2_fdr = size_fdr.replace(np.inf, 10)

size_pval = -np.log(gsea_combined_pval)
size2_pval = size_pval.replace(np.inf, 10)

In [None]:
gsea_combined_nes = gsea_combined_nes.reindex(columns=list_paths)
gsea_combined_fdr = gsea_combined_fdr.reindex(columns=list_paths)
gsea_combined_pval = gsea_combined_pval.reindex(columns=list_paths)

In [None]:
plt.figure(figsize=(30,15))
ax = scattermap(gsea_combined_nes, cmap='coolwarm', marker_size=size2_fdr, factor=40, vmin=-1.5, vmax=1.5, square=True)
plt.tight_layout()
plt.savefig("/Users/asanchezb/Desktop/ko_vs_control_160523/top_GSEA_FDR.png")

In [None]:
plt.figure(figsize=(30,15))
ax = scattermap(gsea_combined_nes, cmap='coolwarm', marker_size=size2_pval, factor=40, vmin=-1.5, vmax=1.5, square=True)
plt.tight_layout()
plt.savefig("/Users/asanchezb/Desktop/ko_vs_control_160523/top_GSEA_pvalue.png")

# MIR AND 2CELL GSEA

In [None]:
gsea_combined = pd.read_csv('/Users/asanchezb/Desktop/new_sc_pgonz/GSEA_MM_NEW/MIR_AND_2CELL/GSEApy_Combined_results.tsv', sep='\t', index_col=0)

In [None]:
gsea_combined_filter = gsea_combined

In [None]:
gsea_combined_nes = gsea_combined_filter.pivot_table(index='Sample', columns="Term", values='nes')
gsea_combined_fdr = gsea_combined_filter.pivot_table(index='Sample', columns="Term", values='fdr')
gsea_combined_pval = gsea_combined_filter.pivot_table(index='Sample', columns="Term", values='pval')

In [None]:
sample_order =['ad_E2_5_plus_dox_vs_E2_5_minus_dox',
              'ad_ICM(E3_5)_plus_dox_vs_ICM(E3_5)_minus_dox',
              'ad_TE(E3_5)_plus_dox_vs_TE(E3_5)_minus_dox',
              'ad_TE(E4_5)_plus_dox_vs_TE(E4_5)_minus_dox',
              'ad_Epi(E4_5)_plus_dox_vs_Epi(E4_5)_minus_dox',
              'ad_PrE(E4_5)_plus_dox_vs_PrE(E4_5)_minus_dox']

In [None]:
gsea_combined_nes = gsea_combined_nes.reindex(index=sample_order)
gsea_combined_fdr = gsea_combined_fdr.reindex(index=sample_order)
gsea_combined_pval = gsea_combined_pval.reindex(index=sample_order)

In [None]:
# Transform to -log
size_fdr = -np.log(gsea_combined_fdr)
size2_fdr = size_fdr.replace(np.inf, 10)

size_pval = -np.log(gsea_combined_pval)
size2_pval = size_pval.replace(np.inf, 10)

In [None]:
plt.figure(figsize=(30,15))
ax = scattermap(gsea_combined_nes, cmap='coolwarm', marker_size=size2_fdr, factor=15, vmin=-1.5, vmax=1.5, square=True)
plt.tight_layout()
plt.savefig("/Users/asanchezb/Desktop/new_sc_pgonz/GSEA_MM_NEW/MIR_AND_2CELL/top_GSEA_FDR.png")

# 9. miR-203 targets

In [None]:
targets = pd.read_csv('/Users/asanchezb/Desktop/new_sc_pgonz/TARGETS_2.csv')
#list_targets = list(targets.MIR-203_TARGETS)

In [None]:
genes = ['Kat6a','Kat6b','Ep300','Arid1a','Arid2','Dr1','Smarcd1','Kmt2c','Atf2']

In [None]:
sc.pl.dotplot(adata, genes, groupby="Treatment")

In [None]:
sc.pl.dotplot(adata[adata.obs.Treatment != "KO"], genes, groupby="leiden_groups_genotype", standard_scale="var")

In [None]:
adata.obs.leiden_groups_genotype.dtype

In [None]:
order = ["Mix(E3.5)_KO", 'TE(E3.5)_KO', 'TE(E3.5)_Control', 'TE(E3.5)_dox', "ICM(E3.5)_KO", "ICM(E3.5)_Control", "ICM(E3.5)_dox",
         "pre_TE(E4.5)_KO", "pre_TE(E4.5)_Control", "pre_TE(E4.5)_dox", "TE(E4.5)_KO", "TE(E4.5)_Control", "TE(E4.5)_dox",
         "Epi(E4.5)_KO", "Epi(E4.5)_Control", "Epi(E4.5)_dox", "PrE(E4.5)_KO", "PrE(E4.5)_Control", "PrE(E4.5)_dox"] 

order_KO = ["Mix(E3.5)_KO", 'TE(E3.5)_KO', 'TE(E3.5)_Control', "ICM(E3.5)_KO", "ICM(E3.5)_Control", 
         "pre_TE(E4.5)_KO", "pre_TE(E4.5)_Control", "TE(E4.5)_KO", "TE(E4.5)_Control", 
         "Epi(E4.5)_KO", "Epi(E4.5)_Control", "PrE(E4.5)_KO", "PrE(E4.5)_Control"] 

order_KI = ['TE(E3.5)_Control', 'TE(E3.5)_dox', "ICM(E3.5)_Control", "ICM(E3.5)_dox",
         "pre_TE(E4.5)_Control", "pre_TE(E4.5)_dox", "TE(E4.5)_Control", "TE(E4.5)_dox",
         "Epi(E4.5)_Control", "Epi(E4.5)_dox", "PrE(E4.5)_Control", "PrE(E4.5)_dox"] 

In [None]:
sc.pl.dotplot(adata_KI, genes, groupby="leiden_groups_genotype", categories_order=order_KI, standard_scale="var")