In [None]:
import os
from pathlib import Path
import scanpy as sc
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import mudata

load_dotenv()

# set working directory
os.chdir(os.getenv('PROJECT_PATH'))

OUTPUT_PATH = Path('/mnt/windows/extradata') / 'meiotic_cells/scplus_pipeline_custom_cis/Snakemake'

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
scplus_mdata = mudata.read(OUTPUT_PATH / "scplusmdata.h5mu")

Direct and extended predicted TF-to-region-to-gene links. This dataframe contains also a ranking of each TF-region-gene triplet, based on its importance triplet_rank.

In [None]:
scplus_mdata.uns["direct_e_regulon_metadata"]

In [None]:
direct_e_regulons = scplus_mdata.uns["direct_e_regulon_metadata"]
#direct_e_regulons.loc[direct_e_regulons["Gene"].isin(["STRA8", "SPO11", "DMC1"])]
direct_e_regulons.loc[direct_e_regulons["Gene"].isin(["SPO11"])]


In [None]:
scplus_mdata.uns["extended_e_regulon_metadata"]

In [None]:
extended_e_regulons = scplus_mdata.uns["extended_e_regulon_metadata"]
extended_e_regulons.loc[extended_e_regulons["Gene"].isin(["STRA8", "SPO11", "DMC1"])]
#extended_e_regulons.loc[extended_e_regulons["Gene"].isin(["SPO11"])]

## eRegulon specificity scores

In [None]:
from scenicplus.RSS import (regulon_specificity_scores, plot_rss)

In [None]:
rss = regulon_specificity_scores(
    scplus_mudata = scplus_mdata,
    variable = "scATAC_counts:celltype",
    modalities = ["direct_gene_based_AUC", "extended_gene_based_AUC"]
)

In [None]:
scplus_mdata.obs["scATAC_counts:celltype"].value_counts()

In [None]:
# Look at top regulons specific to meiotic cells
meiotic_cells_types = ['oogonia_STRA8', 'oogonia_meiotic']

# Get top regulons for these cells
top_meiotic_regulons = rss.loc[meiotic_cells_types].sum(axis=0).sort_values(ascending=False)
print(f"Top 20 most specific regulons in meiotic cells {meiotic_cells_types}:")
print(top_meiotic_regulons.head(20))

In [None]:
# Look at its target genes
direct_e_regulons[direct_e_regulons['eRegulon_name'] == 'IKZF3_direct_+/+'].Gene.unique()

In [None]:
direct_e_regulons[direct_e_regulons['Gene'] == 'SYCP1']



In [None]:
extended_e_regulons[extended_e_regulons['Gene'] == 'SYCP1']


## Heatmap dotplot

In [None]:
regulon_names = top_meiotic_regulons.head(20).index.str.split("_").str[0] + "_" + top_meiotic_regulons.head(20).index.str.split("_").str[1] + "_" + top_meiotic_regulons.head(20).index.str.split("_").str[2]

We can draw a heatmap where the color represent target gene enrichment and the dotsize target region enrichment.

In [None]:
from scenicplus.plotting.dotplot import heatmap_dotplot
heatmap_dotplot(
    scplus_mudata = scplus_mdata,
    color_modality = "direct_gene_based_AUC",
    size_modality = "direct_region_based_AUC",
    group_variable = "scATAC_counts:celltype",
    eRegulon_metadata_key = "direct_e_regulon_metadata",
    color_feature_key = "Gene_signature_name",
    size_feature_key = "Region_signature_name",
    feature_name_key = "eRegulon_name",    
    sort_data_by = "direct_gene_based_AUC",        
    orientation = "horizontal",
    figsize = (16, 5)
)

In [None]:
from scenicplus.plotting.dotplot import heatmap_dotplot
heatmap_dotplot(
    scplus_mudata = scplus_mdata,
    color_modality = "direct_gene_based_AUC",
    size_modality = "direct_region_based_AUC",
    group_variable = "scATAC_counts:celltype",
    eRegulon_metadata_key = "direct_e_regulon_metadata",
    color_feature_key = "Gene_signature_name",
    size_feature_key = "Region_signature_name",
    feature_name_key = "eRegulon_name",    
    sort_data_by = "direct_gene_based_AUC",        
    subset_feature_names = regulon_names.tolist(),
    figsize = (16, 5)
)

## Cytoscape network


In [None]:
direct_e_regulons

In [None]:
direct_e_regulons.sort_values(by="triplet_rank").head(200).loc[:, ["TF", "Region", "Gene"]].to_csv(OUTPUT_PATH / "direct_e_regulons_top200.csv", index=False)

## TF Enrichment

In [None]:
# Look at regulons specific to meiotic cells by comparing to other cell types
meiotic_cells_types = ['oogonia_STRA8', 'oogonia_meiotic'] # addd oogonia_meiotic
other_cells = [ct for ct in rss.index if ct not in meiotic_cells_types]

# Calculate enrichment as difference between meiotic cells and others
meiotic_avg = rss.loc[meiotic_cells_types].mean()
other_avg = rss.loc[other_cells].mean()
enrichment = meiotic_avg - other_avg

# Sort by enrichment and get top regulons
top_meiotic_regulons = enrichment.sort_values(ascending=False)
print(f"Top 20 most enriched regulons in meiotic cells {meiotic_cells_types}:")
print(top_meiotic_regulons.head(20))

In [None]:
plus_plus_regulons = direct_e_regulons[direct_e_regulons['eRegulon_name'] == "ZEB1_direct_+/+"].Gene.unique()
plus_minus_regulons = direct_e_regulons[direct_e_regulons['eRegulon_name'] == "ZEB1_direct_+/-"].Gene.unique()
plus_plus_regulons


In [None]:
plus_minus_regulons

In [None]:
set(plus_plus_regulons).intersection(set(plus_minus_regulons))


In [None]:
direct_e_regulons[(direct_e_regulons['TF'] == "ZEB1") & (direct_e_regulons['Gene'] == "MEIOB")]


## UMAP Plots

In [None]:
rna_data = scplus_mdata["scRNA_counts"]
rna_data.obs["celltype"]= scplus_mdata["scATAC_counts"].obs["celltype"]

In [None]:
sc.pp.normalize_total(rna_data, target_sum=1e4)
sc.pp.log1p(rna_data)

In [None]:
sc.tl.rank_genes_groups(rna_data, 'celltype')#, method='t-test')

# The head function returns the top n genes per cluster
top_markers = pd.DataFrame(rna_data.uns['rank_genes_groups']['names']).head(5)
print(top_markers)

In [None]:
sc.pp.pca(rna_data)
sc.pp.neighbors(rna_data)
sc.tl.umap(rna_data)

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
sc.pl.umap(rna_data, color="celltype", ax=ax, show=False)
plt.tight_layout()
plt.show()

In [None]:
# plot a umap but with only one gene highlighted
#DMC1 HORMAD1 HORMAD2 INCA1 MEIOB PRDM9 RAD51AP2 SCML1 SHCBP1L SMC1B SPATA22 SPDYA SPO11 SYCE2 SYCE3 SYCP1 SYCP2 SYCP3 TEX12
sc.pl.umap(rna_data, color=["DMC1", "HORMAD1", "HORMAD2", "INCA1", "MEIOB", "PRDM9", "RAD51AP2", "SCML1", "SHCBP1L", "SMC1B", "SPATA22", "SPDYA", "SPO11", "SYCE2", "SYCE3", "SYCP1", "SYCP2", "SYCP3", "TEX12"])


In [None]:
rna_data_meiotic = rna_data[rna_data.obs["celltype"].isin(["oogonia_meiotic", "oogonia_STRA8"])].copy()
sc.tl.rank_genes_groups(rna_data_meiotic, 'celltype')#, method='t-test')


In [None]:
top_markers = pd.DataFrame(rna_data_meiotic.uns['rank_genes_groups']['names']).head(5)
print(top_markers)

In [None]:
sc.tl.score_genes(rna_data, ["DMC1", "HORMAD1", "HORMAD2", "INCA1", "MEIOB", "PRDM9", "RAD51AP2", "SCML1", "SHCBP1L", "SMC1B", "SPATA22", "SPDYA", "SPO11", "SYCE2", "SYCE3", "SYCP1", "SYCP2", "SYCP3", "TEX12"])
sc.pl.umap(rna_data, color="score")

In [None]:
rna_data.obs.groupby("celltype")["score"].mean()

Set a cutoff at 0.6. Look at 2 cell types.

In [None]:
rna_data[rna_data.obs["score"] > 0.6].obs["celltype"].value_counts()

In [None]:
rna_data.obs["celltype"].value_counts()

In [None]:
rna_data.obs["score"].max()

In [None]:
rna_data.obs[rna_data.obs["score"] == 1.3427788075725315]


## Correlation TF/region

In [None]:
# Convert sparse matrices to dense arrays and ensure same shape
index_gene = rna_data.var_names.tolist().index("IKZF3")
index_region = scplus_mdata['scATAC_counts'].var_names.tolist().index("chr16:1989418-1989918")

rna_vec = scplus_mdata['scRNA_counts'].X[:, index_gene].toarray().reshape(-1)
atac_vec = scplus_mdata['scATAC_counts'].X[:, index_region].reshape(-1)

from scipy.stats import spearmanr, pearsonr
print(spearmanr(rna_vec, atac_vec))
print(pearsonr(rna_vec, atac_vec))


## Create a cell type based on gene scoring

In [None]:
scplus_mdata.obs["celltype_2"] = np.where(rna_data.obs['score'] > 0.6, "meiotic", "non_meiotic")
scplus_mdata.obs["celltype_2"].value_counts()

In [None]:
# Calculate rss with new cell types
rss_2 = regulon_specificity_scores(
    scplus_mudata = scplus_mdata,
    variable = "celltype_2",
    modalities = ["direct_gene_based_AUC", "extended_gene_based_AUC"]
)

In [None]:
rss_2

In [None]:
# Calculate enrichment as difference between meiotic cells and others
meiotic_avg = rss_2.loc["meiotic"]
other_avg = rss_2.loc["non_meiotic"]
enrichment = meiotic_avg - other_avg

# Sort by enrichment and get top regulons
top_meiotic_regulons_2 = enrichment.sort_values(ascending=False)
print("Top 20 most enriched regulons in meiotic cells:")
print(top_meiotic_regulons_2.head(20))

In [None]:
meiotic_avg.sort_values(ascending=False).head(20)

## Intersection with previous top regulons

In [None]:
intersection = top_meiotic_regulons_2.head(20).index.intersection(top_meiotic_regulons.head(20).index)
print(len(intersection))
intersection

## Visualize regions from those eRegulons

In [None]:
from scenicplus.scenicplus_class import mudata_to_scenicplus
scplus_obj = mudata_to_scenicplus(
    mdata = scplus_mdata,
    path_to_cistarget_h5 = "/mnt/storage/outputs/meiotic_cells/scplus_pipeline/Snakemake/ctx_results.hdf5",
    path_to_dem_h5 = "/mnt/storage/outputs/meiotic_cells/scplus_pipeline/Snakemake/dem_results.hdf5"
)


In [None]:
scplus_obj.metadata_cell['celltype_2'] = scplus_mdata.obs["celltype_2"]

In [None]:
regulons_to_plot = intersection.tolist()[:6]
regulons_to_plot = [regulon.split("_")[0] + "_" + regulon.split("_")[1] + "_" + regulon.split("_")[2] for regulon in regulons_to_plot]
regulons_to_plot

In [None]:
tfs_to_plot = [regulon.split("_")[0] for regulon in regulons_to_plot]
tfs_to_plot = list(set(tfs_to_plot))
color_map = {tf: f"C{i}" for i, tf in enumerate(tfs_to_plot)}
color_map

In [None]:
from scenicplus.networks import create_nx_tables, create_nx_graph
nx_tables = create_nx_tables(scplus_obj, subset_eRegulons=regulons_to_plot,
                             add_differential_gene_expression=True, # add_differential_region_accessibility=True, 
                             differential_variable=["celltype_2"]
                             )


In [None]:
nx_tables['Node']['TF']

In [None]:
#G, pos, edge_tables, node_tables = create_nx_graph(nx_tables)
G, pos, edge_tables, node_tables = create_nx_graph(nx_tables, 
                   use_edge_tables = ['TF2R','R2G'],
                   color_edge_by = {'TF2R': {'variable' : 'TF', 'category_color' : color_map},
                                    'R2G': {'variable' : 'importance_x_rho', 'continuous_color' : 'viridis', 'v_min': -1, 'v_max': 1}},
                   transparency_edge_by =  {'R2G': {'variable' : 'importance_R2G', 'min_alpha': 0.1, 'v_min': 0}},
                   width_edge_by = {'R2G': {'variable' : 'importance_R2G', 'max_size' :  1.5, 'min_size' : 1}},
                   shape_node_by = {
                        'TF': {'variable': 'fixed_shape', 'fixed_shape': 'ellipse'},
                        'Gene': {'variable': 'fixed_shape', 'fixed_shape': 'ellipse'},
                        'Region': {'variable': 'fixed_shape', 'fixed_shape': 'diamond'}
                   },

                   color_node_by = {'TF': {'variable': 'TF', 'category_color' : color_map},
                                   'Gene': {'variable': 'celltype_2_Log2FC_meiotic', 'continuous_color' : 'PiYG'}, # or use 'RdYlBu', 'RdGy', 'PiYG', 'PRGn', 'BrBG', 'RdYlGn', 'coolwarm', 'seismic'                                    
                                    },
                   transparency_node_by =  {'Gene': {'variable' : 'celltype_2_Log2FC_meiotic', 'min_alpha': 0.1}},
                   size_node_by = {'TF': {'variable': 'fixed_size', 'fixed_size': 30},
                                    'Gene': {'variable': 'fixed_size', 'fixed_size': 15},
                                    'Region': {'variable': 'fixed_size', 'fixed_size': 10}},
                   label_size_by = {'TF': {'variable': 'fixed_label_size', 'fixed_label_size': 20.0},
                                    'Gene': {'variable': 'fixed_label_size', 'fixed_label_size': 5.0},
                                    'Region': {'variable': 'fixed_label_size', 'fixed_label_size': 0.0}},
                   layout='kamada_kawai_layout',
                   scale_position_by=250)

In [None]:
from scenicplus.networks import plot_networkx
plot_networkx(G, pos)


In [None]:
#from scenicplus.networks import export_to_cytoscape
#export_to_cytoscape(G, pos, out_file = OUTPUT_PATH / "meiotic_regulons_network.cyjs")


In [None]:
# Normalize each to [0,1] range
def normalize_series(series):
    return (series - series.min()) / (series.max() - series.min())

normalized_regulons_1 = normalize_series(top_meiotic_regulons)
normalized_regulons_2 = normalize_series(top_meiotic_regulons_2)


In [None]:
normalized_regulons = pd.concat([normalized_regulons_1, normalized_regulons_2], axis=1)
normalized_regulons.columns = ['Score3', 'Score4']
normalized_regulons.to_csv("/mnt/windows/extradata/meiotic_cells/top_meiotic_regulons_ensemble_2.csv", index=True)

In [None]:
test = pd.read_csv("/mnt/windows/extradata/meiotic_cells/top_meiotic_regulons_ensemble_1.csv", index_col=0)
test.head()


In [None]:
regulons = test.drop(columns = "Score2")

In [None]:
regulons.columns = ["Score"]
regulons.sort_values("Score", ascending=False, inplace=True)
regulons.to_csv("/mnt/windows/extradata/meiotic_cells/top_meiotic_regulons_ensemble.csv", index=True)

In [None]:
test = pd.read_csv("/mnt/windows/extradata/meiotic_cells/top_meiotic_regulons_ensemble.csv", index_col=0)
test.head()

In [None]:
def regulons_to_tf_scores(regulons):
    df = regulons.to_frame(name='Score')
    df['TF'] = df.index.str.split('_').str[0]
    return df.groupby('TF').mean()

normalized_tfs_1 = regulons_to_tf_scores(normalized_regulons_1)
normalized_tfs_2 = regulons_to_tf_scores(normalized_regulons_2)

normalized_tfs = pd.concat([normalized_tfs_1, normalized_tfs_2], axis=1)
normalized_tfs.columns = ['TopTFS1', 'TopTFS2']
#normalized_tfs.to_csv(Path(os.getenv('OUTPUT_PATH')) / "meiotic_cells/top_meiotic_tfs_ensemble.csv", index=True)


In [None]:
combined_tfs_previous = pd.read_csv("/mnt/windows/extradata/meiotic_cells/top_meiotic_regulons_ensemble_1.csv", index_col=0)
combined_tfs = pd.concat([combined_tfs_previous, normalized_regulons], axis=1)
#combined_tfs.to_csv(Path(os.getenv('OUTPUT_PATH')) / "meiotic_cells/top_meiotic_tfs_ensemble.csv", index=True)


In [None]:
combined_tfs.fillna(0, inplace=True)


In [None]:
combined_tfs['AvgScore'] = combined_tfs.mean(axis=1)
combined_tfs.sort_values(by='AvgScore', ascending=False, inplace=True)


In [None]:
#combined_tfs.to_csv("/mnt/windows/extradata/meiotic_cells/top_meiotic_regulons_ensemble.csv", index=True)

In [None]:
test = pd.read_csv("/mnt/windows/extradata/meiotic_cells/top_meiotic_regulons_ensemble.csv", index_col=0)


In [None]:
test.head(20)

In [None]:
plt.figure(figsize=(12, 6))
plt.bar(regulons.head(20).index, regulons.head(20)['Score'])
plt.xticks(rotation=90)
plt.ylabel('Score')
plt.xlabel('eRegulons')
plt.title('eRegulon score')
plt.tight_layout()
plt.savefig('/mnt/storage/outputs/garcia_ATAC/outputs/top20_eRegulons.svg', format='svg')
plt.show()

In [None]:
tfs_df = test.copy()
tfs_df['TF'] = tfs_df.index.str.split('_').str[0]
tfs_df = tfs_df.groupby('TF').mean().sort_values("Score", ascending=False)
tfs_df

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.bar(tfs_df.head(20).index, tfs_df.head(20)['Score'])
plt.xticks(rotation=90)
plt.ylabel('Score')
plt.xlabel('TFs')
plt.title('TF Score')
plt.tight_layout()
plt.show()

In [None]:
tfs_df.to_csv("/mnt/windows/extradata/meiotic_cells/top_meiotic_TFs.csv", index=True)

In [None]:
test = pd.read_csv("/mnt/windows/extradata/meiotic_cells/top_meiotic_TFs.csv", index_col=0)
test.head()