# Environment Setup and Data Loading

In [13]:
import scanpy as sc
import numpy as np
import pandas as pd
import os
import sys
import anndata as ad
import random
import functions_degs
import importlib

PROJECT_DIR = "D:/Github/SRF_Linda_RNA"
WORKING_DIR = os.path.join(PROJECT_DIR, "combine_data")
os.chdir(WORKING_DIR)
sys.path.insert(0, WORKING_DIR)

importlib.reload(functions_degs)

# Set seeds for all random number generators
random_seed = 0
np.random.seed(random_seed)
random.seed(random_seed)

In [14]:
# Set up directories

# REMOVE_DOUBLETS = True
REMOVE_DOUBLETS = False

FIX_TRESHOLD = True
# FIX_TRESHOLD = False

if FIX_TRESHOLD:
    BASE_RESULTS_DIR = os.path.join(WORKING_DIR, "results_from_raw")
else:
    if REMOVE_DOUBLETS:
        BASE_RESULTS_DIR = os.path.join(WORKING_DIR, "results_from_raw_percentile_threshold", "doublets_removed")
    else:
        BASE_RESULTS_DIR = os.path.join(WORKING_DIR, "results_from_raw_percentile_threshold")

INPUT_DIR = BASE_RESULTS_DIR
adata_path = os.path.join(INPUT_DIR, 'annotation_final.h5ad')

PARENT_OUTPUT_DIR = os.path.join(INPUT_DIR, "DEGs_mapmycells_L1")

# CUSTOM_ANALYSIS =  None
CUSTOM_ANALYSIS =  "FC_0_25"

DEG_BY = "mapmycells_first_layer"

if CUSTOM_ANALYSIS is not None:
    PARENT_OUTPUT_DIR = PARENT_OUTPUT_DIR + CUSTOM_ANALYSIS

PLOT_OUTPUT_DIR = os.path.join(PARENT_OUTPUT_DIR, 'plots')
DGE_OUTPUT_DIR = os.path.join(PARENT_OUTPUT_DIR, 'dge_res')
BIOMARKER_OUTPUT_DIR = os.path.join(PARENT_OUTPUT_DIR, 'biomarkers')

# Create output directories if they don't exist
os.makedirs(PLOT_OUTPUT_DIR, exist_ok=True)
os.makedirs(DGE_OUTPUT_DIR, exist_ok=True)
os.makedirs(BIOMARKER_OUTPUT_DIR, exist_ok=True)

print(f"Input directory: {INPUT_DIR}")
print(f"Plot output directory (Overall/Genotype): {PLOT_OUTPUT_DIR}")
print(f"DGE output directory (Overall/Genotype): {DGE_OUTPUT_DIR}")
print(f"Biomarker output directory: {BIOMARKER_OUTPUT_DIR}")

# Configure scanpy settings
sc.settings.figdir = PLOT_OUTPUT_DIR
sc.settings.set_figure_params(dpi=150, facecolor='white')

Input directory: D:/Github/SRF_Linda_RNA\combine_data\results_from_raw
Plot output directory (Overall/Genotype): D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_mapmycells_L1FC_0_25\plots
DGE output directory (Overall/Genotype): D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_mapmycells_L1FC_0_25\dge_res
Biomarker output directory: D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_mapmycells_L1FC_0_25\biomarkers


In [15]:
# Load data
print(f"\nLoading dataset from {adata_path}")
adata = sc.read_h5ad(adata_path)


Loading dataset from D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\annotation_final.h5ad


# Data Preprocessing

In [16]:
# Save current settings
original_max_rows = pd.get_option('display.max_rows')

# Set to None to display all rows
pd.set_option('display.max_rows', None)

# Check original clusters:
print(f"Original {DEG_BY} clusters:")
print(adata.obs[DEG_BY].value_counts())

# Reset to original settings
pd.set_option('display.max_rows', original_max_rows)

Original mapmycells_first_layer clusters:
mapmycells_first_layer
IT-ET Glut        13439
DG-IMN Glut        3882
OPC-Oligo          2650
NP-CT-L6b Glut     2188
CTX-MGE GABA       1396
Astro-Epen         1085
Immune             1016
CTX-CGE GABA        859
Vascular            407
OB-IMN GABA         265
MB GABA             110
CNU-LGE GABA         99
OB-CR Glut           90
MB Glut              75
CNU-HYa Glut         70
CNU-HYa GABA         64
MY Glut              53
P GABA               47
LSX GABA             43
OEC                  40
CNU-MGE GABA         36
MY GABA              24
HY GABA              20
MH-LH Glut           17
HY Glut              14
TH Glut              13
CB Glut               9
P Glut                5
CB GABA               4
Pineal Glut           3
MB Dopa               2
MB-HB Sero            1
Name: count, dtype: int64


In [17]:
# Check adata structure
print("\nAnnData object summary:")
print(adata)
print("\nAvailable layers:", list(adata.layers.keys()))
print("Raw data available:", adata.raw is not None)
if adata.raw:
    print("Raw data shape:", adata.raw.X.shape)


AnnData object summary:
AnnData object with n_obs × n_vars = 28026 × 26870
    obs: 'sample', 'condition', 'genotype', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden_0.4', 'ISO_majority_voting', 'ISO_conf_score', 'DG_majority_voting', 'DG_conf_score', 'mapmycells_first_layer', 'mapmycells_second_layer', 'cell_type', 'highlight'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'mean', 'std'
    uns: 'DG_majority_voting_colors', 'ISO_majority_voting_colors', 'cell_type_colors', 'condition_colors', 'genotype_colors', 'highlight_colors', 'hvg', 'leiden_0.4', 'leiden_0.4_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

Available layers: []
Raw data available: True
Raw data shape: (28026, 33685)


In [18]:
# Create the 'for_DEGs' layer: Normalized and log1p transformed counts
# Assuming raw counts are in adata.raw.X
print("\nCreating 'for_DEGs' layer...")
if adata.raw is not None and adata.raw.X is not None:
    # Create a temporary AnnData with raw counts to perform normalization and log1p
    adata_for_dge = ad.AnnData(adata.raw.X.copy()) # type: ignore
    adata_for_dge.obs_names = adata.obs_names # type: ignore
    adata_for_dge.var_names = adata.raw.var_names # type: ignore

    print("Normalizing total counts (target_sum=1e4)...")
    sc.pp.normalize_total(adata_for_dge, target_sum=1e4)

    print("Applying log1p transformation...")
    sc.pp.log1p(adata_for_dge)

    # Ensure the gene order matches the main adata object
    adata_for_dge = adata_for_dge[:, adata.var_names].copy() # type: ignore

    print("Storing result in adata.layers['for_DEGs']...")
    adata.layers['for_DEGs'] = adata_for_dge.X.copy() # type: ignore
    print("'for_DEGs' layer created with shape:", adata.layers['for_DEGs'].shape)
else:
    print("Warning: adata.raw.X not found. Cannot create 'for_DEGs' layer from raw counts.")
    print("DGE analysis will proceed using adata.X, which might be scaled.")


Creating 'for_DEGs' layer...
Normalizing total counts (target_sum=1e4)...
Applying log1p transformation...
Storing result in adata.layers['for_DEGs']...
'for_DEGs' layer created with shape: (28026, 26870)


In [19]:
print("\nMetadata check:")
print("condition:", list(adata.obs.condition.unique()))
print("genotype:", list(adata.obs.genotype.unique()))
print(f"{DEG_BY}:", list(adata.obs[DEG_BY].unique()))


Metadata check:
condition: ['Control', 'Mutant']
genotype: ['Emx1', 'Nestin']
mapmycells_first_layer: ['Astro-Epen', 'IT-ET Glut', 'Immune', 'DG-IMN Glut', 'OPC-Oligo', 'MY Glut', 'CTX-MGE GABA', 'NP-CT-L6b Glut', 'OB-CR Glut', 'CTX-CGE GABA', 'OB-IMN GABA', 'MB GABA', 'CNU-LGE GABA', 'LSX GABA', 'CNU-HYa GABA', 'Vascular', 'P Glut', 'CNU-HYa Glut', 'MY GABA', 'MB Glut', 'P GABA', 'MH-LH Glut', 'CNU-MGE GABA', 'CB Glut', 'CB GABA', 'HY GABA', 'TH Glut', 'MB Dopa', 'OEC', 'MB-HB Sero', 'HY Glut', 'Pineal Glut']


# Differential Gene Expression Analysis

In [20]:
# Overall DGE (Mutant vs Control) using the 'for_DEGs' layer, grouped by the DEG_BY
print(f"\nRunning Overall DGE (grouped by {DEG_BY})...")
dge_results = functions_degs.run_overall_dge(
    adata,
    grouping_key=DEG_BY,
    dge_output_dir=DGE_OUTPUT_DIR,
    plot_output_dir=PLOT_OUTPUT_DIR,
    layer='for_DEGs' if 'for_DEGs' in adata.layers else None
)


Running Overall DGE (grouped by mapmycells_first_layer)...

Starting both genomes Differential Gene Expression analysis (Mutant vs Control)...
  Running DGE for group: Astro-Epen (using key 'mapmycells_first_layer')
    Initial genes: 26870
    Filtering genes: Requiring expression in >= 109 cells (max of 3 absolute and 10.0% of 1085)...
    Genes after filtering: 5810
      Calculating mean expression for 5810 genes...
      Using data from layer: for_DEGs
Added mean expression columns.
    DGE results saved to D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_mapmycells_L1FC_0_25\dge_res\both_geno_cond_comp\dge_both_genomes_Astro-Epen_mut_vs_ctrl.csv
    DGE results saved to D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_mapmycells_L1FC_0_25\dge_res\both_geno_cond_comp\dge_both_genomes_Astro-Epen_mut_vs_ctrl.xlsx
    DEBUG: Attempting to create list_output_dir: 'D:\Github\SRF_Linda_RNA\combine_data\results_from_raw\DEGs_mapmycells_L1FC_0_25\dge_res\both_geno_con

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


---
  Generating volcano plot for: Both genomes - Immune (Counts: Control=459, Mutant=557) -> dge_conditions_comparison_Immune_volcano.png
---
  Generating volcano plot for: Both genomes - DG-IMN Glut (Counts: Control=2516, Mutant=1366) -> dge_conditions_comparison_DG-IMN_Glut_volcano.png
---
  Generating volcano plot for: Both genomes - OPC-Oligo (Counts: Control=1267, Mutant=1383) -> dge_conditions_comparison_OPC-Oligo_volcano.png
---
  Skipping volcano plot for Both genomes - MY Glut: Not enough cells in one or both groups (< 50). Control=34, Mutant=19.
  Generating volcano plot for: Both genomes - CTX-MGE GABA (Counts: Control=478, Mutant=918) -> dge_conditions_comparison_CTX-MGE_GABA_volcano.png
---
  Generating volcano plot for: Both genomes - NP-CT-L6b Glut (Counts: Control=757, Mutant=1431) -> dge_conditions_comparison_NP-CT-L6b_Glut_volcano.png
---
  Skipping volcano plot for Both genomes - OB-CR Glut: Not enough cells in one or both groups (< 50). Control=45, Mutant=45.
  Gen

In [21]:
# Genotype-Specific DGE (Mutant vs Control within each genotype) using the 'for_DEGs' layer, grouped by the DEG_BY
print(f"\nRunning Genotype-Specific DGE (grouped by {DEG_BY})...")
dge_by_genotype = functions_degs.run_genotype_specific_dge(
    adata,
    grouping_key=DEG_BY,
    dge_output_dir=DGE_OUTPUT_DIR,
    plot_output_dir=PLOT_OUTPUT_DIR,
    layer='for_DEGs' if 'for_DEGs' in adata.layers else None
)


Running Genotype-Specific DGE (grouped by mapmycells_first_layer)...

Starting Genotype-Specific DGE analysis...
Created 'genotype_condition' column with categories: ['Emx1_Control', 'Emx1_Mutant', 'Nestin_Control', 'Nestin_Mutant']

Processing group: Astro-Epen (using key 'mapmycells_first_layer')
  Running DGE for Emx1 genotype...
    Filtering genes for Emx1 (Initial: 26870)
    Filtering genes: Requiring expression in >= 42 cells (max of 3 absolute and 10.0% of 414)...
    Genes after filtering: 5181
      Calculating mean expression for 5181 genes (Emx1, group Astro-Epen)...
      Using data from layer: for_DEGs
      Added mean expression columns (Emx1, group Astro-Epen).
    Finished DGE for Emx1, group Astro-Epen. Found 5181 ranked genes.
    DGE results saved to D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_mapmycells_L1FC_0_25\dge_res\geno_spec_cond_comp\dge_Emx1_Astro-Epen_mut_vs_ctrl.csv
    DGE results saved to D:/Github/SRF_Linda_RNA\combine_data\results_fro

In [22]:
# Genotype Comparison DGE (Nestin vs Emx1 within each condition) using the 'for_DEGs' layer, grouped by the DEG_BY
print(f"\nRunning Genotype Comparison DGE (grouped by {DEG_BY})...")
dge_genotype_within_condition = functions_degs.run_genotype_comparison_dge(
    adata,
    grouping_key=DEG_BY,
    dge_output_dir=DGE_OUTPUT_DIR,
    plot_output_dir=PLOT_OUTPUT_DIR,
    layer='for_DEGs' if 'for_DEGs' in adata.layers else None
)


Running Genotype Comparison DGE (grouped by mapmycells_first_layer)...

Starting DGE analysis: Genotype comparison within conditions...

Processing group: Astro-Epen (using key 'mapmycells_first_layer')
  Running DGE for Control condition (Nestin vs Emx1)...
    Initial genes for Control condition: 26870
    Filtering genes for Control: Requiring expression in >= 67 cells (max of 3 absolute and 10.0% of 665)...
    Genes after filtering for Control: 6278
      Calculating mean expression for 6278 genes (Control condition, group Astro-Epen)...
      Using data from layer: for_DEGs
      Added mean expression columns (Control condition, group Astro-Epen).
    Finished DGE for Control, group Astro-Epen. Found 6278 ranked genes.
    DGE results saved to D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_mapmycells_L1FC_0_25\dge_res\cond_spec_geno_comp\dge_Control_cond_Astro-Epen_Nestin_vs_Emx1.csv
    DGE results saved to D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


---
  Generating volcano plot for: Control Cond - Immune (Counts: Emx1_Control=72, Nestin_Control=387) -> dge_Control_cond_Nestin_vs_Emx1_Immune_volcano.png
---
  Generating volcano plot for: Control Cond - DG-IMN Glut (Counts: Emx1_Control=861, Nestin_Control=1655) -> dge_Control_cond_Nestin_vs_Emx1_DG-IMN_Glut_volcano.png
---
  Generating volcano plot for: Control Cond - OPC-Oligo (Counts: Emx1_Control=173, Nestin_Control=1094) -> dge_Control_cond_Nestin_vs_Emx1_OPC-Oligo_volcano.png
---
  Skipping volcano plot for Control Cond - MY Glut: Not enough cells in one or both groups (< 50). Emx1_Control=10, Nestin_Control=24.
  Generating volcano plot for: Control Cond - CTX-MGE GABA (Counts: Emx1_Control=216, Nestin_Control=262) -> dge_Control_cond_Nestin_vs_Emx1_CTX-MGE_GABA_volcano.png
---
  Generating volcano plot for: Control Cond - NP-CT-L6b Glut (Counts: Emx1_Control=212, Nestin_Control=545) -> dge_Control_cond_Nestin_vs_Emx1_NP-CT-L6b_Glut_volcano.png
---
  Skipping volcano plot fo

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


---
  Skipping volcano plot for Mutant Cond - Immune: Not enough cells in one or both groups (< 50). Emx1_Mutant=42, Nestin_Mutant=515.
  Generating volcano plot for: Mutant Cond - DG-IMN Glut (Counts: Emx1_Mutant=407, Nestin_Mutant=959) -> dge_Mutant_cond_Nestin_vs_Emx1_DG-IMN_Glut_volcano.png
---
  Generating volcano plot for: Mutant Cond - OPC-Oligo (Counts: Emx1_Mutant=226, Nestin_Mutant=1157) -> dge_Mutant_cond_Nestin_vs_Emx1_OPC-Oligo_volcano.png
---
  Skipping volcano plot for Mutant Cond - MY Glut: Not enough cells in one or both groups (< 50). Emx1_Mutant=10, Nestin_Mutant=9.
  Generating volcano plot for: Mutant Cond - CTX-MGE GABA (Counts: Emx1_Mutant=409, Nestin_Mutant=509) -> dge_Mutant_cond_Nestin_vs_Emx1_CTX-MGE_GABA_volcano.png
---
  Generating volcano plot for: Mutant Cond - NP-CT-L6b Glut (Counts: Emx1_Mutant=456, Nestin_Mutant=975) -> dge_Mutant_cond_Nestin_vs_Emx1_NP-CT-L6b_Glut_volcano.png
---
  Skipping volcano plot for Mutant Cond - OB-CR Glut: Not enough cells i

In [23]:
# Cell Type Comparison DGE (Each cell type vs Rest) using the 'for_DEGs' layer, using the DEG_BY
print(f"\nRunning Cell Type Comparison DGE (Markers for {DEG_BY})...")
dge_mapmycells_second_layer_markers = functions_degs.run_cluster_comparison_dge(
    adata,
    grouping_key=DEG_BY, 
    dge_output_dir=BIOMARKER_OUTPUT_DIR, 
    plot_output_dir=BIOMARKER_OUTPUT_DIR,
    layer='for_DEGs' if 'for_DEGs' in adata.layers else None,
    method='wilcoxon' # or 't-test'
)


Running Cell Type Comparison DGE (Markers for mapmycells_first_layer)...

Starting Cluster Comparison DGE analysis (Markers for 'mapmycells_first_layer') into D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_mapmycells_L1FC_0_25\biomarkers...
  Initial genes: 26870
  Filtering genes globally: Requiring expression in >= 3 cells...
  Genes after global filtering: 26870
  Proceeding with DGE for 30 groups. Removed groups: ['MB Dopa', 'MB-HB Sero']
  Running sc.tl.rank_genes_groups (groupby='mapmycells_first_layer', method='wilcoxon')...


  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group

  rank_genes_groups finished.
    Calculating mean expression for Cluster Astro-Epen vs Rest...
      Using data from layer: for_DEGs
      Added mean expression columns for Cluster Astro-Epen.
    Top 50 Cluster marker results saved to D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_mapmycells_L1FC_0_25\biomarkers\dge_cluster_comparison_Astro-Epen_vs_Rest_top50.csv
    Top 50 Cluster marker results saved to D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_mapmycells_L1FC_0_25\biomarkers\dge_cluster_comparison_Astro-Epen_vs_Rest_top50.xlsx
    DEBUG: Attempting to create list_output_dir: 'D:\Github\SRF_Linda_RNA\combine_data\results_from_raw\DEGs_mapmycells_L1FC_0_25\biomarkers\sig_deg_lists\Astro-Epen'
    DEBUG: Successfully created/confirmed directory: 'D:\Github\SRF_Linda_RNA\combine_data\results_from_raw\DEGs_mapmycells_L1FC_0_25\biomarkers\sig_deg_lists\Astro-Epen'
    DEBUG: Directory 'D:\Github\SRF_Linda_RNA\combine_data\results_from_raw\DEGs_mapmycells_L1F

In [24]:
print("\nAll analysis and output generation complete.")
print(f"Overall/Genotype DGE outputs saved in: {DGE_OUTPUT_DIR}")
print(f"Biomarker DGE outputs saved in: {BIOMARKER_OUTPUT_DIR}")


All analysis and output generation complete.
Overall/Genotype DGE outputs saved in: D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_mapmycells_L1FC_0_25\dge_res
Biomarker DGE outputs saved in: D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_mapmycells_L1FC_0_25\biomarkers
