# Environment Setup and Data Loading

In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import os
import sys
import anndata as ad
import random
import functions_degs
import importlib

PROJECT_DIR = "D:/Github/SRF_Linda_RNA"
WORKING_DIR = os.path.join(PROJECT_DIR, "combine_data")
os.chdir(WORKING_DIR)
sys.path.insert(0, WORKING_DIR)

importlib.reload(functions_degs)

# Set seeds for all random number generators
random_seed = 0
np.random.seed(random_seed)
random.seed(random_seed)

In [3]:
# Set up directories

# REMOVE_DOUBLETS = True
REMOVE_DOUBLETS = False

FIX_TRESHOLD = True
# FIX_TRESHOLD = False

if FIX_TRESHOLD:
    BASE_RESULTS_DIR = os.path.join(WORKING_DIR, "results_from_raw")
else:
    if REMOVE_DOUBLETS:
        BASE_RESULTS_DIR = os.path.join(WORKING_DIR, "results_from_raw_percentile_threshold", "doublets_removed")
    else:
        BASE_RESULTS_DIR = os.path.join(WORKING_DIR, "results_from_raw_percentile_threshold")

INPUT_DIR = BASE_RESULTS_DIR
adata_path = os.path.join(INPUT_DIR, 'annotation_final.h5ad')

PARENT_OUTPUT_DIR = os.path.join(INPUT_DIR, "DEGs_cell_type_L2")

# CUSTOM_ANALYSIS =  None
CUSTOM_ANALYSIS =  "FC_0_25"

DEG_BY = "cell_type_L2"

if CUSTOM_ANALYSIS is not None:
    PARENT_OUTPUT_DIR = PARENT_OUTPUT_DIR + CUSTOM_ANALYSIS

PLOT_OUTPUT_DIR = os.path.join(PARENT_OUTPUT_DIR, 'plots')
DGE_OUTPUT_DIR = os.path.join(PARENT_OUTPUT_DIR, 'dge_res')
BIOMARKER_OUTPUT_DIR = os.path.join(PARENT_OUTPUT_DIR, 'biomarkers')

# Create output directories if they don't exist
os.makedirs(PLOT_OUTPUT_DIR, exist_ok=True)
os.makedirs(DGE_OUTPUT_DIR, exist_ok=True)
os.makedirs(BIOMARKER_OUTPUT_DIR, exist_ok=True)

print(f"Input directory: {INPUT_DIR}")
print(f"Plot output directory (Overall/Genotype): {PLOT_OUTPUT_DIR}")
print(f"DGE output directory (Overall/Genotype): {DGE_OUTPUT_DIR}")
print(f"Biomarker output directory: {BIOMARKER_OUTPUT_DIR}")

# Configure scanpy settings
sc.settings.figdir = PLOT_OUTPUT_DIR
sc.settings.set_figure_params(dpi=150, facecolor='white')

Input directory: D:/Github/SRF_Linda_RNA\combine_data\results_from_raw
Plot output directory (Overall/Genotype): D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_cell_type_L2FC_0_25\plots
DGE output directory (Overall/Genotype): D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_cell_type_L2FC_0_25\dge_res
Biomarker output directory: D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_cell_type_L2FC_0_25\biomarkers


In [4]:
# Load data
print(f"\nLoading dataset from {adata_path}")
adata = sc.read_h5ad(adata_path)


Loading dataset from D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\annotation_final.h5ad


In [13]:
adata

AnnData object with n_obs × n_vars = 28026 × 26870
    obs: 'sample', 'condition', 'genotype', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden_0.4', 'ISO_majority_voting', 'ISO_conf_score', 'DG_majority_voting', 'DG_conf_score', 'mapmycells_first_layer', 'mapmycells_second_layer', 'cell_type_L1', 'cell_type_L2', 'cell_type_L2_new', 'highlight'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'mean', 'std'
    uns: 'DG_majority_voting_colors', 'ISO_majority_voting_colors', 'cell_type_L1_colors', 'cell_type_L2_colors', 'cell_type_L2_new_colors', 'condition_colors', 'genotype_colors', 'highlight_colors', 'hvg', 'leiden_0.4', 'leiden_0.4_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [14]:
print(adata.obs["genotype"].unique())
print(adata.obs["condition"].unique())

['Emx1', 'Nestin']
Categories (2, object): ['Emx1', 'Nestin']
['Control', 'Mutant']
Categories (2, object): ['Control', 'Mutant']


# Data Preprocessing

In [4]:
# Save current settings
original_max_rows = pd.get_option('display.max_rows')

# Set to None to display all rows
pd.set_option('display.max_rows', None)

# Check original clusters:
print(f"Original {DEG_BY} clusters:")
print(adata.obs[DEG_BY].value_counts())

# Reset to original settings
pd.set_option('display.max_rows', original_max_rows)

Original cell_type_L2 clusters:
cell_type_L2
ENT              7589
Unknown          4100
Mature GC        2134
CA1              2056
MOL              1660
Microglia        1236
CA3              1099
Astrocytes        980
PPP               862
OPC               729
CGE-derived       603
MGE-derived       538
Immature GC       503
PV                481
SST               444
Subiculum         425
ProS              367
Lamp5             287
Sncg              271
Endothelial       231
VIP               228
NFOL              149
VLMC              144
Neuroblast        142
CA2               117
ABC               116
Pericytes         113
VIP (DG)           89
Lamp5 (DG)         85
Car3               73
Cajal-Retzius      65
nIPC               63
Mossy cells        47
Name: count, dtype: int64


In [5]:
# Check adata structure
print("\nAnnData object summary:")
print(adata)
print("\nAvailable layers:", list(adata.layers.keys()))
print("Raw data available:", adata.raw is not None)
if adata.raw:
    print("Raw data shape:", adata.raw.X.shape)


AnnData object summary:
AnnData object with n_obs × n_vars = 28026 × 26870
    obs: 'sample', 'condition', 'genotype', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden_0.4', 'ISO_majority_voting', 'ISO_conf_score', 'DG_majority_voting', 'DG_conf_score', 'mapmycells_first_layer', 'mapmycells_second_layer', 'cell_type_L1', 'cell_type_L2', 'highlight'
    var: 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'mean', 'std'
    uns: 'DG_majority_voting_colors', 'ISO_majority_voting_colors', 'cell_type_L1_colors', 'cell_type_L2_colors', 'condition_colors', 'genotype_colors', 'highlight_colors', 'hvg', 'leiden_0.4', 'leiden_0.4_colors', 'log1p', 'neighbors', 'pca', 'sample_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

Available layers: []
Raw data available: True
Raw data shape: (

In [6]:
# Create the 'for_DEGs' layer: Normalized and log1p transformed counts
# Assuming raw counts are in adata.raw.X
print("\nCreating 'for_DEGs' layer...")
if adata.raw is not None and adata.raw.X is not None:
    # Create a temporary AnnData with raw counts to perform normalization and log1p
    adata_for_dge = ad.AnnData(adata.raw.X.copy()) # type: ignore
    adata_for_dge.obs_names = adata.obs_names # type: ignore
    adata_for_dge.var_names = adata.raw.var_names # type: ignore

    print("Normalizing total counts (target_sum=1e4)...")
    sc.pp.normalize_total(adata_for_dge, target_sum=1e4)

    print("Applying log1p transformation...")
    sc.pp.log1p(adata_for_dge)

    # Ensure the gene order matches the main adata object
    adata_for_dge = adata_for_dge[:, adata.var_names].copy() # type: ignore

    print("Storing result in adata.layers['for_DEGs']...")
    adata.layers['for_DEGs'] = adata_for_dge.X.copy() # type: ignore
    print("'for_DEGs' layer created with shape:", adata.layers['for_DEGs'].shape)
else:
    print("Warning: adata.raw.X not found. Cannot create 'for_DEGs' layer from raw counts.")
    print("DGE analysis will proceed using adata.X, which might be scaled.")


Creating 'for_DEGs' layer...
Normalizing total counts (target_sum=1e4)...
Applying log1p transformation...
Storing result in adata.layers['for_DEGs']...
'for_DEGs' layer created with shape: (28026, 26870)


In [7]:
print("\nMetadata check:")
print("condition:", list(adata.obs.condition.unique()))
print("genotype:", list(adata.obs.genotype.unique()))
print(f"{DEG_BY}:", list(adata.obs[DEG_BY].unique()))


Metadata check:
condition: ['Control', 'Mutant']
genotype: ['Emx1', 'Nestin']
cell_type_L2: ['Astrocytes', 'Unknown', 'CA3', 'ENT', 'PPP', 'CA1', 'MGE-derived', 'Mature GC', 'MOL', 'ProS', 'Microglia', 'Subiculum', 'CGE-derived', 'VIP', 'CA2', 'Sncg', 'OPC', 'ABC', 'NFOL', 'Lamp5', 'SST', 'PV', 'Immature GC', 'VLMC', 'Endothelial', 'Lamp5 (DG)', 'nIPC', 'Pericytes', 'Car3', 'VIP (DG)', 'Mossy cells', 'Neuroblast', 'Cajal-Retzius']


# Differential Gene Expression Analysis

In [8]:
# Overall DGE (Mutant vs Control) using the 'for_DEGs' layer, grouped by the DEG_BY
print(f"\nRunning Overall DGE (grouped by {DEG_BY})...")
dge_results = functions_degs.run_overall_dge(
    adata,
    grouping_key=DEG_BY,
    dge_output_dir=DGE_OUTPUT_DIR,
    plot_output_dir=PLOT_OUTPUT_DIR,
    layer='for_DEGs' if 'for_DEGs' in adata.layers else None
)


Running Overall DGE (grouped by cell_type_L2)...

Starting both genomes Differential Gene Expression analysis (Mutant vs Control)...
  Running DGE for group: Astrocytes (using key 'cell_type_L2')
    Initial genes: 26870
    Filtering genes: Requiring expression in >= 98 cells (max of 3 absolute and 10.0% of 980)...
    Genes after filtering: 5556
      Calculating mean expression for 5556 genes...
      Using data from layer: for_DEGs
Added mean expression columns.
    DGE results saved to D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_cell_type_L2FC_0_25\dge_res\both_geno_cond_comp\dge_both_genomes_Astrocytes_mut_vs_ctrl.csv
    DEBUG: Attempting to create list_output_dir: 'D:\Github\SRF_Linda_RNA\combine_data\results_from_raw\DEGs_cell_type_L2FC_0_25\dge_res\both_geno_cond_comp\sig_deg_lists\Astrocytes'
    DEBUG: Successfully created/confirmed directory: 'D:\Github\SRF_Linda_RNA\combine_data\results_from_raw\DEGs_cell_type_L2FC_0_25\dge_res\both_geno_cond_comp\sig_deg_

In [9]:
# Genotype-Specific DGE (Mutant vs Control within each genotype) using the 'for_DEGs' layer, grouped by the DEG_BY
print(f"\nRunning Genotype-Specific DGE (grouped by {DEG_BY})...")
dge_by_genotype = functions_degs.run_genotype_specific_dge(
    adata,
    grouping_key=DEG_BY,
    dge_output_dir=DGE_OUTPUT_DIR,
    plot_output_dir=PLOT_OUTPUT_DIR,
    layer='for_DEGs' if 'for_DEGs' in adata.layers else None
)


Running Genotype-Specific DGE (grouped by cell_type_L2)...

Starting Genotype-Specific DGE analysis...
Created 'genotype_condition' column with categories: ['Emx1_Control', 'Emx1_Mutant', 'Nestin_Control', 'Nestin_Mutant']

Processing group: Astrocytes (using key 'cell_type_L2')
  Running DGE for Emx1 genotype...
    Filtering genes for Emx1 (Initial: 26870)
    Filtering genes: Requiring expression in >= 36 cells (max of 3 absolute and 10.0% of 357)...
    Genes after filtering: 5053
      Calculating mean expression for 5053 genes (Emx1, group Astrocytes)...
      Using data from layer: for_DEGs
      Added mean expression columns (Emx1, group Astrocytes).
    Finished DGE for Emx1, group Astrocytes. Found 5053 ranked genes.
    DGE results saved to D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_cell_type_L2FC_0_25\dge_res\geno_spec_cond_comp\dge_Emx1_Astrocytes_mut_vs_ctrl.csv
    DEBUG: Attempting to create list_output_dir: 'D:\Github\SRF_Linda_RNA\combine_data\results

In [10]:
# Genotype Comparison DGE (Nestin vs Emx1 within each condition) using the 'for_DEGs' layer, grouped by the DEG_BY
print(f"\nRunning Genotype Comparison DGE (grouped by {DEG_BY})...")
dge_genotype_within_condition = functions_degs.run_genotype_comparison_dge(
    adata,
    grouping_key=DEG_BY,
    dge_output_dir=DGE_OUTPUT_DIR,
    plot_output_dir=PLOT_OUTPUT_DIR,
    layer='for_DEGs' if 'for_DEGs' in adata.layers else None
)


Running Genotype Comparison DGE (grouped by cell_type_L2)...

Starting DGE analysis: Genotype comparison within conditions...

Processing group: Astrocytes (using key 'cell_type_L2')
  Running DGE for Control condition (Nestin vs Emx1)...
    Initial genes for Control condition: 26870
    Filtering genes for Control: Requiring expression in >= 61 cells (max of 3 absolute and 10.0% of 609)...
    Genes after filtering for Control: 5966
      Calculating mean expression for 5966 genes (Control condition, group Astrocytes)...
      Using data from layer: for_DEGs
      Added mean expression columns (Control condition, group Astrocytes).
    Finished DGE for Control, group Astrocytes. Found 5966 ranked genes.
    DGE results saved to D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_cell_type_L2FC_0_25\dge_res\cond_spec_geno_comp\dge_Control_cond_Astrocytes_Nestin_vs_Emx1.csv
    DEBUG: Attempting to create list_output_dir: 'D:\Github\SRF_Linda_RNA\combine_data\results_from_raw\D

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


---
  Generating volcano plot for: Mutant Cond - PPP (Counts: Emx1_Mutant=323, Nestin_Mutant=385) -> dge_Mutant_cond_Nestin_vs_Emx1_PPP_volcano.png
---
  Generating volcano plot for: Mutant Cond - CA1 (Counts: Emx1_Mutant=493, Nestin_Mutant=788) -> dge_Mutant_cond_Nestin_vs_Emx1_CA1_volcano.png
---
  Generating volcano plot for: Mutant Cond - MGE-derived (Counts: Emx1_Mutant=112, Nestin_Mutant=226) -> dge_Mutant_cond_Nestin_vs_Emx1_MGE-derived_volcano.png
---
  Generating volcano plot for: Mutant Cond - Mature GC (Counts: Emx1_Mutant=268, Nestin_Mutant=540) -> dge_Mutant_cond_Nestin_vs_Emx1_Mature_GC_volcano.png
---
  Generating volcano plot for: Mutant Cond - MOL (Counts: Emx1_Mutant=91, Nestin_Mutant=668) -> dge_Mutant_cond_Nestin_vs_Emx1_MOL_volcano.png
---
  Generating volcano plot for: Mutant Cond - ProS (Counts: Emx1_Mutant=122, Nestin_Mutant=119) -> dge_Mutant_cond_Nestin_vs_Emx1_ProS_volcano.png
---
  Skipping volcano plot for Mutant Cond - Microglia: Not enough cells in one or

In [11]:
# Cell Type Comparison DGE (Each cell type vs Rest) using the 'for_DEGs' layer, using the DEG_BY
print(f"\nRunning Cell Type Comparison DGE (Markers for {DEG_BY})...")
dge_markers = functions_degs.run_cluster_comparison_dge(
    adata,
    grouping_key=DEG_BY, 
    dge_output_dir=BIOMARKER_OUTPUT_DIR, 
    plot_output_dir=BIOMARKER_OUTPUT_DIR,
    layer='for_DEGs' if 'for_DEGs' in adata.layers else None,
    method='wilcoxon' # or 't-test'
)


Running Cell Type Comparison DGE (Markers for cell_type_L2)...

Starting Cluster Comparison DGE analysis (Markers for 'cell_type_L2') into D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_cell_type_L2FC_0_25\biomarkers...
  Initial genes: 26870
  Filtering genes globally: Requiring expression in >= 3 cells...
  Genes after global filtering: 26870
  Proceeding with DGE for 33 groups. Removed groups: []
  Running sc.tl.rank_genes_groups (groupby='cell_type_L2', method='wilcoxon')...


  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group_name, "scores"] = scores[global_indices]
  self.stats[group_name, "pvals"] = pvals[global_indices]
  self.stats[group_name, "pvals_adj"] = pvals_adj[global_indices]
  self.stats[group_name, "logfoldchanges"] = np.log2(
  self.stats[group_name, "names"] = self.var_names[global_indices]
  self.stats[group

  rank_genes_groups finished.
    Calculating mean expression for Cluster ABC vs Rest...
      Using data from layer: for_DEGs
      Added mean expression columns for Cluster ABC.
    Top 50 Cluster marker results saved to D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_cell_type_L2FC_0_25\biomarkers\dge_cluster_comparison_ABC_vs_Rest_top50.csv
    DEBUG: Attempting to create list_output_dir: 'D:\Github\SRF_Linda_RNA\combine_data\results_from_raw\DEGs_cell_type_L2FC_0_25\biomarkers\sig_deg_lists\ABC'
    DEBUG: Successfully created/confirmed directory: 'D:\Github\SRF_Linda_RNA\combine_data\results_from_raw\DEGs_cell_type_L2FC_0_25\biomarkers\sig_deg_lists\ABC'
    DEBUG: Directory 'D:\Github\SRF_Linda_RNA\combine_data\results_from_raw\DEGs_cell_type_L2FC_0_25\biomarkers\sig_deg_lists\ABC' is writable.
    Saved 782 significant UP genes with stats to D:\Github\SRF_Linda_RNA\combine_data\results_from_raw\DEGs_cell_type_L2FC_0_25\biomarkers\sig_deg_lists\ABC\Cluster_ABC_vs_Rest

In [12]:
print("\nAll analysis and output generation complete.")
print(f"Overall/Genotype DGE outputs saved in: {DGE_OUTPUT_DIR}")
print(f"Biomarker DGE outputs saved in: {BIOMARKER_OUTPUT_DIR}")


All analysis and output generation complete.
Overall/Genotype DGE outputs saved in: D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_cell_type_L2FC_0_25\dge_res
Biomarker DGE outputs saved in: D:/Github/SRF_Linda_RNA\combine_data\results_from_raw\DEGs_cell_type_L2FC_0_25\biomarkers
