In [55]:
import pandas as pd
import numpy as np
import os

wd_dir = '/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_CUTandTAG'
os.chdir(wd_dir)

In [56]:
def categorize_deseq_genes(file_path, log2fc_threshold=0, padj_threshold=0.05):
    """
    Categorize genes from DESeq2 results into up-regulated, down-regulated, and not dysregulated lists.
    
    Parameters:
    -----------
    file_path : str
        Path to the CSV file containing DESeq2 results
    log2fc_threshold : float
        Absolute log2 fold change threshold (default: 0)
    padj_threshold : float
        Adjusted p-value threshold (default: 0.05)
        
    Returns:
    --------
    tuple
        Lists of upregulated, downregulated, and not dysregulated gene names
    """
    # Read the CSV file
    df = pd.read_csv(file_path)
    
    # Create masks for different categories
    significant = df['padj'] < padj_threshold
    upregulated = (df['log2FoldChange'] > log2fc_threshold) & significant
    downregulated = (df['log2FoldChange'] < -log2fc_threshold) & significant
    not_dysregulated = ~significant
    
    # Create lists of gene names for each category
    up_genes = df.loc[upregulated, 'gene'].tolist()
    down_genes = df.loc[downregulated, 'gene'].tolist()
    unchanged_genes = df.loc[not_dysregulated, 'gene'].tolist()
    
    # Print summary statistics
    print(f"Total genes analyzed: {len(df)}")
    print(f"Upregulated genes: {len(up_genes)}")
    print(f"Downregulated genes: {len(down_genes)}")
    print(f"Not dysregulated genes: {len(unchanged_genes)}")
    
    return up_genes, down_genes, unchanged_genes

In [57]:
# You can adjust these thresholds as needed
LOG2FC_THRESHOLD = 0.5
PADJ_THRESHOLD = 0.05

file_path = "custom_pipeline/DATA/DEA_NEU.csv" 

# Get the categorized gene lists
upregulated, downregulated, not_dysregulated = categorize_deseq_genes(
    file_path,
    log2fc_threshold=LOG2FC_THRESHOLD,
    padj_threshold=PADJ_THRESHOLD
)

# Example of how to work with the results
print("\nFirst few genes from each category:")
print("Upregulated:", upregulated[:5], len(upregulated))
print("Downregulated:", downregulated[:5], len(downregulated))
print("Not dysregulated:", not_dysregulated[:5], len(not_dysregulated))

# Save gene lists to CSV files
pd.DataFrame(upregulated, columns=['gene']).to_csv('custom_pipeline/DATA/neuron_upregulated_genes.csv', index=False)
pd.DataFrame(downregulated, columns=['gene']).to_csv('custom_pipeline/DATA/neuron_downregulated_genes.csv', index=False)
pd.DataFrame(not_dysregulated, columns=['gene']).to_csv('custom_pipeline/DATA/neuron_not_dysregulated_genes.csv', index=False)

print("\nGene lists have been saved to CSV files in custom_pipeline/DATA/")

Total genes analyzed: 12945
Upregulated genes: 234
Downregulated genes: 317
Not dysregulated genes: 10830

First few genes from each category:
Upregulated: ['Cox6a2', 'Gm6142', 'Mapkapk5', 'Gucy2g', 'Gm20900'] 234
Downregulated: ['Etohd2', 'Ankrd63', 'Vwc2l', 'Six3', 'Gpc6'] 317
Not dysregulated: ['H2afy2', 'Lurap1l', 'Tcea1', 'Igdcc4', 'Kdm6a'] 10830

Gene lists have been saved to CSV files in custom_pipeline/DATA/


In [58]:
# You can adjust these thresholds as needed
LOG2FC_THRESHOLD = 0.5
PADJ_THRESHOLD = 0.05

file_path = "custom_pipeline/DATA/DEA_NSC.csv" 

# Get the categorized gene lists
upregulated, downregulated, not_dysregulated = categorize_deseq_genes(
    file_path,
    log2fc_threshold=LOG2FC_THRESHOLD,
    padj_threshold=PADJ_THRESHOLD
)

# Example of how to work with the results
print("\nFirst few genes from each category:")
print("Upregulated:", upregulated[:5], len(upregulated))
print("Downregulated:", downregulated[:5], len(downregulated))
print("Not dysregulated:", not_dysregulated[:5], len(not_dysregulated))

# Save gene lists to CSV files
pd.DataFrame(upregulated, columns=['gene']).to_csv('custom_pipeline/DATA/nsc_upregulated_genes.csv', index=False)
pd.DataFrame(downregulated, columns=['gene']).to_csv('custom_pipeline/DATA/nsc_downregulated_genes.csv', index=False)
pd.DataFrame(not_dysregulated, columns=['gene']).to_csv('custom_pipeline/DATA/nsc_not_dysregulated_genes.csv', index=False)

print("\nGene lists have been saved to CSV files in custom_pipeline/DATA/")

Total genes analyzed: 14245
Upregulated genes: 2743
Downregulated genes: 2587
Not dysregulated genes: 5337

First few genes from each category:
Upregulated: ['Gm36501', 'Gm17494', '4921511C10Rik', 'Gm9316', 'Gm48606'] 2743
Downregulated: ['Wipf3', 'Paqr9', 'Srpk3', 'Bbs5', 'Rsf1os2'] 2587
Not dysregulated: ['Mir5125', 'Trim68', 'Frrs1', 'Notch4', 'Ano6'] 5337

Gene lists have been saved to CSV files in custom_pipeline/DATA/
