In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from matplotlib_venn import venn2, venn3
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION
# ============================================================================

# Paths
UP_DIR = Path('GO_enrichment_results')
DOWN_DIR = Path('GO_enrichment_results_downregulated')
OUTPUT_DIR = Path('GO_comparison_results')
OUTPUT_DIR.mkdir(exist_ok=True)

# Parameters
TOP_N = 15
PADJ_THRESHOLD = 0.05

print("="*70)
print("GO ENRICHMENT COMPARISON: UPREGULATED vs DOWNREGULATED")
print("="*70)

# ============================================================================
# 1. LOAD CELL TYPES
# ============================================================================

print("\n[1/5] Loading cell types...")

import scanpy as sc
adata = sc.read('annotated.h5ad')
cell_types = adata.obs['Cell_Type'].unique().tolist()

print(f"Found {len(cell_types)} cell types")

# ============================================================================
# 2. LOAD ENRICHMENT RESULTS
# ============================================================================

print("\n[2/5] Loading enrichment results...")

upregulated_results = {}
downregulated_results = {}

for cell_type in cell_types:
    ct_clean = cell_type.replace(' ', '_')
    
    # Load upregulated
    up_file = UP_DIR / ct_clean / f'{ct_clean}_GO_enrichment.csv'
    if up_file.exists():
        df = pd.read_csv(up_file)
        df_sig = df[df['Adjusted P-value'] < PADJ_THRESHOLD].copy()
        upregulated_results[cell_type] = df_sig
        print(f"  ‚úì UP - {cell_type}: {len(df_sig)} significant terms")
    else:
        upregulated_results[cell_type] = pd.DataFrame()
        print(f"  ‚úó UP - {cell_type}: Not found")
    
    # Load downregulated
    down_file = DOWN_DIR / ct_clean / f'{ct_clean}_GO_enrichment.csv'
    if down_file.exists():
        df = pd.read_csv(down_file)
        df_sig = df[df['Adjusted P-value'] < PADJ_THRESHOLD].copy()
        downregulated_results[cell_type] = df_sig
        print(f"  ‚úì DOWN - {cell_type}: {len(df_sig)} significant terms")
    else:
        downregulated_results[cell_type] = pd.DataFrame()
        print(f"  ‚úó DOWN - {cell_type}: Not found")

# ============================================================================
# 3. OVERALL COMPARISON (ALL CELL TYPES COMBINED)
# ============================================================================

print("\n[3/5] Analyzing overall patterns...")

# Combine all terms
all_up_terms = []
all_down_terms = []

for cell_type in cell_types:
    if len(upregulated_results[cell_type]) > 0:
        up_df = upregulated_results[cell_type].copy()
        up_df['Cell_Type'] = cell_type
        up_df['Direction'] = 'Upregulated'
        all_up_terms.append(up_df)
    
    if len(downregulated_results[cell_type]) > 0:
        down_df = downregulated_results[cell_type].copy()
        down_df['Cell_Type'] = cell_type
        down_df['Direction'] = 'Downregulated'
        all_down_terms.append(down_df)

# Create combined dataframes
if all_up_terms:
    all_up_df = pd.concat(all_up_terms, ignore_index=True)
else:
    all_up_df = pd.DataFrame()

if all_down_terms:
    all_down_df = pd.concat(all_down_terms, ignore_index=True)
else:
    all_down_df = pd.DataFrame()

# Get unique terms
up_unique_terms = set(all_up_df['Term'].unique()) if len(all_up_df) > 0 else set()
down_unique_terms = set(all_down_df['Term'].unique()) if len(all_down_df) > 0 else set()

# Find overlaps
common_terms = up_unique_terms & down_unique_terms
up_only_terms = up_unique_terms - down_unique_terms
down_only_terms = down_unique_terms - up_unique_terms

print(f"\n  Overall Statistics:")
print(f"  ‚Ä¢ Total upregulated pathway enrichments: {len(all_up_df)}")
print(f"  ‚Ä¢ Total downregulated pathway enrichments: {len(all_down_df)}")
print(f"  ‚Ä¢ Unique terms in upregulated: {len(up_unique_terms)}")
print(f"  ‚Ä¢ Unique terms in downregulated: {len(down_unique_terms)}")
print(f"  ‚Ä¢ Terms appearing in BOTH up and down: {len(common_terms)}")
print(f"  ‚Ä¢ Terms only upregulated: {len(up_only_terms)}")
print(f"  ‚Ä¢ Terms only downregulated: {len(down_only_terms)}")

# Count term frequencies
up_term_counts = all_up_df['Term'].value_counts() if len(all_up_df) > 0 else pd.Series()
down_term_counts = all_down_df['Term'].value_counts() if len(all_down_df) > 0 else pd.Series()

# Save overall comparison
overall_summary = pd.DataFrame({
    'Category': ['Total Pathway Enrichments', 'Unique Terms', 'Shared Terms', 'Direction-Specific Terms'],
    'Upregulated': [len(all_up_df), len(up_unique_terms), len(common_terms), len(up_only_terms)],
    'Downregulated': [len(all_down_df), len(down_unique_terms), len(common_terms), len(down_only_terms)]
})
overall_summary.to_csv(OUTPUT_DIR / 'overall_comparison_summary.csv', index=False)

# ============================================================================
# 4. PER CELL TYPE COMPARISON
# ============================================================================

print("\n[4/5] Analyzing per cell type patterns...")

celltype_comparisons = []

for cell_type in cell_types:
    up_df = upregulated_results[cell_type]
    down_df = downregulated_results[cell_type]
    
    up_terms = set(up_df['Term'].unique()) if len(up_df) > 0 else set()
    down_terms = set(down_df['Term'].unique()) if len(down_df) > 0 else set()
    
    common = up_terms & down_terms
    up_only = up_terms - down_terms
    down_only = down_terms - up_terms
    
    celltype_comparisons.append({
        'Cell_Type': cell_type,
        'Up_Total': len(up_df),
        'Down_Total': len(down_df),
        'Up_Unique_Terms': len(up_terms),
        'Down_Unique_Terms': len(down_terms),
        'Shared_Terms': len(common),
        'Up_Only_Terms': len(up_only),
        'Down_Only_Terms': len(down_only)
    })

celltype_comparison_df = pd.DataFrame(celltype_comparisons).sort_values('Shared_Terms', ascending=False)
celltype_comparison_df.to_csv(OUTPUT_DIR / 'celltype_comparison_summary.csv', index=False)

print(f"  ‚úì Analyzed {len(cell_types)} cell types")

# ============================================================================
# 5. IDENTIFY PATHWAYS SHARED ACROSS MULTIPLE CELL TYPES
# ============================================================================

print("\n[5/5] Identifying pathways shared across multiple cell types...")

# For upregulated
if len(all_up_df) > 0:
    up_shared = up_term_counts[up_term_counts >= 2].sort_values(ascending=False)
    up_shared_df = pd.DataFrame({
        'Term': up_shared.index,
        'Num_Cell_Types': up_shared.values,
        'Direction': 'Upregulated'
    })
    
    # Add cell type list
    up_shared_df['Cell_Types'] = up_shared_df['Term'].apply(
        lambda x: '; '.join(all_up_df[all_up_df['Term'] == x]['Cell_Type'].unique())
    )
else:
    up_shared_df = pd.DataFrame()

# For downregulated
if len(all_down_df) > 0:
    down_shared = down_term_counts[down_term_counts >= 2].sort_values(ascending=False)
    down_shared_df = pd.DataFrame({
        'Term': down_shared.index,
        'Num_Cell_Types': down_shared.values,
        'Direction': 'Downregulated'
    })
    
    # Add cell type list
    down_shared_df['Cell_Types'] = down_shared_df['Term'].apply(
        lambda x: '; '.join(all_down_df[all_down_df['Term'] == x]['Cell_Type'].unique())
    )
else:
    down_shared_df = pd.DataFrame()

# Combine and save
if len(up_shared_df) > 0 or len(down_shared_df) > 0:
    shared_pathways = pd.concat([up_shared_df, down_shared_df], ignore_index=True)
    shared_pathways = shared_pathways.sort_values('Num_Cell_Types', ascending=False)
    shared_pathways.to_csv(OUTPUT_DIR / 'shared_pathways_across_celltypes.csv', index=False)
    
    print(f"\n  Shared Pathway Statistics:")
    print(f"  ‚Ä¢ Upregulated pathways in ‚â•2 cell types: {len(up_shared_df)}")
    print(f"  ‚Ä¢ Downregulated pathways in ‚â•2 cell types: {len(down_shared_df)}")
    
    if len(up_shared_df) > 0:
        print(f"  ‚Ä¢ Most common upregulated pathway: {up_shared_df.iloc[0]['Term']}")
        print(f"    (in {up_shared_df.iloc[0]['Num_Cell_Types']} cell types)")
    
    if len(down_shared_df) > 0:
        print(f"  ‚Ä¢ Most common downregulated pathway: {down_shared_df.iloc[0]['Term']}")
        print(f"    (in {down_shared_df.iloc[0]['Num_Cell_Types']} cell types)")

# Save terms appearing in both directions
if len(common_terms) > 0:
    conflicting_terms = []
    for term in common_terms:
        up_celltypes = all_up_df[all_up_df['Term'] == term]['Cell_Type'].unique()
        down_celltypes = all_down_df[all_down_df['Term'] == term]['Cell_Type'].unique()
        
        conflicting_terms.append({
            'Term': term,
            'Num_Up_CellTypes': len(up_celltypes),
            'Num_Down_CellTypes': len(down_celltypes),
            'Up_CellTypes': '; '.join(up_celltypes),
            'Down_CellTypes': '; '.join(down_celltypes)
        })
    
    conflicting_df = pd.DataFrame(conflicting_terms)
    conflicting_df = conflicting_df.sort_values('Num_Up_CellTypes', ascending=False)
    conflicting_df.to_csv(OUTPUT_DIR / 'conflicting_pathways_up_and_down.csv', index=False)
    
    print(f"\n  ‚ö†Ô∏è  Found {len(common_terms)} pathways appearing in BOTH directions")
    print(f"      (these may indicate cell-type specific responses)")

# ============================================================================
# 6. CREATE COMPARISON VISUALIZATIONS
# ============================================================================

print("\n[6/6] Creating comparison visualizations...")

fig_dir = OUTPUT_DIR / 'figures'
fig_dir.mkdir(exist_ok=True)

# --- Plot 1: Venn diagram of overall term overlap ---
fig, ax = plt.subplots(figsize=(8, 6))
if len(up_unique_terms) > 0 and len(down_unique_terms) > 0:
    venn2([up_unique_terms, down_unique_terms],
          set_labels=('Upregulated', 'Downregulated'),
          set_colors=('#FF6B6B', '#4ECDC4'),
          alpha=0.7,
          ax=ax)
    ax.set_title('Pathway Term Overlap\n(All Cell Types Combined)', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.savefig(fig_dir / 'overall_venn_diagram.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("  ‚úì Venn diagram saved")

# --- Plot 2: Top shared pathways (multiple cell types) ---
if len(up_shared_df) > 0 or len(down_shared_df) > 0:
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
    
    # Upregulated
    if len(up_shared_df) > 0:
        top_up = up_shared_df.head(TOP_N)
        ax1.barh(range(len(top_up)), top_up['Num_Cell_Types'], color='#FF6B6B', alpha=0.7)
        ax1.set_yticks(range(len(top_up)))
        ax1.set_yticklabels([t[:60] + '...' if len(t) > 60 else t for t in top_up['Term']], fontsize=9)
        ax1.set_xlabel('Number of Cell Types', fontsize=11)
        ax1.set_title(f'Top {TOP_N} Upregulated Pathways\n(Shared Across Multiple Cell Types)', 
                     fontsize=12, fontweight='bold')
        ax1.grid(True, alpha=0.3, axis='x')
        ax1.invert_yaxis()
    else:
        ax1.text(0.5, 0.5, 'No shared upregulated pathways', ha='center', va='center')
        ax1.set_title('Upregulated Pathways', fontsize=12, fontweight='bold')
    
    # Downregulated
    if len(down_shared_df) > 0:
        top_down = down_shared_df.head(TOP_N)
        ax2.barh(range(len(top_down)), top_down['Num_Cell_Types'], color='#4ECDC4', alpha=0.7)
        ax2.set_yticks(range(len(top_down)))
        ax2.set_yticklabels([t[:60] + '...' if len(t) > 60 else t for t in top_down['Term']], fontsize=9)
        ax2.set_xlabel('Number of Cell Types', fontsize=11)
        ax2.set_title(f'Top {TOP_N} Downregulated Pathways\n(Shared Across Multiple Cell Types)', 
                     fontsize=12, fontweight='bold')
        ax2.grid(True, alpha=0.3, axis='x')
        ax2.invert_yaxis()
    else:
        ax2.text(0.5, 0.5, 'No shared downregulated pathways', ha='center', va='center')
        ax2.set_title('Downregulated Pathways', fontsize=12, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(fig_dir / 'shared_pathways_comparison.png', dpi=300, bbox_inches='tight')
    plt.close()
    print("  ‚úì Shared pathways comparison saved")

# --- Plot 3: Per cell type comparison heatmap ---
fig, ax = plt.subplots(figsize=(12, 10))

comparison_matrix = celltype_comparison_df[['Cell_Type', 'Up_Unique_Terms', 'Down_Unique_Terms', 'Shared_Terms']].copy()
comparison_matrix = comparison_matrix.set_index('Cell_Type')

sns.heatmap(comparison_matrix.T, annot=True, fmt='g', cmap='YlOrRd', 
            cbar_kws={'label': 'Number of Terms'}, ax=ax)
ax.set_title('Pathway Term Counts per Cell Type', fontsize=14, fontweight='bold')
ax.set_xlabel('Cell Type', fontsize=11)
ax.set_ylabel('Category', fontsize=11)
plt.xticks(rotation=45, ha='right', fontsize=9)
plt.yticks(rotation=0, fontsize=10)
plt.tight_layout()
plt.savefig(fig_dir / 'celltype_comparison_heatmap.png', dpi=300, bbox_inches='tight')
plt.close()
print("  ‚úì Cell type comparison heatmap saved")

# --- Plot 4: Stacked bar chart of pathway distribution ---
fig, ax = plt.subplots(figsize=(14, 8))

celltype_comparison_df_sorted = celltype_comparison_df.sort_values('Up_Unique_Terms', ascending=False)

x = range(len(celltype_comparison_df_sorted))
width = 0.35

ax.bar(x, celltype_comparison_df_sorted['Up_Unique_Terms'], width, 
       label='Upregulated Only', color='#FF6B6B', alpha=0.8)
ax.bar(x, celltype_comparison_df_sorted['Shared_Terms'], width,
       bottom=celltype_comparison_df_sorted['Up_Unique_Terms'],
       label='Shared', color='#95E1D3', alpha=0.8)

ax.set_xticks(x)
ax.set_xticklabels(celltype_comparison_df_sorted['Cell_Type'], rotation=45, ha='right', fontsize=9)
ax.set_ylabel('Number of Pathway Terms', fontsize=11)
ax.set_title('Distribution of Pathway Terms Across Cell Types', fontsize=12, fontweight='bold')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig(fig_dir / 'pathway_distribution_stacked.png', dpi=300, bbox_inches='tight')
plt.close()
print("  ‚úì Stacked distribution chart saved")

print(f"\n  ‚Üí All figures saved to: {fig_dir}")

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print("\n" + "="*70)
print("COMPARISON ANALYSIS COMPLETE!")
print("="*70)
print(f"\nüìä Key Findings:")
print(f"  ‚Ä¢ Unique upregulated pathways: {len(up_only_terms)}")
print(f"  ‚Ä¢ Unique downregulated pathways: {len(down_only_terms)}")
print(f"  ‚Ä¢ Pathways in both directions: {len(common_terms)}")
print(f"  ‚Ä¢ Cell types analyzed: {len(cell_types)}")
print(f"\nüìÅ Output Location: {OUTPUT_DIR}")
print(f"  ‚Ä¢ Summary tables: 4 CSV files")
print(f"  ‚Ä¢ Figures: {fig_dir}")
print("\n‚úÖ Check the 'shared_pathways_across_celltypes.csv' for pathways")
print("   enriched in multiple cell types!")
print("="*70)

GO ENRICHMENT COMPARISON: UPREGULATED vs DOWNREGULATED

[1/5] Loading cell types...
Found 21 cell types

[2/5] Loading enrichment results...
  ‚úó UP - CD4+ T Cells: Not found
  ‚úó DOWN - CD4+ T Cells: Not found
  ‚úì UP - CMS3: 115 significant terms
  ‚úì DOWN - CMS3: 27 significant terms
  ‚úì UP - Tip-like ECs: 34 significant terms
  ‚úì DOWN - Tip-like ECs: 10 significant terms
  ‚úó UP - CD8+ T cells: Not found
  ‚úó DOWN - CD8+ T cells: Not found
  ‚úó UP - B Cells: Not found
  ‚úó DOWN - B Cells: Not found
  ‚úì UP - Spp1+: 53 significant terms
  ‚úì DOWN - Spp1+: 0 significant terms
  ‚úì UP - Mast cells: 8 significant terms
  ‚úì DOWN - Mast cells: 0 significant terms
  ‚úì UP - Stromal 2: 29 significant terms
  ‚úì DOWN - Stromal 2: 2 significant terms
  ‚úó UP - CMS2: Not found
  ‚úì DOWN - CMS2: 4 significant terms
  ‚úó UP - Regulatory T Cells: Not found
  ‚úó DOWN - Regulatory T Cells: Not found
  ‚úì UP - Pericytes: 0 significant terms
  ‚úì DOWN - Pericytes: 11 signifi