In [1]:
import pandas as pd
import glob
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

EXPERIMENT="align2_005"

# Set working directory
os.chdir(f'/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_CUTandTAG/iterative_alternative/analyze_mecp2_cpg_enrichment_{EXPERIMENT}')

# Integrate

In [2]:
# Load the integrated results
results_dir = "integrated/"
df = pd.read_csv(f"{results_dir}/mecp2_enriched_genes.csv")

In [3]:
print(df.shape)
df.head()

(12004, 12)


Unnamed: 0,gene,chr,cpg_start,cpg_end,mecp2_enrichment,exo_signal,endo_signal,binding_type,distance_to_gene,baseMean,log2FoldChange,padj
0,0610009B22Rik,chr11,51688626,51689166,0.927207,2775.264257,2993.145475,both,292,,,
1,0610010F05Rik,chr11,23633135,23633766,2.544609,633.626135,249.007293,both,127,,,
2,0610010K14Rik,chr11,70237525,70238413,1.138274,432.391788,379.866033,both,499,,,
3,0610011F06Rik,chr17,25875044,25875794,1.247181,675.105061,541.304633,both,420,,,
4,0610012G03Rik,chr16,31948044,31948757,2.163532,77.034992,35.606118,both,263,1483.241219,0.327138,0.018344


In [4]:
df = df[df['baseMean'].isna() == False]
print(df.shape)
df.head()

(1553, 12)


Unnamed: 0,gene,chr,cpg_start,cpg_end,mecp2_enrichment,exo_signal,endo_signal,binding_type,distance_to_gene,baseMean,log2FoldChange,padj
4,0610012G03Rik,chr16,31948044,31948757,2.163532,77.034992,35.606118,both,263,1483.241219,0.327138,0.018344
14,1110008P14Rik,chr2,32381502,32382189,0.87185,156.051024,178.988456,both,251,1256.617818,0.326418,0.010881
22,1110065P20Rik,chr4,124850233,124851474,0.532049,96.266398,180.935138,both,278,716.898761,0.496807,0.001177
62,1700025G04Rik,chr1,152089616,152090328,1.892529,359.393629,189.901261,both,203,2187.147601,-0.277303,0.013851
101,1810010H24Rik,chr11,107027745,107028594,0.513738,152.05672,295.981148,both,1849,255.436932,0.930793,4.7e-05


In [5]:
df.binding_type.value_counts()

binding_type
both        1486
exo_only      67
Name: count, dtype: int64

In [6]:
df = df[df['binding_type'] == 'both']

In [7]:
# Define gene categories based on RNA-seq data
def categorize_gene(row, fc_threshold=0.5, padj_threshold=0.05):
    if pd.isna(row['log2FoldChange']) or pd.isna(row['padj']):
        return 'non-deregulated'
    elif row['padj'] > padj_threshold:
        return 'non-deregulated'
    elif row['log2FoldChange'] >= fc_threshold:
        return 'up-regulated'
    elif row['log2FoldChange'] <= -fc_threshold:
        return 'down-regulated'
    else:
        return 'non-deregulated'

In [8]:
# Add category column
df['category'] = df.apply(categorize_gene, axis=1)


In [9]:
df['category'].value_counts()

category
non-deregulated    1156
down-regulated      207
up-regulated        123
Name: count, dtype: int64

In [10]:
# Create the density plot
plt.figure(figsize=(12, 8))

# Plot density for each category
for category, color in zip(['non-deregulated', 'up-regulated', 'down-regulated'], 
                         ['blue', 'orange', 'green']):
    subset = df[df['category'] == category]
    if len(subset) > 0:
        sns.kdeplot(data=subset['mecp2_enrichment'], 
                   label=category,
                   color=color)

plt.title('Mecp2 Enrichment Distribution by Gene Category')
plt.xlabel('Enrichment (Exo/Endo)')
plt.ylabel('Density')
plt.xlim(0, 8)  
plt.ylim(0, 1.2)
plt.legend()

# Add some statistics
for category in ['non-deregulated', 'up-regulated', 'down-regulated']:
    subset = df[df['category'] == category]
    print(f"\n{category}:")
    print(f"Number of genes: {len(subset)}")
    print(f"Mean enrichment: {subset['mecp2_enrichment'].mean():.2f}")
    print(f"Median enrichment: {subset['mecp2_enrichment'].median():.2f}")

# Save the plot
plt.savefig(f"{results_dir}/mecp2_enrichment_by_expression.pdf", 
            bbox_inches='tight', dpi=300)
plt.close()


non-deregulated:
Number of genes: 1156
Mean enrichment: 2.30
Median enrichment: 1.69

up-regulated:
Number of genes: 123
Mean enrichment: 2.22
Median enrichment: 1.74

down-regulated:
Number of genes: 207
Mean enrichment: 2.37
Median enrichment: 1.54
