In [24]:
import pandas as pd
import glob
import numpy as np
import os
import matplotlib.pyplot as plt

EXPERIMENT="align2_005"

# Set working directory
os.chdir(f'/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_CUTandTAG/iterative_alternative/analyze_mecp2_cpg_enrichment_{EXPERIMENT}')

# Parallel


In [25]:
# Load differential expression analysis results
dea_nsc = pd.read_csv('../../DATA/DEA_NSC.csv')
print(dea_nsc.shape)
dea_nsc.head()

(14245, 7)


Unnamed: 0,gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,Mir5125,128.824113,7.3e-05,0.189117,0.000383,0.999694,0.999694
1,Trim68,98.639563,0.000133,0.215486,0.000617,0.999508,0.999614
2,Frrs1,442.94612,9.7e-05,0.109825,0.000883,0.999295,0.999556
3,Notch4,60.968185,0.000246,0.2663,0.000925,0.999262,0.999556
4,Ano6,5982.860518,7.4e-05,0.070708,0.001043,0.999168,0.999503


In [26]:
dea_nsc = dea_nsc[dea_nsc['padj'] < 0.05]
dea_nsc.shape

(8908, 7)

In [27]:
expression_threshold = dea_nsc['baseMean'].quantile(0.02)
print(expression_threshold)

60.457748287200005


In [28]:
dea_nsc = dea_nsc[dea_nsc['baseMean'] > expression_threshold]
dea_nsc.shape

(8729, 7)

In [29]:
dea_nsc.to_csv(f'/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_CUTandTAG/iterative_alternative/DATA/DEA_NSC_filtered.csv', index=False)

In [30]:
# Read CpG islands bed file with tab separator and proper column names
cpg_islands = pd.read_csv('../../DATA/cpg_islands.bed', sep='\t', 
                         names=['chr', 'start', 'end', 'id', 'cpg_label', 'cpg_count'])

# Remove the "CpG:" prefix from cpg_label column
cpg_islands['cpg_label'] = cpg_islands['cpg_label'].str.replace('CpG:', '')

print(cpg_islands.shape)
cpg_islands.head()

(17017, 6)


Unnamed: 0,chr,start,end,id,cpg_label,cpg_count
0,chr1,3531624,3531843,611,,27
1,chr1,3670619,3671074,613,,34
2,chr1,3671654,3672156,613,,45
3,chr1,4491701,4493673,619,,165
4,chr1,4496947,4497608,619,,47


In [31]:
# Get list of all chunk files
chunk_files = glob.glob('mecp2_cpg_enrichment_parallel/chunk_*.csv')
chunk_files

['mecp2_cpg_enrichment_parallel/chunk_2.csv',
 'mecp2_cpg_enrichment_parallel/chunk_3.csv',
 'mecp2_cpg_enrichment_parallel/chunk_8.csv',
 'mecp2_cpg_enrichment_parallel/chunk_0.csv',
 'mecp2_cpg_enrichment_parallel/chunk_4.csv',
 'mecp2_cpg_enrichment_parallel/chunk_1.csv',
 'mecp2_cpg_enrichment_parallel/chunk_5.csv',
 'mecp2_cpg_enrichment_parallel/chunk_7.csv',
 'mecp2_cpg_enrichment_parallel/chunk_6.csv',
 'mecp2_cpg_enrichment_parallel/chunk_9.csv']

In [32]:
# Read and concatenate all chunks
df_parallel = pd.concat([pd.read_csv(f) for f in chunk_files], ignore_index=True)

# Sort by chromosome and start position
df_parallel = df_parallel.sort_values(['chr', 'start'])

print(f"Total regions analyzed: {len(df_parallel)}")

Total regions analyzed: 15032


In [33]:
df_parallel.head()

Unnamed: 0,chr,start,end,exo_signal,endo_signal,enrichment,pvalue,binding_type,peak_width_exo,peak_width_endo,significant
4777,chr1,3531624,3531843,0.0,17.214411,0.0,1.0,endo_only,0.0,416.0,False
4778,chr1,3670619,3671074,21.44108,119.642541,0.17921,0.060298,both,462.666667,1554.0,False
4779,chr1,3671654,3672156,114.589139,119.642541,0.957762,0.060298,both,671.333333,1554.0,False
4780,chr1,4496947,4497608,10.886892,20.508762,0.530841,1.0,both,551.0,393.0,False
4781,chr1,4571641,4572075,36.039438,12.499547,2.88326,0.353367,both,382.75,345.0,False


In [34]:
df_parallel = df_parallel[df_parallel['chr'].isin([f'chr{i}' for i in range(1,20)] + ['chrX', 'chrY'])]
df_parallel.shape

(15028, 11)

In [35]:
df_parallel['significant'] = True

In [36]:
df_parallel.to_csv('mecp2_cpg_enrichment_parallel/mecp2_cpg_enrichment_parallel.csv', index=False)

In [37]:
# df_parallel = df_parallel[(df_parallel['exo_signal'] > 4.0) | (df_parallel['endo_signal'] > 4.0)]

In [38]:
df_both = df_parallel[df_parallel['binding_type'] == "both"]
df_both.shape

(13209, 11)

In [39]:
df_both = df_both.sort_values('enrichment', ascending=False)
df_both.head()

Unnamed: 0,chr,start,end,exo_signal,endo_signal,enrichment,pvalue,binding_type,peak_width_exo,peak_width_endo,significant
12733,chr7,123369426,123369630,2976.203129,8.75061,340.113789,0.039143,both,1106.2,433.0,True
14571,chr9,119339620,119340000,1647.147421,13.874465,118.717907,0.039143,both,1054.4,441.0,True
8955,chr11,103117220,103117987,1387.366007,15.728703,88.206001,0.027168,both,996.4,436.0,True
10394,chr3,93555092,93555367,362.668182,4.184631,86.666707,0.039143,both,1041.0,264.0,True
1201,chr14,118611620,118611930,731.500263,8.522529,85.831356,0.03435,both,1171.166667,369.0,True


In [40]:
df_both.head()

Unnamed: 0,chr,start,end,exo_signal,endo_signal,enrichment,pvalue,binding_type,peak_width_exo,peak_width_endo,significant
12733,chr7,123369426,123369630,2976.203129,8.75061,340.113789,0.039143,both,1106.2,433.0,True
14571,chr9,119339620,119340000,1647.147421,13.874465,118.717907,0.039143,both,1054.4,441.0,True
8955,chr11,103117220,103117987,1387.366007,15.728703,88.206001,0.027168,both,996.4,436.0,True
10394,chr3,93555092,93555367,362.668182,4.184631,86.666707,0.039143,both,1041.0,264.0,True
1201,chr14,118611620,118611930,731.500263,8.522529,85.831356,0.03435,both,1171.166667,369.0,True


In [41]:
df_both.to_csv('mecp2_cpg_enrichment_parallel/mecp2_cpg_enrichment_both.csv', index=False)