In [27]:
import pandas as pd
import pybedtools
import os

# Not integrated

## Align2

### analyze_mecp2_cpg_enrichment_align2_005

In [38]:
os.chdir('/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_MeCP2_CUTandTAG/iterative_alternative/analyze_mecp2_cpg_enrichment_align2_005/NSC/mecp2_cpg_enrichment_parallel')

# Read the CSV file
df = pd.read_csv('mecp2_cpg_enrichment_parallel.csv')

# Create three dataframes based on binding_type
binding_types_df = {
    'both': df[df['binding_type'] == 'both'],
    'exo_only': df[df['binding_type'] == 'exo_only'],
    'endo_only': df[df['binding_type'] == 'endo_only']
}

print("analyze_mecp2_cpg_enrichment:")
print(f"Total number of peaks: {len(df)}")
# Save each dataframe
for binding_type, df_subset in binding_types_df.items():
    output_file = f"mecp2_cpg_enrichment_parallel_{binding_type}.csv"
    df_subset.to_csv(output_file, index=False)
    print(f"Saved {binding_type} peaks ({len(df_subset)} rows) to {output_file}")

analyze_mecp2_cpg_enrichment:
Total number of peaks: 13713
Saved both peaks (9361 rows) to mecp2_cpg_enrichment_parallel_both.csv
Saved exo_only peaks (3500 rows) to mecp2_cpg_enrichment_parallel_exo_only.csv
Saved endo_only peaks (852 rows) to mecp2_cpg_enrichment_parallel_endo_only.csv


In [39]:
# Convert DataFrame to BED format
peaks_bed = pybedtools.BedTool.from_dataframe(df[['chr', 'start', 'end']])

# Read CpG islands file
cpg_islands = pybedtools.BedTool('../../../../DATA/cpg_islands.bed')

# Find overlaps between peaks and CpG islands
# Using -wo to get all fields from both files and the amount of overlap
overlaps = peaks_bed.intersect(cpg_islands, wa=True, wb=True).to_dataframe(
    names=['peak_chr', 'peak_start', 'peak_end',
           'cpg_chr', 'cpg_start', 'cpg_end', 'cpg_id', 'cpg_label', 'cpg_number'])

# Clean up the CpG number (remove "CpG:" prefix)
overlaps['cpg_number'] = overlaps['cpg_number'].astype(str).str.extract('(\d+)').astype(int)

# Merge with original dataframe to get all peak information
df_cpg = df.merge(
    overlaps[['peak_chr', 'peak_start', 'peak_end', 'cpg_start', 'cpg_end', 'cpg_number']], 
    left_on=['chr', 'start', 'end'],
    right_on=['peak_chr', 'peak_start', 'peak_end']
).drop(['peak_chr', 'peak_start', 'peak_end'], axis=1)

# Create three dataframes based on binding_type (only CpG overlapping peaks)
binding_types_df = {
    'both': df_cpg[df_cpg['binding_type'] == 'both'],
    'exo_only': df_cpg[df_cpg['binding_type'] == 'exo_only'],
    'endo_only': df_cpg[df_cpg['binding_type'] == 'endo_only']
}

print("CpG islands analysis:")
print(f"Total number of peaks overlapping CpG islands: {len(df_cpg)}")

# Save each dataframe with CpG suffix
for binding_type, df_subset in binding_types_df.items():
    output_file = f"mecp2_cpg_enrichment_parallel_{binding_type}_CpG.csv"
    df_subset.to_csv(output_file, index=False)
    print(f"Saved {binding_type} CpG peaks ({len(df_subset)} rows) to {output_file}")

CpG islands analysis:
Total number of peaks overlapping CpG islands: 13713
Saved both CpG peaks (9361 rows) to mecp2_cpg_enrichment_parallel_both_CpG.csv
Saved exo_only CpG peaks (3500 rows) to mecp2_cpg_enrichment_parallel_exo_only_CpG.csv
Saved endo_only CpG peaks (852 rows) to mecp2_cpg_enrichment_parallel_endo_only_CpG.csv


In [40]:
df_cpg.head()

Unnamed: 0,chr,start,end,exo_signal,endo_signal,enrichment,pvalue,binding_type,peak_width_exo,peak_width_endo,significant,cpg_start,cpg_end,cpg_number
0,chr1,3670619,3671074,14.040878,0.0,inf,1.0,exo_only,681.0,0.0,False,3670619,3671074,34
1,chr1,4571641,4572075,50.997745,0.0,inf,1.0,exo_only,447.333333,0.0,False,4571641,4572075,44
2,chr1,4689184,4689397,77.713614,0.0,inf,1.0,exo_only,2094.0,0.0,False,4689184,4689397,24
3,chr1,4785376,4785814,135.213609,0.0,inf,1.0,exo_only,808.2,0.0,False,4785376,4785814,49
4,chr1,4807559,4808103,205.420074,0.0,inf,1.0,exo_only,949.8,0.0,False,4807559,4808103,73


### analyze_mecp2_cpg_enrichment_align2_005_consistent_peaks

In [41]:
os.chdir('/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_MeCP2_CUTandTAG/iterative_alternative/analyze_mecp2_cpg_enrichment_align2_005_consistent_peaks/NSC/mecp2_cpg_enrichment_parallel')

# Read the CSV file
df = pd.read_csv('mecp2_cpg_enrichment_parallel.csv')

# Create three dataframes based on binding_type
binding_types_df = {
    'both': df[df['binding_type'] == 'both'],
    'exo_only': df[df['binding_type'] == 'exo_only'],
    'endo_only': df[df['binding_type'] == 'endo_only']
}

print("analyze_mecp2_cpg_enrichment_only_consistent_peaks")
print(f"Total number of peaks: {len(df)}")
# Save each dataframe
for binding_type, df_subset in binding_types_df.items():
    output_file = f"mecp2_cpg_enrichment_parallel_{binding_type}.csv"
    df_subset.to_csv(output_file, index=False)
    print(f"Saved {binding_type} peaks ({len(df_subset)} rows) to {output_file}")

analyze_mecp2_cpg_enrichment_only_consistent_peaks
Total number of peaks: 13626
Saved both peaks (10348 rows) to mecp2_cpg_enrichment_parallel_both.csv
Saved exo_only peaks (2361 rows) to mecp2_cpg_enrichment_parallel_exo_only.csv
Saved endo_only peaks (917 rows) to mecp2_cpg_enrichment_parallel_endo_only.csv


In [42]:
# Convert DataFrame to BED format
peaks_bed = pybedtools.BedTool.from_dataframe(df[['chr', 'start', 'end']])

# Read CpG islands file
cpg_islands = pybedtools.BedTool('../../../../DATA/cpg_islands.bed')

# Find overlaps between peaks and CpG islands
# Using -wo to get all fields from both files and the amount of overlap
overlaps = peaks_bed.intersect(cpg_islands, wa=True, wb=True).to_dataframe(
    names=['peak_chr', 'peak_start', 'peak_end',
           'cpg_chr', 'cpg_start', 'cpg_end', 'cpg_id', 'cpg_label', 'cpg_number'])

# Clean up the CpG number (remove "CpG:" prefix)
overlaps['cpg_number'] = overlaps['cpg_number'].astype(str).str.extract('(\d+)').astype(int)

# Merge with original dataframe to get all peak information
df_cpg = df.merge(
    overlaps[['peak_chr', 'peak_start', 'peak_end', 'cpg_start', 'cpg_end', 'cpg_number']], 
    left_on=['chr', 'start', 'end'],
    right_on=['peak_chr', 'peak_start', 'peak_end']
).drop(['peak_chr', 'peak_start', 'peak_end'], axis=1)

# Create three dataframes based on binding_type (only CpG overlapping peaks)
binding_types_df = {
    'both': df_cpg[df_cpg['binding_type'] == 'both'],
    'exo_only': df_cpg[df_cpg['binding_type'] == 'exo_only'],
    'endo_only': df_cpg[df_cpg['binding_type'] == 'endo_only']
}

print("CpG islands analysis:")
print(f"Total number of peaks overlapping CpG islands: {len(df_cpg)}")

# Save each dataframe with CpG suffix
for binding_type, df_subset in binding_types_df.items():
    output_file = f"mecp2_cpg_enrichment_parallel_{binding_type}_CpG.csv"
    df_subset.to_csv(output_file, index=False)
    print(f"Saved {binding_type} CpG peaks ({len(df_subset)} rows) to {output_file}")

CpG islands analysis:
Total number of peaks overlapping CpG islands: 13626
Saved both CpG peaks (10348 rows) to mecp2_cpg_enrichment_parallel_both_CpG.csv
Saved exo_only CpG peaks (2361 rows) to mecp2_cpg_enrichment_parallel_exo_only_CpG.csv
Saved endo_only CpG peaks (917 rows) to mecp2_cpg_enrichment_parallel_endo_only_CpG.csv


## Align1

### analyze_mecp2_cpg_enrichment_align1_005

In [43]:
os.chdir('/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_MeCP2_CUTandTAG/iterative_alternative/analyze_mecp2_cpg_enrichment_align1_005/NSC/mecp2_cpg_enrichment_parallel')

# Read the CSV file
df = pd.read_csv('mecp2_cpg_enrichment_parallel.csv')

# Create three dataframes based on binding_type
binding_types_df = {
    'both': df[df['binding_type'] == 'both'],
    'exo_only': df[df['binding_type'] == 'exo_only'],
    'endo_only': df[df['binding_type'] == 'endo_only']
}

print("analyze_mecp2_cpg_enrichment:")
print(f"Total number of peaks: {len(df)}")
# Save each dataframe
for binding_type, df_subset in binding_types_df.items():
    output_file = f"mecp2_cpg_enrichment_parallel_{binding_type}.csv"
    df_subset.to_csv(output_file, index=False)
    print(f"Saved {binding_type} peaks ({len(df_subset)} rows) to {output_file}")

analyze_mecp2_cpg_enrichment:
Total number of peaks: 14649
Saved both peaks (12425 rows) to mecp2_cpg_enrichment_parallel_both.csv
Saved exo_only peaks (1123 rows) to mecp2_cpg_enrichment_parallel_exo_only.csv
Saved endo_only peaks (1101 rows) to mecp2_cpg_enrichment_parallel_endo_only.csv


In [44]:
# Convert DataFrame to BED format
peaks_bed = pybedtools.BedTool.from_dataframe(df[['chr', 'start', 'end']])

# Read CpG islands file
cpg_islands = pybedtools.BedTool('../../../../DATA/cpg_islands.bed')

# Find overlaps between peaks and CpG islands
# Using -wo to get all fields from both files and the amount of overlap
overlaps = peaks_bed.intersect(cpg_islands, wa=True, wb=True).to_dataframe(
    names=['peak_chr', 'peak_start', 'peak_end',
           'cpg_chr', 'cpg_start', 'cpg_end', 'cpg_id', 'cpg_label', 'cpg_number'])

# Clean up the CpG number (remove "CpG:" prefix)
overlaps['cpg_number'] = overlaps['cpg_number'].astype(str).str.extract('(\d+)').astype(int)

# Merge with original dataframe to get all peak information
df_cpg = df.merge(
    overlaps[['peak_chr', 'peak_start', 'peak_end', 'cpg_start', 'cpg_end', 'cpg_number']], 
    left_on=['chr', 'start', 'end'],
    right_on=['peak_chr', 'peak_start', 'peak_end']
).drop(['peak_chr', 'peak_start', 'peak_end'], axis=1)

# Create three dataframes based on binding_type (only CpG overlapping peaks)
binding_types_df = {
    'both': df_cpg[df_cpg['binding_type'] == 'both'],
    'exo_only': df_cpg[df_cpg['binding_type'] == 'exo_only'],
    'endo_only': df_cpg[df_cpg['binding_type'] == 'endo_only']
}

print("CpG islands analysis:")
print(f"Total number of peaks overlapping CpG islands: {len(df_cpg)}")

# Save each dataframe with CpG suffix
for binding_type, df_subset in binding_types_df.items():
    output_file = f"mecp2_cpg_enrichment_parallel_{binding_type}_CpG.csv"
    df_subset.to_csv(output_file, index=False)
    print(f"Saved {binding_type} CpG peaks ({len(df_subset)} rows) to {output_file}")

CpG islands analysis:
Total number of peaks overlapping CpG islands: 14649
Saved both CpG peaks (12425 rows) to mecp2_cpg_enrichment_parallel_both_CpG.csv
Saved exo_only CpG peaks (1123 rows) to mecp2_cpg_enrichment_parallel_exo_only_CpG.csv
Saved endo_only CpG peaks (1101 rows) to mecp2_cpg_enrichment_parallel_endo_only_CpG.csv


### analyze_mecp2_cpg_enrichment_align1_005_consistent_peaks

In [45]:
os.chdir('/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_MeCP2_CUTandTAG/iterative_alternative/analyze_mecp2_cpg_enrichment_align1_005_consistent_peaks/NSC/mecp2_cpg_enrichment_parallel')

# Read the CSV file
df = pd.read_csv('mecp2_cpg_enrichment_parallel.csv')

# Create three dataframes based on binding_type
binding_types_df = {
    'both': df[df['binding_type'] == 'both'],
    'exo_only': df[df['binding_type'] == 'exo_only'],
    'endo_only': df[df['binding_type'] == 'endo_only']
}

print("analyze_mecp2_cpg_enrichment_only_consistent_peaks")
print(f"Total number of peaks: {len(df)}")
# Save each dataframe
for binding_type, df_subset in binding_types_df.items():
    output_file = f"mecp2_cpg_enrichment_parallel_{binding_type}.csv"
    df_subset.to_csv(output_file, index=False)
    print(f"Saved {binding_type} peaks ({len(df_subset)} rows) to {output_file}")

analyze_mecp2_cpg_enrichment_only_consistent_peaks
Total number of peaks: 14467
Saved both peaks (12337 rows) to mecp2_cpg_enrichment_parallel_both.csv
Saved exo_only peaks (1343 rows) to mecp2_cpg_enrichment_parallel_exo_only.csv
Saved endo_only peaks (787 rows) to mecp2_cpg_enrichment_parallel_endo_only.csv


In [46]:
# Convert DataFrame to BED format
peaks_bed = pybedtools.BedTool.from_dataframe(df[['chr', 'start', 'end']])

# Read CpG islands file
cpg_islands = pybedtools.BedTool('../../../../DATA/cpg_islands.bed')

# Find overlaps between peaks and CpG islands
# Using -wo to get all fields from both files and the amount of overlap
overlaps = peaks_bed.intersect(cpg_islands, wa=True, wb=True).to_dataframe(
    names=['peak_chr', 'peak_start', 'peak_end',
           'cpg_chr', 'cpg_start', 'cpg_end', 'cpg_id', 'cpg_label', 'cpg_number'])

# Clean up the CpG number (remove "CpG:" prefix)
overlaps['cpg_number'] = overlaps['cpg_number'].astype(str).str.extract('(\d+)').astype(int)

# Merge with original dataframe to get all peak information
df_cpg = df.merge(
    overlaps[['peak_chr', 'peak_start', 'peak_end', 'cpg_start', 'cpg_end', 'cpg_number']], 
    left_on=['chr', 'start', 'end'],
    right_on=['peak_chr', 'peak_start', 'peak_end']
).drop(['peak_chr', 'peak_start', 'peak_end'], axis=1)

# Create three dataframes based on binding_type (only CpG overlapping peaks)
binding_types_df = {
    'both': df_cpg[df_cpg['binding_type'] == 'both'],
    'exo_only': df_cpg[df_cpg['binding_type'] == 'exo_only'],
    'endo_only': df_cpg[df_cpg['binding_type'] == 'endo_only']
}

print("CpG islands analysis:")
print(f"Total number of peaks overlapping CpG islands: {len(df_cpg)}")

# Save each dataframe with CpG suffix
for binding_type, df_subset in binding_types_df.items():
    output_file = f"mecp2_cpg_enrichment_parallel_{binding_type}_CpG.csv"
    df_subset.to_csv(output_file, index=False)
    print(f"Saved {binding_type} CpG peaks ({len(df_subset)} rows) to {output_file}")

CpG islands analysis:
Total number of peaks overlapping CpG islands: 14467
Saved both CpG peaks (12337 rows) to mecp2_cpg_enrichment_parallel_both_CpG.csv
Saved exo_only CpG peaks (1343 rows) to mecp2_cpg_enrichment_parallel_exo_only_CpG.csv
Saved endo_only CpG peaks (787 rows) to mecp2_cpg_enrichment_parallel_endo_only_CpG.csv


# Integrated

In [7]:
os.chdir('/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_MeCP2_CUTandTAG/iterative_alternative/analyze_mecp2_cpg_enrichment_align2_005_consistent_peaks/NSC/integrated')


# Read the CSV file
df = pd.read_csv('mecp2_enriched_genes.csv')

# Create three dataframes based on binding_type
binding_types_df = {
    'both': df[df['binding_type'] == 'both'],
    'exo_only': df[df['binding_type'] == 'exo_only'],
    'endo_only': df[df['binding_type'] == 'endo_only']
}

# Save each dataframe
for binding_type, df_subset in binding_types_df.items():
    output_file = f"mecp2_enriched_genes_{binding_type}.csv"
    df_subset.to_csv(output_file, index=False)
    print(f"Saved {binding_type} peaks ({len(df_subset)} rows) to {output_file}")

Saved both peaks (1607 rows) to mecp2_enriched_genes_both.csv
Saved exo_only peaks (0 rows) to mecp2_enriched_genes_exo_only.csv
Saved endo_only peaks (0 rows) to mecp2_enriched_genes_endo_only.csv
