In [None]:
import pybedtools
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()
DATA_PATH = Path(os.getenv("DATA_PATH"))/'garcia_ATAC'


In [None]:
scenic_consensus_peak_paths = '/mnt/windows/extradata/meiotic_cells/atac_preprocessing/consensus_regions.bed'
peaks = pd.read_csv(scenic_consensus_peak_paths, sep='\t', header=None, comment = "#")
    
# Assuming standard BED3+ format
peaks.columns = ['chrom', 'start', 'end', "name", "score"]

peaks_bed = pybedtools.BedTool.from_dataframe(peaks)


In [None]:
peaks_bed

In [None]:
print(f'total_peaks: {len(peaks_bed)}')
print(f'total_coverage: {sum(peak.length for peak in peaks_bed)}')
    

In [None]:
peaks

In [None]:

feature_beds = {
    'promoters': DATA_PATH / 'feature_annotation/Hs_EPDnew_006_hg38_900up400down.bed',
    'CpG_islands': DATA_PATH / 'feature_annotation/2023-12-29_CpGislands_export.bed',
    'imprints': DATA_PATH / 'feature_annotation/human_imprintome_hg38_ICRs_coordinates.bed',
    'TEs': DATA_PATH / 'feature_annotation/2024-01-06_RepeatMasker_UCSC_Export.bed'
}


In [None]:
feature = pybedtools.BedTool(feature_beds['promoters'])


In [None]:
overlaps = peaks_bed.intersect(feature, wa=True, wb=True)
overlaps

In [None]:
overlaps.head()

In [None]:
len(peaks_bed), len(feature)


In [None]:
def analyze_scenic_accessibility(atac_peaks, feature_name, feature_bed):       
    results = []
    genome_size = 2.9e9
    feature = pybedtools.BedTool(feature_bed)
    overlaps = atac_peaks.intersect(feature, wo=True)
    
    # Calculate base coverage
    total_peak_bases = sum(peak.length for peak in atac_peaks)
    total_feature_bases = sum(f.length for f in feature)
    bases_overlapping = sum(int(o.fields[-1]) for o in overlaps) if overlaps else 0
    
    results.append({
        'feature_type': feature_name,        
        # What proportion of feature bases are accessible?
        'percent_coverage_of_features': (bases_overlapping / total_feature_bases) * 100,
        # How enriched is accessibility at these features?
        'enrichment_over_genome': (bases_overlapping / total_peak_bases) / (total_feature_bases / genome_size),
        # Raw counts for reference
        'total_peak_bases': total_peak_bases,
        'total_feature_bases': total_feature_bases,
        'overlapping_bases': bases_overlapping
    })
    
    return pd.DataFrame(results)

In [None]:
all_results = []
for name, bed in feature_beds.items():
    print(name)
    all_results.append(analyze_scenic_accessibility(peaks_bed, name, bed))

all_results_df = pd.concat(all_results)
all_results_df


In [None]:
all_results_df.to_csv(Path(os.getenv("OUTPUT_PATH"))/'garcia_ATAC/outputs/feature_accessibility.csv', index=False)

In [None]:
def plot_accessibility_patterns(results_df):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 5))
    
    # Plot percentage of ATAC coverage at each feature
    sns.barplot(data=results_df, 
                x='feature_type', 
                y='percent_coverage_of_features',
                ax=ax1)
    ax1.set_title('% feature coverage')
    ax1.set_ylabel('% of feature bases')
    
    # Plot enrichment
    sns.barplot(data=results_df,
                x='feature_type',
                y='enrichment_over_genome',
                ax=ax2)
    ax2.set_title('Coverage enrichment over genome background')
    ax2.set_ylabel('Fold enrichment')    
    
    # Rotate x-axis labels if they're too long
    for ax in [ax1, ax2]:
        ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()

In [None]:
fig = plot_accessibility_patterns(all_results_df)

# Get detailed overlaps for promoters

In [None]:
# Get detailed overlaps for promoters
promoter_overlaps = peaks_bed.intersect(
    pybedtools.BedTool(feature_beds['promoters']), 
    wo=True
)

# Convert to DataFrame for easier analysis
overlaps_df = pd.DataFrame([
    {
        'peak_chrom': o.fields[0],
        'peak_start': int(o.fields[1]),
        'peak_end': int(o.fields[2]),
        'promoter_name': o.fields[-2],
        'overlap_size': int(o.fields[-1]),
    }
    for o in promoter_overlaps
])

# Group by promoter and sum overlap sizes
promoter_coverage = (overlaps_df
    .groupby('promoter_name')
    .agg({
        'overlap_size': 'sum',
        'peak_chrom': 'first'  # Keep chromosome for reference
    })
    .sort_values('overlap_size', ascending=False)
)

# Display top 20 promoters with highest ATAC coverage
print("Top 20 promoters by ATAC coverage:")
print(promoter_coverage.head(20))

# Create a bar plot of top promoters
plt.figure(figsize=(12, 6))
sns.barplot(data=promoter_coverage.head(20).reset_index(), 
            x='promoter_name', 
            y='overlap_size')
plt.xticks(rotation=45, ha='right')
plt.title('Top 20 Promoters by ATAC Coverage')
plt.ylabel('Total bases covered by ATAC peaks')
plt.tight_layout()
plt.show()

In [None]:
all_overlaps = []
for i in range(1,4):
    peak_path = f'/mnt/windows/extradata/meiotic_cells/24047-05-0{i}/atac_peaks.bed'
    peaks = pd.read_csv(peak_path, sep='\t', header=None, comment = "#")
        
    # Assuming standard BED3+ format
    peaks.columns = ['chrom', 'start', 'end']

    peaks = peaks.loc[peaks.chrom.str.startswith('chr')]
    peaks_bed = pybedtools.BedTool.from_dataframe(peaks)
    promoter_overlaps = pybedtools.BedTool(feature_beds['promoters']).intersect(peaks_bed, wao=True)
    # Convert to DataFrame for easier analysis
    overlaps_df = pd.DataFrame([
    {
        'promoter_name': o.fields[3],
        'overlap_size': int(o.fields[-1]),
    }
    for o in promoter_overlaps
    ])
    overlaps_df_by_promoter = overlaps_df.groupby("promoter_name").sum()

    all_overlaps.append(overlaps_df_by_promoter)

all_overlaps_df = pd.concat(all_overlaps)


In [None]:

all_overlaps_df.head()

In [None]:
all_overlaps_df.head()

In [None]:
plt.figure(figsize=(12, 6))
all_overlaps_df.overlap_size.hist()
plt.title('Histogram of Overlap Sizes')
plt.xlabel('Overlap Size')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('/mnt/storage/outputs/garcia_ATAC/outputs/TSS_overlap_hist_2.svg', format='svg')
plt.show()