In [None]:
import pybedtools
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()
DATA_PATH = Path(os.getenv("DATA_PATH"))/'garcia_ATAC'


In [None]:
oogonia_meiotic_peak_paths = '/mnt/windows/extradata/meiotic_cells/atac_preprocessing/oogonia_meiotic.bed'
peaks_bed_meiotic = pybedtools.BedTool(oogonia_meiotic_peak_paths)

oogonia_STRA8_peak_paths = '/mnt/windows/extradata/meiotic_cells/atac_preprocessing/oogonia_STRA8.bed'
peaks_bed_STRA8 = pybedtools.BedTool(oogonia_STRA8_peak_paths)


In [None]:
peaks_bed = peaks_bed_meiotic.cat(peaks_bed_STRA8)

In [None]:
peaks_bed.head()

In [None]:
print(f'total_peaks: {len(peaks_bed)}')
print(f'total_coverage: {sum(peak.length for peak in peaks_bed)}')
    

In [None]:

feature_beds = {
    'promoters': DATA_PATH / 'feature_annotation/Hs_EPDnew_006_hg38_900up400down.bed',
    'TEs': DATA_PATH / 'feature_annotation/2024-01-06_RepeatMasker_UCSC_Export.bed'
}


In [None]:
feature = pybedtools.BedTool(feature_beds['promoters'])


In [None]:
overlaps = peaks_bed.intersect(feature, u = True)
overlaps

In [None]:
len(overlaps)

In [None]:
len(overlaps)

In [None]:
len(peaks_bed), len(feature)


In [None]:
def analyze_overlap(atac_peaks, feature_name, feature_bed):       
    results = []
    feature = pybedtools.BedTool(feature_bed)
    overlaps = atac_peaks.intersect(feature, u=True)
    
    # Calculate base coverage
    total_peaks = len(atac_peaks)
    total_features = len(feature)
    total_overlaps = len(overlaps)
    total_peak_bases = sum(peak.length for peak in atac_peaks)
    total_feature_bases = sum(f.length for f in feature)


    results.append({
        'feature_type': feature_name,        
        'percent_of_peaks_intersecting_at_least_once': (total_overlaps / total_peaks) * 100,
        'total_DARs_peaks': total_peaks,
        'total_features': total_features,        
        'total_peak_bases': total_peak_bases,
        'total_feature_bases': total_feature_bases
        })
    
    return pd.DataFrame(results)

In [None]:
all_results = []
for name, bed in feature_beds.items():
    print(name)
    all_results.append(analyze_overlap(peaks_bed, name, bed))

all_results_df = pd.concat(all_results)
all_results_df
