In [1]:
import pyBigWig
import os

In [2]:
ata_sec_file_folder = "data/GSE248049/ata-sec/"


In [3]:
def convert_bw_to_bed(file_path):
    ata_folder_path = os.path.join(file_path)
    
    # List all bw files in the folder, ignoring those already containing '_with_id0'
    bw_files = [f for f in os.listdir(ata_folder_path) if f.endswith('.bw')]
    print(bw_files)
    
    for bw_file in bw_files:
        bw_file_path = os.path.join(ata_folder_path, bw_file)
        
        bw = pyBigWig.open(bw_file_path)
        
        # Generate the output .bed file name
        bed_file = bw_file.replace('_bio_rep_1', '')  # Remove "_bio_rep_1"
        bed_file = os.path.splitext(bed_file)[0] + '.bed'  # Change extension to .bed
        
        # Full path for the output .bed file
        bed_file_path = os.path.join(ata_folder_path, bed_file)
        
        # Get chromosome names and lengths
        chroms = bw.chroms()

        # Open the output file for writing
        with open(bed_file_path, "w") as outfile:
            # Iterate through each chromosome
            for chrom, length in chroms.items():
                print(f"Processing {chrom} ({length} bp)")

                # Extract intervals and their scores for the chromosome
                intervals = bw.intervals(chrom)

                # Write each interval to the output file
                for interval in intervals:
                    start, end, score = interval
                    outfile.write(f"{chrom}\t{start}\t{end}\t{score}\n")

        # Close the BigWig file
        bw.close()

        print(f"Results saved to {bed_file_path}")
        

In [4]:
# convert_bw_to_bed(ata_sec_file_folder)

In [7]:
def generate_peaks_with_score(file_path):
    ata_folder_path = os.path.join(file_path)
    
    # List all bw files in the folder, ignoring those already containing '_with_id0'
    bw_files = [f for f in os.listdir(ata_folder_path) if f.endswith('.bw')]
#     print(bw_files)
    
    for bw_file in bw_files:
        bw_file_path = os.path.join(ata_folder_path, bw_file)
        
        bw = pyBigWig.open(bw_file_path)
        
        # Generate the output .bed file name
        bed_file = bw_file.replace('_bio_rep_1', '')  # Remove "_bio_rep_1"
        bed_file = os.path.splitext(bed_file)[0] + '_peaks.bed'  # Change extension to .bed
        
        # Full path for the output .bed file
        bed_file_path = os.path.join(ata_folder_path, bed_file)
        
        output_file = os.path.splitext(bed_file)[0] + '_peaks_with_scores.bed'
        
        output_file_path = os.path.join(ata_folder_path, output_file)
        
        # Open the narrowPeak file and process each peak
        with open(bed_file_path, "r") as infile, open(output_file_path, "w") as outfile:
            # Skip the header line (if present)
            next(infile)  # Remove this line if there's no header

            # Iterate through each peak in the narrowPeak file
            for line in infile:
                # Split the line into columns
                cols = line.strip().split("\t")

                # Extract chromosome, start, and end
                chrom = cols[0]
                start = int(cols[1])
                end = int(cols[2])

                # Extract signal values for the peak region from the .bw file
                signal_values = bw.values(chrom, start, end)

                # Calculate the score (e.g., average signal)
                if signal_values and any(v is not None for v in signal_values):
                    avg_score = sum(v for v in signal_values if v is not None) / len(signal_values)
                    outfile.write(f"{chrom}\t{start}\t{end}\t{avg_score}\n")

        # Close the BigWig file
        bw.close()

        print(f"Results saved to {output_file}")

In [8]:
generate_peaks_with_score(ata_sec_file_folder)

Results saved to 12hr_mock_infected_ATAC_sec_peaks_peaks_with_scores.bed
Results saved to 24hr_mock_infected_ATAC_sec_peaks_peaks_with_scores.bed
Results saved to 12hr_mva_infected_ATAC_sec_peaks_peaks_with_scores.bed
Results saved to 24hr_mva_infected_ATAC_sec_peaks_peaks_with_scores.bed
Results saved to 18hr_mva_infected_ATAC_sec_peaks_peaks_with_scores.bed
Results saved to 18hr_mock_infected_ATAC_sec_peaks_peaks_with_scores.bed


In [3]:
folder_path = os.path.join("data/GSE248049/access_files/")
    
# List all bw files in the folder, ignoring those already containing '_with_id0'
files = [f for f in os.listdir(folder_path)]

print(files)

['24hr_mva_infected_ATAC_sec_peaks_peaks_with_scores.bed', '24hr_mock_infected_ATAC_sec_peaks_peaks_with_scores.bed', '12hr_mock_infected_ATAC_sec_peaks_peaks_with_scores.bed', '18hr_mva_infected_ATAC_sec_peaks_peaks_with_scores.bed', '12hr_mva_infected_ATAC_sec_peaks_peaks_with_scores.bed', '18hr_mock_infected_ATAC_sec_peaks_peaks_with_scores.bed']
