In [1]:
import pandas as pd
import os

In [23]:
def map_chromatin_peaks_to_gene(file_path):
    for chr_folder_name in os.listdir(file_path):
        chr_path = os.path.join(file_path, chr_folder_name)

        if not os.path.isdir(chr_path) or not chr_folder_name.startswith('chr'):
            continue

        print(f"Processing folder: {chr_folder_name}")

        for hour_name in os.listdir(chr_path):
            hour_path = os.path.join(chr_path, hour_name)
            if not os.path.isdir(hour_path):
                continue

            for cond_name in os.listdir(hour_path):
                cond_path = os.path.join(hour_path, cond_name)
                if not os.path.isdir(cond_path):
                    continue

                # Filenames
                gene_file = f"structure_{hour_name}_{cond_name}_gene_info.csv"
                peak_file = f"structure_{hour_name}_{cond_name}_chromatin_peak_info.csv"
                gene_path = os.path.join(cond_path, gene_file)
                peak_path = os.path.join(cond_path, peak_file)

                if not (os.path.exists(gene_path) and os.path.exists(peak_path)):
                    print(f"Skipping: Missing file in {cond_path}")
                    continue

                # Load data
                gene_df = pd.read_csv(gene_path)
                peak_df = pd.read_csv(peak_path)

                # Clean column names
                gene_df = gene_df.rename(columns={"start": "gene_start", "end": "gene_end"})
                peak_df = peak_df.rename(columns={"start": "peak_start", "end": "peak_end"})

                # Prepare output
                mapped_rows = []

                for index, gene in gene_df.iterrows():
                    g_chr = gene['chromosome']
                    g_start = gene['gene_start']
                    g_end = gene['gene_end']

                    overlapping_peaks = peak_df[
                        (peak_df['chromosome'] == g_chr) &
                        (peak_df['peak_start'] <= g_end) &
                        (peak_df['peak_end'] >= g_start)
                    ]

                    for _, peak in overlapping_peaks.iterrows():
                        row = {
                            'gene_name': gene['gene_name'],
                            'gene_id': gene['gene_id'],
                            'gene_start': g_start,
                            'gene_end': g_end,
                        }
                        # Add all peak info
                        row.update(peak.to_dict())
                        mapped_rows.append(row)

                if mapped_rows:
                    out_df = pd.DataFrame(mapped_rows)
                    output_file = f"structure_{hour_name}_{cond_name}_gene_peak_mapped.csv"
                    out_path = os.path.join(cond_path, output_file)
                    out_df.to_csv(out_path, index=False)
                    print(f"Saved: {output_file}")
                else:
                    print(f"No peak-gene overlap found in {cond_path}")
    


In [24]:
structure_file_path = 'data/green_monkey/all_structure_files'

map_chromatin_peaks_to_gene(structure_file_path)

Processing folder: chr23
Saved: structure_24hrs_untr_gene_peak_mapped.csv
Saved: structure_24hrs_vacv_gene_peak_mapped.csv
Saved: structure_12hrs_untr_gene_peak_mapped.csv
Saved: structure_12hrs_vacv_gene_peak_mapped.csv
Saved: structure_18hrs_untr_gene_peak_mapped.csv
Saved: structure_18hrs_vacv_gene_peak_mapped.csv
Processing folder: chr24
Saved: structure_24hrs_untr_gene_peak_mapped.csv
Saved: structure_24hrs_vacv_gene_peak_mapped.csv
Skipping: Missing file in data/green_monkey/all_structure_files/chr24/12hrs/untr
Saved: structure_12hrs_vacv_gene_peak_mapped.csv
Saved: structure_18hrs_untr_gene_peak_mapped.csv
Saved: structure_18hrs_vacv_gene_peak_mapped.csv
Processing folder: chr12
Saved: structure_24hrs_untr_gene_peak_mapped.csv
Saved: structure_24hrs_vacv_gene_peak_mapped.csv
Saved: structure_12hrs_untr_gene_peak_mapped.csv
Saved: structure_12hrs_vacv_gene_peak_mapped.csv
Saved: structure_18hrs_untr_gene_peak_mapped.csv
Saved: structure_18hrs_vacv_gene_peak_mapped.csv
Processing 

Saved: structure_12hrs_vacv_gene_peak_mapped.csv
Skipping: Missing file in data/green_monkey/all_structure_files/chr9/18hrs/untr
Saved: structure_18hrs_vacv_gene_peak_mapped.csv
Processing folder: chr8
Skipping: Missing file in data/green_monkey/all_structure_files/chr8/24hrs/untr
Saved: structure_24hrs_vacv_gene_peak_mapped.csv
Saved: structure_12hrs_untr_gene_peak_mapped.csv
Saved: structure_12hrs_vacv_gene_peak_mapped.csv
Skipping: Missing file in data/green_monkey/all_structure_files/chr8/18hrs/untr
Saved: structure_18hrs_vacv_gene_peak_mapped.csv
Processing folder: chr6
Saved: structure_24hrs_untr_gene_peak_mapped.csv
Saved: structure_24hrs_vacv_gene_peak_mapped.csv
Saved: structure_12hrs_untr_gene_peak_mapped.csv
Saved: structure_12hrs_vacv_gene_peak_mapped.csv
Saved: structure_18hrs_untr_gene_peak_mapped.csv
Saved: structure_18hrs_vacv_gene_peak_mapped.csv
Processing folder: chr1
Saved: structure_24hrs_untr_gene_peak_mapped.csv
Saved: structure_24hrs_vacv_gene_peak_mapped.csv
Sa