In [1]:
import pandas as pd
import os
import numpy as np
import math
import ast

In [2]:
def calculatePosition(pointA, pointB, percent):
    # Calculate the differences in coordinates
    delta_x = pointB[0] - pointA[0]
    delta_y = pointB[1] - pointA[1]
    delta_z = pointB[2] - pointA[2]

    # Calculate % of the differences
    dif_x = (percent / 100) * delta_x
    dif_y = (percent / 100) * delta_y
    dif_z = (percent / 100) * delta_z

    # Calculate the coordinates of the point at % along the line
    x_pos = pointA[0] + dif_x
    y_pos = pointA[1] + dif_y
    z_pos = pointA[2] + dif_z
    
    return [x_pos, y_pos, z_pos]

In [3]:
def data_processing(df, df_acc, resolution):   
    
    #deep copy
    acc_df = df_acc.copy(deep=True)
    
    #values to store
    start_ids = []
    end_ids = []
    start_pos = []
    start_x = []
    start_y = []
    start_z = []
    end_pos = []
    end_x = []
    end_y = []
    end_z = []
    start_percent = []
    end_percent = []
    middle = []
    middle_percent = []
    middle_pos = []
    middle_x = []
    middle_y = []
    middle_z = []
    
    #iterate over the gene data frame
    for index, row in acc_df.iterrows():
        #get the start and end position
        start = row['start']
        end = row['end']
        
        #find the mid position
        mid = int((start + end) / 2)
        middle.append(mid)

        # Calculate the ID range that encompasses the start and end
        start_id = (start / resolution)
        end_id = (end / resolution)

        start_id_int = int(start_id)
        end_id_int = int(end_id) + 1

        start_ids.append(start_id_int)
        end_ids.append(end_id_int)
        
        # find the percentage of where the mid point is located
        m_percent = ((mid - (start_id_int * resolution)) / ((end_id_int * resolution) - (start_id_int * resolution))) * 100
        s_percent = m_percent = ((start - (start_id_int * resolution)) / ((end_id_int * resolution) - (start_id_int * resolution))) * 100
        e_percent = m_percent = ((end - (start_id_int * resolution)) / ((end_id_int * resolution) - (start_id_int * resolution))) * 100
        
        middle_percent.append(m_percent)
        start_percent.append(s_percent)
        end_percent.append(e_percent)

        #getting the coordinates of the start and end beads that encompasses the gene
        pointA = 0
        pointB = 0
        
        # Check if start_id and end_id are within the bounds of df_structure
        if (start_id_int <= len(df) - 1) and (end_id_int <= len(df) - 1):
            
            start_row = df[df['id'] == start_id_int]
            end_row = df[df['id'] == end_id_int]

            # Extract x, y, z values if rows are found
            if not start_row.empty:
                pointA = [start_row.iloc[0]['x'], start_row.iloc[0]['y'], start_row.iloc[0]['z']]

            if not end_row.empty:
                pointB = [end_row.iloc[0]['x'], end_row.iloc[0]['y'], end_row.iloc[0]['z']]

        else:                                
            pointA = None
            pointB = None
            
#         print(pointA, pointB)
            
        #calcualate the coords of the mid point of the gene 
        # using parametric equation
        # for two points P1, P2
        # The parametric equation of the line passing through these two points can be written as:
        # P(t) = P1 + t. (P2- P1) where t is the percentage value (0-1)
        if pointA is not None and pointB is not None:
            
#             print('not none')
            sp = calculatePosition(pointA, pointB, s_percent)
            mp = calculatePosition(pointA, pointB, m_percent)
            ep = calculatePosition(pointA, pointB, e_percent)        
            
            start_pos.append(sp)
            middle_pos.append(mp)
            end_pos.append(ep)
            
            start_x.append(sp[0])
            start_y.append(sp[1])
            start_z.append(sp[2])
            end_x.append(ep[0])
            end_y.append(ep[1])
            end_z.append(ep[2])
            middle_x.append(mp[0])
            middle_y.append(mp[1])
            middle_z.append(mp[2])
            
        else:
            start_pos.append(None)
            end_pos.append(None)
            middle_pos.append(None)
            start_x.append(None)
            start_y.append(None)
            start_z.append(None)
            end_x.append(None)
            end_y.append(None)
            end_z.append(None)
            middle_x.append(None)
            middle_y.append(None)
            middle_z.append(None)
            


    acc_df['structure_start_id'] = start_ids
    acc_df['structure_end_id'] = end_ids
    acc_df['start_pos'] = start_pos
    acc_df['start_x'] = start_x
    acc_df['start_y'] = start_y
    acc_df['start_z'] = start_z
    acc_df['end_pos'] = end_pos
    acc_df['end_x'] = end_x
    acc_df['end_y'] = end_y
    acc_df['end_z'] = end_z
    acc_df['middle'] = middle
    acc_df['middle_percent'] = middle_percent
    acc_df['middle_pos'] = middle_pos
    acc_df['middle_x'] = middle_x
    acc_df['middle_y'] = middle_y
    acc_df['middle_z'] = middle_z
                
    print("calculation done")       
    return acc_df


In [27]:
def accessibility_data_coord_extract(atom_file_path, acc_file_path, res):     
    chr_folder_path = os.path.join(atom_file_path)
    for chr_folder_name in os.listdir(chr_folder_path):
        chr_path = os.path.join(chr_folder_path, chr_folder_name)

        # Check if the current item is a directory starting with 'chr'
        if os.path.isdir(chr_path) and chr_folder_name.startswith('chr'):  
            print(f"Processing folder: {chr_folder_name}")
            
            #iterate over each time point
            for hour_name in os.listdir(chr_path):
#                 print(hour_name)
                hour_path = os.path.join(chr_path, hour_name)
                
                if not os.path.isdir(hour_path):
                    continue
                
                # each condition subfolder
                for cond_name in os.listdir(hour_path):
#                     print(cond_name)
                    cond_path = os.path.join(hour_path, cond_name)
                    
                    if not os.path.isdir(cond_path):
                        continue
                    
            
                    # List all CSV files in the folder, considering only those already containing '_with_id0'
                    csv_files = [f for f in os.listdir(cond_path) if f.endswith('.csv') and '_with_id0' in f]

                    chr_name = ''.join(filter(str.isdigit, chr_folder_name))  
            
                    #removing the 0 at the first
                    chr_name = chr_name.lstrip('0')
            
#                     print("chr_name", chr_name)
                
                    # get the acc file
                    acc_file = os.path.join(acc_file_path, f"{hour_name}_{cond_name}_narrowPeak.csv")
                    acc_df = pd.read_csv(acc_file)
                    # Filter by current chromosome
                    acc_df = acc_df[acc_df['chromosome'] == chr_folder_name]
                    
                    if acc_df.empty:
                        print(f"No peaks for {chr_folder_name} in {hour_name}_{cond_name}, skipping.")
                        continue
                    
                    for csv_file in csv_files:
                        csv_file_path = os.path.join(cond_path, csv_file)
                        df = pd.read_csv(csv_file_path)
                        
                        acc_df_with_start_end = data_processing(df, acc_df, res)
                        filtered_acc_with_position = acc_df_with_start_end.dropna(subset=['middle_pos'])
                        
                        # Generate new output filename with "_with_id0"
                        base_name = csv_file.replace('_with_id0.csv', '')  # Remove ".csv"
                        new_file_name = f"{base_name}_chromatin_peak_info.csv"
                        save_file_path = os.path.join(cond_path, new_file_name)

                        filtered_acc_with_position.to_csv(save_file_path, index=False)

                        print(f"data saved {save_file_path}")

            

In [28]:
structure_file_path = 'data/green_monkey/all_structure_files'
acc_file_path = 'data/green_monkey/ata-sec'
resolution = 100000

accessibility_data_coord_extract(structure_file_path, acc_file_path, 100000)

Processing folder: chr23
calculation done
data saved data/green_monkey/all_structure_files/chr23/24hrs/untr/structure_24hrs_untr_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr23/24hrs/vacv/structure_24hrs_vacv_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr23/12hrs/untr/structure_12hrs_untr_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr23/12hrs/vacv/structure_12hrs_vacv_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr23/18hrs/untr/structure_18hrs_untr_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr23/18hrs/vacv/structure_18hrs_vacv_chromatin_peak_info.csv
Processing folder: chr24
calculation done
data saved data/green_monkey/all_structure_files/chr24/24hrs/untr/structure_24hrs_untr_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_stru

calculation done
data saved data/green_monkey/all_structure_files/chr2/12hrs/vacv/structure_12hrs_vacv_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr2/18hrs/untr/structure_18hrs_untr_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr2/18hrs/vacv/structure_18hrs_vacv_chromatin_peak_info.csv
Processing folder: chr5
calculation done
data saved data/green_monkey/all_structure_files/chr5/24hrs/untr/structure_24hrs_untr_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr5/24hrs/vacv/structure_24hrs_vacv_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr5/12hrs/untr/structure_12hrs_untr_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr5/12hrs/vacv/structure_12hrs_vacv_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr5/18hrs/untr/struc

calculation done
data saved data/green_monkey/all_structure_files/chr21/24hrs/vacv/structure_24hrs_vacv_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr21/12hrs/untr/structure_12hrs_untr_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr21/12hrs/vacv/structure_12hrs_vacv_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr21/18hrs/untr/structure_18hrs_untr_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr21/18hrs/vacv/structure_18hrs_vacv_chromatin_peak_info.csv
Processing folder: chr19
calculation done
data saved data/green_monkey/all_structure_files/chr19/24hrs/untr/structure_24hrs_untr_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr19/24hrs/vacv/structure_24hrs_vacv_chromatin_peak_info.csv
calculation done
data saved data/green_monkey/all_structure_files/chr19/12hrs/u