In [1]:
import pandas as pd
import os
import numpy as np
import math
import ast


In [2]:
# before_accessible_file_path = 'data/accessibility/raw-data/2803_017_autosomes_treat_pileup_in_chr.bdg'
before_accessible_file_path = 'data/MRC5/accessibility_raw/MRC5_229E_48hr_Mock_ATACseq_merged_reads_in_consensus_peaks_RPKM.bed'

# after_accessible_file_path = 'data/accessibility/raw-data/2803_017_autosomes_treat_pileup_in_chr.bdg'
after_accessible_file_path = 'data/MRC5/accessibility_raw/MRC5_229E_48hr_229E_ATACseq_merged_reads_in_consensus_peaks_RPKM.bed'
# after_path = 'data/dataroot/products/eda-xwtafb42/section'

before_path = 'data/MRC5/processed/MRC5/mock'
after_path = 'data/MRC5/processed/MRC5/229E'
time_hr = '48hr'
resolution = '250000'
structure_file_name = 'structure-with-id0'

In [3]:
def calculatePosition(pointA, pointB, percent):
    # Calculate the differences in coordinates
    delta_x = pointB[0] - pointA[0]
    delta_y = pointB[1] - pointA[1]
    delta_z = pointB[2] - pointA[2]

    # Calculate % of the differences
    dif_x = (percent / 100) * delta_x
    dif_y = (percent / 100) * delta_y
    dif_z = (percent / 100) * delta_z

    # Calculate the coordinates of the point at % along the line
    x_pos = pointA[0] + dif_x
    y_pos = pointA[1] + dif_y
    z_pos = pointA[2] + dif_z
    
    return [x_pos, y_pos, z_pos]

In [4]:
def data_processing(df, df_acc, resolution):   
    
    #deep copy
    acc_df = df_acc.copy(deep=True)
    
    #values to store
    start_ids = []
    end_ids = []
    start_pos = []
    start_x = []
    start_y = []
    start_z = []
    end_pos = []
    end_x = []
    end_y = []
    end_z = []
    start_percent = []
    end_percent = []
    middle = []
    middle_percent = []
    middle_pos = []
    middle_x = []
    middle_y = []
    middle_z = []
    
    #iterate over the gene data frame
    for index, row in acc_df.iterrows():
        #get the start and end position
        start = row['start']
        end = row['end']
        
        #find the mid position
        mid = int((start + end) / 2)
        middle.append(mid)

        # Calculate the ID range that encompasses the start and end
        start_id = (start / resolution)
        end_id = (end / resolution)

        start_id_int = int(start_id)
        end_id_int = int(end_id) + 1

        start_ids.append(start_id_int)
        end_ids.append(end_id_int)
        
        # find the percentage of where the mid point is located
        m_percent = ((mid - (start_id_int * resolution)) / ((end_id_int * resolution) - (start_id_int * resolution))) * 100
        s_percent = m_percent = ((start - (start_id_int * resolution)) / ((end_id_int * resolution) - (start_id_int * resolution))) * 100
        e_percent = m_percent = ((end - (start_id_int * resolution)) / ((end_id_int * resolution) - (start_id_int * resolution))) * 100
        
        middle_percent.append(m_percent)
        start_percent.append(s_percent)
        end_percent.append(e_percent)


#         print(start_id, end_id, start_id_int,end_id_int, len(df))

        #getting the coordinates of the start and end beads that encompasses the gene
        pointA = 0
        pointB = 0
        
        # Check if start_id and end_id are within the bounds of df_structure
        if (start_id_int <= len(df) - 1) and (end_id_int <= len(df) - 1):
            
#                         print(start_id_int, end_id_int, len(df))
            start_row = df[df['id'] == start_id_int]
            end_row = df[df['id'] == end_id_int]
#             print(start_id_int, end_id_int, len(df))
            # Extract x, y, z values if rows are found
            if not start_row.empty:
                pointA = [start_row.iloc[0]['x'], start_row.iloc[0]['y'], start_row.iloc[0]['z']]
#                 start_pos.append(pointA)
            if not end_row.empty:
                pointB = [end_row.iloc[0]['x'], end_row.iloc[0]['y'], end_row.iloc[0]['z']]
#                 end_pos.append(pointB)
        else:
            
#             start_pos.append(None)
#             end_pos.append(None)
                                
            pointA = None
            pointB = None
            
#         print(pointA, pointB)
            
        #calcualate the coords of the mid point of the gene 
        # using parametric equation
        # for two points P1, P2
        # The parametric equation of the line passing through these two points can be written as:
        # P(t) = P1 + t. (P2- P1) where t is the percentage value (0-1)
        if pointA is not None and pointB is not None:
            
#             print('not none')
            sp = calculatePosition(pointA, pointB, s_percent)
            mp = calculatePosition(pointA, pointB, m_percent)
            ep = calculatePosition(pointA, pointB, e_percent)        
            
            start_pos.append(sp)
            middle_pos.append(mp)
            end_pos.append(ep)
            
            start_x.append(sp[0])
            start_y.append(sp[1])
            start_z.append(sp[2])
            end_x.append(ep[0])
            end_y.append(ep[1])
            end_z.append(ep[2])
            middle_x.append(mp[0])
            middle_y.append(mp[1])
            middle_z.append(mp[2])
            
        else:
            start_pos.append(None)
            end_pos.append(None)
            middle_pos.append(None)
            start_x.append(None)
            start_y.append(None)
            start_z.append(None)
            end_x.append(None)
            end_y.append(None)
            end_z.append(None)
            middle_x.append(None)
            middle_y.append(None)
            middle_z.append(None)
            


    acc_df['start_id'] = start_ids
    acc_df['end_id'] = end_ids
    acc_df['start_pos'] = start_pos
    acc_df['start_x'] = start_x
    acc_df['start_y'] = start_y
    acc_df['start_z'] = start_z
    acc_df['end_pos'] = end_pos
    acc_df['end_x'] = end_x
    acc_df['end_y'] = end_y
    acc_df['end_z'] = end_z
    acc_df['middle'] = middle
    acc_df['middle_percent'] = middle_percent
    acc_df['middle_pos'] = middle_pos
    acc_df['middle_x'] = middle_x
    acc_df['middle_y'] = middle_y
    acc_df['middle_z'] = middle_z
                
    print("calculation done")       
    return acc_df


In [5]:
def accessibility_data_coord_extract(acc_file_path, atom_file_path, hour, resolution, structure_file_name, res):  
    # Detect the delimiter (assuming file is either tab-separated or space-separated)
    with open(acc_file_path, 'r') as file:
        first_line = file.readline().strip()
        if '\t' in first_line:
            print('tab')
            delimiter = '\t'
        elif ' ' in first_line:
            print('space')
            delimiter = ' '
        else:
            raise ValueError("Unsupported delimiter. The file must be tab or space separated.")
    
    # Read the file without treating any row as header
    temp_df = pd.read_csv(acc_file_path, sep=delimiter, header=None, nrows=1)
    
    # Check if the first row contains header-like values
    # Adjust the condition based on what constitutes a header in your files
    if temp_df.iloc[0].apply(lambda x: isinstance(x, str)).all():
        # The first row is a header
        acc = pd.read_csv(acc_file_path, sep=delimiter, header=0)
    else:
        # The first row is data
        acc = pd.read_csv(acc_file_path, sep=delimiter, header=None)
        
#     print(acc.head())
#     print(len(acc.columns))
    
    # Assign columns based on the number of columns in the DataFrame
    acc.columns = ['chrname', 'start', 'end', 'value']
    
    chr_folder_path = os.path.join(atom_file_path, hour, resolution)
    # iterate through structure files 100kb
    for chr_folder_name in os.listdir(chr_folder_path):
        
        print("chr folder name ==== ", chr_folder_name)

        # Check if the current item is a directory starting with 'chr'
        if os.path.isdir(chr_folder_path) and chr_folder_name.startswith('chr'):        
            # Navigate into the chr folder and look for structure-with-tracks.csv
#             csv_file_path = os.path.join(chr_folder_path, 'structure', '100kb', 'structure-with-id0.csv')
            csv_file_path = os.path.join(chr_folder_path, chr_folder_name, f"{structure_file_name}.csv" )
            
            chr_name = ''.join(filter(str.isdigit, chr_folder_name))  
            
            #removing the 0 at the first
            chr_name = chr_name.lstrip('0')
            
            print("chr_name===", chr_name)
            
            filtered_acc = acc[acc['chrname'] == f"chr{chr_name}"]
            
            print(f"data shape = {filtered_acc.shape}")
            

            if os.path.exists(csv_file_path):
                df = pd.read_csv(csv_file_path)
                
                acc_with_position = data_processing(df, filtered_acc, res)
                
                filtered_acc_with_position = acc_with_position.dropna(subset=['middle_pos'])                
                
                print("acc df length", len(acc_with_position))
                print("acc df length without NAN middle_pos", len(filtered_acc_with_position))
            
                csv_file_path = os.path.join(chr_folder_path, chr_folder_name, 'accessibility-peaks-only.csv')
                filtered_acc_with_position.to_csv(csv_file_path, index=False)
                
                print(f"data saved {csv_file_path}")

In [6]:
accessibility_data_coord_extract(before_accessible_file_path, before_path, time_hr, resolution, structure_file_name, 250000)
accessibility_data_coord_extract(after_accessible_file_path, after_path, time_hr, resolution, structure_file_name, 250000)

tab
chr folder name ====  .DS_Store
chr folder name ====  chr10
chr_name=== 10
data shape = (6927, 4)
calculation done
acc df length 6927
acc df length without NAN middle_pos 6927
data saved data/MRC5/processed/MRC5/mock/48hr/250000/chr10/accessibility-peaks-only.csv
chr folder name ====  chr17
chr_name=== 17
data shape = (6488, 4)
calculation done
acc df length 6488
acc df length without NAN middle_pos 6486
data saved data/MRC5/processed/MRC5/mock/48hr/250000/chr17/accessibility-peaks-only.csv
chr folder name ====  chr21
chr_name=== 21
data shape = (1826, 4)
calculation done
acc df length 1826
acc df length without NAN middle_pos 1826
data saved data/MRC5/processed/MRC5/mock/48hr/250000/chr21/accessibility-peaks-only.csv
chr folder name ====  chr19
chr_name=== 19
data shape = (4442, 4)
calculation done
acc df length 4442
acc df length without NAN middle_pos 4442
data saved data/MRC5/processed/MRC5/mock/48hr/250000/chr19/accessibility-peaks-only.csv
tab
chr folder name ====  .DS_Store


In [11]:
df.head()

Unnamed: 0,0,1,2,3
0,chr1,0,18,0.0
1,chr1,18,23,0.03223
2,chr1,23,26,0.09669
3,chr1,26,28,0.12892
4,chr1,28,102,0.19338
