In [1]:
import pandas as pd
import os
import math
import numpy as np
import ast

Center of mass is the average location of mass in the system. For n (n1,n2, ... n) atoms with mass m_i (m1, m2, ..), the center of mass is 

com_x = (m1x1 + m2x2 + ..)/(m1+m2+..)
com_y = (m1y1 + m2y2 + ..)/(m1+m2+..)
com_z = (m1z1 + m2z2 + ..)/(m1+m2+..)

when all atoms have the same mass it just becomes the mean value
com_x = (x1 + x2 + ..)/n
com_y = (y1 + y2 + ..)/n
com_z = (z1 + z2 + ..)/n

In [2]:
# Function to calculate center of mass
def calculate_center_of_mass(df):
    total_mass = df.shape[0]  
    center_of_mass = df[['x', 'y', 'z']].mean(axis=0).tolist()  # Calculate mean of x, y, z columns
    return center_of_mass

In [3]:
def calculate_distance(com, gene):
    com = np.array(com)
    gene = np.array(gene)
    
    square_distance = np.sum((com - gene)**2)
    
    return math.sqrt(square_distance)

In [4]:
def generate_gene_df_with_distance(gene, com):
    center_of_mass = []
    distance_from_com = []
#     print(com)
    for index, row in gene.iterrows():
        center_of_mass.append(com)
        middle = ast.literal_eval(row['middle_pos'])
#         print(middle)
        dis = calculate_distance(com, middle)
        
#         print(dis)
        distance_from_com.append(dis)
        
    gene['center_of_mass'] = center_of_mass
    gene['distance_from_com'] = distance_from_com
    
    return gene
    

In [5]:
before_path = 'data/MRC5/processed/MRC5/mock'
after_path = 'data/MRC5/processed/MRC5/229E'
time_hr = '48hr'
resolution = '250000'
structure_file_name = 'structure'

In [6]:
# Iterate through files in the section directory
def get_center_of_mass(atom_file_path, hour, resolution, structure_file_name):
#     count = 0
    chr_folder_path = os.path.join(atom_file_path, hour, resolution)
    for chr_folder_name in os.listdir(chr_folder_path):
#         print(chr_folder_path)
        # Check if the current item is a directory starting with 'chr'
        if os.path.isdir(chr_folder_path) and chr_folder_name.startswith('chr'):        
            # Navigate into the chr folder and look for structure-with-tracks.csv
#             csv_file_path = os.path.join(chr_folder_path, 'structure', '100kb', 'structure-with-tracks.csv')
            csv_file_path = os.path.join(chr_folder_path, chr_folder_name, f"{structure_file_name}.csv" )

            if os.path.exists(csv_file_path):
                
                df = pd.read_csv(csv_file_path)
                
                com = calculate_center_of_mass(df)
                
#                 print(type(com))
                
                gene_file_path = os.path.join(chr_folder_path, chr_folder_name, 'gene-info.csv')
                gene_data = pd.read_csv(gene_file_path)
                
                gene_data_dis = generate_gene_df_with_distance(gene_data, com)

            
                save_file_path = os.path.join(chr_folder_path, chr_folder_name, 'gene-info.csv')
                gene_data_dis.to_csv(save_file_path, index=False)
                
                print(f"data saved {save_file_path}")
    

            
get_center_of_mass(before_path, time_hr, resolution, structure_file_name)
get_center_of_mass(after_path, time_hr, resolution, structure_file_name)

data saved data/MRC5/processed/MRC5/mock/48hr/250000/chr10/gene-info.csv
data saved data/MRC5/processed/MRC5/mock/48hr/250000/chr17/gene-info.csv
data saved data/MRC5/processed/MRC5/mock/48hr/250000/chr21/gene-info.csv
data saved data/MRC5/processed/MRC5/mock/48hr/250000/chr19/gene-info.csv
data saved data/MRC5/processed/MRC5/229E/48hr/250000/chr10/gene-info.csv
data saved data/MRC5/processed/MRC5/229E/48hr/250000/chr17/gene-info.csv
data saved data/MRC5/processed/MRC5/229E/48hr/250000/chr21/gene-info.csv
data saved data/MRC5/processed/MRC5/229E/48hr/250000/chr19/gene-info.csv


In [48]:
com

In [49]:
gened.head()

Unnamed: 0,seqid,start,end,gene_name,id,name,start_id,end_id,start_pos,end_pos,middle,middle_percent,middle_pos,center_of_mass,distance_from_com
0,NC_023642.1,102388,109337,gene-ATHL1,340,ATHL1,1,2,"[-5.295, -0.685, 17.387]","[-4.645, -0.659, 16.701]",105862,5.862,"[-5.2568969999999995, -0.68347588, 17.34678668]","[-7.176905391658189, -3.050302136317396, 16.99...",3.067652
1,NC_023642.1,112164,113988,gene-IFITM5,378,IFITM5,1,2,"[-5.295, -0.685, 17.387]","[-4.645, -0.659, 16.701]",113076,13.076,"[-5.210006, -0.6816002400000001, 17.29729864]","[-7.176905391658189, -3.050302136317396, 16.99...",3.093458
2,NC_023642.1,124603,127671,gene-LOC103241759,394,LOC103241759,1,2,"[-5.295, -0.685, 17.387]","[-4.645, -0.659, 16.701]",126137,26.137,"[-5.1251095, -0.67820438, 17.20770018]","[-7.176905391658189, -3.050302136317396, 16.99...",3.143408
3,NC_023642.1,128309,129543,gene-IFITM1,407,IFITM1,1,2,"[-5.295, -0.685, 17.387]","[-4.645, -0.659, 16.701]",128926,28.926,"[-5.106981, -0.67747924, 17.18856764]","[-7.176905391658189, -3.050302136317396, 16.99...",3.154597
4,NC_023642.1,129666,136899,gene-LOC103241765,417,LOC103241765,1,2,"[-5.295, -0.685, 17.387]","[-4.645, -0.659, 16.701]",133282,33.282,"[-5.078666999999999, -0.6763466800000001, 17.1...","[-7.176905391658189, -3.050302136317396, 16.99...",3.172434
