In [1]:
import pandas as pd
import os

In [2]:
class GFFProcessor:
    def __init__(self, gff_file):
        self.gff_file = gff_file
        self.df = None
        self.df_with_id_chromosome = None
        self.gene_df = None
        self.chromosome_df = None
    
    def load_gff_file(self):
        # Read the GFF file into a pandas DataFrame
        self.df = pd.read_csv(self.gff_file, sep='\t', header=None, comment='#')
        
        # Determine the number of columns in the DataFrame
        num_cols = len(self.df.columns)

        # Define the column names based on the number of columns in the file
        if num_cols == 9:
            self.df.columns = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
        elif num_cols == 10:
            self.df.columns = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes', 'add_col']
        else:
            raise ValueError("Column number < 9 or > 10 in GFF file")
        
        
    def parse_attributes(self, attr_str):
        # Function to parse GFF attributes into a dictionary
        attributes = {}
        items = attr_str.strip().split(';')
        for item in items:
            key_value = item.split('=')
            if len(key_value) == 2:  # Ensure valid key-value pair
                key = key_value[0].strip()
                value = key_value[1].strip()
                attributes[key] = value
                        
        return attributes
    
    def process_gff_data(self):
        # Initialize lists to store gene data
        genes = []
        seqids = []
        starts = []
        ends = []
        ids = []
        chromosomes = []

        # Iterate through each row in the DataFrame
        for index, row in self.df.iterrows():
            attributes = row['attributes']

            # Parse attributes to get gene_id
            attr_dict = self.parse_attributes(attributes)
            if 'ID' in attr_dict:
                if ':' in attr_dict['ID']:
#                     print(attr_dict['ID'])
                    gene_id = attr_dict['ID'].split(':')[0].strip()
                    chromosome = attr_dict['chromosome'] if 'chromosome' in attr_dict else None
                else:
                    gene_id = attr_dict['ID']
                    chromosome = None
            else:
                gene_id = ''
                chromosome = None

            # Append data to lists
            genes.append(row['type'])
            seqids.append(row['seqid'])
            starts.append(row['start'])
            ends.append(row['end'])
            ids.append(gene_id)
            chromosomes.append(chromosome)

        # Create a new DataFrame with gene information
        self.df_with_id_chromosome = pd.DataFrame({
            'type': genes,
            'seqid': seqids,
            'start': starts,
            'end': ends,
            'id': ids,
            'chromosome': chromosomes
        })
    
    def create_gene_df(self):
        # Filter DataFrame to include genes only
        self.gene_df = self.df_with_id_chromosome[self.df_with_id_chromosome['type'] == 'gene'][['seqid', 'start', 'end', 'id']]

        # Rename the index column to 'id'
#         self.gene_df.loc[:, 'id'] = self.gene_df.index

        # Create a column without "gene-"
        self.gene_df['name'] = self.gene_df['id'].str.replace('gene-', '')

        # Reset the index
        self.gene_df.reset_index(drop=True, inplace=True)
        
    def create_chromosome_df(self):
        # filter df to include chromosomes only
        self.chromosome_df = self.df_with_id_chromosome[self.df_with_id_chromosome['chromosome'].notna()]
        
        self.chromosome_df.reset_index(drop=True, inplace=True)
    
    def display_gene_df(self):
        # Display the gene DataFrame
        if self.gene_df is not None:
            print(self.gene_df.head())
        else:
            print("Gene DataFrame has not been created yet.")


In [3]:
# Initialize the GFFProcessor instance with your GFF file path
gff_processor = GFFProcessor('data/MRC5/gencode_raw/GCF_009914755.1_T2T-CHM13v2.0_genomic.named.gff')

# Load and process the GFF file
gff_processor.load_gff_file()
# extract important attributes
gff_processor.process_gff_data()
#create gene df
gff_processor.create_gene_df()
#create chromosome df
gff_processor.create_chromosome_df()

# Save the processed DataFrame to CSV
gene_save_path = 'data/MRC5/gencode_raw/gene-chromosome/GCF_009914755.1_T2T-CHM13v2.0_genomic.named_genes.csv'
gff_processor.gene_df.to_csv(gene_save_path, index=False)


# Save the processed DataFrame to CSV
chr_save_path = 'data/MRC5/gencode_raw/gene-chromosome/GCF_009914755.1_T2T-CHM13v2.0_genomic.named_chromosomes.csv'
gff_processor.chromosome_df.to_csv(chr_save_path, index=False)


In [4]:
gff_processor.df_with_id_chromosome.head()

Unnamed: 0,type,seqid,start,end,id,chromosome
0,region,chr1,1,248387328,NC_060925.1,1.0
1,gene,chr1,7506,138480,gene-LOC127239154,
2,lnc_RNA,chr1,7506,138480,rna-NR_182074.1,
3,exon,chr1,138321,138480,exon-NR_182074.1-1,
4,exon,chr1,129906,130010,exon-NR_182074.1-2,


In [5]:
gff_processor.gene_df

Unnamed: 0,seqid,start,end,id,name
0,chr1,7506,138480,gene-LOC127239154,LOC127239154
1,chr1,20892,23710,gene-LOC124905685,LOC124905685
2,chr1,52979,54612,gene-LOC101928626,LOC101928626
3,chr1,111940,112877,gene-OR4F16,OR4F16
4,chr1,152269,205171,gene-LOC100288069,LOC100288069
...,...,...,...,...,...
41504,chrY,62159112,62204700,gene-SPRY3-2,SPRY3-2
41505,chrY,62306325,62368760,gene-VAMP7-2,VAMP7-2
41506,chrY,62422543,62435805,gene-IL9R-2,IL9R-2
41507,chrY,62439553,62441822,gene-WASIR1-2,WASIR1-2


In [6]:
gff_processor.chromosome_df.head()

Unnamed: 0,type,seqid,start,end,id,chromosome
0,region,chr1,1,248387328,NC_060925.1,1
1,region,chr2,1,242696752,NC_060926.1,2
2,region,chr3,1,201105948,NC_060927.1,3
3,region,chr4,1,193574945,NC_060928.1,4
4,region,chr5,1,182045439,NC_060929.1,5


In [7]:
before_path = 'data/MRC5/processed/MRC5/mock'
after_path = 'data/MRC5/processed/MRC5/229E'
time_hr = '48hr'
resolution = '250000'
structure_file_name = 'structure-with-id0'

In [8]:
def calculatePosition(pointA, pointB, percent):
    # Calculate the differences in coordinates
    delta_x = pointB[0] - pointA[0]
    delta_y = pointB[1] - pointA[1]
    delta_z = pointB[2] - pointA[2]

    # Calculate % of the differences
    dif_x = (percent / 100) * delta_x
    dif_y = (percent / 100) * delta_y
    dif_z = (percent / 100) * delta_z

    # Calculate the coordinates of the point at % along the line
    x_pos = pointA[0] + dif_x
    y_pos = pointA[1] + dif_y
    z_pos = pointA[2] + dif_z
    
    return [x_pos, y_pos, z_pos]


In [9]:
def gene_coord_extraction(df, df_gene, resolution):   
    
    #deep copy
    gene_df = df_gene.copy(deep=True)
    
    #values to store
    start_ids = []
    end_ids = []
    start_pos = []
    start_x = []
    start_y = []
    start_z = []
    end_pos = []
    end_x = []
    end_y = []
    end_z = []
    start_percent = []
    end_percent = []
    middle = []
    middle_percent = []
    middle_pos = []
    middle_x = []
    middle_y = []
    middle_z = []
    
    #iterate over the gene data frame
    for index, row in gene_df.iterrows():
        #get the start and end position
        start = row['start']
        end = row['end']
        
        #find the mid position
        mid = int((start + end) / 2)
        middle.append(mid)

        # Calculate the ID range that encompasses the start and end
        start_id = (start / resolution)
        end_id = (end / resolution)

        start_id_int = int(start_id)
        end_id_int = int(end_id) + 1

        start_ids.append(start_id_int)
        end_ids.append(end_id_int)
        
        # find the percentage of where the mid point is located
        m_percent = ((mid - (start_id_int * resolution)) / ((end_id_int * resolution) - (start_id_int * resolution))) * 100
        s_percent = m_percent = ((start - (start_id_int * resolution)) / ((end_id_int * resolution) - (start_id_int * resolution))) * 100
        e_percent = m_percent = ((end - (start_id_int * resolution)) / ((end_id_int * resolution) - (start_id_int * resolution))) * 100
        
        middle_percent.append(m_percent)
        start_percent.append(s_percent)
        end_percent.append(e_percent)

#         print(start_id, end_id, start_id_int,end_id_int, len(df))

        #getting the coordinates of the start and end beads that encompasses the gene
        pointA = 0
        pointB = 0
        
        # Check if start_id and end_id are within the bounds of df_structure
        if (start_id_int <= len(df) - 1) and (end_id_int <= len(df) - 1):
            
#                         print(start_id_int, end_id_int, len(df))
            start_row = df[df['id'] == start_id_int]
            end_row = df[df['id'] == end_id_int]
#             print(start_id_int, end_id_int, len(df))
            # Extract x, y, z values if rows are found
            if not start_row.empty:
                pointA = [start_row.iloc[0]['x'], start_row.iloc[0]['y'], start_row.iloc[0]['z']]
#                 start_pos.append(pointA)
            if not end_row.empty:
                pointB = [end_row.iloc[0]['x'], end_row.iloc[0]['y'], end_row.iloc[0]['z']]
#                 end_pos.append(pointB)
        else:
            
#             start_pos.append(None)
#             end_pos.append(None)
                                
            pointA = None
            pointB = None
            
#         print(pointA, pointB)
            
        #calcualate the coords of the mid point of the gene 
        # using parametric equation
        # for two points P1, P2
        # The parametric equation of the line passing through these two points can be written as:
        # P(t) = P1 + t. (P2- P1) where t is the percentage value (0-1)
        if pointA is not None and pointB is not None:
#             print('not none')
            sp = calculatePosition(pointA, pointB, s_percent)
            mp = calculatePosition(pointA, pointB, m_percent)
            ep = calculatePosition(pointA, pointB, e_percent)        
            
            start_pos.append(sp)
            middle_pos.append(mp)
            end_pos.append(ep)
            start_x.append(sp[0])
            start_y.append(sp[1])
            start_z.append(sp[2])
            end_x.append(ep[0])
            end_y.append(ep[1])
            end_z.append(ep[2])
            middle_x.append(mp[0])
            middle_y.append(mp[1])
            middle_z.append(mp[2])
            
        else:
            middle_pos.append(None)
            start_pos.append(None)
            end_pos.append(None)
            start_x.append(None)
            start_y.append(None)
            start_z.append(None)
            end_x.append(None)
            end_y.append(None)
            end_z.append(None)
            middle_x.append(None)
            middle_y.append(None)
            middle_z.append(None)
            


    gene_df['start_id'] = start_ids
    gene_df['end_id'] = end_ids
    gene_df['start_pos'] = start_pos
    gene_df['start_x'] = start_x
    gene_df['start_y'] = start_y
    gene_df['start_z'] = start_z
    gene_df['end_pos'] = end_pos
    gene_df['end_x'] = end_x
    gene_df['end_y'] = end_y
    gene_df['end_z'] = end_z
    gene_df['middle'] = middle
    gene_df['middle_x'] = middle_x
    gene_df['middle_y'] = middle_y
    gene_df['middle_z'] = middle_z
    gene_df['middle_percent'] = middle_percent
    gene_df['middle_pos'] = middle_pos
                
    print("calculation done")       
    return gene_df


# gene_df_with_start_end_pos = gene_coord_extraction(bead_df, gff_processor.gene_df, 100000)

# gene_df_with_start_end_pos

In [10]:
# Iterate through files in the section directory
def bead_file_read_gene_coord_extract(atom_file_path, hour, resolution, structure_file_name, res):
    chr_folder_path = os.path.join(atom_file_path, hour, resolution)
    for chr_folder_name in os.listdir(chr_folder_path):
        print(chr_folder_name)

        # Check if the current item is a directory starting with 'chr'
        if os.path.isdir(chr_folder_path) and chr_folder_name.startswith('chr'):        
            # Navigate into the chr folder and look for structure-with-tracks.csv
#             csv_file_path = os.path.join(chr_folder_path, 'structure', '100kb', 'structure-with-id0.csv')
            
            csv_file_path = os.path.join(chr_folder_path, chr_folder_name, f"{structure_file_name}.csv" )
            
            chr_name = ''.join(filter(str.isdigit, chr_folder_name))  
            
            #removing the 0 at the first
            chr_name = chr_name.lstrip('0')
            
            print("chr_name", chr_name)
            
            chr_id = gff_processor.chromosome_df.loc[gff_processor.chromosome_df['chromosome'] == chr_name, 'id'].tolist()
            print("chr_id", chr_id)
            
            # human seqID is the chromosomes
            # TODO dynamically fix this
#             chr_gene_df = gff_processor.gene_df[gff_processor.gene_df['seqid'] == chr_id[0]]

            chr_gene_df = gff_processor.gene_df[gff_processor.gene_df['seqid'] == chr_folder_name]
            
#             print(chr_gene_df.head())

            if os.path.exists(csv_file_path):
                df = pd.read_csv(csv_file_path)
                
#                 print(res)
                gene_df_with_start_end_pos = gene_coord_extraction(df, chr_gene_df, res)
                
                #rows_with_nan = gene_df_with_start_end_pos[pd.isna(gene_df_with_start_end_pos['middle_pos'])]
                
                
                filtered_df = gene_df_with_start_end_pos.dropna(subset=['middle_pos'])
                
                print("gene df length", len(gene_df_with_start_end_pos))
                print("gene df length without NAN middle_pos", len(filtered_df))
            
#                 csv_file_path = f"data/MRC5/gencode/{struc_type}/{chr_folder_name}-{struc_type}-gene-info.csv"
                csv_file_path = os.path.join(chr_folder_path, chr_folder_name, 'gene-info.csv')
                filtered_df.to_csv(csv_file_path, index=False)
                
                print(f"data saved {csv_file_path}")
                print(f"{chr_folder_name} okay")
            
    

            
bead_file_read_gene_coord_extract(before_path, time_hr, resolution, structure_file_name, 250000)
bead_file_read_gene_coord_extract(after_path, time_hr, resolution, structure_file_name, 250000)

.DS_Store
chr10
chr_name 10
chr_id ['NC_060934.1']
calculation done
gene df length 1592
gene df length without NAN middle_pos 1592
data saved data/MRC5/processed/MRC5/mock/48hr/250000/chr10/gene-info.csv
chr10 okay
chr17
chr_name 17
chr_id ['NC_060941.1']
calculation done
gene df length 1944
gene df length without NAN middle_pos 1943
data saved data/MRC5/processed/MRC5/mock/48hr/250000/chr17/gene-info.csv
chr17 okay
chr21
chr_name 21
chr_id ['NC_060945.1']
calculation done
gene df length 775
gene df length without NAN middle_pos 775
data saved data/MRC5/processed/MRC5/mock/48hr/250000/chr21/gene-info.csv
chr21 okay
chr19
chr_name 19
chr_id ['NC_060943.1']
calculation done
gene df length 2029
gene df length without NAN middle_pos 2029
data saved data/MRC5/processed/MRC5/mock/48hr/250000/chr19/gene-info.csv
chr19 okay
.DS_Store
chr10
chr_name 10
chr_id ['NC_060934.1']
calculation done
gene df length 1592
gene df length without NAN middle_pos 1592
data saved data/MRC5/processed/MRC5/229E/