In [24]:
import pandas as pd
import os

In [25]:
class GFFProcessor:
    def __init__(self, gff_file, seqId_to_chr_map_file = None):
        self.gff_file = gff_file
        self.seqId_to_chr_map_file = seqId_to_chr_map_file
        self.df = None
        self.chr_map_df = None
        self.df_with_id_chromosome = None
        self.gene_df = None
        self.chromosome_df = None
    
    def load_gff_and_seq_to_chr_map_file(self):
        # Read the GFF file into a pandas DataFrame
        self.df = pd.read_csv(self.gff_file, sep='\t', header=None, comment='#')
        
        # Determine the number of columns in the DataFrame
        num_cols = len(self.df.columns)

        # Define the column names based on the number of columns in the file
        if num_cols == 9:
            self.df.columns = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']
        elif num_cols == 10:
            self.df.columns = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes', 'add_col']
        else:
            raise ValueError("Column number < 9 or > 10 in GFF file")
            
        if self.seqId_to_chr_map_file is not None:
            self.chr_map_df = pd.read_csv(self.seqId_to_chr_map_file)
#             print(self.chr_map_df.head())
        
        
    def parse_attributes(self, attr_str):
        # Function to parse GFF attributes into a dictionary
        attributes = {}
        items = attr_str.strip().split(';')
        for item in items:
            key_value = item.split('=')
            if len(key_value) == 2:  # Ensure valid key-value pair
                key = key_value[0].strip()
                value = key_value[1].strip()
                attributes[key] = value
                        
        return attributes
    
    def process_gff_data(self):
        # Initialize lists to store gene data
        genes = []
        seqids = []
        starts = []
        ends = []
        ids = []
        chromosomes = []

        # Iterate through each row in the DataFrame
        for index, row in self.df.iterrows():
            attributes = row['attributes']

            # Parse attributes to get gene_id
            attr_dict = self.parse_attributes(attributes)
            if 'ID' in attr_dict:
                if ':' in attr_dict['ID']:
#                     print(attr_dict['ID'])
                    gene_id = attr_dict['ID'].split(':')[0].strip()
                    chromosome = attr_dict['chromosome'] if 'chromosome' in attr_dict else None
                else:
                    gene_id = attr_dict['ID']
                    chromosome = None
            else:
                gene_id = ''
                chromosome = None

            # Append data to lists
            genes.append(row['type'])
            seqids.append(row['seqid'])
            starts.append(row['start'])
            ends.append(row['end'])
            ids.append(gene_id)
            chromosomes.append(chromosome)

        # Create a new DataFrame with gene information
        self.df_with_id_chromosome = pd.DataFrame({
            'type': genes,
            'seqid': seqids,
            'start': starts,
            'end': ends,
            'id': ids,
            'chromosome': chromosomes
        })
    
#     def create_gene_df(self):
#         # Filter DataFrame to include genes only
#         self.gene_df = self.df_with_id_chromosome[self.df_with_id_chromosome['type'] == 'gene'][['seqid', 'start', 'end', 'id']]

#         # Rename the index column to 'id'
# #         self.gene_df.loc[:, 'id'] = self.gene_df.index

#         # Create a column without "gene-"
#         self.gene_df['name'] = self.gene_df['id'].str.replace('gene-', '')

#         # Reset the index
#         self.gene_df.reset_index(drop=True, inplace=True)
        
#     def create_chromosome_df(self):
#         # filter df to include chromosomes only
#         self.chromosome_df = self.df_with_id_chromosome[self.df_with_id_chromosome['chromosome'].notna()]
        
#         self.chromosome_df.reset_index(drop=True, inplace=True)

    def create_gene_df(self):
        # Filter DataFrame to include genes only
        self.gene_df = self.df_with_id_chromosome[self.df_with_id_chromosome['type'] == 'gene'][['seqid', 'start', 'end', 'id']]

        # Create a column without "gene-"
        self.gene_df['name'] = self.gene_df['id'].str.replace('gene-', '')

        # If chr_map_df is provided, update seqid and store the old seqid
        if self.chr_map_df is not None:
            # Select only the necessary columns from chr_map_df
            chr_map_subset = self.chr_map_df[['Contig', 'Chromosome']]

            # Merge with chr_map_subset to get the Chromosome values
            self.gene_df = self.gene_df.merge(
                chr_map_subset, 
                how='left', 
                left_on='seqid', 
                right_on='Contig'
            )

            # Store the old seqid in seqid_old
            self.gene_df['seqid_old'] = self.gene_df['seqid']

            # Update seqid with Chromosome values
            self.gene_df['seqid'] = self.gene_df['Chromosome']

            # Drop the temporary columns from the merge
            self.gene_df.drop(columns=['Contig', 'Chromosome'], inplace=True)

        # Reset the index
        self.gene_df.reset_index(drop=True, inplace=True)

    def create_chromosome_df(self):
        # Filter df to include chromosomes only
        self.chromosome_df = self.df_with_id_chromosome[self.df_with_id_chromosome['chromosome'].notna()]

        # If chr_map_df is provided, update seqid and store the old seqid
        if self.chr_map_df is not None:
            chr_map_subset = self.chr_map_df[['Contig', 'Chromosome']]
        
            # Merge with chr_map_subset to get the Chromosome values
            self.chromosome_df = self.chromosome_df.merge(
                chr_map_subset, 
                how='left', 
                left_on='seqid', 
                right_on='Contig'
            )

            # Store the old seqid in seqid_old
            self.chromosome_df['seqid_old'] = self.chromosome_df['seqid']

            # Update seqid with Chromosome values
            self.chromosome_df['seqid'] = self.chromosome_df['Chromosome']

            # Drop the temporary columns from the merge
            self.chromosome_df.drop(columns=['Contig', 'Chromosome'], inplace=True)

        # Reset the index
        self.chromosome_df.reset_index(drop=True, inplace=True)
    
    def display_gene_df(self):
        # Display the gene DataFrame
        if self.gene_df is not None:
            print(self.gene_df.head())
        else:
            print("Gene DataFrame has not been created yet.")
            
    def display_chromosome_df(self):
        # Display the gene DataFrame
        if self.chromosome_df is not None:
            print(self.chromosome_df.head())
        else:
            print("Chromosome DataFrame has not been created yet.")


In [27]:
# Initialize the GFFProcessor instance with your GFF file path
gff_processor = GFFProcessor('data/GSE248049/annotation_files/Chlorocebus_sabaeus_genome.gff', 
                            'data/GSE248049/annotation_files/Chlorocebus_sabeus_mva.genome.map.csv')

# Load and process the GFF file
gff_processor.load_gff_and_seq_to_chr_map_file()
# extract important attributes
gff_processor.process_gff_data()
#create gene df
gff_processor.create_gene_df()
#create chromosome df
gff_processor.create_chromosome_df()

# gff_processor.display_chromosome_df()
# gff_processor.display_gene_df()

# Save the processed DataFrame to CSV
gene_save_path = 'data/GSE248049/annotation_files/Chlorocebus_sabaeus_genome_genes.csv'
gff_processor.gene_df.to_csv(gene_save_path, index=False)


# Save the processed DataFrame to CSV
chr_save_path = 'data/GSE248049/annotation_files/Chlorocebus_sabaeus_genome_chromosomes.csv'
gff_processor.chromosome_df.to_csv(chr_save_path, index=False)


  self.df = pd.read_csv(self.gff_file, sep='\t', header=None, comment='#')


In [28]:
gff_processor.df_with_id_chromosome.head()

Unnamed: 0,type,seqid,start,end,id,chromosome
0,region,NC_023642.1,1,126035930,NC_023642.1,1.0
1,gene,NC_023642.1,2628,4911,gene-SCGB1C1,
2,mRNA,NC_023642.1,2628,4911,rna-XM_007993457.1,
3,exon,NC_023642.1,2628,3883,exon-XM_007993457.1-1,
4,exon,NC_023642.1,4414,4911,exon-XM_007993457.1-2,


In [30]:
gff_processor.gene_df.head()

Unnamed: 0,seqid,start,end,id,name,seqid_old
0,chr1,2628,4911,gene-SCGB1C1,SCGB1C1,NC_023642.1
1,chr1,4916,10379,gene-ODF3,ODF3,NC_023642.1
2,chr1,14423,18444,gene-BET1L,BET1L,NC_023642.1
3,chr1,18597,25398,gene-RIC8A,RIC8A,NC_023642.1
4,chr1,25740,49471,gene-SIRT3,SIRT3,NC_023642.1


In [31]:
gff_processor.chromosome_df.head()

Unnamed: 0,type,seqid,start,end,id,chromosome,seqid_old
0,region,chr1,1,126035930,NC_023642.1,1,NC_023642.1
1,region,chr10,1,128595539,NC_023651.1,10,NC_023651.1
2,region,chr11,1,128539186,NC_023652.1,11,NC_023652.1
3,region,chr12,1,108555830,NC_023653.1,12,NC_023653.1
4,region,chr13,1,98384682,NC_023654.1,13,NC_023654.1


In [33]:
def calculatePosition(pointA, pointB, percent):
    # Calculate the differences in coordinates
    delta_x = pointB[0] - pointA[0]
    delta_y = pointB[1] - pointA[1]
    delta_z = pointB[2] - pointA[2]

    # Calculate % of the differences
    dif_x = (percent / 100) * delta_x
    dif_y = (percent / 100) * delta_y
    dif_z = (percent / 100) * delta_z

    # Calculate the coordinates of the point at % along the line
    x_pos = pointA[0] + dif_x
    y_pos = pointA[1] + dif_y
    z_pos = pointA[2] + dif_z
    
    return [x_pos, y_pos, z_pos]


In [34]:
def gene_coord_extraction(df, df_gene, resolution):   
    
    #deep copy
    gene_df = df_gene.copy(deep=True)
    
    #values to store
    start_ids = []
    end_ids = []
    start_pos = []
    start_x = []
    start_y = []
    start_z = []
    end_pos = []
    end_x = []
    end_y = []
    end_z = []
    start_percent = []
    end_percent = []
    middle = []
    middle_percent = []
    middle_pos = []
    middle_x = []
    middle_y = []
    middle_z = []
    
    #iterate over the gene data frame
    for index, row in gene_df.iterrows():
        #get the start and end position
        start = row['start']
        end = row['end']
        
        #find the mid position
        mid = int((start + end) / 2)
        middle.append(mid)

        # Calculate the ID range that encompasses the start and end
        start_id = (start / resolution)
        end_id = (end / resolution)

        start_id_int = int(start_id)
        end_id_int = int(end_id) + 1

        start_ids.append(start_id_int)
        end_ids.append(end_id_int)
        
        # find the percentage of where the mid point is located
        m_percent = ((mid - (start_id_int * resolution)) / ((end_id_int * resolution) - (start_id_int * resolution))) * 100
        s_percent = m_percent = ((start - (start_id_int * resolution)) / ((end_id_int * resolution) - (start_id_int * resolution))) * 100
        e_percent = m_percent = ((end - (start_id_int * resolution)) / ((end_id_int * resolution) - (start_id_int * resolution))) * 100
        
        middle_percent.append(m_percent)
        start_percent.append(s_percent)
        end_percent.append(e_percent)

#         print(start_id, end_id, start_id_int,end_id_int, len(df))

        #getting the coordinates of the start and end beads that encompasses the gene
        pointA = 0
        pointB = 0
        
        # Check if start_id and end_id are within the bounds of df_structure
        if (start_id_int <= len(df) - 1) and (end_id_int <= len(df) - 1):
            
#                         print(start_id_int, end_id_int, len(df))
            start_row = df[df['id'] == start_id_int]
            end_row = df[df['id'] == end_id_int]
#             print(start_id_int, end_id_int, len(df))
            # Extract x, y, z values if rows are found
            if not start_row.empty:
                pointA = [start_row.iloc[0]['x'], start_row.iloc[0]['y'], start_row.iloc[0]['z']]
#                 start_pos.append(pointA)
            if not end_row.empty:
                pointB = [end_row.iloc[0]['x'], end_row.iloc[0]['y'], end_row.iloc[0]['z']]
#                 end_pos.append(pointB)
        else:
            
#             start_pos.append(None)
#             end_pos.append(None)
                                
            pointA = None
            pointB = None
            
#         print(pointA, pointB)
            
        #calcualate the coords of the mid point of the gene 
        # using parametric equation
        # for two points P1, P2
        # The parametric equation of the line passing through these two points can be written as:
        # P(t) = P1 + t. (P2- P1) where t is the percentage value (0-1)
        if pointA is not None and pointB is not None:
#             print('not none')
            sp = calculatePosition(pointA, pointB, s_percent)
            mp = calculatePosition(pointA, pointB, m_percent)
            ep = calculatePosition(pointA, pointB, e_percent)        
            
            start_pos.append(sp)
            middle_pos.append(mp)
            end_pos.append(ep)
            start_x.append(sp[0])
            start_y.append(sp[1])
            start_z.append(sp[2])
            end_x.append(ep[0])
            end_y.append(ep[1])
            end_z.append(ep[2])
            middle_x.append(mp[0])
            middle_y.append(mp[1])
            middle_z.append(mp[2])
            
        else:
            middle_pos.append(None)
            start_pos.append(None)
            end_pos.append(None)
            start_x.append(None)
            start_y.append(None)
            start_z.append(None)
            end_x.append(None)
            end_y.append(None)
            end_z.append(None)
            middle_x.append(None)
            middle_y.append(None)
            middle_z.append(None)
            


    gene_df['start_id'] = start_ids
    gene_df['end_id'] = end_ids
    gene_df['start_pos'] = start_pos
    gene_df['start_x'] = start_x
    gene_df['start_y'] = start_y
    gene_df['start_z'] = start_z
    gene_df['end_pos'] = end_pos
    gene_df['end_x'] = end_x
    gene_df['end_y'] = end_y
    gene_df['end_z'] = end_z
    gene_df['middle'] = middle
    gene_df['middle_x'] = middle_x
    gene_df['middle_y'] = middle_y
    gene_df['middle_z'] = middle_z
    gene_df['middle_percent'] = middle_percent
    gene_df['middle_pos'] = middle_pos
                
    print("calculation done")       
    return gene_df


# gene_df_with_start_end_pos = gene_coord_extraction(bead_df, gff_processor.gene_df, 100000)

# gene_df_with_start_end_pos

In [32]:
structure_file_path = 'data/GSE248049/all_structure_files'

resolution = '100000'

In [44]:
# Iterate through files in the section directory
def bead_file_read_gene_coord_extract(atom_file_path, res):
    
    chr_folder_path = os.path.join(atom_file_path)
    for chr_folder_name in os.listdir(chr_folder_path):
        chr_path = os.path.join(chr_folder_path, chr_folder_name)
#         print(chr_folder_name)

        # Check if the current item is a directory starting with 'chr'
        if os.path.isdir(chr_path) and chr_folder_name.startswith('chr'):  
            print(f"Processing folder: {chr_folder_name}")
            
            # List all CSV files in the folder, considering only those already containing '_with_id0'
            csv_files = [f for f in os.listdir(chr_path) if f.endswith('.csv') and '_with_id0' in f]
            
            chr_name = ''.join(filter(str.isdigit, chr_folder_name))  
            
            #removing the 0 at the first
            chr_name = chr_name.lstrip('0')
            
            print("chr_name", chr_name)

            chr_gene_df = gff_processor.gene_df[gff_processor.gene_df['seqid'] == chr_folder_name]
            
            for csv_file in csv_files:
                csv_file_path = os.path.join(chr_path, csv_file)

                df = pd.read_csv(csv_file_path)
                
#                 print(res)
                gene_df_with_start_end_pos = gene_coord_extraction(df, chr_gene_df, res)
                
                #rows_with_nan = gene_df_with_start_end_pos[pd.isna(gene_df_with_start_end_pos['middle_pos'])]
                
                
                filtered_df = gene_df_with_start_end_pos.dropna(subset=['middle_pos'])
                
#                 print("gene df length", len(gene_df_with_start_end_pos))
#                 print("gene df length without NAN middle_pos", len(filtered_df))

                # Generate new output filename with "_with_id0"
                base_name = csv_file.replace('_with_id0.csv', '')  # Remove ".csv"
                new_file_name = f"{base_name}_gene_info.csv"
                save_file_path = os.path.join(chr_path, new_file_name)

                filtered_df.to_csv(save_file_path, index=False)
                
                print(f"data saved {save_file_path}")

            
    

            
bead_file_read_gene_coord_extract(structure_file_path, 100000)
# bead_file_read_gene_coord_extract(after_path, time_hr, resolution, structure_file_name, 250000)

Processing folder: chr23
chr_name 23
calculation done
data saved data/GSE248049/all_structure_files/chr23/structure_24hrs_untr_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr23/structure_18hrs_untr_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr23/structure_12hrs_untr_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr23/structure_24hrs_vacv_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr23/structure_12hrs_vacv_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr23/structure_18hrs_vacv_gene_info.csv
Processing folder: chr24
chr_name 24
calculation done
data saved data/GSE248049/all_structure_files/chr24/structure_24hrs_untr_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr24/structure_18hrs_untr_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr24/structure_24hrs_vacv_gene

data saved data/GSE248049/all_structure_files/chr18/structure_18hrs_vacv_gene_info.csv
Processing folder: chr27
chr_name 27
calculation done
data saved data/GSE248049/all_structure_files/chr27/structure_24hrs_untr_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr27/structure_18hrs_untr_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr27/structure_12hrs_untr_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr27/structure_24hrs_vacv_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr27/structure_12hrs_vacv_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr27/structure_18hrs_vacv_gene_info.csv
Processing folder: chr20
chr_name 20
calculation done
data saved data/GSE248049/all_structure_files/chr20/structure_24hrs_untr_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr20/structure_18hrs_untr_gene_info.csv
calcula

data saved data/GSE248049/all_structure_files/chr9/structure_18hrs_vacv_gene_info.csv
Processing folder: chr8
chr_name 8
calculation done
data saved data/GSE248049/all_structure_files/chr8/structure_12hrs_untr_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr8/structure_24hrs_vacv_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr8/structure_12hrs_vacv_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr8/structure_18hrs_vacv_gene_info.csv
Processing folder: chr6
chr_name 6
calculation done
data saved data/GSE248049/all_structure_files/chr6/structure_24hrs_untr_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr6/structure_18hrs_untr_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr6/structure_12hrs_untr_gene_info.csv
calculation done
data saved data/GSE248049/all_structure_files/chr6/structure_24hrs_vacv_gene_info.csv
calculation done
dat