In [49]:
import pandas as pd
import pybedtools
from pybedtools import BedTool
import sys
import os

wd_dir = '/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_CUTandTAG'
os.chdir(wd_dir)

# Get the current working directory
current_dir = os.getcwd()

In [89]:
def create_bed_from_peaks(peaks_df):
    """
    Convert peak coordinates to BED format
    """
    # Split the chromosome coordinates
    peaks_df[['chrom', 'coords']] = peaks_df['gene'].str.split(':', n=1, expand=True)
    peaks_df[['start', 'end']] = peaks_df['coords'].str.split('-', expand=True)
    
    # Convert start and end to integers
    peaks_df['start'] = peaks_df['start'].astype(int)
    peaks_df['end'] = peaks_df['end'].astype(int)
    
    # Create BED format DataFrame
    bed_df = peaks_df[['chrom', 'start', 'end']].copy()
    return bed_df

def parse_gtf_attributes(attribute_str):
    """
    Parse GTF attribute string to get gene_name
    """
    attrs = {}
    for attr in attribute_str.split('; '):
        if attr:
            try:
                key, value = attr.split(' ', 1)
                attrs[key] = value.strip('"')
            except ValueError:
                continue
    return attrs

def create_gene_bed_from_gtf(gtf_file):
    """
    Create BED file from GTF containing only genes
    """
    genes = []
    with open(gtf_file, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue
            fields = line.strip().split('\t')
            if len(fields) < 9 or fields[2] != 'gene':
                continue
            
            chrom = fields[0]
            start = int(fields[3]) - 1  # Convert to 0-based
            end = int(fields[4])
            attributes = parse_gtf_attributes(fields[8])
            gene_name = attributes.get('gene_name', 'Unknown')
            
            genes.append([chrom, start, end, gene_name])
    
    gene_df = pd.DataFrame(genes, columns=['chrom', 'start', 'end', 'gene_name'])
    return BedTool.from_dataframe(gene_df)

def get_gene_symbols(peaks_df, gtf_file):
    """
    Add gene symbols to peaks DataFrame using GENCODE GTF
    """
    # Create BED file from peaks
    bed_df = create_bed_from_peaks(peaks_df)
    peaks_bed = BedTool.from_dataframe(bed_df)
    
    # Create BED from GTF genes
    genes_bed = create_gene_bed_from_gtf(gtf_file)
    
    # Intersect peaks with genes
    intersect = peaks_bed.intersect(genes_bed, wa=True, wb=True)
    
    # Create dictionary to store peak to gene symbol mappings
    peak_to_gene = {}
    for intersection in intersect:
        peak_key = f"{intersection[0]}:{intersection[1]}-{intersection[2]}"
        gene_symbol = intersection[6]  # gene_name field from our gene BED
        if peak_key in peak_to_gene:
            if gene_symbol not in peak_to_gene[peak_key].split(';'):
                peak_to_gene[peak_key] += f";{gene_symbol}"
        else:
            peak_to_gene[peak_key] = gene_symbol
    
    # Add gene symbols to original DataFrame
    peaks_df['gene_symbol'] = peaks_df['gene'].map(peak_to_gene)
    
    # Fill NA with "Intergenic"
    peaks_df['gene_symbol'] = peaks_df['gene_symbol'].fillna('Intergenic')
    
    return peaks_df

def process_files(input_file, output_file, gtf_file):
    """
    Process input CSV file and write annotated results to output file
    """
    try:
        # Read input data
        peaks_df = pd.read_csv(input_file)
        
        # Add gene symbols
        result_df = get_gene_symbols(peaks_df, gtf_file)
        
        # Save results
        result_df.copy().to_csv(output_file, index=False)
        print(f"Successfully processed {input_file} and saved results to {output_file}")
        
        return result_df
    
    except Exception as e:
        print(f"Error processing files: {str(e)}")
        sys.exit(1)

In [91]:
# File paths
input_files = ['custom_pipeline/results/Neuron_peak_analysis.csv', 'custom_pipeline/results/NSC_peak_analysis.csv']
gtf_file = 'custom_pipeline/DATA/gencode.vM10.annotation.gtf'

# Process each input file
result_dfs = []
for input_file in input_files:
    output_file = input_file.replace('.csv', '_annotated.csv')
    result_df = process_files(input_file, output_file, gtf_file)
    result_dfs.append(result_df)

Successfully processed custom_pipeline/results/Neuron_peak_analysis.csv and saved results to custom_pipeline/results/Neuron_peak_analysis_annotated.csv
Successfully processed custom_pipeline/results/NSC_peak_analysis.csv and saved results to custom_pipeline/results/NSC_peak_analysis_annotated.csv


In [92]:
result_dfs[0].head()

Unnamed: 0,gene,Endogenous_Promoter,Exogenous_Promoter,chrom,coords,start,end,gene_symbol
0,chr1:3514840-3515094,True,True,chr1,3514840-3515094,3514840,3515094,Xkr4
1,chr1:3671504-3671967,True,True,chr1,3671504-3671967,3671504,3671967,Intergenic
2,chr1:4785444-4786146,True,True,chr1,4785444-4786146,4785444,4786146,Mrpl15
3,chr1:4857475-4857822,True,True,chr1,4857475-4857822,4857475,4857822,Tcea1;Gm37988
4,chr1:5015421-5015642,True,True,chr1,5015421-5015642,5015421,5015642,Rgs20
