In [1]:
import pandas as pd
import subprocess
import os
from pathlib import Path
import pysam
import numpy as np
import sys

In [2]:
wd_dir = '/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_CUTandTAG/custom_pipeline/results'
os.chdir(wd_dir)

# Get the current working directory
current_dir = os.getcwd()

In [8]:
def ensure_bam_index(bam_file):
    """
    Ensure BAM file is indexed. Create index if it doesn't exist.
    
    Parameters:
    - bam_file: path to BAM file
    """
    if not os.path.exists(bam_file + '.bai'):
        print(f"Creating index for {bam_file}")
        try:
            pysam.index(bam_file)
        except Exception as e:
            print(f"Error creating index for {bam_file}: {str(e)}")
            sys.exit(1)

def calculate_coverage(bam_file, chrom, start, end):
    """
    Calculate mean coverage for a genomic region from a BAM file.
    
    Parameters:
    - bam_file: path to the BAM file
    - chrom: chromosome name
    - start: region start position (0-based)
    - end: region end position
    
    Returns:
    - float: mean coverage across the region
    """
    try:
        # Ensure BAM is indexed
        ensure_bam_index(bam_file)
        
        # Open BAM file
        with pysam.AlignmentFile(bam_file, "rb") as bam:
            # Get per-base coverage array for the region
            coverage_arrays = bam.count_coverage(chrom, start, end)
            
            # Sum all nucleotide counts per position
            total_coverage = np.sum(coverage_arrays, axis=0)
            
            # Calculate mean coverage across the region
            mean_coverage = np.mean(total_coverage)
            
            return mean_coverage
    except Exception as e:
        print(f"Warning: Error calculating coverage for {chrom}:{start}-{end} in {bam_file}")
        print(f"Error: {str(e)}")
        return 0

def get_sample_bams(tissue, condition):
    """
    Get list of BAM files for a specific tissue and condition.
    
    Parameters:
    - tissue: 'Neuron' or 'NSC'
    - condition: 'Endo' or 'Exo'
    
    Returns:
    - list: paths to BAM files for the specified condition
    """
    # Update these paths to match your actual BAM file locations
    bam_dict = {
        'Neuron': {
            'Endo': ["aligned/NeuM2.bam", "aligned/NeuM3.bam"],
            'Exo': ["aligned/NeuV1.bam", "aligned/NeuV2.bam", "aligned/NeuV3.bam"]
        },
        'NSC': {
            'Endo': ["aligned/NSCM1.bam", "aligned/NSCM2.bam", "aligned/NSCM3.bam"],
            'Exo': ["aligned/NSCv1.bam", "aligned/NSCv2.bam", "aligned/NSCv3.bam"]
        }
    }
    
    # Get BAM files for the specified condition
    bam_files = bam_dict[tissue][condition]
    
    # Check if all BAM files exist
    for bam_file in bam_files:
        if not os.path.exists(bam_file):
            print(f"Error: BAM file not found: {bam_file}")
            sys.exit(1)
            
    return bam_files

def calculate_mean_coverage(peak_file, tissue, condition):
    """
    Calculate mean coverage across replicates for all peaks in a condition.
    
    Parameters:
    - peak_file: path to BED file containing peaks
    - tissue: tissue type ('Neuron' or 'NSC')
    - condition: condition type ('Endo' or 'Exo')
    
    Returns:
    - dict: mapping of region coordinates to mean coverage
    """
    coverages = {}
    bam_files = get_sample_bams(tissue, condition)
    
    print(f"Processing {condition} peaks for {tissue} using {len(bam_files)} BAM files...")
    
    # Check if peak file exists
    if not os.path.exists(peak_file):
        print(f"Error: Peak file not found: {peak_file}")
        sys.exit(1)
    
    with open(peak_file) as f:
        for line in f:
            # Parse peak coordinates
            chrom, start, end = line.strip().split()[:3]
            start, end = int(start), int(end)
            region_key = f"{chrom}:{start}-{end}"
            
            # Calculate coverage for each replicate
            replicate_coverages = []
            for bam_file in bam_files:
                cov = calculate_coverage(bam_file, chrom, start, end)
                replicate_coverages.append(cov)
            
            # Store mean coverage across replicates
            coverages[region_key] = np.mean(replicate_coverages)
    
    return coverages

def process_peaks(tissue):
    """
    Process peaks for a given tissue type and generate summary statistics.
    Focus only on the consensus peaks that were already generated.
    
    Parameters:
    - tissue: tissue type ('Neuron' or 'NSC')
    """
    print(f"\nProcessing {tissue} peaks...")
    
    # Define input files for specific peaks (already generated by final_list.sh)
    shared_peaks = f"consensus_peaks/{tissue}_shared_peaks.bed"
    endo_specific = f"consensus_peaks/{tissue}_Endo_specific.bed"
    exo_specific = f"consensus_peaks/{tissue}_Exo_specific.bed"
    
    # Check if files exist
    for peak_file in [shared_peaks, endo_specific, exo_specific]:
        if not os.path.exists(peak_file):
            print(f"Error: Peak file not found: {peak_file}")
            sys.exit(1)
    
    results = []
    
    # Process shared peaks
    with open(shared_peaks) as f:
        for line in f:
            chrom, start, end = line.strip().split()[:3]
            results.append({
                'gene': f"{chrom}:{start}-{end}",
                'Endogenous_Promoter': True,
                'Exogenous_Promoter': True
            })
    
    # Process endogenous-specific peaks
    with open(endo_specific) as f:
        for line in f:
            chrom, start, end = line.strip().split()[:3]
            results.append({
                'gene': f"{chrom}:{start}-{end}",
                'Endogenous_Promoter': True,
                'Exogenous_Promoter': False
            })
    
    # Process exogenous-specific peaks
    with open(exo_specific) as f:
        for line in f:
            chrom, start, end = line.strip().split()[:3]
            results.append({
                'gene': f"{chrom}:{start}-{end}",
                'Endogenous_Promoter': False,
                'Exogenous_Promoter': True
            })
    
    # Convert to DataFrame and save
    df = pd.DataFrame(results)

    # Print summary statistics
    print(f"\nResults for {tissue}:")
    print(f"Total regions analyzed: {len(df)}")
    print(f"Regions with Endogenous peaks: {df['Endogenous_Promoter'].sum()}")
    print(f"Regions with Exogenous peaks: {df['Exogenous_Promoter'].sum()}")
    print(f"Regions with both: {(df['Endogenous_Promoter'] & df['Exogenous_Promoter']).sum()}")
    # print(f"Output saved to: {output_file}")
    return df

In [9]:
df_neuron = process_peaks("Neuron")
df_nsc = process_peaks("NSC")



Processing Neuron peaks...

Results for Neuron:
Total regions analyzed: 94013
Regions with Endogenous peaks: 57049
Regions with Exogenous peaks: 60631
Regions with both: 23667

Processing NSC peaks...

Results for NSC:
Total regions analyzed: 25357
Regions with Endogenous peaks: 19037
Regions with Exogenous peaks: 11914
Regions with both: 5594


In [10]:
df_neuron.head()

Unnamed: 0,gene,Endogenous_Promoter,Exogenous_Promoter
0,chr1:3514840-3515094,True,True
1,chr1:3671504-3671967,True,True
2,chr1:4785444-4786146,True,True
3,chr1:4857475-4857822,True,True
4,chr1:5015421-5015642,True,True


In [11]:
df_nsc.head()

Unnamed: 0,gene,Endogenous_Promoter,Exogenous_Promoter
0,chr1:7088533-7089020,True,True
1,chr1:7397417-7398361,True,True
2,chr1:9564436-9564866,True,True
3,chr1:9658927-9659271,True,True
4,chr1:9748279-9748842,True,True


In [None]:
df_neuron.to_csv("neuron_peak_analysis.csv", index=False)
df_nsc.to_csv("nsc_peak_analysis.csv", index=False)
