In [None]:
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42 # enables correct plotting of text
rcParams['figure.figsize'] = (12,12)
import pandas as pd

In [None]:
import scipy
scipy.__version__

'1.11.4'

In [None]:
def csv_to_bed_df(csv_file, bed_file):
    # Read the CSV file
    df = pd.read_csv(csv_file,sep='\t', index_col=None)
    print(df)
    bed_df = df[['chrom', 'start', 'end']]
    bed_df.rename(columns = ['chromosome', 'start', 'end'], inplace=True)
    # Parse the index to extract chromosome, start, and end positions
    # parsed_data = [parse_index(idx) for idx in df.index]
    # bed_df = pd.DataFrame(df, columns=['chromosome', 'start', 'end'])
    
    # Save to BED format (tab-separated)
    bed_df.to_csv(bed_file, sep='\t', header=False, index=False)
    print(f"Converted {csv_file} to {bed_file}")
def parse_index(index):
    chrom, positions = index.split(':')
    start, end = positions.split('-')
    return chrom, int(start), int(end)
def index_to_bed(index, bed_file):
    # Read the CSV file

    # Parse the index to extract chromosome, start, and end positions
    parsed_data = [parse_index(idx) for idx in index]
    bed_df = pd.DataFrame(parsed_data, columns=['chromosome', 'start', 'end'])
    
    # Save to BED format (tab-separated)
    bed_df.to_csv(bed_file, sep='\t', header=False, index=False)
    print(f"Converted {csv_file} to {bed_file}")

In [None]:
import snapatac2 as snap
from collections import defaultdict
import numpy as np
import os

def parse_index(index):
    chrom, positions = index.split(':')
    start, end = positions.split('-')
    return chrom, int(start), int(end)

def csv_to_bed(csv_file, bed_file):
    # Read the CSV file
    df = pd.read_csv(csv_file, index_col=0)
    
    # Parse the index to extract chromosome, start, and end positions
    parsed_data = [parse_index(idx) for idx in df.index]
    bed_df = pd.DataFrame(parsed_data, columns=['chromosome', 'start', 'end'])
    
    # Save to BED format (tab-separated)
    bed_df.to_csv(bed_file, sep='\t', header=False, index=False)
    print(f"Converted {csv_file} to {bed_file}")

for l in ['l1', 'l2']:
    for file in os.listdir(f"DAR/csv/{l}/"):
        if file.endswith('.csv') and file.startswith('DAR'):
            input_csv = f"DAR/csv/{l}/{file}"

            # Processing overlap file (as in DAG_overlap)
            file_save = file.replace('DAR', 'DAG_Overlap')
            output_csv = f"DAR/csv/{l}/DAG/{file_save}"

            df = pd.read_csv(input_csv)

            if not os.path.exists(output_csv):
                

                from collections import defaultdict

                from scipy.stats import norm
                network = snap.tl.init_network_from_annotation(
                    regions=df['Feature_name'].tolist(),
                    anno_file=snap.genome.GRCh38,
                    upstream=2000,
                    downstream=500,
                    # only_coding=True
                )
                # Initialize the gene score dictionary
                gene_scores = defaultdict(lambda: {
                    "lfc_sum": 0, "region_count": 0, "z_sum": 0,"score_sum":0
                })

                for edge in network.edge_list():
                    region = network[edge[0]].id
                    gene = network[edge[1]].id

                    if region in df['Feature_name'].values:
                        region_data = df[df['Feature_name'] == region]
                        log2_fold_change = region_data['log2(fold_change)'].values[0]
                        p_value = region_data['p-value'].values[0]
                        if p_value < 1e-50:  # Cap extremely low p-values to avoid numerical issues
                            p_value = 1e-50 

                        # Convert p-value to Z-score for Stouffer's method
                        z_score = norm.ppf(1 - p_value)

                        weighted_score = log2_fold_change * (-np.log10(p_value))
                        
                        # Accumulate scores
                        gene_scores[gene]["score_sum"] += weighted_score
                        # Accumulate scores
                        gene_scores[gene]["lfc_sum"] += log2_fold_change
                        gene_scores[gene]["z_sum"] += z_score
                        gene_scores[gene]["region_count"] += 1

                # Process the gene_scores to aggregate LFC and p-values
                for gene, data in gene_scores.items():
                    # Compute the average LFC as an unweighted mean
                    if data["region_count"] > 0:
                        data["average_lfc"] = data["lfc_sum"] / data["region_count"]
                    else:
                        data["average_lfc"] = 0

                    # Combine p-values using Stouffer's method
                    if data["region_count"] > 1:
                        combined_z = data["z_sum"] / np.sqrt(data["region_count"])
                        data["combined_pval"] = 1 - norm.cdf(combined_z)
                    else:
                        # If there's only one p-value, use it directly
                        data["combined_pval"] = 1 - norm.cdf(data["z_sum"]) if data["region_count"] == 1 else 1

                # Convert gene_scores to a DataFrame for easier analysis
                gene_scores_df = pd.DataFrame.from_dict(gene_scores, orient='index')
                gene_scores_df['average_score'] = gene_scores_df['score_sum'] / gene_scores_df['region_count']

                gene_scores_df.reset_index(inplace=True)
                gene_scores_df.rename(columns={'index': 'Gene'}, inplace=True)

                gene_scores_df.to_csv(output_csv)
