In [None]:
import os
import tempfile
from pathlib import Path
# import umap
import matplotlib.pyplot as plt
import numpy as np
import pooch
import pandas as pd
import scanpy as sc
import scvi
import torch
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42 # enables correct plotting of text
rcParams['figure.figsize'] = (12,12)

import muon as mu
import numpy as np
import pandas as pd
import scanpy as sc

from matplotlib import pyplot as plt
import seaborn as sns
import snapatac2 as snap

In [8]:
import pandas as pd
import snapatac2 as snap
from collections import defaultdict
import numpy as np
import os
from scipy.stats import combine_pvalues

def parse_index(index):
    chrom, positions = index.split(':')
    start, end = positions.split('-')
    return chrom, int(start), int(end)

def csv_to_bed(csv_file, bed_file):
    # Read the CSV file
    df = pd.read_csv(csv_file, index_col=0)
    
    # Parse the index to extract chromosome, start, and end positions
    parsed_data = [parse_index(idx) for idx in df.index]
    bed_df = pd.DataFrame(parsed_data, columns=['chromosome', 'start', 'end'])
    
    # Save to BED format (tab-separated)
    bed_df.to_csv(bed_file, sep='\t', header=False, index=False)
    print(f"Converted {csv_file} to {bed_file}")

for l in ['l1', 'l2']:
    for file in os.listdir(f"DAR/csv/{l}/"):
        if file.endswith('.csv') and file.startswith('DAR'):
            input_csv = f"DAR/csv/{l}/{file}"

            # Save the ranked list to a CSV file
            file_save = file.replace('DAR', 'DAG_distal')
            output_csv = f"DAR/csv/{l}/DAG/{file_save}"

            df = pd.read_csv(input_csv)
            if not os.path.exists(output_csv):
                
                # Initialize the network using your existing setup
                network = snap.tl.init_network_from_annotation(
                    regions=df['Feature_name'].tolist(),
                    anno_file=snap.genome.GRCh38,
                    # upstream=5000,
                    # downstream=100,
                    # only_coding=True
                )

                # Initialize a dictionary to store gene scores
                gene_scores = defaultdict(lambda: {
                    "score_sum": 0, "region_count": 0,
                    "lfc_sum": 0, "lfc_weight_sum": 0,
                    "pval_list": []
                })

                for edge in network.edge_list():
                    region = network[edge[0]].id
                    gene = network[edge[1]].id
                    
                    if region in df['Feature_name'].values:
                        region_data = df[df['Feature_name'] == region]
                        log2_fold_change = region_data['log2(fold_change)'].values[0]
                        p_value = region_data['p-value'].values[0]
                        if p_value < 1e-50:
                            p_value = 1e-50 
                        
                        # Calculate weighted score for each region
                        weighted_score = log2_fold_change * (-np.log10(p_value))
                        
                        # Accumulate scores
                        gene_scores[gene]["score_sum"] += weighted_score
                        gene_scores[gene]["region_count"] += 1
                        
                        # Accumulate weighted log2 fold changes
                        gene_scores[gene]["lfc_sum"] += log2_fold_change * (-np.log10(p_value))
                        gene_scores[gene]["lfc_weight_sum"] += (-np.log10(p_value))
                        
                        # Collect p-values for later combination
                        gene_scores[gene]["pval_list"].append(p_value)

                # Process the gene_scores to aggregate LFC and p-values
                for gene, data in gene_scores.items():
                    # Compute the average LFC, weighted by the significance of each region
                    if data["lfc_weight_sum"] != 0:
                        data["average_lfc"] = data["lfc_sum"] / data["lfc_weight_sum"]
                    else:
                        data["average_lfc"] = 0
                    
                    # Combine p-values using Fisher's method
                    if len(data["pval_list"]) > 1:
                        _, combined_pval = combine_pvalues(data["pval_list"], method='fisher')
                        data["combined_pval"] = combined_pval
                    else:
                        data["combined_pval"] = data["pval_list"][0] if data["pval_list"] else 1

                # Convert the dictionary to a DataFrame for ranking
                gene_df = pd.DataFrame.from_dict(gene_scores, orient='index')
                gene_df['average_score'] = gene_df['score_sum'] / gene_df['region_count']
                gene_df = gene_df.sort_values(by="average_score", ascending=False)

                gene_df.to_csv(output_csv)

            # Processing overlap file (as in DAG_overlap)
            file_save = file.replace('DAR', 'DAG_overlap')
            output_csv = f"DAR/csv/{l}/DAG/{file_save}"

            if not os.path.exists(output_csv):
                
                network = snap.tl.init_network_from_annotation(
                    regions=df['Feature_name'].tolist(),
                    anno_file=snap.genome.GRCh38,
                    upstream=5000,
                    downstream=100,
                    # only_coding=True
                )

                # Initialize a dictionary to store gene scores
                gene_scores = defaultdict(lambda: {
                    "score_sum": 0, "region_count": 0,
                    "lfc_sum": 0, "lfc_weight_sum": 0,
                    "pval_list": []
                })

                for edge in network.edge_list():
                    region = network[edge[0]].id
                    gene = network[edge[1]].id
                    
                    if region in df['Feature_name'].values:
                        region_data = df[df['Feature_name'] == region]
                        log2_fold_change = region_data['log2(fold_change)'].values[0]
                        p_value = region_data['p-value'].values[0]
                        if p_value < 1e-50:
                            p_value = 1e-50 
                        
                        # Calculate weighted score for each region
                        weighted_score = log2_fold_change * (-np.log10(p_value))
                        
                        # Accumulate scores
                        gene_scores[gene]["score_sum"] += weighted_score
                        gene_scores[gene]["region_count"] += 1
                        
                        # Accumulate weighted log2 fold changes
                        gene_scores[gene]["lfc_sum"] += log2_fold_change * (-np.log10(p_value))
                        gene_scores[gene]["lfc_weight_sum"] += (-np.log10(p_value))
                        
                        # Collect p-values for later combination
                        gene_scores[gene]["pval_list"].append(p_value)

                # Process the gene_scores to aggregate LFC and p-values
                for gene, data in gene_scores.items():
                    # Compute the average LFC, weighted by the significance of each region
                    if data["lfc_weight_sum"] != 0:
                        data["average_lfc"] = data["lfc_sum"] / data["lfc_weight_sum"]
                    else:
                        data["average_lfc"] = 0
                    
                    # Combine p-values using Fisher's method
                    if len(data["pval_list"]) > 1:
                        _, combined_pval = combine_pvalues(data["pval_list"], method='fisher')
                        data["combined_pval"] = combined_pval
                    else:
                        data["combined_pval"] = data["pval_list"][0] if data["pval_list"] else 1

                # Convert the dictionary to a DataFrame for ranking
                gene_df = pd.DataFrame.from_dict(gene_scores, orient='index')
                gene_df['average_score'] = gene_df['score_sum'] / gene_df['region_count']
                gene_df = gene_df.sort_values(by="average_score", ascending=False)

                gene_df.to_csv(output_csv)

  weighted_score = log2_fold_change * (-np.log10(p_value))
  gene_scores[gene]["lfc_sum"] += log2_fold_change * (-np.log10(p_value))
  gene_scores[gene]["lfc_weight_sum"] += (-np.log10(p_value))
  data["average_lfc"] = data["lfc_sum"] / data["lfc_weight_sum"]
  statistic = -2 * np.sum(np.log(pvalues))
  weighted_score = log2_fold_change * (-np.log10(p_value))
  gene_scores[gene]["lfc_sum"] += log2_fold_change * (-np.log10(p_value))
  gene_scores[gene]["lfc_weight_sum"] += (-np.log10(p_value))
  data["average_lfc"] = data["lfc_sum"] / data["lfc_weight_sum"]
  statistic = -2 * np.sum(np.log(pvalues))
  weighted_score = log2_fold_change * (-np.log10(p_value))
  gene_scores[gene]["lfc_sum"] += log2_fold_change * (-np.log10(p_value))
  gene_scores[gene]["lfc_weight_sum"] += (-np.log10(p_value))
  data["average_lfc"] = data["lfc_sum"] / data["lfc_weight_sum"]
  statistic = -2 * np.sum(np.log(pvalues))
  weighted_score = log2_fold_change * (-np.log10(p_value))
  gene_scores[gene]["lfc_sum"] +