In [1]:
import os
import sys
import pandas as pd
from pycisTopic import *

from dotenv import load_dotenv
load_dotenv()
sys.path.insert(0, os.getenv('PROJECT_FUNCTIONS_PATH'))

from grn_helpers import set_output_folders

In [2]:
root_dir = os.getenv('BASE_PATH')

In [5]:
n_cpus = 32
neurons_set = "L2-3_CUX2"
# neurons_set = "all_ex"
# neurons_set = "all_ex_all_ages"

In [6]:
# all_excitatory ex_neurons ex_neurons_combined ex_progenitors all_inhibitory
out_dir, in_dir, root_dir, tmp_dir, data_folder = set_output_folders(root_dir, neurons_set)

# Read the consensus_regions.bed file using pandas
consensus_regions = pd.read_csv(os.path.join(out_dir, 'consensus_peak_calling/consensus_regions.bed'), sep='\t', header=None)
consensus_regions.columns = ['chrom', 'start', 'end', 'peak_id', 'score', 'strand']

# Define the cell types of interest
cells_dict = {
    "all_ex"            :   ['L5-6_TLE4', 'L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'PN_dev'],
    "all_ex_all_ages"   :   ['L5-6_TLE4', 'L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'PN_dev'],
    "L2-3_CUX2"         :   ['L2-3_CUX2']
}
cell_types = cells_dict[neurons_set]

root_dir: /group/testa/michal.kubacki/herring_minimal
out_dir: /group/testa/michal.kubacki/herring_minimal/L2-3_CUX2
in_dir: /group/testa/michal.kubacki/herring_minimal/data
tmp_dir: /group/testa/michal.kubacki/herring_minimal/tmp


In [8]:
consensus_regions.head()

Unnamed: 0,chrom,start,end,peak_id,score,strand
0,chr1,762659,763159,"L2-3_CUX2_peak_4a,L2-3_CUX2_peak_4b",8.048006,.
1,chr1,875527,876027,"L2-3_CUX2_peak_6a,L2-3_CUX2_peak_6b",2.090391,.
2,chr1,878463,878963,L2-3_CUX2_peak_7,1.254235,.
3,chr1,894443,894943,L2-3_CUX2_peak_8,25.241473,.
4,chr1,895685,896185,L2-3_CUX2_peak_9,4.546601,.


In [None]:
# Create the peak_name column
consensus_regions['peak_name'] = consensus_regions['chrom'].astype(str) + '_' + consensus_regions['start'].astype(str) + '_' + consensus_regions['end'].astype(str)

# Define a score threshold for selecting valid peaks
score_threshold = 1.0

# Create a directory to store the cell type-specific consensus regions
cell_type_dir = os.path.join(out_dir, "cell_type_consensus_regions")
os.makedirs(cell_type_dir, exist_ok=True)

# Save the valid consensus regions for each cell type
for cell_type in cell_types:
    cell_type_regions = consensus_regions[consensus_regions['peak_id'].str.contains(cell_type) & (consensus_regions['score'] >= score_threshold)]
    cell_type_regions.loc[:, 'cell_type'] = cell_type
    cell_type_regions_file = os.path.join(cell_type_dir, f"{cell_type}_consensus_regions.bed")
    cell_type_regions.to_csv(cell_type_regions_file, sep='\t', header=False, index=False)
    print(f"Saved valid consensus regions for cell type {cell_type} to {cell_type_regions_file}")