1. Data Exploration
2. Importing and Formating snATAC data
3. Getting pseudobulk profiles from cell annotations
4. Infering consensus peaks

# Environment

In [2]:
# Standard library imports
import os
import gc
import sys
import pickle
import importlib

# Data manipulation imports
import pandas as pd

# Visualization imports
import matplotlib.pyplot as plt
import seaborn as sns

# pycisTopic imports
import pycisTopic
from pycisTopic.pseudobulk_peak_calling import export_pseudobulk, peak_calling
from pycisTopic.iterative_peak_calling import get_consensus_peaks

importlib.reload(pycisTopic)
from pycisTopic import *
pycisTopic.__version__

sys.path.insert(0, "/home/michal.kubacki/Githubs/Re-MEND/code/External_Datasets/GeneSet_Derivation/Herring_scenic/helpers")
import config
importlib.reload(config)
from config import *
n_cpu = 32


In [3]:
#################################################################
reference = "hg19"


# neurons_set = "all_excitatory"
# neurons_set = "all_inhibitory"
neurons_set = "all_excitatory_all_ages"
# neurons_set = "all_inhibitory_all_ages"

cells_dict = {
    "all_inhibitory"            :   ['SST', 'VIP', 'MGE_dev'],
    "all_inhibitory_all_ages"   :   ['VIP', 'SST', 'PV', 'MGE_dev'],
    "all_excitatory"            :   ['L5-6_TLE4', 'L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'PN_dev'],
    "all_excitatory_all_ages"   :   ['L5-6_TLE4', 'L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'PN_dev']
}

ages_dict = {
    "all_inhibitory"            :   ['1m','3m','6m','10m','1y','2y','4y','ga22','ga24'],
    "all_inhibitory_all_ages"   :   ['1m','3m','6m','10m','1y','2y','4y','6y','10y','16y','20y','40y','ga22','ga24'],
    "all_excitatory"            :   ['1m','3m','6m','10m','1y','2y','4y','ga22','ga24'],
    "all_excitatory_all_ages"   :   ['1m','3m','6m','10m','1y','2y','4y','6y','10y','16y','20y','40y','ga22','ga24']
}

out_dir, in_dir, root_dir, tmp_dir, data_folder = set_output_folders(reference, neurons_set)

sel_celltypes  = cells_dict[neurons_set]
sel_ages = ages_dict[neurons_set]

#################################################################

root_dir: /group/testa/michal.kubacki/herring
out_dir: /group/testa/michal.kubacki/herring/output_hg19_all_excitatory
in_dir: /group/testa/michal.kubacki/herring/data
tmp_dir: /group/testa/michal.kubacki/herring/tmp


In [4]:
ATAC_metadata_path = os.path.join(in_dir, "Processed_data_ATAC_BCs-meta-data.csv")

In [5]:
fragments_dict = select_files(reference, selected_fragments = sel_ages)

All fragments: {'ga22': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138510_RL2366_ga22_snATAC_fragments.tsv.gz', '1y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138526_RL2209_1y_snATAC_fragments.tsv.gz', '14y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138542_RL2372_14y_snATAC_fragments.tsv.gz', 'ga24': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138512_RL2207_ga24_snATAC_fragments.tsv.gz', '2y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138529_RL1784_2y_snATAC_fragments.tsv.gz', '16y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138544_RL1785_16y_snATAC_fragments.tsv.gz', '1m': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138515_RL2367_1m_snATAC_fragments.tsv.gz', '4y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138532_RL2210_4y_snATAC_fragments.tsv.gz', '20y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138548_RL2085_20y_snATAC_fragment

In [6]:
fragments_dict

{'1m': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138515_RL2367_1m_snATAC_fragments.tsv.gz',
 '3m': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138518_RL1914_3m_snATAC_fragments.tsv.gz',
 '6m': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138521_RL2208_6m_snATAC_fragments.tsv.gz',
 '10m': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138523_RL2371_10m_snATAC_fragments.tsv.gz',
 '1y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138526_RL2209_1y_snATAC_fragments.tsv.gz',
 '2y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138529_RL1784_2y_snATAC_fragments.tsv.gz',
 '4y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138532_RL2210_4y_snATAC_fragments.tsv.gz',
 'ga22': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138510_RL2366_ga22_snATAC_fragments.tsv.gz',
 'ga24': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138512_RL2207_ga24_snATAC_fragments.tsv.gz'}

# Load ATAC metadata

In [7]:
pd.set_option('display.max_columns', None)
cells_data = pd.read_csv(ATAC_metadata_path, sep=",", index_col = 0)
cells_data.head()

Unnamed: 0,TSSEnrichment,ReadsInTSS,ReadsInPromoter,ReadsInBlacklist,PromoterRatio,PassQC,NucleosomeRatio,nMultiFrags,nMonoFrags,nFrags,nDiFrags,DoubletScore,DoubletEnrichment,BlacklistRatio,Clusters,predictedCell,predictedGroup,predictedScore,FRIP,ReadsInPeaks
RL1784#TTGCGGGCATTGCGAT-1,3.187,1534.0,9410.0,981.0,0.052331,1.0,0.939366,8204.0,46360.0,89909.0,35345.0,-0.0,1.2,0.005456,C28,GCTACCTCAGCTTCCT-RL2100_86d_v3,L2/3_CUX2_dev-1,0.335881,0.420796,74502.0
RL1784#ACATGCATCAATTCCT-1,2.553,1012.0,7014.0,700.0,0.047747,1.0,0.786993,5692.0,41102.0,73449.0,26655.0,-0.0,1.028571,0.004765,C46,TCCTTCTTCCTAAACG-RL2121_ga34_v3,L5/6_TLE4_SCUBE1,0.541803,0.384517,56111.0
RL1784#CCGTGAGCAGGTAGCA-1,2.914,1167.0,7173.0,1268.0,0.050797,1.0,0.954166,6839.0,36130.0,70604.0,27635.0,-0.0,0.9,0.00898,C46,CCTTTGGGTGTATTCG-RL2108_179d_v3,L5/6_TLE4_SORCS1,0.439298,0.398352,54136.0
RL1784#TTACTCACAACTCCCT-1,2.684,1095.0,7214.0,1044.0,0.051844,1.0,0.771999,7073.0,39263.0,69574.0,23238.0,-0.0,1.1,0.007503,C46,TATACCTAGGGCAACT-RL2121_ga34_v3,L5/6_TLE4_SORCS1,0.372978,0.361398,49385.0
RL1784#CCACAGGAGACACGGT-1,3.119,1208.0,7018.0,737.0,0.052616,1.0,1.795448,8900.0,23857.0,66691.0,33934.0,-0.0,0.914286,0.005525,C46,TATACCTAGGGCAACT-RL2121_ga34_v3,L5/6_TLE4_SORCS1,0.326004,0.442014,57643.0


## Add new columns

In [8]:
def process_age(name):
    parts = name.split('-')[1]
    processed_name = parts.split('_')[1]
    return processed_name

def process_name(name):
    parts = name.split('_')[:2]
    processed_name = '_'.join(parts)
    processed_name = processed_name.replace('/', '-')
    return processed_name

def process_chem(name):
    processed_name = name.split('_')[-1]
    return processed_name

def map_major_clust(name, mapped_names):
    for prefix, name_list in mapped_names.items():
        if name in name_list:
            return prefix
    return process_name(name)

cells_data['age'] = cells_data.predictedCell.apply(process_age)
cells_data['chem'] = cells_data.predictedCell.apply(process_chem)

unique_names = cells_data.predictedGroup.unique()

prefixes = sel_celltypes
mapped_names = {}

for prefix in prefixes:
    mapped_names[prefix] = []
    for name in unique_names:
        if name.startswith(prefix):
            mapped_names[prefix].append(name)

cells_data['major_clust'] = cells_data.predictedGroup.apply(lambda x: map_major_clust(x, mapped_names))

In [9]:
# def process_age(name):
#     parts = name.split('-')[1]
#     processed_name = parts.split('_')[1]
#     return processed_name

# def process_name(name):
#     parts = name.split('_')[:2]
#     processed_name = '_'.join(parts)
#     processed_name = processed_name.replace('/', '-')
#     return processed_name

# def process_chem(name):
#     processed_name = name.split('_')[-1]
#     return processed_name

# cells_data['age'] = cells_data.predictedCell.apply(process_age)
# cells_data['chem'] = cells_data.predictedCell.apply(process_chem)

# unique_names = cells_data.predictedGroup.unique()
# name_mapping = {name: process_name(name) for name in unique_names}
# cells_data['major_clust'] = cells_data.predictedGroup.map(name_mapping)

## Match `ages` from the fragments files

In [10]:
mapping = {
    '2d': '1m',
    '34d': '1m',
    '86d': '3m',
    '118d': '3m',
    '179d': '6m',
    '301d': '10m',
    '422d': '1y',
    '2yr': '2y',
    '627d': '2y',
    '3yr': '4y',
    '4yr': '4y',
    '6yr': '6y',
    '8yr': '8y',
    '10yr': '10y',
    '12yr': '14y',
    '14yr': '14y',
    '16yr': '16y',
    '17yr': '16y',
    '20yr': '20y',
    '25yr': '25y',
    '40yr': '40y',
    'ga22': 'ga22',
    'ga24': 'ga24',
    'ga34': 'ga24'
}

cells_data["age_mapped"] = [mapping.get(age, age) for age in cells_data.age]
cells_data["age_mapped"].unique()

array(['3m', 'ga24', '6m', 'ga22', '1m', '10m', '2y', '16y', '4y', '1y',
       '20y', '6y', '14y', '40y', '25y', '10y', '8y'], dtype=object)

# Filter ATAC metadata

In [11]:
print(cells_data.shape)

(87339, 24)


In [13]:
cells_data = cells_data[cells_data['major_clust'].isin(sel_celltypes)]
print(cells_data.shape)

(32365, 24)


In [14]:
print(cells_data['chem'].value_counts())
print(cells_data['PassQC'].value_counts())

v3    29982
v2     2383
Name: chem, dtype: int64
1.0    32365
Name: PassQC, dtype: int64


In [15]:
cells_data = cells_data[cells_data['chem']=="v3"]
print(cells_data.shape)

(29982, 24)


In [16]:
cells_data = cells_data[cells_data['age_mapped'].isin(sel_ages)]
print(cells_data.shape)

(22558, 24)


# Format indexes

In [17]:
def format_index(index):
    parts = index.split("#")
    formatted_index = parts[1]
    return formatted_index

cells_data = cells_data.rename(index=format_index)

In [18]:
cells_data["old_index"] = cells_data.index
cells_data.index = cells_data.index + "-" + cells_data["age_mapped"]

In [19]:
cells_data.head()

Unnamed: 0,TSSEnrichment,ReadsInTSS,ReadsInPromoter,ReadsInBlacklist,PromoterRatio,PassQC,NucleosomeRatio,nMultiFrags,nMonoFrags,nFrags,nDiFrags,DoubletScore,DoubletEnrichment,BlacklistRatio,Clusters,predictedCell,predictedGroup,predictedScore,FRIP,ReadsInPeaks,age,chem,major_clust,age_mapped,old_index
TTGCGGGCATTGCGAT-1-3m,3.187,1534.0,9410.0,981.0,0.052331,1.0,0.939366,8204.0,46360.0,89909.0,35345.0,-0.0,1.2,0.005456,C28,GCTACCTCAGCTTCCT-RL2100_86d_v3,L2/3_CUX2_dev-1,0.335881,0.420796,74502.0,86d,v3,L2-3_CUX2,3m,TTGCGGGCATTGCGAT-1
ACATGCATCAATTCCT-1-ga24,2.553,1012.0,7014.0,700.0,0.047747,1.0,0.786993,5692.0,41102.0,73449.0,26655.0,-0.0,1.028571,0.004765,C46,TCCTTCTTCCTAAACG-RL2121_ga34_v3,L5/6_TLE4_SCUBE1,0.541803,0.384517,56111.0,ga34,v3,L5-6_TLE4,ga24,ACATGCATCAATTCCT-1
CCGTGAGCAGGTAGCA-1-6m,2.914,1167.0,7173.0,1268.0,0.050797,1.0,0.954166,6839.0,36130.0,70604.0,27635.0,-0.0,0.9,0.00898,C46,CCTTTGGGTGTATTCG-RL2108_179d_v3,L5/6_TLE4_SORCS1,0.439298,0.398352,54136.0,179d,v3,L5-6_TLE4,6m,CCGTGAGCAGGTAGCA-1
TTACTCACAACTCCCT-1-ga24,2.684,1095.0,7214.0,1044.0,0.051844,1.0,0.771999,7073.0,39263.0,69574.0,23238.0,-0.0,1.1,0.007503,C46,TATACCTAGGGCAACT-RL2121_ga34_v3,L5/6_TLE4_SORCS1,0.372978,0.361398,49385.0,ga34,v3,L5-6_TLE4,ga24,TTACTCACAACTCCCT-1
CCACAGGAGACACGGT-1-ga24,3.119,1208.0,7018.0,737.0,0.052616,1.0,1.795448,8900.0,23857.0,66691.0,33934.0,-0.0,0.914286,0.005525,C46,TATACCTAGGGCAACT-RL2121_ga34_v3,L5/6_TLE4_SORCS1,0.326004,0.442014,57643.0,ga34,v3,L5-6_TLE4,ga24,CCACAGGAGACACGGT-1


In [20]:
cells_data = cells_data[cells_data.age_mapped.isin(fragments_dict)]
cells_data.head()

Unnamed: 0,TSSEnrichment,ReadsInTSS,ReadsInPromoter,ReadsInBlacklist,PromoterRatio,PassQC,NucleosomeRatio,nMultiFrags,nMonoFrags,nFrags,nDiFrags,DoubletScore,DoubletEnrichment,BlacklistRatio,Clusters,predictedCell,predictedGroup,predictedScore,FRIP,ReadsInPeaks,age,chem,major_clust,age_mapped,old_index
TTGCGGGCATTGCGAT-1-3m,3.187,1534.0,9410.0,981.0,0.052331,1.0,0.939366,8204.0,46360.0,89909.0,35345.0,-0.0,1.2,0.005456,C28,GCTACCTCAGCTTCCT-RL2100_86d_v3,L2/3_CUX2_dev-1,0.335881,0.420796,74502.0,86d,v3,L2-3_CUX2,3m,TTGCGGGCATTGCGAT-1
ACATGCATCAATTCCT-1-ga24,2.553,1012.0,7014.0,700.0,0.047747,1.0,0.786993,5692.0,41102.0,73449.0,26655.0,-0.0,1.028571,0.004765,C46,TCCTTCTTCCTAAACG-RL2121_ga34_v3,L5/6_TLE4_SCUBE1,0.541803,0.384517,56111.0,ga34,v3,L5-6_TLE4,ga24,ACATGCATCAATTCCT-1
CCGTGAGCAGGTAGCA-1-6m,2.914,1167.0,7173.0,1268.0,0.050797,1.0,0.954166,6839.0,36130.0,70604.0,27635.0,-0.0,0.9,0.00898,C46,CCTTTGGGTGTATTCG-RL2108_179d_v3,L5/6_TLE4_SORCS1,0.439298,0.398352,54136.0,179d,v3,L5-6_TLE4,6m,CCGTGAGCAGGTAGCA-1
TTACTCACAACTCCCT-1-ga24,2.684,1095.0,7214.0,1044.0,0.051844,1.0,0.771999,7073.0,39263.0,69574.0,23238.0,-0.0,1.1,0.007503,C46,TATACCTAGGGCAACT-RL2121_ga34_v3,L5/6_TLE4_SORCS1,0.372978,0.361398,49385.0,ga34,v3,L5-6_TLE4,ga24,TTACTCACAACTCCCT-1
CCACAGGAGACACGGT-1-ga24,3.119,1208.0,7018.0,737.0,0.052616,1.0,1.795448,8900.0,23857.0,66691.0,33934.0,-0.0,0.914286,0.005525,C46,TATACCTAGGGCAACT-RL2121_ga34_v3,L5/6_TLE4_SORCS1,0.326004,0.442014,57643.0,ga34,v3,L5-6_TLE4,ga24,CCACAGGAGACACGGT-1


In [21]:
print(cells_data.shape)

(22558, 25)


In [22]:
for cell in cells_data['major_clust'].unique():
    print(f"{cell}: {(cells_data['major_clust']==cell).sum()}")

L2-3_CUX2: 6355
L5-6_TLE4: 3292
L4_RORB: 5956
L5-6_THEMIS: 2071
PN_dev: 4884


In [23]:
cells_data.to_csv(os.path.join(out_dir, 'cells_data.csv'), index=True)

In [24]:
pd.reset_option('display.max_columns')

# Getting pseudobulk profiles from cell annotations

In [25]:
cell_data = pd.read_csv(os.path.join(out_dir, 'cells_data.csv'), index_col = 0)
cell_data.head()

Unnamed: 0,TSSEnrichment,ReadsInTSS,ReadsInPromoter,ReadsInBlacklist,PromoterRatio,PassQC,NucleosomeRatio,nMultiFrags,nMonoFrags,nFrags,...,predictedCell,predictedGroup,predictedScore,FRIP,ReadsInPeaks,age,chem,major_clust,age_mapped,old_index
TTGCGGGCATTGCGAT-1-3m,3.187,1534.0,9410.0,981.0,0.052331,1.0,0.939366,8204.0,46360.0,89909.0,...,GCTACCTCAGCTTCCT-RL2100_86d_v3,L2/3_CUX2_dev-1,0.335881,0.420796,74502.0,86d,v3,L2-3_CUX2,3m,TTGCGGGCATTGCGAT-1
ACATGCATCAATTCCT-1-ga24,2.553,1012.0,7014.0,700.0,0.047747,1.0,0.786993,5692.0,41102.0,73449.0,...,TCCTTCTTCCTAAACG-RL2121_ga34_v3,L5/6_TLE4_SCUBE1,0.541803,0.384517,56111.0,ga34,v3,L5-6_TLE4,ga24,ACATGCATCAATTCCT-1
CCGTGAGCAGGTAGCA-1-6m,2.914,1167.0,7173.0,1268.0,0.050797,1.0,0.954166,6839.0,36130.0,70604.0,...,CCTTTGGGTGTATTCG-RL2108_179d_v3,L5/6_TLE4_SORCS1,0.439298,0.398352,54136.0,179d,v3,L5-6_TLE4,6m,CCGTGAGCAGGTAGCA-1
TTACTCACAACTCCCT-1-ga24,2.684,1095.0,7214.0,1044.0,0.051844,1.0,0.771999,7073.0,39263.0,69574.0,...,TATACCTAGGGCAACT-RL2121_ga34_v3,L5/6_TLE4_SORCS1,0.372978,0.361398,49385.0,ga34,v3,L5-6_TLE4,ga24,TTACTCACAACTCCCT-1
CCACAGGAGACACGGT-1-ga24,3.119,1208.0,7018.0,737.0,0.052616,1.0,1.795448,8900.0,23857.0,66691.0,...,TATACCTAGGGCAACT-RL2121_ga34_v3,L5/6_TLE4_SORCS1,0.326004,0.442014,57643.0,ga34,v3,L5-6_TLE4,ga24,CCACAGGAGACACGGT-1


In [26]:
chromsizes = pd.read_table(
    os.path.join(in_dir, f"{reference}.chrom.sizes"),
    header = None,
    names = ["Chromosome", "End"]
)
chromsizes.insert(1, "Start", 0)
chromsizes.head()

Unnamed: 0,Chromosome,Start,End
0,chr1,0,249250621
1,chr2,0,243199373
2,chr3,0,198022430
3,chr4,0,191154276
4,chr5,0,180915260


potentialy beforehand you  might want to run `for file in *fragments.tsv.gz; do tabix -p bed "$file"; done`

In [27]:
gc.collect()

33160

In [28]:
fragments_dict

{'1m': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138515_RL2367_1m_snATAC_fragments.tsv.gz',
 '3m': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138518_RL1914_3m_snATAC_fragments.tsv.gz',
 '6m': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138521_RL2208_6m_snATAC_fragments.tsv.gz',
 '10m': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138523_RL2371_10m_snATAC_fragments.tsv.gz',
 '1y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138526_RL2209_1y_snATAC_fragments.tsv.gz',
 '2y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138529_RL1784_2y_snATAC_fragments.tsv.gz',
 '4y': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138532_RL2210_4y_snATAC_fragments.tsv.gz',
 'ga22': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138510_RL2366_ga22_snATAC_fragments.tsv.gz',
 'ga24': '/group/testa/michal.kubacki/herring/data/GSE168408_RAW/GSM5138512_RL2207_ga24_snATAC_fragments.tsv.gz'}

In [29]:
#########  Requied modification to the export_pseudobulk function ###

# bed_paths = {}
# for cell_type in cell_data[variable].unique():
#     _bed_fname = os.path.join(
#         bed_path,
#         f"{_santize_string_for_filename(cell_type)}.fragments.tsv.gz")
#     if os.path.exists(_bed_fname):
#         bed_paths[cell_type] = _bed_fname
#     else:
#         log.warning(f"Missing fragments for {cell_type}!")

# # log.info("generating bigwig files")
# # joblib.Parallel(n_jobs=n_cpu)(
# #     joblib.delayed(_generate_bigwig)
# #     (
# #         path_to_fragments = bed_paths[cell_type],
# #         chromsizes = chromsizes_dict,
# #         normalize_bigwig = normalize_bigwig,
# #         bw_filename = os.path.join(bigwig_path, f"{_santize_string_for_filename(cell_type)}.bw"),
# #         log = log
# #     )
# #     for cell_type in bed_paths.keys()
# # )
# # bw_paths = {}
# # for cell_type in cell_data[variable].unique():
# #     _bw_fname = os.path.join(
# #         bigwig_path,
# #         f"{_santize_string_for_filename(cell_type)}.bw")
# #     if os.path.exists(_bw_fname):
# #         bw_paths[cell_type] = _bw_fname
# #     else:
# #         log.warning(f"Missing bigwig for {cell_type}!")

# # return bw_paths, bed_paths
# return  bed_paths

In [32]:
# %%script false --no-raise-error

os.makedirs(os.path.join(out_dir, "consensus_peak_calling"), exist_ok = True)
os.makedirs(os.path.join(out_dir, "consensus_peak_calling/pseudobulk_bed_files"), exist_ok = True)
os.makedirs(os.path.join(out_dir, "consensus_peak_calling/pseudobulk_bw_files"), exist_ok = True)

paths = export_pseudobulk(
    input_data = cells_data,
    variable = "major_clust",
    sample_id_col = "age_mapped",
    chromsizes = chromsizes,
    bigwig_path = os.path.join(root_dir, "", "consensus_peak_calling/pseudobulk_bw_files"),
    bed_path = os.path.join(out_dir, "consensus_peak_calling/pseudobulk_bed_files"),
    path_to_fragments = fragments_dict,
    n_cpu = 1,
    temp_dir = tmp_dir,
    split_pattern = "-"
)

2024-06-07 09:20:46,702 cisTopic     INFO     Splitting fragments by cell type.


In [None]:
bw_path, bed_paths = paths

NameError: name 'paths' is not defined

In [None]:
# %%script false --no-raise-error
with open(os.path.join(out_dir, "consensus_peak_calling/bed_paths.tsv"), "wt") as f:
    for v in bed_paths:
        _ = f.write(f"{v}\t{bed_paths[v]}\n")

: 

In [None]:
# %%script false --no-raise-error

# directory = os.path.join(out_dir, "consensus_peak_calling/pseudobulk_bed_files")

# # Get the list of file names in the directory
# file_names = os.listdir(directory)
# # Create a dictionary to store the file paths
# file_paths = {}

# # Iterate over the file names and store their paths in the dictionary
# for file_name in file_names:
#     file_path = os.path.join(directory, file_name)
#     file_paths[file_name] = file_path

# # Specify the output directory and file name
# output_file = "bed_paths.tsv"

# # Write the file paths to the output file
# with open(os.path.join(out_dir, "consensus_peak_calling/", output_file), "wt") as f:
#     for file_name, file_path in file_paths.items():
#         f.write(f"{file_name}\t{file_path}\n")

: 

# Inferring consensus peaks

In [None]:
bed_paths = {}
with open(os.path.join(out_dir, "consensus_peak_calling/bed_paths.tsv")) as f:
    for line in f:
        v, p = line.strip().split("\t")
        bed_paths.update({v: p})

: 

In [None]:
bed_paths

: 

In [None]:
# %%script false --no-raise-error 
import logging

macs_path = "/home/michal.kubacki/.conda/envs/scenicplus/bin/macs2" 
# macs_path = "macs2"

os.makedirs(os.path.join(out_dir, "consensus_peak_calling/MACS"), exist_ok = True)

narrow_peak_dict = peak_calling(
    macs_path = macs_path,
    bed_paths = bed_paths,
    outdir = os.path.join(os.path.join(out_dir, "consensus_peak_calling/MACS")),
    genome_size = 'hs',
    n_cpu = 1, # n_cpu,
    input_format = 'BEDPE',
    shift = 73,
    ext_size = 146,
    keep_dup = 'all',
    q_value = 0.05,
    _temp_dir = '/tmp',
    skip_empty_peaks=True,
    logging_level=logging.DEBUG
)

: 

In [None]:
# %%script false --no-raise-error

# Other param
peak_half_width=250
path_to_blacklist=os.path.join(in_dir, f"{reference}-blacklist.v2.bed")
# Get consensus peaks
consensus_peaks = get_consensus_peaks(
    narrow_peaks_dict = narrow_peak_dict,
    peak_half_width = peak_half_width,
    chromsizes = chromsizes,
    path_to_blacklist = path_to_blacklist)

: 

In [None]:
# %%script false --no-raise-error

consensus_peaks.to_bed(
    path = os.path.join(out_dir, "consensus_peak_calling/consensus_regions.bed"),
    keep = True,
    compression = 'infer',
    chain = False)

: 