In [None]:
# Maddy Peng -- 2/10/25

## Tobias workflow

Starting from filtering the BAM files with given barcodes for each sample and cell type.
**See R script for generating these barcodes and peak files**

In [1]:
# Install pysam if you don't have it already
!pip install pysam

Defaulting to user installation because normal site-packages is not writeable


In [1]:
import subprocess
import pysam
import pandas as pd
import numpy as np
import os
import re

In [2]:
# Get working directory
!pwd

/groups/mb928_gp/mp4486/tobias


In [3]:
# Read in metadata to get diagnoses
meta = pd.read_csv("../Multiome_process/meta/current_working_multiome_meta (1).csv")

  meta = pd.read_csv("../Multiome_process/meta/current_working_multiome_meta (1).csv")


In [4]:
meta['sample']

0         CM021
1         CM021
2         CM021
3         CM021
4         CM021
          ...  
495032    MM152
495033    MM152
495034    MM152
495035    MM152
495036    MM152
Name: sample, Length: 495037, dtype: object

In [5]:
# Get IDs from the metadata
ids = list(set(meta['sample']))
len(ids)

74

In [6]:
# Filter meta to just get the sample and donor/diagnosis
meta = meta[meta['sample'].isin(ids)]
dx = meta[['sample','Diagnosis']].drop_duplicates()

In [7]:
dx

Unnamed: 0,sample,Diagnosis
0,CM021,NONE
2222,CM023,NONE
7526,CM024,NONE
14429,CM025,MDD
20707,CM026,NONE
...,...,...
486334,MM148,NONE
489199,MM149,NONE
490177,MM150,NONE
491058,MM151,NONE


In [8]:
# Create a dictionary for control and mdd samples
dx_dict = {'control':[], 'mdd':[]}
for i in range(len(dx)):
    if dx.iloc[i]['Diagnosis'][0] == 'N':
        dx_dict['control'].append(dx.iloc[i]['sample'])
    else:
        dx_dict['mdd'].append(dx.iloc[i]['sample'])

In [33]:
cell_types = ['GC.1', 'ExN.1', 'InN.5']

In [34]:
# Get all the bam file paths per sample
def get_bam_path(i):
    if i[:2] == 'CM':
        path = "/groups/mb928_gp/data_backup/ngs/release/singleCell/220502_MAURA_CHEICK_16_HUMAN_SEQUENCING/bamfiles/"

    else:
        if re.match(r"^(00[1-9]|0[12][0-9]|03[01])$", i[2:5]):
            path = "/groups/mb928_gp/data_backup/230817_MAURA_MADELINE_12_HUMAN_10X/bamfiles/"

        elif re.match(r"^(14[5-9]|15[0-2])$", i[2:5]):
            path = "/groups/mb928_gp/data_backup/250131_MAURA_MADELINE_7_HUMAN_LIBRARY_SELF_3B_PE150_X/BAM_Multiome/"
        else:    
            path = "/groups/mb928_gp/data_backup/240806_MAURA_MADELINE_6_HUMAN_LIBRARY_10X/BAM/"

    return path + f"{i}_atac_possorted_bam.bam"

    
bam_paths = [get_bam_path(i) for i in ids]

In [35]:
len(bam_paths)

74

In [36]:
# Define filtering function: Use pysam for filtering bam by barcode
def filter_bam_by_barcode(input_bam, barcode_file, output_bam):
    """Filter a BAM file to keep only reads with barcodes from a given list."""
    if os.path.exists(output_bam):
        print(f"{output_bam} already exits!")
        return

    if not os.path.exists(barcode_file):
        print(f"{barcode_file} does not exist ..... skipping .......")
        return

    print(input_bam)
        
    # Read barcode list from file
    with open(barcode_file, 'r') as f:
        barcodes = set(line.strip() for line in f)
    
    # Open input BAM and create an output BAM
    with pysam.AlignmentFile(input_bam, "r") as bam_in, \
         pysam.AlignmentFile(output_bam, "wb", template=bam_in) as bam_out:
        
        for read in bam_in:
            # Extract barcode from read (Assumes barcode is stored in a tag, e.g., 'CB' for 10x data)
            barcode = read.get_tag('CB') if read.has_tag('CB') else None
            
            if barcode and f"CB:Z:{barcode}" in barcodes:
                bam_out.write(read)

    print(f"Filtered BAM saved: {output_bam}")


def process_sample(args):
    c, sample_id, input_bam = args
    output_path = f"CURRENT/filtered_bams/{c}"
    os.makedirs(output_path, exist_ok=True)

    barcode_file = f"CURRENT/barcodes/{sample_id}_{c}.txt"
    output_bam = f"{output_path}/{sample_id}_filtered.bam"
    
    filter_bam_by_barcode(input_bam, barcode_file, output_bam)

In [37]:
# Check number of cores available
!nproc

128


In [None]:
# Nested for loop: for each cell type loop through each sample 
from concurrent.futures import ProcessPoolExecutor

# Prepare all tasks
tasks = []
for c in cell_types:
    for i in range(len(ids)):
        tasks.append((c, ids[i], bam_paths[i]))

# Run in parallel
with ProcessPoolExecutor(max_workers=128) as executor:
    executor.map(process_sample, tasks)

In [39]:
# Define merge bam file function: use pysam to merge
def merge_bam_files(args):
    """Merge BAM files for a given cell type and condition."""
    condition, cell_type, bam_list, output_dir = args
    
    if not bam_list:
        return f"No BAMs to merge for {cell_type} - {condition}"

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    output_path = os.path.join(output_dir, f"{condition}_merged.bam")

    if os.path.exists(output_path):
        print(f"{condition} {cell_type} already exists")
        return
    
    pysam.merge("-f", output_path, *bam_list)
    print(f"Merged {condition} BAM for {cell_type}: {output_path}")

# Prepare tasks
merge_tasks = []
for c in cell_types:
    bam_path = f"CURRENT/filtered_bams/{c}"

    ctrl_bams = [f"{bam_path}/{i}_filtered.bam" for i in dx_dict['control'] if os.path.exists(f"{bam_path}/{i}_filtered.bam")]
    mdd_bams = [f"{bam_path}/{i}_filtered.bam" for i in dx_dict['mdd'] if os.path.exists(f"{bam_path}/{i}_filtered.bam")]

    merge_tasks.append(("control", c, ctrl_bams, f"CURRENT/merged_bams/{c}"))
    merge_tasks.append(("mdd", c, mdd_bams, f"CURRENT/merged_bams/{c}"))

In [None]:
# Run in parallel
with ProcessPoolExecutor(max_workers=8) as executor:
    executor.map(merge_bam_files, merge_tasks)

control ImGC.2 already existscontrol NB already exists



In [None]:
!pip install tobias

In [19]:
cell_types

['NSC', 'INP']

In [23]:
!pwd

/groups/mb928_gp/mp4486/tobias


In [None]:
# Run TOBIAS ATACorrect

import os
import subprocess
from concurrent.futures import ProcessPoolExecutor

# Constants
genome_fasta = "/groups/mb928_gp/adr2189/TOBIAS/Tobias_footprinting_non_GN/ATACorrect_input/genome.fa"
blacklist = "/groups/mb928_gp/adr2189/TOBIAS/Tobias_footprinting_non_GN/ATACorrect_input/ENCFF356LFX_blacklist.bed"
peaks_dir = 'CURRENT/peak_files'
output_root = 'CURRENT/merged_bams'

# Build tasks: (group, cell_type, bam_file, peak_file, output_dir)
tasks = []
for group in ['control','mdd']:
    for c in cell_types:
        bam_file = f"{output_root}/{c}/{group}_merged.bam"
        peak_file = f"{peaks_dir}/{c}.bed"
        output_dir = f"{output_root}/{c}"
        output_prefix = f"{output_dir}/{group}_corrected"

        if os.path.exists(bam_file) and os.path.exists(peak_file):
            tasks.append((bam_file, peak_file, output_prefix, output_dir))

print(tasks)
# Function to run TOBIAS ATACorrect
def run_tobias(task):
    bam_file, peaks_file, output_prefix, outdir = task
    cell_type = os.path.basename(peaks_file).split('.')[0]
    print(f"Running TOBIAS ATACorrect for {cell_type}")

    command = [
        "TOBIAS", "ATACorrect",
        "--bam", bam_file,
        "--genome", genome_fasta,
        "--blacklist", blacklist,
        "--peaks", peaks_file,
        "--outdir", outdir,
        "--prefix", os.path.basename(output_prefix),
        "--cores", "4"
    ]
    
    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    print(result.stdout.decode())
    if result.stderr:
        print(result.stderr.decode())

# Run in parallel
with ProcessPoolExecutor(max_workers=4) as executor:
    executor.map(run_tobias, tasks)

In [None]:
# Run TOBIAS footprint scores

from concurrent.futures import ProcessPoolExecutor

peaks_dir = 'CURRENT/peak_files'
genome_dir = 'CURRENT/merged_bams'
conditions = ['control', 'mdd']

def prepare_args(condition, cell_type):
    corrected_file = f"{genome_dir}/{cell_type}/{condition}_corrected_corrected.bw"
    peaks_file = f"{peaks_dir}/{cell_type}.bed"
    output_dir = f"{genome_dir}/{cell_type}/footprints"
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, f"{condition}_footprints.bw")
    return (condition, cell_type, corrected_file, peaks_file, output_file)

def process_task(args):
    condition, cell_type, corrected_file, peaks_file, output_file = args
    print(f"Running TOBIAS FootprintScores for {cell_type} ({condition})")
    
    command = [
        "TOBIAS", "FootprintScores",
        "--signal", corrected_file,
        "--regions", peaks_file,
        "--output", output_file,
        "--cores", "4"
    ]

    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    print(f"[{condition} - {cell_type}] STDOUT:\n{result.stdout.decode()}")
    print(f"[{condition} - {cell_type}] STDERR:\n{result.stderr.decode()}")

# Create task list
args_list = [prepare_args(cond, ct) for cond in conditions for ct in cell_types]

# Run in parallel
with ProcessPoolExecutor(max_workers=4) as executor:
    executor.map(process_task, args_list)

In [None]:
# UROPA
import os
import subprocess
from concurrent.futures import ProcessPoolExecutor

# Directory containing peak files
peaks_dir = 'CURRENT/peak_files'
output_dir = 'CURRENT/uropa_output'
os.makedirs(output_dir, exist_ok=True)

# GTF file path
gtf_file = "/groups/mb928_gp/adr2189/TOBIAS/Tobias_footprinting_non_GN/ATACorrect_input/Homo_sapiens.GRCh38.113.chr.gtf"

# Function to run uropa for a single peak file
def run_uropa(peak_file):
    print(f"Running uropa for: {peak_file}")
    command = [
        "uropa",
        "--bed", peak_file,
        "--gtf", gtf_file,
        "--show_attributes", "gene_id", "gene_name",
        "--feature_anchor", "start",
        "--distance", "20000", "10000",
        "--feature", "gene",
        "--outdir", output_dir
    ]
    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    print(result.stdout.decode())
    print(result.stderr.decode())

# Get all .bed files in the peak directory
peak_files = [os.path.join(peaks_dir, f"{c}.bed") for c in cell_types]

# Run in parallel
with ProcessPoolExecutor(max_workers=4) as executor:
    executor.map(run_uropa, peak_files)

In [None]:
#Cut files
uropa_output_dir = 'CURRENT/uropa_output'

# Function to run cut header
def run_cut_header(cell_type):
    final_hits = os.path.join(uropa_output_dir, f"{cell_type}_finalhits.txt")
    header_output = os.path.join(uropa_output_dir, f"{cell_type}_annotated_header.txt")
    
    command = f"cut -f 1-6,16-17 {final_hits} | head -n 1 > {header_output}"
    subprocess.run(command, shell=True, check=True)
    print(f"Header saved for {cell_type}")

# Function to run cut body (without header)
def run_cut(cell_type):
    final_hits = os.path.join(uropa_output_dir, f"{cell_type}_finalhits.txt")
    body_output = os.path.join(uropa_output_dir, f"{cell_type}_annotated.bed")
    
    command = f"cut -f 1-6,16-17 {final_hits} | tail -n +2 > {body_output}"
    subprocess.run(command, shell=True, check=True)
    print(f"Annotated BED saved for {cell_type}")

# Parallel execution
with ProcessPoolExecutor(max_workers=4) as executor:
    executor.map(run_cut_header, cell_types)
    executor.map(run_cut, cell_types)

In [None]:
# Run bin detect

genome_fasta = "/groups/mb928_gp/adr2189/TOBIAS/Tobias_footprinting_non_GN/ATACorrect_input/genome.fa"
motif_file = "/groups/mb928_gp/adr2189/TOBIAS/Tobias_footprinting_non_GN/JASPAR2024.jaspar"
uropa_output_dir = 'CURRENT/uropa_output'

def run_bin_detect(cell_type):
    base = 'CURRENT/merged_bams'
    ctrl_fp = f"{base}/{cell_type}/footprints/control_footprints.bw"
    mdd_fp = f"{base}/{cell_type}/footprints/mdd_footprints.bw"
    
    peaks = f"{uropa_output_dir}/{cell_type}_annotated.bed"
    header = f"{uropa_output_dir}/{cell_type}_annotated_header.txt"
    output_dir = f"CURRENT/BINDetect_output/{cell_type}"
    os.makedirs(output_dir, exist_ok=True)

    cmd = [
        "TOBIAS", "BINDetect",
        "--motifs", motif_file,
        "--signals", mdd_fp, ctrl_fp,
        "--genome", genome_fasta,
        "--peaks", peaks,
        "--peak_header", header,
        "--outdir", output_dir,
        "--cond_names", f"{cell_type}MDD", f"{cell_type}CTRL",
        "--cores", "4"
    ]

    try:
        subprocess.run(cmd, check=True)
        print(f"BINDetect ran for {cell_type}")
    except subprocess.CalledProcessError as e:
        print(f"Error running BINDetect for {cell_type}: {e}")

# Run in parallel
with ProcessPoolExecutor(max_workers=4) as executor:
    executor.map(run_bin_detect, cell_types)

In [18]:
!pwd

/groups/mb928_gp/mp4486/tobias


In [31]:
import os
os.environ["OPENBLAS_NUM_THREADS"] = "4" 

In [34]:
import subprocess
import pandas as pd
from concurrent.futures import ThreadPoolExecutor
import os

# List of cell types
cell_types = ["GN.1", "GN.2", "ExN.1", "InN.5"]

# Significance thresholds
PVAL_THRESHOLD = 0.05
CHANGE_THRESHOLD = 0.1

# Function to extract significant TFs for a given cell type
def get_significant_tfs(cell):
    bindetect_path = f"CURRENT/BINDetect_output/{cell}/bindetect_results.txt"
    if not os.path.exists(bindetect_path):
        print(f"‚ö†Ô∏è  Missing BINDetect file for {cell}")
        return []

    df = pd.read_csv(bindetect_path, sep="\t")

    # Dynamically build column names
    change_col = f"{cell}MDD_{cell}CTRL_change"
    pval_col = f"{cell}MDD_{cell}CTRL_pvalue"

    if change_col not in df.columns or pval_col not in df.columns:
        print(f"‚ö†Ô∏è  Columns missing in {bindetect_path}: {change_col}, {pval_col}")
        return []

    sig_df = df[(df[pval_col] < PVAL_THRESHOLD) & (df[change_col].abs() > CHANGE_THRESHOLD)]

    # Format: Name_MotifID
    significant_tfs = (sig_df["name"] + "_" + sig_df["motif_id"]).tolist()
    return significant_tfs

# Function to run TOBIAS PlotAggregate for a (cell, tf) combo
def run_plotaggregate(cell_tf):
    cell, tf = cell_tf
    tfbs_bed = f"CURRENT/BINDetect_output/{cell}/{tf}/beds/{tf}_all.bed"
    output_file = f"CURRENT/plots/{cell}_{tf}_plotaggregate.pdf"
    title = f"{cell} {tf} Aggregated Signals"

    if not os.path.exists(tfbs_bed):
        print(f"‚ö†Ô∏è  Missing TFBS bed file: {tfbs_bed}")
        return

    cmd = [
        "TOBIAS", "PlotAggregate",
        "--signals",
        f"CURRENT/merged_bams/{cell}/footprints/control_footprints.bw",
        f"CURRENT/merged_bams/{cell}/footprints/mdd_footprints.bw",
        "--TFBS", tfbs_bed,
        "--output", output_file,
        "--signal_labels", "control", "mdd",
        "--title", title
    ]

    try:
        subprocess.run(cmd, check=True)
        print(f"‚úÖ Finished: {cell} x {tf}")
    except subprocess.CalledProcessError as e:
        print(f"‚ùå Error running for {cell} x {tf}: {e}")

# Build task list
tasks = []
for cell in cell_types:
    tfs = get_significant_tfs(cell)
    print(f"üìå {cell}: {len(tfs)} significant TFs")
    tasks.extend([(cell, tf) for tf in tfs])

with ThreadPoolExecutor(max_workers=16) as executor:
        executor.map(run_plotaggregate, tasks)

üìå GN.1: 240 significant TFs
üìå GN.2: 11 significant TFs
üìå ExN.1: 82 significant TFs
üìå InN.5: 163 significant TFs
‚ö†Ô∏è  Missing TFBS bed file: CURRENT/BINDetect_output/GN.1/NFIC::TLX1_MA0119.1/beds/NFIC::TLX1_MA0119.1_all.bed
# TOBIAS 0.17.1 PlotAggregate (run started 2025-04-23 14:13:27.845175)
# TOBIAS 0.17.1 PlotAggregate (run started 2025-04-23 14:13:27.845191)
# TOBIAS 0.17.1 PlotAggregate (run started 2025-04-23 14:13:27.845198)
# Working directory: /groups/mb928_gp/mp4486/tobias
# Working directory: /groups/mb928_gp/mp4486/tobias
# Command line call: TOBIAS PlotAggregate --signals CURRENT/merged_bams/GN.1/footprints/control_footprints.bw CURRENT/merged_bams/GN.1/footprints/mdd_footprints.bw --TFBS CURRENT/BINDetect_output/GN.1/TFAP4_MA0691.1/beds/TFAP4_MA0691.1_all.bed --output CURRENT/plots/GN.1_TFAP4_MA0691.1_plotaggregate.pdf --signal_labels control mdd --title GN.1 TFAP4_MA0691.1 Aggregated Signals

# Working directory: /groups/mb928_gp/mp4486/tobias
# Command li

In [33]:
tasks

[('GN.1', 'Arnt_MA0004.1'),
 ('GN.1', 'NFIC::TLX1_MA0119.1'),
 ('GN.1', 'Arid3a_MA0151.1'),
 ('GN.1', 'PLAG1_MA0163.1'),
 ('GN.1', 'Tfcp2l1_MA0145.2'),
 ('GN.1', 'KLF5_MA0599.1'),
 ('GN.1', 'Creb3l2_MA0608.1'),
 ('GN.1', 'HNF1B_MA0153.2'),
 ('GN.1', 'JDP2_MA0655.1'),
 ('GN.1', 'MEF2B_MA0660.1'),
 ('GN.1', 'MLX_MA0663.1'),
 ('GN.1', 'TFAP4_MA0691.1'),
 ('GN.1', 'ZIC1_MA0696.1'),
 ('GN.1', 'PHOX2A_MA0713.1'),
 ('GN.1', 'PROP1_MA0715.1'),
 ('GN.1', 'EGR2_MA0472.2'),
 ('GN.1', 'GLIS2_MA0736.1'),
 ('GN.1', 'KLF16_MA0741.1'),
 ('GN.1', 'MEF2D_MA0773.1'),
 ('GN.1', 'PAX3_MA0780.1'),
 ('GN.1', 'POU3F2_MA0787.1'),
 ('GN.1', 'POU3F3_MA0788.1'),
 ('GN.1', 'TFAP2B_MA0813.1'),
 ('GN.1', 'TFAP2C_MA0815.1'),
 ('GN.1', 'Ascl2_MA0816.1'),
 ('GN.1', 'HEY1_MA0823.1'),
 ('GN.1', 'FOXB1_MA0845.1'),
 ('GN.1', 'FOXC1_MA0032.2'),
 ('GN.1', 'TFAP2A_MA0872.1'),
 ('GN.1', 'FERD3L_MA1485.1'),
 ('GN.1', 'ZBTB32_MA1580.1'),
 ('GN.1', 'ZNF8_MA1718.1'),
 ('GN.1', 'Foxn1_MA1684.1'),
 ('GN.1', 'POU2F1::SOX2_MA1962.1'),

In [30]:
import subprocess
from concurrent.futures import ThreadPoolExecutor

# List of cell types
cell_types = ["GN.1", "ExN.1", "InN.5"]  # Add all your cell types here

def run_plotaggregate(cell_tf):
    cell, tf = cell_tf
    tfbs_bed = f"CURRENT/BINDetect_output/{cell}/{tf}/beds/{tf}_all.bed"
    output_file = f"CURRENT/plots/{cell}_{tf}_plotaggregate.pdf"
    title = f"{cell} {tf} Aggregated Signals"

    cmd = [
        "TOBIAS", "PlotAggregate",
        "--signals",
        f"CURRENT/merged_bams/{cell}/footprints/control_footprints.bw",
        f"CURRENT/merged_bams/{cell}/footprints/mdd_footprints.bw",
        "--TFBS", tfbs_bed,
        "--output", output_file,
        "--signal_labels", "control", "mdd",
        "--title", title
    ]

    try:
        subprocess.run(cmd, check=True)
        print(f"‚úÖ Finished: {cell} x {tf}")
    except subprocess.CalledProcessError as e:
        print(f"‚ùå Error running for {cell} x {tf}: {e}")

# Generate all combinations of cell types and TFs
tasks = [
    ('GN.2', 'Nrf1_MA0506.3'),
]

with ThreadPoolExecutor(max_workers=16) as executor:
        executor.map(run_plotaggregate, tasks)

# TOBIAS 0.17.1 PlotAggregate (run started 2025-04-23 13:50:59.234902)
# TOBIAS 0.17.1 PlotAggregate (run started 2025-04-23 13:50:59.234905)
# TOBIAS 0.17.1 PlotAggregate (run started 2025-04-23 13:50:59.234908)
# Working directory: /groups/mb928_gp/mp4486/tobias
# Command line call: TOBIAS PlotAggregate --signals CURRENT/merged_bams/GN.1/footprints/control_footprints.bw CURRENT/merged_bams/GN.1/footprints/mdd_footprints.bw --TFBS CURRENT/BINDetect_output/GN.1/Neurod2_MA0668.3/beds/Neurod2_MA0668.3_all.bed --output CURRENT/plots/GN.1_Neurod2_MA0668.3_plotaggregate.pdf --signal_labels control mdd --title GN.1 Neurod2_MA0668.3 Aggregated Signals

# Working directory: /groups/mb928_gp/mp4486/tobias
# Working directory: /groups/mb928_gp/mp4486/tobias
# Command line call: TOBIAS PlotAggregate --signals CURRENT/merged_bams/GN.1/footprints/control_footprints.bw CURRENT/merged_bams/GN.1/footprints/mdd_footprints.bw --TFBS CURRENT/BINDetect_output/GN.1/ZNF354A_MA1978.2/beds/ZNF354A_MA1978.2_al

ERROR: File "CURRENT/BINDetect_output/GN.2/Nrf1_MA0506.3.2/beds/Nrf1_MA0506.3.2_all.bed" does not exists


2025-04-23 13:50:59 (2258544) [STATS]	COUNT MEF2A_MA0052.5_all: 37957 sites
2025-04-23 13:50:59 (2258544) [INFO]	Reading signal from bigwigs
2025-04-23 13:50:59 (2258544) [INFO]	- Reading signal from control
2025-04-23 13:50:59 (2258556) [STATS]	COUNT POU4F1_MA0790.2_all: 41889 sites
2025-04-23 13:50:59 (2258556) [INFO]	Reading signal from bigwigs
2025-04-23 13:50:59 (2258556) [INFO]	- Reading signal from control
2025-04-23 13:50:59 (2258540) [STATS]	COUNT ZNF354A_MA1978.2_all: 45823 sites
2025-04-23 13:50:59 (2258540) [INFO]	Reading signal from bigwigs
2025-04-23 13:50:59 (2258540) [INFO]	- Reading signal from control
‚ùå Error running for GN.2 x Nrf1_MA0506.3.2: Command '['TOBIAS', 'PlotAggregate', '--signals', 'CURRENT/merged_bams/GN.2/footprints/control_footprints.bw', 'CURRENT/merged_bams/GN.2/footprints/mdd_footprints.bw', '--TFBS', 'CURRENT/BINDetect_output/GN.2/Nrf1_MA0506.3.2/beds/Nrf1_MA0506.3.2_all.bed', '--output', 'CURRENT/plots/GN.2_Nrf1_MA0506.3.2_plotaggregate.pdf', '--

In [27]:
# Out of all these, aggregate the spreadsheets

from collections import defaultdict

cell_types = ['ExN.1', 'GN.1', 'GN.2', 'InN.5']

# Read the RTF and parse TFs for each cell type
celltype_to_tfs = defaultdict(list)

with open('CURRENT/TFs_to_use.txt', 'r') as f:
    current_celltype = None
    for line in f:
        line = line.strip()
        if not line:
            continue
     
        if line in cell_types:  
            current_celltype = line
        else:
            tf = line.strip()
            if current_celltype is not None:
                celltype_to_tfs[current_celltype].append(tf)
            else:
                print(f"‚ö†Ô∏è Found TF without cell type: {tf}")
print(f"Parsed cell types: {list(celltype_to_tfs.keys())}")

Parsed cell types: ['ExN.1', 'GN.1', 'GN.2', 'InN.5']


In [28]:
import os
output_dir = "combined_TF_hits_per_celltype"
os.makedirs(output_dir, exist_ok=True)

celltype_dfs = {}

for celltype, tf_list in celltype_to_tfs.items():
    print(f"\nProcessing {celltype} with {len(tf_list)} TFs...")
    dfs = []
    cell_dir = os.path.join('CURRENT/BINDetect_output', celltype)
    
    for tf in tf_list:
        csv_file = os.path.join(cell_dir,tf,f"{tf}_overview.txt")
        if os.path.exists(csv_file):
            df = pd.read_csv(csv_file, sep="\t")
            dfs.append(df)
        else:
            print(f"‚ö†Ô∏è Missing: {csv_file}")
    
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        celltype_dfs[celltype] = combined_df
        combined_df.to_csv(os.path.join(output_dir, f"{celltype}_combined.csv"), index=False)
    else:
        print(f"‚ùó No TF files found for {celltype}!")


Processing ExN.1 with 48 TFs...

Processing GN.1 with 63 TFs...

Processing GN.2 with 3 TFs...

Processing InN.5 with 47 TFs...


In [34]:
celltype_dfs['ExN.1']

Unnamed: 0,TFBS_chr,TFBS_start,TFBS_end,TFBS_name,TFBS_score,TFBS_strand,peak_chr,peak_start,peak_end,peak_id,peak_score,peak_strand,gene_id,gene_name,ExN.1MDD_score,ExN.1CTRL_score,ExN.1MDD_bound,ExN.1CTRL_bound,ExN.1MDD_ExN.1CTRL_log2fc
0,GL000194.1,55947,55955,CTCFL_MA1102.3,8.76966,-,GL000194.1,55808,56722,peak_6,.,.,,,0.04061,0.04195,0,0,-0.02634
1,GL000194.1,58435,58443,CTCFL_MA1102.3,7.56089,+,GL000194.1,58351,58695,peak_8,.,.,,,0.03660,0.03447,0,0,0.04542
2,GL000194.1,68032,68040,CTCFL_MA1102.3,7.56089,-,GL000194.1,67958,68971,peak_11,.,.,,,0.03447,0.03158,0,0,0.06401
3,GL000194.1,68142,68150,CTCFL_MA1102.3,8.76966,+,GL000194.1,67958,68971,peak_11,.,.,,,0.03511,0.02238,0,0,0.30282
4,GL000194.1,71739,71747,CTCFL_MA1102.3,7.34821,-,GL000194.1,71707,71927,peak_12,.,.,,,0.03614,0.02471,0,0,0.26437
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4028396,chrY,19077139,19077153,ZIC1_MA0696.1,6.61887,-,chrY,19075515,19078146,peak_229899,.,.,ENSG00000310485,,0.22560,0.18743,1,0,0.23125
4028397,chrY,19567593,19567607,ZIC1_MA0696.1,4.72802,-,chrY,19566500,19568218,peak_229901,.,.,ENSG00000131002,TXLNGY,0.20418,0.22325,0,1,-0.11197
4028398,chrY,19744381,19744395,ZIC1_MA0696.1,5.61109,-,chrY,19743926,19745415,peak_229907,.,.,ENSG00000288049,,0.05755,0.04629,0,0,0.19358
4028399,chrY,20978937,20978951,ZIC1_MA0696.1,5.80581,-,chrY,20978846,20979298,peak_229918,.,.,,,0.06520,0.08041,0,0,-0.20949


In [8]:
de_rna = pd.read_csv('../Multiome_process/DE_RESULTS/dif_expression_mdd_vs_ctrl_leiden_annotations.csv')
de_atac = pd.read_csv('../Multiome_process/DE_RESULTS/DE_from_predicted_gene_activity_mdd_vs_ctrl.csv')

de_rna = de_rna[de_rna['adj.P.Val'] < 0.05]
de_atac = de_atac[de_atac['adj.P.Val'] < 0.05]

Unnamed: 0.1,Unnamed: 0,logFC,AveExpr,t,P.Value,adj.P.Val,B,cell_type,gene
0,LINC02315,2.519347,2.738877,6.642613,7.553579e-09,0.000120,7.801319,Astro.1,LINC02315
1,FP671120.4,1.427939,3.204372,5.122715,2.933403e-06,0.023344,3.708431,Astro.1,FP671120.4
2,ACOT1,1.670513,1.890780,4.815490,9.218714e-06,0.041702,2.200764,Astro.1,ACOT1
3,MIR2052HG,0.566993,6.254350,4.670235,1.568618e-05,0.041702,2.894991,Astro.1,MIR2052HG
4,CACNB2,-0.468203,9.722582,-4.636752,1.771383e-05,0.041702,2.754153,Astro.1,CACNB2
...,...,...,...,...,...,...,...,...,...
459427,TNFRSF1925,-0.748854,4.613302,-5.149835,2.514253e-06,0.008209,4.401316,OPC,TNFRSF19
459428,AC090403.116,-2.053451,3.900041,-4.881804,6.906411e-06,0.018791,3.340676,OPC,AC090403.1
459429,MIR3681HG23,-0.724127,8.138444,-4.693994,1.383843e-05,0.028820,2.923820,OPC,MIR3681HG
459430,AC005906.227,-0.536407,6.102415,-4.688440,1.412333e-05,0.028820,2.993036,OPC,AC005906.2


In [54]:
for c in cell_types:
    df = celltype_dfs[c]
    rna = de_rna[de_rna['cell_type'] == c]
    atac = de_atac[de_atac['cell_type'] == c]
    overlap_rna = list(set(df['gene_name']) & set(rna['gene']))
    overlap_atac = list(set(df['gene_name']) & set(atac['gene']))

    overlaps = list(set(overlap_rna + overlap_atac))
    print(len(overlaps))
    celltype_dfs[c] = df[df['gene_name'].isin(overlaps)]

107
1487
25
25


In [126]:
for c in cell_types:
    df = celltype_dfs[c]
    # df = df[(df[f"{c}MDD_bound"]==0) | (df[f"{c}CTRL_bound"]==0)]
    
    # rna = de_rna[de_rna['cell_type'] == c]
    # overlap_rna = list(set(df['gene_name']) & set(rna['gene']))
    
    # df = df[df['gene_name'].isin(overlap_rna)]
    # celltype_dfs[c] = df    
    df.to_csv(os.path.join('combined_TF_hits_per_celltype/', f"{c}_combined_filtered_final.csv"), index=False)

In [5]:
cell_types = ['GN.1', 'GN.2', 'ExN.1', 'InN.5']
celltype_dfs = {}

for c in cell_types:
    df = pd.read_csv(os.path.join('combined_TF_hits_per_celltype/', f"{c}_combined_filtered_final.csv"))
    celltype_dfs[c] = df

len(celltype_dfs)

4

In [19]:
celltype_dfs['GN.1'].sort_values('GN.1MDD_GN.1CTRL_log2fc', ascending=False).tail(20)

Unnamed: 0,TFBS_chr,TFBS_start,TFBS_end,TFBS_name,TFBS_score,TFBS_strand,peak_chr,peak_start,peak_end,peak_id,peak_score,peak_strand,gene_id,gene_name,GN.1MDD_score,GN.1CTRL_score,GN.1MDD_bound,GN.1CTRL_bound,GN.1MDD_GN.1CTRL_log2fc
114567,chr1,228635401,228635412,ZNF682_MA1599.2,7.14135,+,chr1,228635126,228635663,peak_14349,.,.,ENSG00000199270,RNA5S12,0.24288,0.54277,0,1,-1.04151
111881,chr1,228635349,228635359,ZNF610_MA1713.2,7.04504,+,chr1,228635126,228635663,peak_14349,.,.,ENSG00000199270,RNA5S12,0.17391,0.40497,0,1,-1.05507
40792,chr1,228635372,228635383,KLF16_MA0741.1,8.37561,+,chr1,228635126,228635663,peak_14349,.,.,ENSG00000199270,RNA5S12,0.17391,0.41826,0,1,-1.09748
62089,chr1,228635372,228635383,SP3_MA0746.3,8.49302,+,chr1,228635126,228635663,peak_14349,.,.,ENSG00000199270,RNA5S12,0.17391,0.41826,0,1,-1.09748
24853,chr1,228635373,228635382,KLF10_MA1511.2,7.05394,-,chr1,228635126,228635663,peak_14349,.,.,ENSG00000199270,RNA5S12,0.17391,0.41826,0,1,-1.09748
20209,chr1,228635373,228635382,KLF6_MA1517.2,8.2917,+,chr1,228635126,228635663,peak_14349,.,.,ENSG00000199270,RNA5S12,0.17391,0.41826,0,1,-1.09748
64734,chr1,228635373,228635382,SP4_MA0685.2,6.26553,-,chr1,228635126,228635663,peak_14349,.,.,ENSG00000199270,RNA5S12,0.17391,0.41826,0,1,-1.09748
28141,chr1,228635373,228635383,KLF11_MA1512.2,9.08053,+,chr1,228635126,228635663,peak_14349,.,.,ENSG00000199270,RNA5S12,0.17391,0.41826,0,1,-1.09748
22002,chr1,228635373,228635381,KLF7_MA1959.2,7.70673,-,chr1,228635126,228635663,peak_14349,.,.,ENSG00000199270,RNA5S12,0.17391,0.41826,0,1,-1.09748
67687,chr1,228635373,228635383,SP9_MA1564.2,9.10381,+,chr1,228635126,228635663,peak_14349,.,.,ENSG00000199270,RNA5S12,0.17391,0.41826,0,1,-1.09748


In [6]:
celltype_dfs['GN.1'].value_counts(['TFBS_name','gene_name']).to_frame().head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,count
TFBS_name,gene_name,Unnamed: 2_level_1
ZNF454_MA1712.2,DAP,6
PATZ1_MA1961.2,DAP,4
KLF15_MA1513.2,DAP,4
ZBED4_MA2328.1,NETO2,4
ZNF530_MA1981.2,SUSD4,3
PATZ1_MA1961.2,DPF3,3
ZBED4_MA2328.1,ZMIZ1,3
ZNF93_MA1721.2,DAP,3
KLF12_MA0742.2,DAP,3
E2F6_MA0471.3,HNRNPR,3


In [136]:
celltype_dfs['GN.1'].value_counts(['TFBS_name','gene_name'])

TFBS_name        gene_name
ZNF454_MA1712.2  DAP          6
PATZ1_MA1961.2   DAP          4
KLF15_MA1513.2   DAP          4
ZBED4_MA2328.1   NETO2        4
ZNF530_MA1981.2  SUSD4        3
                             ..
PLAG1_MA0163.1   AKAP5        1
                 BRINP3       1
                 FADS3        1
                 FAM118A      1
PATZ1_MA1961.2   RUNX1T1      1
Name: count, Length: 815, dtype: int64

In [7]:
gn1_stk = celltype_dfs['GN.1'][celltype_dfs['GN.1']['gene_name'] == 'STK33']
stk_regions = gn1_stk[['TFBS_chr', 'TFBS_start', 'TFBS_end']].astype(str).agg('-'.join, axis=1)
stk_regions.to_csv('CURRENT/highlight_regions/DAP.csv')

In [140]:
gn1_dap['highlight_region'] = gn1_dap[['TFBS_chr', 'TFBS_start', 'TFBS_end']].astype(str).agg('-'.join, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gn1_dap['highlight_region'] = gn1_dap[['TFBS_chr', 'TFBS_start', 'TFBS_end']].astype(str).agg('-'.join, axis=1)


In [144]:
gn1_dap = gn1_dap[(gn1_dap['TFBS_name'] == 'ZNF454_MA1712.2') | (gn1_dap['TFBS_name'] == 'PATZ1_MA1961.2') | (gn1_dap['TFBS_name'] == 'KLF15_MA1513.2')]

In [149]:
gn1_dap[['TFBS_name', 'gene_name', 'highlight_region', 'GN.1MDD_bound', 'GN.1MDD_GN.1CTRL_log2fc']]

Unnamed: 0,TFBS_name,gene_name,highlight_region,GN.1MDD_bound,GN.1MDD_GN.1CTRL_log2fc
1276058,KLF15_MA1513.2,DAP,chr5-10760870-10760878,1,0.15424
1276059,KLF15_MA1513.2,DAP,chr5-10760881-10760889,1,0.14383
1276060,KLF15_MA1513.2,DAP,chr5-10760940-10760948,1,0.42277
1276067,KLF15_MA1513.2,DAP,chr5-10761406-10761414,1,0.33202
1613861,PATZ1_MA1961.2,DAP,chr5-10760869-10760880,1,0.15424
1613862,PATZ1_MA1961.2,DAP,chr5-10760939-10760950,1,0.42277
1613867,PATZ1_MA1961.2,DAP,chr5-10761404-10761415,1,0.33202
1613868,PATZ1_MA1961.2,DAP,chr5-10761416-10761427,1,0.33202
3670971,ZNF454_MA1712.2,DAP,chr5-10760726-10760743,1,0.20824
3670972,ZNF454_MA1712.2,DAP,chr5-10760769-10760786,0,-0.10754


In [8]:
gn1_stk

Unnamed: 0,TFBS_chr,TFBS_start,TFBS_end,TFBS_name,TFBS_score,TFBS_strand,peak_chr,peak_start,peak_end,peak_id,peak_score,peak_strand,gene_id,gene_name,GN.1MDD_score,GN.1CTRL_score,GN.1MDD_bound,GN.1CTRL_bound,GN.1MDD_GN.1CTRL_log2fc
116,chr11,8612754,8612762,KLF4_MA0039.5,7.17966,+,chr11,8612218,8613143,peak_24945,.,.,ENSG00000130413,STK33,0.32035,0.15041,1,0,0.91862
729,chr11,8612738,8612752,ZNF93_MA1721.2,9.86667,+,chr11,8612218,8613143,peak_24945,.,.,ENSG00000130413,STK33,0.35848,0.15722,1,0,1.01297
932,chr11,8612681,8612692,ZNF682_MA1599.2,7.26837,+,chr11,8612218,8613143,peak_24945,.,.,ENSG00000130413,STK33,0.29257,0.14367,1,0,0.8549


## Finding the TFs and genes with highest logFC in RNA DEGs

In [6]:
gn1_genes = list(set(celltype_dfs['GN.1']['gene_name']))
gn2_genes = list(set(celltype_dfs['GN.2']['gene_name']))
ex1_genes = list(set(celltype_dfs['ExN.1']['gene_name']))
in5_genes = list(set(celltype_dfs['InN.5']['gene_name']))

In [9]:
gn1_footprints_degs = de_rna[(de_rna['gene'].isin(gn1_genes)) & (de_rna['cell_type']=='GN.1')]
gn2_footprints_degs = de_rna[(de_rna['gene'].isin(gn2_genes)) & (de_rna['cell_type']=='GN.2')]
ex1_footprints_degs = de_rna[(de_rna['gene'].isin(ex1_genes)) & (de_rna['cell_type']=='ExN.1')]
in5_footprints_degs = de_rna[(de_rna['gene'].isin(in5_genes)) & (de_rna['cell_type']=='InN.5')]

In [10]:
gn2_footprints_degs


Unnamed: 0.1,Unnamed: 0,logFC,AveExpr,t,P.Value,adj.P.Val,B,cell_type,gene
238363,ZDHHC11B13,-0.942788,4.233546,-5.073574,4e-06,0.013788,3.706624,GN.2,ZDHHC11B


In [19]:
celltype_dfs['GN.1']=celltype_dfs['GN.1'].merge(gn1_footprints_degs[['gene', 'logFC', 'AveExpr', 'adj.P.Val']],
                                                 how='left',
                                                 left_on = 'gene_name', right_on='gene')

celltype_dfs['GN.2']=celltype_dfs['GN.2'].merge(gn2_footprints_degs[['gene', 'logFC', 'AveExpr', 'adj.P.Val']],
                                                 how='left',
                                                 left_on = 'gene_name', right_on='gene')

celltype_dfs['ExN.1']=celltype_dfs['ExN.1'].merge(ex1_footprints_degs[['gene', 'logFC', 'AveExpr', 'adj.P.Val']],
                                                 how='left',
                                                 left_on = 'gene_name', right_on='gene')

celltype_dfs['InN.5']=celltype_dfs['InN.5'].merge(in5_footprints_degs[['gene', 'logFC', 'AveExpr', 'adj.P.Val']],
                                                 how='left',
                                                 left_on = 'gene_name', right_on='gene')

In [51]:
set(celltype_dfs['GN.1']['gene'])

{'AKAP5',
 'ARHGAP28',
 'BLCAP',
 'BRI3BP',
 'BRINP3',
 'BTBD16',
 'C1RL',
 'CCDC170',
 'CCND2',
 'CCND2-AS1',
 'CD99L2',
 'COA6',
 'CXCL12',
 'CYTH3',
 'DAP',
 'DEK',
 'DNAJC15',
 'DPF3',
 'FADS3',
 'FAM118A',
 'FAM53B',
 'FBLN7',
 'FRAS1',
 'GFOD1',
 'GNA14',
 'GRIN3A',
 'GUCA1B',
 'HELB',
 'HIP1',
 'HNRNPR',
 'IFIT1',
 'IFIT5',
 'IL21R-AS1',
 'IQGAP2',
 'ITPK1',
 'JAM3',
 'KCNG1',
 'LAMP5',
 'LINC00534',
 'LINC01931',
 'LINC02248',
 'LURAP1L',
 'LYRM7',
 'MAPK4',
 'MCM4',
 'MCOLN3',
 'MGAT5B',
 'NETO2',
 'NPAS3',
 'NRG1',
 'NUCKS1',
 'PDGFC',
 'PIK3R3',
 'PJA2',
 'PLXNA3',
 'RAB22A',
 'RBBP7',
 'RBM20',
 'RIF1',
 'RUNX1T1',
 'SCART1',
 'SDC2',
 'SEMA5B',
 'SEMA6D',
 'SERTM1',
 'SESN3',
 'SGSM2',
 'SH3GL3',
 'SLC25A18',
 'SLC35F3',
 'SNTG2',
 'SPNS2',
 'ST3GAL6',
 'ST8SIA3',
 'STK33',
 'SUSD4',
 'SYT13',
 'SYT6',
 'TAP2',
 'TERF2IP',
 'TMCO1',
 'TMEM220',
 'VASH1',
 'YWHAQ',
 'ZDHHC11B',
 'ZMIZ1',
 'ZNF483'}

In [50]:
celltype_dfs['GN.1'][celltype_dfs['GN.1']['gene_name'] ==  'SEMA5B']

Unnamed: 0,TFBS_chr,TFBS_start,TFBS_end,TFBS_name,TFBS_score,TFBS_strand,peak_chr,peak_start,peak_end,peak_id,...,AveExpr_x,adj.P.Val_x,gene_y,logFC_y,AveExpr_y,adj.P.Val_y,gene,logFC,AveExpr,adj.P.Val
557,chr3,123027791,123027801,KLF11_MA1512.2,7.22719,-,chr3,123026159,123028776,peak_113568,...,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465
545,chr3,123027519,123027529,ZBED4_MA2328.1,9.57853,-,chr3,123026159,123028776,peak_113568,...,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465
556,chr3,123028009,123028019,ZNF148_MA1653.2,7.61743,+,chr3,123026159,123028776,peak_113568,...,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465
558,chr3,123027800,123027810,ZNF148_MA1653.2,8.54208,-,chr3,123026159,123028776,peak_113568,...,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465
559,chr3,123027787,123027795,MAZ_MA1522.2,10.55435,-,chr3,123026159,123028776,peak_113568,...,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465
546,chr3,123027786,123027795,KLF12_MA0742.2,6.70351,+,chr3,123026159,123028776,peak_113568,...,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465
547,chr3,123027791,123027801,SP9_MA1564.2,8.36596,-,chr3,123026159,123028776,peak_113568,...,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465
544,chr3,123027004,123027013,KLF14_MA0740.2,8.07385,+,chr3,123026159,123028776,peak_113568,...,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465
551,chr3,123027784,123027798,ZNF530_MA1981.2,7.56801,+,chr3,123026159,123028776,peak_113568,...,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465
568,chr3,123027792,123027801,KLF10_MA1511.2,8.08997,+,chr3,123026159,123028776,peak_113568,...,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465,SEMA5B,-0.236242,6.285134,0.033465


In [36]:
celltype_dfs['GN.1']= celltype_dfs['GN.1'].sort_values('logFC', ascending=False)
gn1_use = celltype_dfs['GN.1'][['gene', 'TFBS_name', 'logFC', 'AveExpr','GN.1MDD_bound','TFBS_chr', 'TFBS_start', 'TFBS_end']]

In [37]:
gn1_use['highlight_region'] = gn1_use[['TFBS_chr', 'TFBS_start', 'TFBS_end']].astype(str).agg('-'.join, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gn1_use['highlight_region'] = gn1_use[['TFBS_chr', 'TFBS_start', 'TFBS_end']].astype(str).agg('-'.join, axis=1)


In [38]:
gn1_use.sort_values('AveExpr', ascending=False).head(100).sort_values('logFC', ascending=False)

Unnamed: 0,gene,TFBS_name,logFC,AveExpr,GN.1MDD_bound,TFBS_chr,TFBS_start,TFBS_end,highlight_region
101,NRG1,SP4_MA0685.2,0.678469,8.423643,1,chr8,31640308,31640317,chr8-31640308-31640317
110,NRG1,KLF10_MA1511.2,0.678469,8.423643,1,chr8,31640308,31640317,chr8-31640308-31640317
117,NRG1,TFAP2B_MA0811.2,0.678469,8.423643,1,chr8,31640024,31640035,chr8-31640024-31640035
112,NRG1,KLF14_MA0740.2,0.678469,8.423643,1,chr8,31640308,31640317,chr8-31640308-31640317
109,NRG1,KLF12_MA0742.2,0.678469,8.423643,1,chr8,31640308,31640317,chr8-31640308-31640317
...,...,...,...,...,...,...,...,...,...
897,SLC35F3,SP4_MA0685.2,-0.642352,9.293730,0,chr1,233905376,233905385,chr1-233905376-233905385
886,SLC35F3,TFDP1_MA1122.2,-0.642352,9.293730,0,chr1,233905369,233905377,chr1-233905369-233905377
883,SLC35F3,ZNF148_MA1653.2,-0.642352,9.293730,0,chr1,233905466,233905476,chr1-233905466-233905476
881,SLC35F3,SP2_MA0516.3,-0.642352,9.293730,0,chr1,233905466,233905475,chr1-233905466-233905475


In [53]:
len(list(gn1_use[gn1_use['gene']== 'SEMA5B']['highlight_region']))


37

In [54]:
celltype_dfs['GN.1'][celltype_dfs['GN.1']['gene']== 'SUSD4']

Unnamed: 0,TFBS_chr,TFBS_start,TFBS_end,TFBS_name,TFBS_score,TFBS_strand,peak_chr,peak_start,peak_end,peak_id,...,AveExpr_x,adj.P.Val_x,gene_y,logFC_y,AveExpr_y,adj.P.Val_y,gene,logFC,AveExpr,adj.P.Val
522,chr1,223364095,223364105,ZNF148_MA1653.2,8.54208,+,chr1,223362972,223365521,peak_13937,...,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316
528,chr1,223364089,223364098,SP2_MA0516.3,7.71798,-,chr1,223362972,223365521,peak_13937,...,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316
521,chr1,223363237,223363257,ZNF320_MA1976.2,5.81996,-,chr1,223362972,223365521,peak_13937,...,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316
520,chr1,223364932,223364940,KLF4_MA0039.5,8.27062,+,chr1,223362972,223365521,peak_13937,...,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316
523,chr1,223364095,223364105,ZNF281_MA1630.3,10.03539,-,chr1,223362972,223365521,peak_13937,...,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316
517,chr1,223365196,223365210,ZNF530_MA1981.2,8.55087,-,chr1,223362972,223365521,peak_13937,...,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316
518,chr1,223364089,223364098,KLF12_MA0742.2,6.66238,-,chr1,223362972,223365521,peak_13937,...,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316
519,chr1,223364577,223364587,ZNF148_MA1653.2,7.61743,+,chr1,223362972,223365521,peak_13937,...,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316
504,chr1,223364544,223364558,ZNF574_MA1982.2,6.96527,-,chr1,223362972,223365521,peak_13937,...,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316
505,chr1,223365009,223365017,PLAGL2_MA1548.2,7.59842,+,chr1,223362972,223365521,peak_13937,...,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316,SUSD4,-0.210015,8.79127,0.040316


In [101]:
celltype_dfs['GN.1'].to_csv(os.path.join('combined_TF_hits_per_celltype/', f"GN.1_combined_filtered_final.csv"), index=False)