In [None]:
import gc
import os
import sys
import importlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import celloracle as co
from celloracle import motif_analysis as ma
co.__version__

from dotenv import load_dotenv
load_dotenv()
sys.path.insert(0, os.getenv('PROJECT_FUNCTIONS_PATH'))

from grn_helpers import set_custom_folders

In [2]:
n_cpus = 8
neurons_set = "L2-3_CUX2"
# neurons_set = "all_ex"
# neurons_set = "all_ex_all_ages"
reference = "hg19"
root_dir = os.getenv('BASE_PATH')

In [None]:
output_dir, input_dir, root_dir, tmp_dir, in_dir_from_scenic = set_custom_folders(root_dir, neurons_set)
    
celltypes_dict = {
    "all_ex"                : ['L5-6_TLE4', 'L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'PN_dev'],
    "all_ex_all_ages"       : ['L5-6_TLE4', 'L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'PN_dev'],
    "L2-3_CUX2"             : ['L2-3_CUX2']
}

sel_celltypes = celltypes_dict[neurons_set]

In [4]:
# Load motifs from celloracle dataset or any other custom TFs set
motifs = ma.load_motifs("CisBP_ver2_Homo_sapiens.pfm")
base_GRN = pd.read_parquet(os.path.join(input_dir, "2023_11_tfi.celloracle.parquet"), engine='pyarrow')

base_GRN_non_zero = base_GRN.iloc[:, 2:].astype(bool).sum().sum()

In [None]:
motifs[:5]

In [None]:
base_GRN.head()

In [None]:
base_GRN_non_zero

In [None]:
for cell_type in sel_celltypes:
    peaks_path = os.path.join(output_dir, f'processed_peak_file_{cell_type}.csv')

    peaks = pd.read_csv(os.path.join(output_dir, peaks_path), index_col=0)
    peaks.head()

    tfi = ma.TFinfo(peak_data_frame=peaks, 
                    ref_genome=reference,
                    genomes_dir=None) 

    gc.collect()

    tfi.scan(fpr=0.01, 
            motifs=motifs, #None
            verbose=True, n_cpus=n_cpus)

    file_name = os.path.join(output_dir, f"{cell_type}.celloracle.tfinfo")
    tfi.to_hdf5(file_path = file_name)

    tfi.scanned_df.head()

    tfi.reset_filtering()

    tfi.filter_motifs_by_score(threshold=8)

    tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)

    tfi.scanned_df.head()

    file_path = os.path.join(output_dir, f"{cell_type}.celloracle.parquet")

    df = tfi.to_dataframe()
    df.to_parquet(file_path)

    df.shape

    GRN_non_zero = df.iloc[:, 2:].astype(bool).sum().sum()

    print(GRN_non_zero/base_GRN_non_zero)

In [12]:
# Load back saved file
cell_type = "L2-3_CUX2"
file_path = os.path.join(output_dir, f"{cell_type}.celloracle.parquet")

df = pd.read_parquet(file_path)


In [13]:
df.head()

Unnamed: 0,peak_id,gene_short_name,AC023509.3,AC138696.1,AHR,AIRE,ALX1,ALX3,ALX4,ANHX,...,ZSCAN22,ZSCAN23,ZSCAN29,ZSCAN30,ZSCAN31,ZSCAN4,ZSCAN5,ZSCAN5C,ZSCAN9,ZZZ3
0,chr10_100027739_100028239,LOXL4,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,chr10_100205694_100206194,LOC101927278,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,chr10_100206328_100206828,HPS1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,chr10_101088830_101089330,CNNM1,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
4,chr10_101190344_101190844,GOT1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
GRN_non_zero = df.iloc[:, 2:].astype(bool).sum().sum()
print(GRN_non_zero/base_GRN_non_zero)

0.09130496847754192
