# Environment

In [1]:
import gc
import os
import importlib
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Set working directory
# work_dir = '/home/michal.kubacki/Githubs/GeneScore/trimmed_GRN_derivation'
# work_dir = 'D:/Github/GeneScore/trimmed_GRN_derivation'
work_dir = '/mnt/d/Github/GeneScore/trimmed_GRN_derivation'
os.chdir(work_dir)

# Load environment variables from .env file
from dotenv import load_dotenv

# Explicitly specify the path to the .env file
env_path = os.path.join(work_dir, '.env')
load_dotenv(env_path)

# Get environment variables with error handling
project_functions_path = os.getenv('PROJECT_FUNCTIONS_PATH')
if not project_functions_path:
    raise ValueError("PROJECT_FUNCTIONS_PATH environment variable not found in .env file")

print(f"Using PROJECT_FUNCTIONS_PATH: {project_functions_path}")
sys.path.insert(0, project_functions_path)

# Try to import from project_functions
try:
    from grn_helpers import *
except ImportError:
    print("Warning: Could not import from project_functions path, trying absolute path")
    # Try absolute import path as fallback
    # sys.path.insert(0, '/home/michal.kubacki/Githubs/GeneScore/project_functions')
    # sys.path.insert(0, 'D:/Github/GeneScore/project_functions')
    sys.path.insert(0,'/mnt/d/Github/GeneScore/project_functions')
    from grn_helpers import *

Using PROJECT_FUNCTIONS_PATH: /mnt/d/Github/GeneScore/project_functions


In [2]:
n_cpus = 8
neurons_set = "L2-3_CUX2"
# neurons_set = "all_ex"
# neurons_set = "all_ex_all_ages"
root_dir = os.getenv('BASE_PATH')

In [3]:
cells_dict = {
    "all_ex"            :   ['L5-6_TLE4', 'L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'PN_dev'],
    "all_ex_all_ages"   :   ['L5-6_TLE4', 'L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'PN_dev'],
    "L2-3_CUX2"         :   ['L2-3_CUX2']
}

ages_dict = {
    "all_ex"            :   ['1m','3m','6m','10m','1y','2y','4y','ga22','ga24'],
    "all_ex_all_ages"   :   ['1m','3m','6m','10m','1y','2y','4y','6y','10y','16y','20y','40y','ga22','ga24'],
    "L2-3_CUX2"         :   ['1m','3m','6m','10m','1y','2y','4y','ga22','ga24']
}

output_dir, input_dir, root_dir, tmp_dir, in_dir_from_scenic = set_custom_folders(root_dir, neurons_set)

sel_celltypes  = cells_dict[neurons_set]
sel_ages = ages_dict[neurons_set]

root_dir: /mnt/d/Github/GeneScore/herring_minimal
out_dir: /mnt/d/Github/GeneScore/herring_minimal/L2-3_CUX2/celloracle
in_dir: /mnt/d/Github/GeneScore/herring_minimal/data
tmp_dir: /mnt/d/Github/GeneScore/herring_minimal/celloracle/tmp


In [4]:

plt.rcParams['figure.figsize'] = (15,7)
plt.rcParams["savefig.dpi"] = 600

# Check results

In [5]:
cell_type = sel_celltypes[0]

In [6]:
file_path = os.path.join(output_dir, f"{cell_type}.celloracle.parquet")

In [7]:
df = pd.read_parquet(file_path)

In [8]:
df.shape

(21264, 1080)

In [9]:
# total number of non zero elements
result = df.iloc[:, 2:].astype(bool).sum().sum()
print(result)

5754903


In [10]:
selected_columns = df.iloc[:, 2:]
row_counts = selected_columns.apply(lambda row: len(row[row != 0.0]), axis=1)
print(list(row_counts))

first_row = df.iloc[0]
non_zero_elements = first_row[first_row != 0]
print(non_zero_elements)

[282, 348, 312, 289, 307, 285, 264, 274, 299, 216, 323, 294, 213, 252, 278, 285, 285, 268, 239, 169, 301, 237, 280, 232, 270, 274, 338, 188, 260, 260, 298, 283, 266, 232, 232, 268, 180, 274, 278, 253, 297, 281, 299, 299, 273, 266, 275, 253, 266, 271, 255, 280, 218, 319, 287, 286, 241, 229, 260, 295, 267, 328, 313, 280, 293, 278, 310, 260, 241, 295, 252, 188, 275, 333, 251, 226, 260, 202, 265, 314, 231, 306, 209, 209, 247, 275, 276, 276, 298, 298, 281, 325, 307, 260, 240, 266, 314, 218, 204, 266, 310, 310, 249, 256, 311, 290, 290, 299, 275, 254, 286, 260, 257, 237, 238, 293, 298, 270, 269, 254, 285, 289, 227, 273, 273, 284, 280, 282, 250, 185, 245, 257, 282, 204, 292, 228, 239, 265, 191, 287, 199, 211, 207, 265, 315, 267, 273, 273, 296, 200, 284, 306, 279, 241, 219, 316, 310, 264, 263, 273, 277, 316, 276, 313, 286, 291, 345, 273, 203, 347, 263, 266, 269, 319, 319, 222, 260, 267, 267, 212, 252, 237, 208, 280, 230, 312, 254, 302, 294, 226, 241, 217, 275, 237, 323, 315, 330, 217, 380, 333,

## Compare with `2023_11_tfi.celloracle.parquet`

In [11]:
base_GRN = pd.read_parquet(os.path.join(input_dir, "2023_11_tfi.celloracle.parquet"), engine='pyarrow')

In [12]:
base_GRN.head()

Unnamed: 0,peak_id,gene_short_name,9430076C15RIK,AC002126.6,AC012531.1,AC226150.2,AFP,AHR,AHRR,AIRE,...,ZNF784,ZNF8,ZNF816,ZNF85,ZSCAN10,ZSCAN16,ZSCAN22,ZSCAN26,ZSCAN31,ZSCAN4
0,chr10_100009478_100010332,DNMBP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,chr10_100045768_100046671,CHUK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chr10_100045768_100046671,ERLIN1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chr10_100185577_100186445,ERLIN1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chr10_100229128_100229954,CHUK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
base_GRN.shape

(272543, 1084)

In [14]:
selected_columns = base_GRN.iloc[:, 2:]
row_counts = selected_columns.apply(lambda row: len(row[row != 0.0]), axis=1)
print(list(row_counts[:100]))

first_row = base_GRN.iloc[0]
non_zero_elements = first_row[first_row != 0]
print(non_zero_elements)

[104, 47, 47, 122, 117, 85, 78, 35, 35, 35, 35, 35, 98, 98, 98, 98, 98, 98, 44, 44, 114, 68, 86, 85, 61, 61, 61, 73, 77, 58, 131, 117, 117, 104, 104, 117, 117, 117, 117, 117, 117, 117, 117, 117, 117, 135, 135, 135, 135, 158, 158, 158, 158, 158, 158, 158, 158, 158, 65, 65, 65, 65, 113, 113, 113, 113, 113, 122, 122, 122, 122, 122, 122, 122, 122, 122, 125, 57, 63, 63, 63, 63, 61, 42, 42, 42, 92, 76, 76, 76, 76, 76, 76, 76, 76, 76, 110, 110, 110, 110]
peak_id            chr10_100009478_100010332
gene_short_name                        DNMBP
ATF3                                     1.0
BACH1                                    1.0
BACH2                                    1.0
                             ...            
ZIC3                                     1.0
ZIC4                                     1.0
ZIC5                                     1.0
ZNF143                                   1.0
ZNF350                                   1.0
Name: 0, Length: 106, dtype: object


In [15]:
result2 = base_GRN.iloc[:, 2:].astype(bool).sum().sum()
print(result2)

21064823


In [16]:
print(result/result2)

0.273199684611639
