# Environment

In [155]:
import sys
import os
import anndata as ad
import ipywidgets as widgets
import numpy as np

from dotenv import load_dotenv
load_dotenv()
sys.path.insert(0, os.getenv('PROJECT_FUNCTIONS_PATH'))

from evaluated_helpers import (
    load_GRNs_gene_sets,
    remove_duplicates_preserve_order_GRNs,
    boxplot_EDCs_GRN_scores_parameters_local
)

from gene_scoring import score_genes

In [156]:
gpu_support = False
recompute = True
plotting = True

In [157]:
base_path = os.getenv('BASE_PATH')
root_dir = base_path
data_path = os.path.join(base_path, "data")
output_path = os.path.join(base_path, "all_ex/results")

# Load Precomputed Scores

In [158]:
if not recompute:
    file_name = os.path.join(output_path, f"EDCs_andata_scored_GRNs.loom")
    adata = ad.read_loom(file_name, sparse=False)

    adata.var['original_var_names'] = adata.var.index
    adata.var_names = adata.var['var_names']

    adata.var_names_make_unique()
    adata.var.set_index('var_names', inplace=True)

# Load Gene Sets Data

In [159]:
gere_sets_dict, gene_sets_dict_cell_type_first = load_GRNs_gene_sets(root_dir=root_dir)

In [160]:
gene_sets_dict_cell_type_first = remove_duplicates_preserve_order_GRNs(gene_sets_dict_cell_type_first)

In [161]:
sets = list(gere_sets_dict)
print(sets)

set_selected = sets[0]
cell_types = list(gene_sets_dict_cell_type_first[set_selected].keys())
print(cell_types)

cell_type_selected = cell_types[0]
scored_genes = list(gene_sets_dict_cell_type_first[set_selected][cell_type_selected].keys())
print(scored_genes)

scored_gene_selected = scored_genes[0]
print(len(gene_sets_dict_cell_type_first[set_selected][cell_type_selected][scored_gene_selected]['targets']))

['all_ex']
['L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'L5-6_TLE4', 'PN_dev']
['AHR', 'ARNT', 'ARNT2', 'CLOCK', 'AR', 'NR1I2', 'NR1I3', 'NR3C1', 'NR3C2', 'ESR1', 'RARA', 'ESR2', 'THRB', 'THRA']
30


# Load Expression Data

In [162]:
if recompute:
    adata = ad.read_h5ad(os.path.join(data_path,'CTL04_EDCs.h5ad'))

In [163]:
adata

AnnData object with n_obs × n_vars = 121 × 13908
    obs: 'InternalUniqueID', 'HRID', 'Specimen', 'Timepoint', 'Condition', 'Concentration', 'Line', 'Sex', 'Project', 'Seq.run', 'FASTQ.R1', 'FASTQ.R2', 'RequestedCoverage', 'ExperimentCode', 'SeqApproach', 'RNASelection', 'SeqPlatform', 'NrSequencingRuns', 'lib.size', 'Treatment', 'norm.factors'
    var: 'Gene', 'EnsGene', 'HGNCSymbol', 'GeneName', 'GeneBiotype', 'description', 'Chr', 'Start', 'End'
    layers: 'counts', 'cpm'

In [164]:
adata.obs['condition_concentraion'] = [item1 + '_' + item2 for item1, item2 in zip(list(adata.obs.Condition), list(adata.obs.Concentration))]

In [165]:
adata.obs.condition_concentraion.unique()

array(['3PBA_0.1X', '3PBA_1X', '3PBA_10X', '3PBA_100X', 'BPA_0.1X',
       'BPA_10X', 'BPA_100X', 'BPF_0.1X', 'BPF_1X', 'BPF_10X', 'BPF_100X',
       'DMSO_0.1', 'DPHP_0.1X', 'DPHP_1X', 'DPHP_10X', 'DPHP_100X',
       'MBzP_0.1X', 'MBzP_1X', 'MBzP_10X', 'MBzP_100X', 'MEP_0.1X',
       'MEP_1X', 'MEP_10X', 'MEP_100X', 'MIX_0.1X', 'MIX_1X', 'MIX_10X',
       'TCP_0.1X', 'TCP_1X', 'TCP_10X', 'TCP_100X'], dtype=object)

In [166]:
adata.obs.Condition.unique()

['3PBA', 'BPA', 'BPF', 'DMSO', 'DPHP', 'MBzP', 'MEP', 'MIX', 'TCP']
Categories (9, object): ['3PBA', 'BPA', 'BPF', 'DMSO', ..., 'MBzP', 'MEP', 'MIX', 'TCP']

In [167]:
samples_count = [f"{unique}: {np.sum(adata.obs.condition_concentraion == unique)}" for unique in list(adata.obs.condition_concentraion.unique())]
print(samples_count)

['3PBA_0.1X: 4', '3PBA_1X: 4', '3PBA_10X: 4', '3PBA_100X: 4', 'BPA_0.1X: 3', 'BPA_10X: 2', 'BPA_100X: 2', 'BPF_0.1X: 4', 'BPF_1X: 4', 'BPF_10X: 4', 'BPF_100X: 4', 'DMSO_0.1: 6', 'DPHP_0.1X: 4', 'DPHP_1X: 4', 'DPHP_10X: 4', 'DPHP_100X: 4', 'MBzP_0.1X: 4', 'MBzP_1X: 4', 'MBzP_10X: 4', 'MBzP_100X: 4', 'MEP_0.1X: 4', 'MEP_1X: 4', 'MEP_10X: 4', 'MEP_100X: 4', 'MIX_0.1X: 4', 'MIX_1X: 4', 'MIX_10X: 4', 'TCP_0.1X: 4', 'TCP_1X: 4', 'TCP_10X: 4', 'TCP_100X: 4']


# Scoring

In [168]:
print(list(gene_sets_dict_cell_type_first["all_ex"].keys()))

['L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'L5-6_TLE4', 'PN_dev']


In [169]:
gois = ['AHR', 'AR', 'NR1I2', 'NR1I3', 'NR3C1', 'NR3C2', 'ESR1', 'RARA', 'ESR2', 'THRB', 'THRA']
gene_sets = ['all_ex']
cell_types = ['L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'L5-6_TLE4', 'PN_dev']

In [170]:
adata

AnnData object with n_obs × n_vars = 121 × 13908
    obs: 'InternalUniqueID', 'HRID', 'Specimen', 'Timepoint', 'Condition', 'Concentration', 'Line', 'Sex', 'Project', 'Seq.run', 'FASTQ.R1', 'FASTQ.R2', 'RequestedCoverage', 'ExperimentCode', 'SeqApproach', 'RNASelection', 'SeqPlatform', 'NrSequencingRuns', 'lib.size', 'Treatment', 'norm.factors', 'condition_concentraion'
    var: 'Gene', 'EnsGene', 'HGNCSymbol', 'GeneName', 'GeneBiotype', 'description', 'Chr', 'Start', 'End'
    layers: 'counts', 'cpm'

In [171]:
if recompute:
    for control in [True, False]:
        for control_condition in ['DMSO', None]:
            for normalize_weights in [True, False]:
                for scaling_only_based_on_control in [True, False]: 
                    for scale_by_variance in [True, False]:
                        for gene_set in list(gene_sets_dict_cell_type_first.keys()):
                            for cell_type in list(gene_sets_dict_cell_type_first[gene_set].keys()):
                                for goi in gois:
                                    score_genes(
                                        adata,
                                        gene_list=gene_sets_dict_cell_type_first[gene_set][cell_type][goi]['targets'], 
                                        gene_weights=gene_sets_dict_cell_type_first[gene_set][cell_type][goi]['coef_mean'],   
                                        score_name = (
                                            f'gene_score_{gene_set}_{cell_type}_{goi}_{control}_'
                                            f'normalized_{normalize_weights}_'
                                            f'scaled_{scale_by_variance}_'
                                            f'cc_{control_condition}_'
                                            f'sc_{scaling_only_based_on_control}'
                                        ),                                    
                                        ctrl_size=50,
                                        gene_pool=None,
                                        n_bins=25,
                                        random_state=0,
                                        copy=False,
                                        used_layer='cpm',
                                        return_scores=False,
                                        control=control,
                                        weighted=True,
                                        abs_diff=False,
                                        gpu=gpu_support,
                                        chunk_size=10000,
                                        disable_chunking=True,
                                        scale_by_variance=scale_by_variance,
                                        normalize_weights=normalize_weights,
                                        conditions_labels='Condition',
                                        control_condition=control_condition,
                                        debug=False,
                                        scaling_only_based_on_control=scaling_only_based_on_control
                                )

# Save results

In [172]:
if recompute:
    file_name = os.path.join(output_path, f"EDCs_andata_scored_GRNs.loom")
    adata.write_loom(file_name)

# Plotting

In [173]:
%%capture
if plotting:
    condition_dropdown = widgets.Dropdown(
        options=list(adata.obs.Condition.unique()),
        value=list(adata.obs.Condition.unique())[0],
        description='Condition:',
        disabled=False,
    )

    gene_set_dropdown = widgets.Dropdown(
        options=list(gene_sets_dict_cell_type_first.keys()),
        value=list(gene_sets_dict_cell_type_first.keys())[0],
        description='Gene Set:',
        disabled=False,
    )

    control_dropdown = widgets.Dropdown(
        options=list(['True', 'False']),
        value=list(['True', 'False'])[0],
        description='Control:',
        disabled=False,
    )

    control_condition_dropdown = widgets.Dropdown(
        options=list(['DMSO', "None"]),
        value=list(['DMSO', "None"])[0],
        description='Condition Control:',
        disabled=False,
    )

    normalized_dropdown = widgets.Dropdown(
        options=list(['True', 'False']),
        value=list(['True', 'False'])[0],
        description='Normalized weights:',
        disabled=False,
    )

    scaled_dropdown = widgets.Dropdown(
        options=list(['True', 'False']),
        value=list(['True', 'False'])[0],
        description='Scale by variance:',
        disabled=False,
    )

    scaling_only_based_on_control_dropdown = widgets.Dropdown(
        options=list(['True', 'False']),
        value=list(['True', 'False'])[0],
        description='Scale only with Control:',
        disabled=False,
    )

    cell_type_dropdown = widgets.Dropdown(
        options=cell_types,
        value=cell_types[0],
        description='Cell Type:',
        disabled=False,
    )

    scored_gene_dropdown = widgets.Dropdown(
        options=gois,
        value=gois[0],
        description='Scored Gene:',
        disabled=False,
    )

## Display

In [174]:
if plotting:
    interactive_plot = widgets.interactive(boxplot_EDCs_GRN_scores_parameters_local,
                                    adata=widgets.fixed(adata),
                                    conditions=condition_dropdown, # type: ignore
                                    gene_set=gene_set_dropdown, # type: ignore
                                    cell_type=cell_type_dropdown, # type: ignore
                                    goi=scored_gene_dropdown, # type: ignore
                                    control=control_dropdown, # type: ignore
                                    normalize_weights=normalized_dropdown, # type: ignore
                                    scale_by_variance=scaled_dropdown, # type: ignore
                                    control_condition=control_condition_dropdown, # type: ignore
                                    scaling_only_based_on_control=scaling_only_based_on_control_dropdown # type: ignore
                                    )

    display(interactive_plot)

interactive(children=(Dropdown(description='Condition:', options=('3PBA', 'BPA', 'BPF', 'DMSO', 'DPHP', 'MBzP'…