# Environment

In [37]:
import base64
import csv
import gzip
import io
import json
import os
import sys

import anndata as ad
import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
from IPython.display import display
from PIL import Image
from plotly.subplots import make_subplots

import importlib

sys.path.insert(0, "/home/michal.kubacki/Githubs/GeneScore/project_functions")

import load_gene_sets 
importlib.reload(load_gene_sets)
from load_gene_sets import *

import gene_scoring_edited_opt
importlib.reload(gene_scoring_edited_opt)

import ploting_workbook 
importlib.reload(ploting_workbook)
from ploting_workbook import *

In [38]:
gpu_support = False
recompute = True
plotting = True

In [39]:
root_dir = "/group/testa/michal.kubacki/herring_minimal"
data_path = "/group/testa/michal.kubacki/herring_minimal/data"
output_path = "/group/testa/michal.kubacki/herring_minimal/all_ex/results"

# Load Precomputed Scores

In [40]:
%%capture
if not recompute:
    file_name = os.path.join(output_path, f"andata_scored_GRNs.loom")
    adata = ad.read_loom(file_name, sparse=False)

    adata.var['original_var_names'] = adata.var.index
    adata.var_names = adata.var['var_names']

    adata.var_names_make_unique()
    adata.var.set_index('var_names', inplace=True)

# Load Gene Sets Data

In [41]:
def load_GRNs_gene_sets(root_dir, gene_set_list = ["all_ex"], weights_list="scores_grn_all_from_comb_run_new.csv"):
    gene_sets = {}

    # Load data for each gene set
    for gene_set in gene_set_list:
        path = os.path.join(root_dir, f"{gene_set}", "celloracle")
        gene_sets[gene_set] = pd.read_csv(os.path.join(path, weights_list))

    # Sets Formatting
    gene_sets_dict = {}
    gene_sets_dict_cell_type_first = {}

    for key, value in gene_sets.items():
        gene_sets_dict[key] = {}
        gene_sets_dict_cell_type_first[key] = {}

        for _, row in value.iterrows():
            goi = row['source']
            target = row['target']
            score1 = float(row['score']) * float(row['coef_mean'])
            score2 = float(row['coef_mean'])
            source = row['celltype']

            # Format 1: Gene of interest first
            if goi not in gene_sets_dict[key]:
                gene_sets_dict[key][goi] = {}

            if source not in gene_sets_dict[key][goi]:
                gene_sets_dict[key][goi][source] = {'targets': [], 'scored_coef_mean': [], 'coef_mean': []}

            gene_sets_dict[key][goi][source]['targets'].append(target)
            gene_sets_dict[key][goi][source]['scored_coef_mean'].append(score1)
            gene_sets_dict[key][goi][source]['coef_mean'].append(score2)

            # Format 2: Cell type first
            if source not in gene_sets_dict_cell_type_first[key]:
                gene_sets_dict_cell_type_first[key][source] = {}

            if goi not in gene_sets_dict_cell_type_first[key][source]:
                gene_sets_dict_cell_type_first[key][source][goi] = {'targets': [], 'scored_coef_mean': [], 'coef_mean': []}

            gene_sets_dict_cell_type_first[key][source][goi]['targets'].append(target)
            gene_sets_dict_cell_type_first[key][source][goi]['scored_coef_mean'].append(score1)
            gene_sets_dict_cell_type_first[key][source][goi]['coef_mean'].append(score2)

    print(gene_sets_dict_cell_type_first.keys())

    return gene_sets_dict, gene_sets_dict_cell_type_first

In [42]:
gere_sets_dict, gene_sets_dict_cell_type_first = load_GRNs_gene_sets(root_dir=root_dir)

dict_keys(['all_ex'])


In [43]:
gene_sets_dict_cell_type_first = remove_duplicates_preserve_order_GRNs(gene_sets_dict_cell_type_first)

In [44]:
sets = list(gere_sets_dict)
print(sets)

set_selected = sets[0]
cell_types = list(gene_sets_dict_cell_type_first[set_selected].keys())
print(cell_types)

cell_type_selected = cell_types[0]
scored_genes = list(gene_sets_dict_cell_type_first[set_selected][cell_type_selected].keys())
print(scored_genes)

scored_gene_selected = scored_genes[0]
print(len(gene_sets_dict_cell_type_first[set_selected][cell_type_selected][scored_gene_selected]['targets']))

['all_ex']
['L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'L5-6_TLE4', 'PN_dev']
['AHR', 'ARNT', 'ARNT2', 'CLOCK', 'AR', 'NR1I2', 'NR1I3', 'NR3C1', 'NR3C2', 'ESR1', 'RARA', 'ESR2', 'THRB', 'THRA']
30


# Load Expression Data

In [45]:
if recompute:
    adata = ad.read_h5ad(os.path.join(data_path, 'CTL04.h5ad'))

In [46]:
conditions = [["DMSO", "Ret_Ag", "Ret_Inh"], 
              ["DMSO", "AhHyd_Ag", "AhHyd_Inh"], 
              ["DMSO", "Andr_Ag", "Andr_Inh"], 
              ["DMSO", "LivX_Ag", "LivX_Inh"],
              ["DMSO", "GC_Ag", "GC_Inh"],
              ["DMSO", "Estr_Ag", "Estr_Inh"],
              ["DMSO", "Thyr_Ag", "Thyr_Inh"]]

# Scoring

In [47]:
print(list(gene_sets_dict_cell_type_first["all_ex"].keys()))

['L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'L5-6_TLE4', 'PN_dev']


In [48]:
gois = ['AHR', 'AR', 'NR1I2', 'NR1I3', 'NR3C1', 'NR3C2', 'ESR1', 'RARA', 'ESR2', 'THRB', 'THRA']
gene_sets = ['all_ex']
cell_types = ['L2-3_CUX2', 'L4_RORB', 'L5-6_THEMIS', 'L5-6_TLE4', 'PN_dev']

In [49]:
if recompute:
    for control in [True, False]:
        for control_condition in ['DMSO', None]:
            for normalize_weights in [True, False]:
                for scaling_only_based_on_control in [True, False]: 
                    for scale_by_variance in [True, False]:
                        for gene_set in gene_sets:
                            for cell_type in cell_types:
                                for goi in gois:
                                    gene_scoring_edited_opt.score_genes(
                                        adata,
                                        gene_list=gene_sets_dict_cell_type_first[gene_set][cell_type][goi]['targets'], 
                                        gene_weights=gene_sets_dict_cell_type_first[gene_set][cell_type][goi]['coef_mean'],   
                                        score_name = (
                                            f'gene_score_{gene_set}_{cell_type}_{goi}_{control}_'
                                            f'normalized_{normalize_weights}_'
                                            f'scaled_{scale_by_variance}_'
                                            f'cc_{control_condition}_'
                                            f'sc_{scaling_only_based_on_control}'
                                        ),                                    
                                        ctrl_size=50,
                                        gene_pool=None,
                                        n_bins=25,
                                        random_state=0,
                                        copy=False,
                                        used_layer='cpm',
                                        return_scores=False,
                                        control=control,
                                        weighted=True,
                                        abs_diff=False,
                                        gpu=gpu_support,
                                        chunk_size=10000,
                                        disable_chunking=True,
                                        scale_by_variance=scale_by_variance,
                                        normalize_weights=normalize_weights,
                                        conditions_labels='Condition',
                                        control_condition=control_condition,
                                        debug=False,
                                        scaling_only_based_on_control=scaling_only_based_on_control
                                )

In [50]:
adata

AnnData object with n_obs × n_vars = 66 × 19892
    obs: 'InternalUniqueID', 'HRID', 'Subject', 'Specimen', 'Condition', 'Treatment', 'Pathway', 'Type', 'Line', 'Sex', 'Project', 'Seq.run', 'FASTQ.R1', 'FASTQ.R2', 'RequestedCoverage', 'ExperimentCode', 'SeqApproach', 'RNASelection', 'SeqPlatform', 'lib.size', 'norm.factors', 'sample_type', 'gene_score_all_ex_L2-3_CUX2_AHR_True_normalized_True_scaled_True_cc_DMSO_sc_True', 'gene_score_all_ex_L2-3_CUX2_AR_True_normalized_True_scaled_True_cc_DMSO_sc_True', 'gene_score_all_ex_L2-3_CUX2_NR1I2_True_normalized_True_scaled_True_cc_DMSO_sc_True', 'gene_score_all_ex_L2-3_CUX2_NR1I3_True_normalized_True_scaled_True_cc_DMSO_sc_True', 'gene_score_all_ex_L2-3_CUX2_NR3C1_True_normalized_True_scaled_True_cc_DMSO_sc_True', 'gene_score_all_ex_L2-3_CUX2_NR3C2_True_normalized_True_scaled_True_cc_DMSO_sc_True', 'gene_score_all_ex_L2-3_CUX2_ESR1_True_normalized_True_scaled_True_cc_DMSO_sc_True', 'gene_score_all_ex_L2-3_CUX2_RARA_True_normalized_True_scaled_

# Plot scores for a single condition

## Configure

In [51]:
if plotting:
    def boxplot_Reference_GRN_scores_local(adata, control, control_condition, condition, gene_set, cell_type, goi, normalize_weights, scale_by_variance, scaling_only_based_on_control, prefix=""):
        plt.close('all')
        print(f"control: {control}, variance: {scale_by_variance}, scaling_only_based_on_control: {scaling_only_based_on_control}")
        mask = adata.obs['sample_type'].isin(condition)
        adata_filtered = adata[mask]

        if len(adata_filtered) > 0:
            fig, ax = plt.subplots(figsize=(4, 6))
            selection = (
                        f'{prefix}'
                        f'gene_score_{gene_set}_{cell_type}_{goi}_{control}_'
                        f'normalized_{normalize_weights}_'
                        f'scaled_{scale_by_variance}_'
                        f'cc_{control_condition}_'
                        f'sc_{scaling_only_based_on_control}'
                        )
            gene_scores_dmso = adata_filtered[adata_filtered.obs['sample_type'] == condition[0]].obs[selection].values
            gene_scores_ag = adata_filtered[adata_filtered.obs['sample_type'] == condition[1]].obs[selection].values
            gene_scores_inh = adata_filtered[adata_filtered.obs['sample_type'] == condition[2]].obs[selection].values
                    
            sns.boxplot(data=[gene_scores_dmso, gene_scores_ag, gene_scores_inh], notch=False,
                        boxprops=dict(alpha=0.5),
                        ax=ax)
            sns.stripplot(data=[gene_scores_dmso, gene_scores_ag, gene_scores_inh], 
                            jitter=True, color=".3", linewidth=1, ax=ax)
            
            ax.set_xticks(range(3))
            ax.set_xticklabels([condition[0], condition[1], condition[2]], fontsize=12)
            ax.set_title(f'Gene Scores - {goi}\n cell_type: {cell_type}', fontsize=16) #, control: {control}
            ax.set_ylabel("Gene Score", fontsize=12)

            plt.tight_layout()
            return fig
        else:
            print(f'No data to plot for the selected condition: {condition}')

In [52]:
%%capture
if plotting:
    condition_dropdown = widgets.Dropdown(
        options=conditions,
        value=conditions[0],
        description='Condition:',
        disabled=False,
    )

    gene_set_dropdown = widgets.Dropdown(
        options=list(gene_sets_dict_cell_type_first.keys()),
        value=list(gene_sets_dict_cell_type_first.keys())[0],
        description='Gene Set:',
        disabled=False,
    )

    control_dropdown = widgets.Dropdown(
        options=list(['True', 'False']),
        value=list(['True', 'False'])[0],
        description='Control:',
        disabled=False,
    )

    control_condition_dropdown = widgets.Dropdown(
        options=list(['DMSO', "None"]),
        value=list(['DMSO', "None"])[0],
        description='Condition Control:',
        disabled=False,
    )

    normalized_dropdown = widgets.Dropdown(
        options=list(['True', 'False']),
        value=list(['True', 'False'])[0],
        description='Normalized weights:',
        disabled=False,
    )

    scaled_dropdown = widgets.Dropdown(
        options=list(['True', 'False']),
        value=list(['True', 'False'])[0],
        description='Scale by variance:',
        disabled=False,
    )

    scaling_only_based_on_control_dropdown = widgets.Dropdown(
        options=list(['True', 'False']),
        value=list(['True', 'False'])[0],
        description='Scale only with Control:',
        disabled=False,
    )

    cell_type_dropdown = widgets.Dropdown(
        options=cell_types,
        value=cell_types[0],
        description='Cell Type:',
        disabled=False,
    )

    scored_gene_dropdown = widgets.Dropdown(
        options=gois,
        value=gois[0],
        description='Scored Gene:',
        disabled=False,
    )

## Display

In [53]:
if plotting:
    interactive_plot = widgets.interactive(boxplot_Reference_GRN_scores_local,
                                        adata=widgets.fixed(adata),
                                        control=control_dropdown,
                                        control_condition=control_condition_dropdown,
                                        condition=condition_dropdown,
                                        gene_set=gene_set_dropdown,
                                        cell_type=cell_type_dropdown,
                                        goi=scored_gene_dropdown,
                                        normalize_weights=normalized_dropdown,
                                        scale_by_variance=scaled_dropdown,
                                        scaling_only_based_on_control=scaling_only_based_on_control_dropdown,
                                        )

    display(interactive_plot)

interactive(children=(Dropdown(description='Control:', options=('True', 'False'), value='True'), Dropdown(desc…

# Plot scores across all the conditions

In [54]:
if plotting:
    interactive_plot = widgets.interactive(boxplot_Reference_GRN_scores_parameters_all_conditions,
                                    adata=widgets.fixed(adata),
                                    conditions=widgets.fixed(conditions),
                                    gene_set=gene_set_dropdown,
                                    cell_type=cell_type_dropdown,
                                    goi=scored_gene_dropdown,
                                    control=control_dropdown,
                                    control_condition=control_condition_dropdown,
                                    normalize_weights=normalized_dropdown, 
                                    scale_by_variance=scaled_dropdown, 
                                    scaling_only_based_on_control=scaling_only_based_on_control_dropdown
                                    )

    display(interactive_plot)

interactive(children=(Dropdown(description='Gene Set:', options=('all_ex',), value='all_ex'), Dropdown(descrip…

In [55]:
if plotting:
    interactive_plot = widgets.interactive(boxplot_Reference_GRN_scores_parameters_all_gois,
                                        adata=widgets.fixed(adata),
                                        gene_set=gene_set_dropdown,
                                        condition=condition_dropdown,
                                        cell_type=cell_type_dropdown,
                                        gois=widgets.fixed(gois),
                                        control=control_dropdown,
                                        control_condition=control_condition_dropdown,
                                        normalize_weights=normalized_dropdown, 
                                        scale_by_variance=scaled_dropdown, 
                                        scaling_only_based_on_control=scaling_only_based_on_control_dropdown
                                        )

    display(interactive_plot)

interactive(children=(Dropdown(description='Gene Set:', options=('all_ex',), value='all_ex'), Dropdown(descrip…

# Save results

In [56]:
if recompute:
    file_name = os.path.join(output_path, f"andata_scored_GRNs.loom")
    adata.write_loom(file_name)