## PAGA Basic Analysis (Only Main Analysis)

- In this notebook, I include basic leiden clustering with no downstream timeseries analysis

- Mean parameter values are projected onto the space to visualize

- Distributions of genes of known function should be assembled in this notebook

- Finally, basic clustering and ontology enrichment are present at the end

- This notebook should be used to decide on clustering parameters and the resulting paga_df saved to disk

In [None]:
import ast
import copy
import random
import warnings

import anndata
import dask
import dask.array as da
import dask.dataframe as dd
import holoviews as hv
import igraph as ig
import leidenalg
import matplotlib as mpl
import matplotlib.gridspec as gridspec
import networkx as nx
import numpy as np
import pandas as pd
import pylab
import scanpy as sc
import scipy as sp
import scipy.cluster.hierarchy as sch
import scipy.sparse
import scipy.stats
import seaborn as sns
import sklearn as skl
import umap
from igraph.drawing.text import TextDrawer
from matplotlib import pyplot as plt
from scanpy.plotting.palettes import default_20, vega_20_scanpy
from sklearn.cluster import AffinityPropagation, AgglomerativeClustering
from sklearn.linear_model import LinearRegression
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import (
    cosine_distances,
    euclidean_distances,
    manhattan_distances,
)
from tslearn.barycenters import (
    dtw_barycenter_averaging,
    euclidean_barycenter,
    softdtw_barycenter,
)
from tslearn.metrics import cdist_soft_dtw, cdist_soft_dtw_normalized
from tslearn.neighbors import KNeighborsTimeSeries

import paulssonlab.deaton.trenchripper.trenchripper as tr

hv.extension("bokeh")
random.seed(42)
np.random.seed(42)

warnings.filterwarnings(action="once", category=UserWarning)

### Load Data From First Notebook

In [None]:
paga_df_only = pd.read_pickle("./2021-12-07_paga_df_only.pkl")
paga_df = sc.read("./2021-12-07_paga_df.h5ad")
paga_df.obs = copy.deepcopy(paga_df_only)

In [None]:
paga_df_only["Target Sites"] = paga_df_only["Target Sites"].apply(
    lambda x: ast.literal_eval(x)
)

In [None]:
import goatools
import goatools.base
from goatools.anno.gaf_reader import GafReader
from goatools.base import download_go_basic_obo
from goatools.go_enrichment import GOEnrichmentStudy
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
from goatools.obo_parser import GODag
from goatools.semantic import TermCounts, get_info_content, semantic_similarity


def search_go(ns2assoc, obodag, inv_gene_to_id, go_term):
    namespace_abbv = {
        "biological_process": "BP",
        "molecular_function": "MF",
        "cellular_component": "CC",
    }

    print("Searching for " + str(obodag[go_term].name))
    namespace = namespace_abbv[obodag[go_term].namespace]
    child_goterms = list(obodag[go_term].get_all_children())
    gene_list = [
        inv_gene_to_id[key]
        for key, val in ns2assoc[namespace].items()
        if go_term in val
    ]
    for child_goterm in child_goterms:
        gene_list += [
            inv_gene_to_id[key]
            for key, val in ns2assoc[namespace].items()
            if child_goterm in val
        ]
    gene_list = sorted(list(set(gene_list)))
    return gene_list


def selection_fn(item, gene_name):
    is_gene = item["Gene"] == gene_name
    if is_gene:
        return item["TargetID"]
    else:
        return 0


def highlight_gene_group(an_df, selection_list):
    highlight_genes_df = copy.deepcopy(an_df)

    selection_list = sorted(
        list(
            set(highlight_genes_df.obs["Gene"].unique().tolist()) & set(selection_list)
        )
    )

    for i, selected_gene in enumerate(selection_list):
        selected_series = (highlight_genes_df.obs["Gene"] == selected_gene).astype(
            "category"
        )
        selected_series = selected_series.cat.reorder_categories([True, False])
        highlight_genes_df.obs["Selected Genes: " + str(i)] = selected_series

    selected_series = (highlight_genes_df.obs["Gene"].isin(selection_list)).astype(
        "category"
    )
    selected_series = selected_series.cat.reorder_categories([True, False])
    highlight_genes_df.obs["All Genes"] = selected_series

    # selected_series = (paga_df.obs["Gene"]=="ftsZ").astype(float)
    # selected_series[selected_series==0.] = np.NaN
    # paga_df.obs["Selected Genes"] = selected_series

    fig = sc.pl.umap(
        highlight_genes_df,
        title=selection_list + ["All Genes"],
        color=["Selected Genes: " + str(i) for i in range(len(selection_list))]
        + ["All Genes"],
        groups=[True],
        show=False,
        legend_loc="right margin",
        add_outline=False,
        size=50,
        return_fig=True,
        palette={True: "red", False: "lightgrey"},
    )  # palette ={}

    return fig


def highlight_sgrnas(an_df, selection_list):
    highlight_genes_df = copy.deepcopy(an_df)
    highlight_genes_df.obs["tempindex"] = highlight_genes_df.obs.index

    selection_list = sorted(
        list(
            set(highlight_genes_df.obs["tempindex"].unique().tolist())
            & set(selection_list)
        )
    )
    selected_series = (highlight_genes_df.obs["tempindex"].isin(selection_list)).astype(
        "category"
    )
    selected_series = selected_series.cat.reorder_categories([True, False])
    highlight_genes_df.obs["All sgRNAs"] = selected_series

    # selected_series = (paga_df.obs["Gene"]=="ftsZ").astype(float)
    # selected_series[selected_series==0.] = np.NaN
    # paga_df.obs["Selected Genes"] = selected_series

    fig = sc.pl.umap(
        highlight_genes_df,
        title="All sgRNAs",
        color="All sgRNAs",
        groups=[True],
        show=False,
        legend_loc="right margin",
        add_outline=False,
        size=50,
        return_fig=True,
        palette={True: "red", False: "lightgrey"},
    )  # palette ={}

    return fig

## 3) Divergence Detection

- this is not currently working well

- need a better way of accounting for "consistently" divergent genes

- perhaps aggregate over target sites?

In [None]:
dist_mat = copy.deepcopy(paga_df.obsp["soft_dtw"])
dist_mat = np.triu(dist_mat)
upper_tri_mask = dist_mat > 0.0
gene_list = sorted(paga_df.obs["Gene"].unique().tolist())

In [None]:
unmatched_gene_mask = np.ones(dist_mat.shape, dtype=bool)

In [None]:
import itertools

unmatched_gene_mask = np.ones(dist_mat.shape, dtype=bool) * upper_tri_mask

all_matched_vals = []
gene_groups = {}

for i, gene_i in enumerate(gene_list):
    gene_mask = (paga_df.obs["Gene"] == gene_i).values
    double_gene_mask = np.logical_and.outer(gene_mask, gene_mask)
    masked_vals = dist_mat[double_gene_mask * upper_tri_mask]
    masked_vals = masked_vals[~np.isnan(masked_vals)].tolist()
    all_matched_vals += masked_vals
    gene_groups[gene_i] = masked_vals

    unmatched_gene_mask = unmatched_gene_mask * (~double_gene_mask)

all_unmatched_vals = dist_mat[unmatched_gene_mask].tolist()
sampled_unmatched_vals = np.random.choice(
    all_unmatched_vals, replace=False, size=(100000,)
)
# all_unmatched_vals = []
# for i,gene_i in enumerate(gene_list):
#     for j,gene_j in enumerate(gene_list):
#         if i > j:
#             coord_list = np.array([item for item in itertools.product(gene_loc_dict[gene_i],gene_loc_dict[gene_j]) if item[0]>item[1]])
#             if len(coord_list)>0:
#                 masked_vals = dist_mat[coord_list]
#                 masked_vals = masked_vals[~np.isnan(masked_vals)].tolist()
#                 all_unmatched_vals += masked_vals

In [None]:
plt.hist(all_matched_vals, bins=50, range=(0, 1000))
plt.show()
plt.hist(sampled_unmatched_vals, bins=50, range=(0, 1000))
plt.show()

In [None]:
n_matched_vals = len(all_matched_vals)
n_unmatched_vals = len(all_unmatched_vals)
p_match = n_matched_vals / (n_matched_vals + n_unmatched_vals)
p_unmatch = 1.0 - p_match

### t-test with multihypothesis correction

In [None]:
import statsmodels.stats.multitest

mean_match = np.mean(all_matched_vals)

gene_group_ttest_pval = []
gene_group_ttest_gene = []
for key, val in gene_groups.items():
    if len(val) > 1:
        gene_group_ttest_gene.append(key)
        gene_group_ttest_pval.append(
            sp.stats.ttest_1samp(
                gene_groups[key], mean_match, alternative="greater"
            ).pvalue
        )

gene_group_ttest_pval = np.array(gene_group_ttest_pval)
gene_group_ttest_gene = np.array(gene_group_ttest_gene)

rejected, pval_corr = statsmodels.stats.multitest.fdrcorrection(
    gene_group_ttest_pval, alpha=0.00001, method="indep", is_sorted=False
)

In [None]:
gene_group_ttest_gene[rejected]

### Highlight Genes of Interest

In [None]:
fig = highlight_gene_group(paga_df, gene_group_ttest_gene[rejected])
# fig.savefig("./3_Divergent_Genes/divergent_genes.png",dpi=150)

In [None]:
paga_df_only["Target Sites"]

#### Get RegulonDB Files

In [None]:
import urllib.request

from Bio import SeqIO
from dna_features_viewer import BiopythonTranslator

In [None]:
urllib.request.urlretrieve(
    "http://regulondb.ccg.unam.mx/menu/download/datasets/files/U00096.3.gbk",
    "./U00096.3.gbk",
)

In [None]:
urllib.request.urlretrieve(
    "http://regulondb.ccg.unam.mx/menu/download/datasets/files/PromoterSet.txt",
    "./PromoterSet.txt",
)

In [None]:
from Bio import SeqIO
from Bio.SeqFeature import FeatureLocation, SeqFeature
from dna_features_viewer import BiopythonTranslator


class sgRNA_Explorer(BiopythonTranslator):
    """Custom translator implementing the following theme:

    - Color terminators in green, CDS in blue, all other features in gold.
    - Do not display features that are restriction sites unless they are BamHI
    - Do not display labels for restriction sites.
    - For CDS labels just write "CDS here" instead of the name of the gene.

    """

    def __init__(self, ignored_features_types=["CDS"]):
        self.ignored_features_types = ignored_features_types
        super(sgRNA_Explorer, self).__init__()

    def compute_feature_color(self, feature):
        if feature.type == "CDS":
            return "#1f77b4"
        elif feature.type == "terminator":
            return "#279e68"
        elif feature.type == "promoter":
            return "#aa40fc"
        elif feature.type == "sgRNA":
            return "#d62728"
        else:
            return "#aec7e8"


def add_promoters_to_genbank(genome_record, promoter_df):
    promoter_feature_list = []
    for index, promoter in promoter_df.iterrows():
        if promoter["Strand"] == "forward":
            promoter_feature = SeqFeature(
                location=FeatureLocation(promoter["TSS"], promoter["TSS"]),
                type="promoter",
                strand=1,
            )
            promoter_feature.qualifiers["gene"] = promoter["Name"]
            promoter_feature_list.append(promoter_feature)
        else:
            promoter_feature = SeqFeature(
                location=FeatureLocation(promoter["TSS"], promoter["TSS"]),
                type="promoter",
                strand=-1,
            )
            promoter_feature.qualifiers["gene"] = promoter["Name"]
            promoter_feature_list.append(promoter_feature)
    genome_record.features = genome_record.features + promoter_feature_list
    return genome_record


def display_target_sites(genome_record, target_site_list, translator, view_pad=1000):
    strand_dict = {"+": 1, "-": -1}

    n_targets = len(target_site_list)

    for target_site in target_site_list:
        start_coord = target_site[0] - view_pad
        end_coord = target_site[1] + view_pad

        sub_genome_record = genome_record[start_coord:end_coord]
        sgRNA = SeqFeature(
            location=FeatureLocation(view_pad, view_pad + 20),
            type="sgRNA",
            strand=strand_dict[target_site[2]],
        )

        sgRNA.qualifiers["gene"] = "sgRNA"

        sub_genome_record.features = sub_genome_record.features + [sgRNA]
        graphic_record = translator.translate_record(sub_genome_record)

        ax, _ = graphic_record.plot(figure_width=10, strand_in_label_threshold=7)


def display_target_sites_single_locus(
    genome_record, target_site_dict, translator, view_pad=2000, outer_context_pad=20000
):
    strand_dict = {"+": 1, "-": -1}

    n_targets = len(target_site_dict)
    first_key = list(target_site_dict.keys())[0]

    outer_start_coord = target_site_dict[first_key][0] - outer_context_pad
    outer_end_coord = target_site_dict[first_key][1] + outer_context_pad

    start_coord = target_site_dict[first_key][0] - outer_start_coord
    end_coord = target_site_dict[first_key][1] - outer_start_coord

    sub_genome_record = genome_record[outer_start_coord:outer_end_coord]

    for targetid, target_site in target_site_dict.items():
        sgRNA = SeqFeature(
            location=FeatureLocation(
                target_site[0] - outer_start_coord, target_site[1] - outer_start_coord
            ),
            type="sgRNA",
            strand=strand_dict[target_site[2]],
        )

        sgRNA.qualifiers["gene"] = str(targetid)

        sub_genome_record.features = sub_genome_record.features + [sgRNA]
    graphic_record = translator.translate_record(sub_genome_record)

    cropped_record = graphic_record.crop((start_coord - view_pad, end_coord + view_pad))

    ax, _ = cropped_record.plot(figure_width=10, strand_in_label_threshold=7)


def gene_to_target_dict(df, gene_name):
    subset_df = df[df["Gene"] == gene_name]
    subset_target_site_dict = (
        subset_df.groupby("TargetID")
        .apply(lambda x: x.iloc[0])["Target Sites"]
        .to_dict()
    )
    subset_target_site_dict = {
        key: item for key, val in subset_target_site_dict.items() for item in val
    }  # unwrapping target sites

    return subset_target_site_dict, subset_df

#### Make Reference SeqRecord

In [None]:
genome_record = SeqIO.read("./U00096.3.gbk", "genbank")
promoter_df = pd.read_csv(
    "./PromoterSet.txt",
    sep="\t",
    skiprows=37,
    names=[
        "ID",
        "Name",
        "Strand",
        "TSS",
        "Sigma Factor",
        "Sequence",
        "Evidence",
        "Confidence",
    ],
)
promoter_df = promoter_df[promoter_df["Confidence"] == "Strong"]
genome_record_merged = add_promoters_to_genbank(genome_record, promoter_df)

In [None]:
genome_record_merged

#### Initialize Viewer

In [None]:
translator = sgRNA_Explorer()

In [None]:
divergent_df = paga_df.obs[paga_df.obs["Gene"].isin(gene_group_ttest_gene[rejected])]

In [None]:
divergent_df["Gene"].unique().tolist()

In [None]:
fig = sc.pl.umap(
    paga_df,
    color=["leiden_lowres", "leiden", "leiden_highres"],
    title=["Leiden Resolution=0.25", "Leiden Resolution=1.", "Leiden Resolution=1.5"],
    show=False,
    legend_loc="on data",
    edges=True,
    add_outline=False,
    size=50,
    return_fig=True,
    palette=vega_20_scanpy,
)
# fig.savefig("./1_Global_Analysis/Global_PAGA.png",dpi=150)

In [None]:
for gene in divergent_df["Gene"].unique().tolist():
    target_site_dict, subset_df = gene_to_target_dict(paga_df_only, gene)
    highlight_gene_group(paga_df, [gene])
    plt.show()
    display_target_sites_single_locus(
        genome_record, target_site_dict, translator, view_pad=1000
    )
    print(subset_df.reset_index().set_index("TargetID").sort_index()["leiden_lowres"])
    plt.show()