# Test Case Mortality

Creating a random set of fictitious patients to show how the code works without using patient data.

In [None]:
import os

print(os.getcwd())
os.chdir("../")
print(os.getcwd())

import string

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline

from hypmmm import (
    build_model,
    centrality,
    create_figures,
    remove_correction,
    utils,
    weight_functions,
)

plt.rcParams["figure.figsize"] = (10, 10)
plt.rcParams["font.size"] = 10

### Example patients definition to input format

**Patients 1, 2 and 3:**
- Has diseases A, B, and C (i.e. `binmat: [1, 1, 1]`)
- Disease progression is A -> B -> C (i.e. `conds_worklist: [0, 1, 2]`)
- No duplicates are present (i.e. `idx_worklist: [-1, -1, -1]`)


**Patient 4:**
- Has diseases A, B, and C (i.e. `binmat: [1, 1, 1]`)
- Disease progression is C -> A -> B (i.e. `conds_worklist: [2, 0, 1]`)
- No duplicates are present (i.e. `idx_worklist: [-1, -1, -1]`)


**Patient 5:**
- Has diseases B and C (i.e. `binmat: [0, 1, 1]`)
- Disease progression is B -> C (i.e. `conds_worklist: [1, 2, -1]`)
- No duplicates are present (i.e. `idx_worklist: [-1, -1, -1]`)


**Patient 6:**
- Has disease B (i.e. `binmat: [0, 1, 0]`)
- No disease progression (just B) (i.e. `conds_worklist: [1, -1, -1]`)
- No duplicates are present, and only has one condition (i.e. `idx_worklist: [-2, -1, -1]`)


**Patient 7:**
- Has disease C (i.e. `binmat: [0, 0, 1]`)
- No disease progression (just C) (i.e. `conds_worklist: [2, -1, -1]`)
- No duplicates are present, and only has one condition (i.e. `idx_worklist: [-2, -1, -1]`)


**Patient 8:**
- Has diseases A, B and C (i.e. `binmat: [1, 1, 1]`)
- Disease progression is B -> A -> C (i.e. `conds_worklist: [1, 0, 2]`)
- No duplicates are present (i.e. `idx_worklist: [-1, -1, -1]`)


**Patient 9:**
- Has diseases A and B (i.e. `binmat: [1, 1, 0]`)
- Disease progression is B -> A (i.e. `conds_worklist: [1, 0, -1]`)
- No duplicates are present (i.e. `idx_worklist: [-1, -1, -1]`)


**Patient 10:**
- Has diseases A and C (i.e. `binmat: [1, 0, 1]`)
- Disease progression is A -> C (i.e. `conds_worklist: [0, 2, -1]`)
- No duplicates are present (i.e. `idx_worklist: [-1, -1, -1]`)

In [None]:
n_diseases = 3
colarr = np.asarray(list(string.ascii_uppercase[:n_diseases]), dtype="<U24")

# Fictitious trajectory of 'diseases' A, B and C
edge_list = [
    ("A", "B", "C"),
    ("A", "B", "C"),
    ("A", "B", "C"),
    ("C", "A", "B"),
    ("B", "C"),
    ("B", "B"),
    ("C", "C"),
    ("B", "A", "C"),
    ("B", "A"),
    ("A", "C"),
]

binmat, conds_worklist, idx_worklist = utils.create_initial_worklists(
    n_diseases, edge_list
)

## With or without mortality?



Mortality | Short Description | Detail
:-:|:-|:-
No Mortality | 0 mortality nodes <br /> 0 alive nodes | Nothing to represent progression to death
Mortality | 1 mortality node $M$ <br /> 1 alive node $S$ | A single node for death and a single node to represent the individual still alive by the end of the cohort period of analysis (PoA)

## No Mortality

In [None]:
end_prog = -1 * np.ones(binmat.shape[0], dtype=np.int8)  # Ignores fictitious mortality
end_type = 0
mort_type = None

inc_mat, weights, mort_colarr = build_model.compute_weights(
    binmat,
    conds_worklist,
    idx_worklist,
    colarr,
    "progression",
    weight_functions.modified_sorensen_dice_coefficient,
    end_prog,
    dice_type=1,
    end_type=end_type,
    plot=False,
    ret_inc_mat=True,
    sort_weights=False,
)


hyperedge_weights = np.array(weights[0]["weight"])
hyperedge_titles = np.array(weights[0]["disease set"])

hyperarc_weights = np.array(weights[1]["weight"])
progression_titles = np.array(weights[1]["progression"])

node_weights = np.array(weights[2]["weight"])
node_titles = np.array(weights[2]["node"])

In [None]:
output = build_model.setup_vars(
    inc_mat,
    n_diseases,
    hyperarc_weights,
    progression_titles,
    node_weights,
    mort_type=mort_type,
)
inc_mat_data, hyperarc_data, node_weights, node_degs, edge_degs = output

inc_mat_tail, inc_mat_head = inc_mat_data

edge_weights, hyperarc_titles = hyperarc_data

node_degree_tail, node_degree_head = node_degs
edge_degree_tail, edge_degree_head = edge_degs

In [None]:
inpt = (
    (inc_mat_tail, inc_mat_head),
    (edge_weights, node_weights),
    node_degs,
    edge_degree_tail,
)

node_pagerank = centrality.pagerank_centrality(
    inpt,
    rep="standard",
    typ="successor",
    tolerance=1e-10,
    max_iterations=1000,
    is_irreducible=True,
    weight_resultant=False,
    random_seed=None,
    eps=0.00001,
)

all_node_sc_evc = pd.DataFrame(
    {
        "Disease": mort_colarr,
        "Successor PageRank": node_pagerank,
    }
)

succ_order = (
    all_node_sc_evc.sort_values(by="Successor PageRank", ascending=False)
    .reset_index(drop=True)
    .Disease
)
succ_node_sc_evc = (
    all_node_sc_evc.sort_values(by="Successor PageRank", ascending=False)
    .reset_index(drop=True)
    .round({"Successor PageRank": 3})
)

In [None]:
inpt = (
    (inc_mat_tail, inc_mat_head),
    (edge_weights, node_weights),
    node_degs,
    edge_degree_tail,
)

node_pagerank = centrality.pagerank_centrality(
    inpt,
    rep="standard",
    typ="predecessor",
    tolerance=1e-5,
    max_iterations=1_000,
    is_irreducible=True,
    weight_resultant=True,
    random_seed=None,
)

all_node_pr_evc = pd.DataFrame(
    {"Disease": mort_colarr, "Predecessor PageRank": node_pagerank}
)

pred_order = (
    all_node_pr_evc.sort_values(by="Predecessor PageRank", ascending=False)
    .reset_index(drop=True)
    .Disease
)
pred_node_pd_evc = (
    all_node_pr_evc.sort_values(by="Predecessor PageRank", ascending=False)
    .reset_index(drop=True)
    .round({"Predecessor PageRank": 3})
)

In [None]:
succ_node_sc_evc.columns = ["Disease", "No Mort Suc PageRank"]
pred_node_pd_evc.columns = ["Disease", "No Mort Pred PageRank"]
no_mort_pagerank = pred_node_pd_evc.merge(succ_node_sc_evc)

In [None]:
create_figures.pagerank_scatter(
    suc_col=no_mort_pagerank["No Mort Suc PageRank"],
    pred_col=no_mort_pagerank["No Mort Pred PageRank"],
    dis_col=no_mort_pagerank["Disease"],
)

## Mortality Type 1 with Remove PageRank Correction

### Example patients definition to input format

**Patients 1, 2, 3, 5 and 10:**
- Have Mort end nodes

**Patient 4, 6, 7, 8, and 9:**
- Have Alive end nodes

In [None]:
end_prog = np.array([1, 1, 1, 0, 1, 0, 0, 0, 0, 1], dtype=np.int8)

if np.any(end_prog == -1):
    mort_type = None
    end_type = 0
    mort_incl = False
else:
    mort_type = 1
    end_type = mort_type
    mort_incl = True

(
    remove_cor_df,
    mort1_inc_mat_tail,
    mort1_inc_mat_head,
    mort1_hyperarc_weights,
    mort1_node_weights,
    mort1_hyperarc_titles,
) = remove_correction.calc_remove_pagerank(
    n_diseases,
    binmat,
    conds_worklist,
    idx_worklist,
    contribution_type="progression",
    weight_function=weight_functions.modified_sorensen_dice_coefficient,
    complete_denom=1,
    end_prog=end_prog,
    plot=False,
)

In [None]:
# Scatterplot for Corrected PageRank
create_figures.pagerank_scatter(
    suc_col=remove_cor_df["Corrected Suc PageRank"],
    pred_col=remove_cor_df["Corrected Pred PageRank"],
    dis_col=remove_cor_df["Disease"],
)

In [None]:
# Construct incidence matrix for undirected representation of directed hypergraph
incidence_matrix = np.concatenate([mort1_inc_mat_tail, mort1_inc_mat_head], axis=0)
incidence_matrix

In [None]:
mort1_node_weights

In [None]:
hyperarc_centrality = centrality.eigenvector_centrality(
    incidence_matrix,
    mort1_hyperarc_weights,
    mort1_node_weights,
    rep="dual",
    tolerance=1e-6,
    max_iterations=1000,
    weight_resultant=True,
    random_seed=None,
)

n_conds = n_diseases * [2] + [
    len(d.split(",")) + 1 for d in mort1_hyperarc_titles[n_diseases:]
]

hyperarc_evc = pd.DataFrame(
    {
        "Disease": mort1_hyperarc_titles,
        "Degree": n_conds,
        "Eigenvector Centrality": np.round(hyperarc_centrality, 3),
    },
)
hyperarc_evc.sort_values(by="Degree", ascending=True).reset_index(drop=True)

In [None]:
def generate_forward_prog(disease_set, hyperarc_evc, n, max_degree):
    """
    Given a disease set, generate a tree of likely disease progressions given the hyperarc eigenvector
    centrality values. n decides on the number of disease progessions to generate.

    Args:
        disease_set (str) : Observed disease progression. Must be of format
        "DIS1, DIS2, ..., DISn-1"

        hyperarc_evc (pd.DataFrame) : Dataframe of hyperarc eigenvector centrality values.

        n (int) : Number of progressions to return.

        max_degree (int) : Maximum degree disease progression to generate.
    """
    pathways = [[] for i in range(n)]
    deg = len(disease_set.split(", ")) + 1
    if deg < max_degree:
        deg_hyperarc_evc = hyperarc_evc[hyperarc_evc.Degree == deg]
        deg_dis = np.array([dis.split(" -> ")[0] for dis in deg_hyperarc_evc.Disease])
        deg_dis_hyperarc_evc = deg_hyperarc_evc.iloc[
            np.where(deg_dis == disease_set)
        ].sort_values(by="Eigenvector Centrality", ascending=False, axis=0)
        deg_progs = list(deg_dis_hyperarc_evc.iloc[:n].Disease)
        for i, prog in enumerate(deg_progs):
            pathways[i].append(prog)
            disease_set = ", ".join(prog.split(" -> "))
            prog_pathway = generate_forward_prog(
                disease_set, hyperarc_evc, 2, max_degree
            )
            if prog_pathway is not None:
                pathways[i].append(prog_pathway)
        deg += 1
    else:
        pathways = None

    return pathways

In [None]:
def generate_pathways(progression, hyperarc_evc, n):
    """
    Given a disease progression specified, use the hyperarc eigenvector centrality
    ranking of observed disease progressions to generate the observed disease progression
    before the specified one.

    Args:
        progression (str) : Observed disease progression. Must be of format
        "DIS1, DIS2, ..., DISn-1 -> DISn"

        hyperarc_evc (pd.DataFrame) : Dataframe of hyperarc eigenvector centrality values.

        n (int) : Number of progressions to return.
    """
    # Extract tail set and tail degree
    prog_tail = progression.split(" -> ")[0].split(", ")
    pathway_progressions = pd.DataFrame()
    deg_hyperarc_evc = hyperarc_evc[hyperarc_evc.Degree == len(prog_tail)].sort_values(
        by="Eigenvector Centrality", ascending=False, axis=0
    )
    # Loop over hyperarc centralities.
    for i, row in deg_hyperarc_evc.iterrows():
        disease_prog, deg, cent = row

        # If degree of hyperarc in iteration matches degree of tail set then extract tail and head
        # members
        diseases_tail = disease_prog.split(" -> ")[0].split(", ")
        diseases_head = disease_prog.split(" -> ")[1]
        diseases = diseases_tail + [diseases_head]

        # Loop over diseases in tail set of progression and only extract hyperarcs which match the tail and
        # head sets
        for dis in prog_tail:
            prog = prog_tail.copy()
            prog.remove(dis)
            if dis in diseases_head and np.all(np.sort(prog) == np.sort(diseases_tail)):
                pathway_progressions = pd.concat(
                    [pathway_progressions, pd.DataFrame(row).T], axis=0
                )

    # Extract n of these pathways and return
    print(f"Found {pathway_progressions.shape[0]} possible pathways...")
    # If there are no pathways there will be no dataframe and so an error will occur
    return list(
        pathway_progressions.reset_index(drop=True)
        .sort_values(by="Eigenvector Centrality", axis=0, ascending=False)
        .iloc[:n]["Disease"]
    )

In [None]:
def generate_all_pathways(progression, hyperarc_evc, n):
    """
    Given an observed disease progression, generate all likely pathways to that progression
    using the hyperarc centralities computed.

    Args:
        progression (str) : Observed disease progression. Must be of format
        "DIS1, DIS2, ..., DISn-1 -> DISn"

        hyperarc_evc (pd.DataFrame) : Dataframe of hyperarc eigenvector centrality values.

        n (int) : Number of progressions to return.
    """
    # Extract tail set and tail degree
    print(progression)
    prog_tail = progression.split(" -> ")[0].split(", ")
    deg = len(prog_tail)
    pathway_progressions = dict()

    if deg != 1:
        pathways = generate_pathways(progression, hyperarc_evc, n)
        pathway_progressions[progression] = pathways
        for progression in pathways:
            prog_pathways = generate_all_pathways(progression, hyperarc_evc, n)
            if prog_pathways == {}:
                prog_tail = progression.split(" -> ")[0]
                pathway_progressions[progression] = [f"{prog_tail} -> {prog_tail}"]
            else:
                pathway_progressions[progression] = list(prog_pathways.values())

    return pathway_progressions

In [None]:
progression = "A, B, C -> MORT"
test = generate_all_pathways(progression, hyperarc_evc, 3)

In [None]:
# Gets the top n (3) centralities for each degree type
top_n = 3
top_progressions = pd.DataFrame()
max_d = 4
for deg in range(1, max_d + 1):
    deg_hyperarc_evc = hyperarc_evc[hyperarc_evc.Degree == deg].sort_values(
        by="Eigenvector Centrality", ascending=False, axis=0
    )
    top_progressions = pd.concat(
        [top_progressions, deg_hyperarc_evc.iloc[:top_n]], axis=0
    )

top_progressions.reset_index(drop=True).sort_values(by="Degree", ascending=True, axis=0)
top_progressions
rank_progressions = top_progressions.copy()
rank_progressions.index = (max_d * [i for i in range(top_n)])[
    : rank_progressions.shape[0]
]
rank_progressions.reset_index(inplace=True)

rank_progressions

In [None]:
prog_idx_lists = []
for deg in range(1, max_d):
    deg_dis_progs = rank_progressions[rank_progressions.Degree == deg].reset_index(
        drop=True
    )
    lst_idx = deg - 1
    n_degs = deg_dis_progs.shape[0]
    prog_idx_lists.append([[] for i in range(n_degs)])
    for deg_dis_prog in deg_dis_progs.iterrows():
        i, (idx, dis_prog, dis_deg, eig_cent) = deg_dis_prog
        if deg != 1:
            prog_tail = dis_prog.split(" -> ")[0].split(", ")
            prog_head = dis_prog.split(" -> ")[1]
            prog_dis = np.sort(prog_tail + [prog_head])
        else:
            prog_dis = dis_prog.split(" -> ")[0]
        degabove_dis_progs = rank_progressions[
            rank_progressions.Degree == dis_deg + 1
        ].Disease

        for j, degabove_dis in enumerate(degabove_dis_progs):
            degabove_tail = np.sort(degabove_dis.split(" -> ")[0].split(", "))
            if np.all(prog_dis == degabove_tail):
                prog_idx_lists[lst_idx][i].append(j)

In [None]:
create_figures.dis_prog_paths(rank_progressions, max_d, top_n, prog_idx_lists)