# DESMOND2 
Method demo on random data

In [None]:
# colab initialization
!git clone https://github.com/ozolotareva/DESMOND2
!pip3 install fisher

In [None]:
import sys,os
import numpy as np
import pandas as pd
import random
import math
from time import time
from datetime import datetime
import matplotlib
import matplotlib.pyplot as plt

from DESMOND2 import method2  # DESMOND2 functions 


# Load exprs data (generate some random matrix)

In [None]:
def make_random_df(seed=0):
    np.random.seed(seed)
    rand_matrix = np.random.normal(size=(30, 20)).round(3)

    diff = 5
#     inds_up = np.s_[1:10, 1:10]
#     rand_matrix[inds_up] += diff
    
    rand_matrix[1:10, 1:10] += diff
    rand_matrix[22:28, 2:7] += diff
    rand_matrix[12:26:2, 8:15] += diff
    
    inds_down = np.s_[11:25:2, 13:19]
    rand_matrix[inds_down] -= diff


    df = pd.DataFrame(rand_matrix)
    df = df.rename(lambda x: f'col {x}', axis=1)
    df = df.rename(lambda x: f'row {x}', axis=0)

    return df
    

exprs = make_random_df()

# for real data use smth like
# exprs = pd.read_csv("TCGA_micro_nolog2.z.13K_common.tsv", sep="\t",index_col=0)

exprs.style.background_gradient(axis=None, low=0, high=2).format('{:.3f}')


## Setting default metaparams

In [None]:
min_n_samples = -1
alpha = 0.1
beta_K = 100.0
snr_pval = 0.05 
f = 0.25 # min % of time in a module

# convergence
max_n_steps= 100
n_steps_averaged = 10
n_points_fit=20
n_steps_for_convergence = 10

verbose = True
plot_all = True

seed = 0

# 0) Find the trend between SNR and bicluster size

1) generate random splits for all bicluster sizes [min_n_samples, N/2]

2) find SNR_thr(size) for a p-value cutoff, e.g. p_val =0.05

get_snr_thr is a function of bicluser size 


In [None]:
sizes, thresholds = method2.rand_norm_splits(
    exprs.shape[1], 
    snr_pval = snr_pval,
    seed = seed
)
get_min_snr = method2.get_trend(
    sizes,
    thresholds, 
    plot = plot_all
)

# 1) Assign patients to edges
 * GMM
 * filtering by sample nubmer and SNR

In [None]:
binarized_expressions = method2.GM_binarization(
    exprs,
    get_min_snr,
    min_n_samples,
    verbose = True, 
    plot = True, 
    plot_SNR_thr = 3.0, 
    show_fits = [],
    seed = seed
)

# Step 2. Sample module memberships

In [None]:
direction = "UP"
# direction = "DOWN"
exprs_bin = binarized_expressions[direction]
exprs_bin.T

In [None]:
print("Direction:", direction)

(
    moduleSizes, 
    gene2Samples, 
    nOnesPerSampleInModules, 
    gene2Module, 
    moduleOneFreqs, 
    LP
) = method2.set_initial_conditions(exprs_bin, alpha, beta_K, verbose=True)

K = len(moduleSizes)
N = gene2Samples.shape[1]
print("\t\tLP memory, {:.2f} M".format(LP.nbytes/(1024*1024)), file=sys.stdout)

In [None]:
# init some often used vars
p0 = N*np.log(0.5) + np.log(beta_K)
match_score    = np.log((0.5*alpha + 1) / alpha)
mismatch_score = np.log((0.5*alpha + 0) / alpha)
bK_1 = math.log(1 + beta_K)

t0 = time()
gene2Module_history, sampling_steps, n_skipping_genes = method2.sampling(
    LP,
    gene2Module, 
    gene2Samples, 
    nOnesPerSampleInModules,
    moduleSizes,
    moduleOneFreqs, 
    p0, 
    match_score,
    mismatch_score, 
    bK_1, 
    alpha, 
    beta_K,
    max_n_steps = max_n_steps, 
    n_steps_averaged = n_steps_averaged, 
    n_points_fit = n_points_fit, 
    tol = 0.1,
    n_steps_for_convergence = n_steps_for_convergence, 
    verbose = True
)
print("time:\tSampling (%s steps) fininshed in %s s." %(len(gene2Module_history),round(time()-t0,2)), file = sys.stdout)


In [None]:
method2.plot_convergence(
    n_skipping_genes[0:], 
    len(gene2Module_history) - sampling_steps - 0,
    alpha = alpha, 
    beta_K = beta_K,
    n_steps_averaged = n_steps_averaged,
    n_points_fit = n_points_fit, 
    n_steps_for_convergence = n_steps_for_convergence,
    save_plot = False
)


In [None]:
consensus = method2.get_consensus_modules(gene2Module_history[-sampling_steps:], verbose=True)

# 3) Define biclusters

In [None]:
genes = exprs_bin.columns.values
exprs_np = exprs.loc[genes,:]
ints2g_names = exprs_np.index.values
ints2s_names = exprs_np.columns.values
exprs_np = exprs_np.values
exprs_sums = exprs_np.sum(axis=1)
exprs_sq_sums = np.square(exprs_np).sum(axis=1)
N = exprs.shape[1]
exprs_data = N, exprs_sums, exprs_sq_sums

In [None]:
filtered_bics = method2.genesets2biclusters(
    consensus, 
    exprs_np, 
    exprs_data,
    ints2g_names,
    ints2s_names,
    min_SNR = 0,
    min_n_samples = min_n_samples, 
    min_n_genes = 2,
    verbose = verbose
)

## Found clusters

In [None]:
colors = plt.get_cmap('tab20b')

style = exprs.style.background_gradient(axis=None, low=0, high=2).format('{:.3f}')
for i, bic in enumerate(filtered_bics):
    style = style.set_properties(**{'background-color':matplotlib.colors.to_hex(colors(i))}, subset=(bic['genes'], bic['samples']))
style