# Compute diffusion scores in Python

In [1]:
import networkx as nx
import numpy as np
import logging
import scipy as sp
from math import pi, sqrt
import sys
import os

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

dir_path = os.path.dirname(os.path.realpath('__file__'))

DEBUG:root:test


#### Import kernel functions from diffuPy

The kernel functions a imported from the package. Despite this the functions implementation are in this notebook (final _imp in the function name).

In [2]:
from diffuPy.kernels import commute_time_kernel, p_step_kernel, inverse_cosine_kernel, diffusion_kernel, regularised_laplacian_kernel

from diffuPy.matrix import Matrix, LaplacianMatrix

from diffuPy.miscellaneous import get_label_list_graph

from diffuPy.diffuse_raw import diffuse_raw

from diffuPy.validate_inputs import _validate_scores, _validate_graph, _validate_K

### Import example graph

In [3]:
G = nx.read_gml(dir_path+'/04_unit_testing/_graph.gml', label='id')

### General functions

#### Labels mapping

In [4]:
def csv_labeled_matrix_to_matrix(path):
    # Import matrix from csv file and remove headers
    m = np.genfromtxt(path, dtype=None, delimiter=',')
    return Matrix(np.array([[float(x) for x in a[1:]] for a in m[1:]]),os.path.basename(path).split('.csv'), m[1:,0], m[0, 1:])

In [5]:
def run_score_method_test(method, G, input_scores, test_output_scores):
    
    computed_output_scores = diffuse(input_scores, method, graph = G)

    if isinstance(computed_output_scores, Matrix):
        computed_output_scores = computed_output_scores.mat
    
    if isinstance(test_output_scores, Matrix):
        test_output_scores = test_output_scores.mat
    
    logging.info(' %s  \n %s\n', 'Computed matrix', computed_output_scores)
    logging.info(' %s  \n %s\n', 'Test matrix', test_output_scores)
    # Assert rounded similarity (floating comma)
    assert np.allclose(computed_output_scores, test_output_scores)
    logging.info(' Test '+ method+' passed')

#### Helpers

In [6]:
# In which format is the input? Tell apart vector, matrix or list of matrices
def which_format(x):
    # if is.numeric(x) and is.null(dim(x)): return "vector"
    if isinstance(x, isinstance(x[0], list)) or isinstance(x, isinstance(x[0], np.ndarray)): return "matrix"
    if isinstance(x, list): return "list"
    
    raise ValueError('Non-recognised input scores format, object of class:  %s', x.__class__.__name__)

## Diffuse scores

Diffuse scores on a network. Diffuse takes a network in networkx format and an initial state to score all the nodes in the network.

In [7]:
def diffuse (scores,
    method,
    graph = None,
    **kargs):
        
    # sanity checks
    _validate_scores(scores)
    
    # Check if we have a graph or a kernel
    if graph:
        format_network = "graph"
    else:
        if not "K" in kargs:
            raise ValueError("Neither a graph 'graph' or a kernel 'K' has been provided.")
        format_network = "kernel"
    
    # Diffuse raw
    if method == "raw":
        return diffuse_raw(graph = graph, scores = scores, **kargs)
    
    # z scores
    elif method == "z":
        return diffuse_raw(graph, scores, z = True, **kargs)
    
    elif method == "ml":
        for score, i, j in scores.__iter__(get_labels = False, get_indices = True):
            if score not in [0, 1]:
                raise ValueError("'graph' cannot have NA as node names")
            if score == 0:
                scores.mat[j, i] = -1
                
        return diffuse_raw(graph, scores, **kargs)
    
    elif method == "gm":
        for score, i, j in scores.__iter__(get_labels = False, get_indices = True):
            if score not in [0, 1]:
                raise ValueError("Input scores must be binary.")
                # Have to match rownames with background
                # If the kernel is provided...
        
        if format_network == "graph":
            names_ordered = get_label_list_graph(graph, 'name')
        elif format_network == "kernel":
            names_ordered =  kargs['K'].rows_labels

        # If the graph is defined...
        ids_nobkgd = set(names_ordered) - set(scores.rows_labels)
        n_tot = len(names_ordered)

        n_bkgd = scores.mat.shape[0]
        n_cols = scores.mat.shape[1]

        # normalisation has to be performed
        # for each column, as it depends
        # on the number of positives and negatives...
        # n_pos and n_neg are vectors counting the number of 
        # positives and negatives in each column
        n_pos = np.sum(scores.mat, axis=0)
        n_neg = n_bkgd - n_pos
        
        # biases
        p = (n_pos - n_neg)/n_tot
        
        for score, i, j in scores.__iter__(get_labels = False, get_indices = True):
            if score == 0:
                scores.mat[j, i] = -1
                
        # add biases (each column has its bias)
        scores.row_bind(np.transpose(np.array([np.repeat(score, n_tot - n_bkgd) for score in p])), ids_nobkgd)
    
        return diffuse_raw(graph, scores, **kargs)

### Raw scores test

In [8]:
run_score_method_test('raw', G, Matrix.from_csv(dir_path+'/scores_test/input_methods_scores.csv'), Matrix.from_csv(dir_path+'/scores_test/output_raw_scores.csv'))


  m = np.genfromtxt(path, dtype=None, delimiter=',')
INFO:root:Kernel not supplied. Computing regularised Laplacian kernel ...
INFO:root:Done
INFO:root: Computed matrix  
 [[0.04214556 0.50326422]
 [0.02803012 0.61130974]
 [0.07392846 0.60340568]
 [0.02882746 0.50479119]
 [0.02498394 0.5953641 ]
 [0.02694918 0.5304442 ]
 [0.04164558 0.51905041]
 [0.02889637 0.51799777]
 [0.03051561 0.57061128]
 [0.02342762 0.48754876]
 [0.02099228 0.43087285]
 [0.02743536 0.62909988]
 [0.03170256 0.61312181]
 [0.02845791 0.49601417]
 [0.02020737 0.41095152]
 [0.02814164 0.61780424]
 [0.02503623 0.58131454]
 [0.01942714 0.63091609]
 [0.02439709 0.56344145]
 [0.13784437 0.56701818]
 [0.02120247 0.46841617]
 [0.05397049 0.56280449]
 [0.03590104 0.68344146]
 [0.0431597  0.44611294]
 [0.01995905 0.60352157]
 [0.02123165 0.57968164]
 [0.01967457 0.63579363]
 [0.05469101 0.41506672]
 [0.02302591 0.6108705 ]
 [0.21329129 0.40056129]
 [0.04744166 0.67397161]
 [0.02109237 0.45186963]
 [0.01860675 0.3841622 ]
 [0

### z-scores test

In [9]:
run_score_method_test('z', G, Matrix.from_csv(dir_path+'/scores_test/input_methods_scores.csv'), Matrix.from_csv(dir_path+'/scores_test/output_z_scores.csv'))


INFO:root:Kernel not supplied. Computing regularised Laplacian kernel ...
INFO:root:Done
INFO:root: Computed matrix  
 [[ 5.72900503e-01 -7.55909691e-01]
 [-1.13346069e-01  1.20963780e+00]
 [ 4.08043369e+00  1.70100923e+00]
 [-9.23887165e-02 -1.22144207e+00]
 [-4.09220982e-01  1.26901322e+00]
 [-2.51410271e-01 -5.52587495e-01]
 [ 1.11370470e+00 -1.01489876e+00]
 [-5.88803789e-02 -5.85447605e-01]
 [ 3.71221934e-02  5.08832701e-01]
 [-2.57795602e-01 -8.39949280e-01]
 [-4.84654538e-01 -2.19779428e+00]
 [-1.25700004e-01  1.32936466e+00]
 [ 1.33054511e-01  1.69147879e+00]
 [-7.37208835e-02 -8.84951997e-01]
 [-2.17625149e-01 -1.05958311e+00]
 [-6.20089510e-02  7.75782122e-01]
 [-2.22217600e-01  4.80697206e-01]
 [-3.76246194e-01  9.87354874e-01]
 [-3.13109721e-01  2.57565602e-01]
 [ 5.05401771e+00  2.73471068e-01]
 [-3.80558984e-01 -1.21010824e+00]
 [ 1.10604763e+00  2.02589992e-01]
 [ 1.31770256e-01  1.02173534e+00]
 [ 6.24516771e-01 -1.69051256e+00]
 [-3.50165695e-01  6.40009514e-01]
 [-3.8

###  ml scores test

In [10]:
run_score_method_test('ml', G, Matrix.from_csv(dir_path+'/scores_test/input_methods_scores.csv'), Matrix.from_csv(dir_path+'/scores_test/output_ml_scores.csv'))


INFO:root:Kernel not supplied. Computing regularised Laplacian kernel ...
INFO:root:Done
INFO:root: Computed matrix  
 [[-0.91570887  0.00652844]
 [-0.94393976  0.22261947]
 [-0.85214307  0.20681137]
 [-0.94234508  0.00958238]
 [-0.95003212  0.1907282 ]
 [-0.94610164  0.06088839]
 [-0.91670884  0.03810082]
 [-0.94220725  0.03599554]
 [-0.93896878  0.14122256]
 [-0.95314475 -0.02490249]
 [-0.95801544 -0.13825431]
 [-0.94512927  0.25819976]
 [-0.93659489  0.22624361]
 [-0.94308418 -0.00797166]
 [-0.95958525 -0.17809697]
 [-0.94371672  0.23560849]
 [-0.94992754  0.16262909]
 [-0.96114573  0.26183217]
 [-0.95120582  0.12688291]
 [-0.72431125  0.13403637]
 [-0.95759506 -0.06316767]
 [-0.89205902  0.12560897]
 [-0.92819792  0.36688291]
 [-0.9136806  -0.10777413]
 [-0.9600819   0.20704313]
 [-0.9575367   0.15936329]
 [-0.96065085  0.27158725]
 [-0.89061798 -0.16986657]
 [-0.95394818  0.221741  ]
 [-0.57341743 -0.19887743]
 [-0.90511667  0.34794323]
 [-0.95781525 -0.09626073]
 [-0.96278649 -0.

### gm scores test: input unlabeled p-value score prediction

In [11]:
run_score_method_test('gm', G, Matrix.from_csv(dir_path+'/scores_test/input_unlabeled_scores.csv'), Matrix.from_csv(dir_path+'/scores_test/output_gm_scores.csv'))


INFO:root:Kernel not supplied. Computing regularised Laplacian kernel ...
INFO:root:Done
INFO:root: Computed matrix  
 [[-9.04126012e-01  2.06500769e-01]
 [-9.29041712e-01  1.90001698e-01]
 [-8.34711481e-01  1.90686950e-01]
 [-9.30071604e-01  9.14153380e-02]
 [-9.38759692e-01  1.29573249e-01]
 [-9.30359027e-01  6.14179913e-02]
 [-8.94973965e-01  1.57124624e-01]
 [-9.30345277e-01  1.50988772e-02]
 [-9.24874850e-01  1.82741005e-01]
 [-9.41186885e-01  2.37131391e-01]
 [-9.43807953e-01  1.14354195e-01]
 [-9.28934825e-01 -2.95367824e-02]
 [-9.23972255e-01  1.29250938e-01]
 [-9.24435596e-01 -4.96910204e-04]
 [-9.49865999e-01  3.01522529e-01]
 [-9.32529434e-01  1.21846360e-01]
 [-9.33824555e-01  2.07889709e-01]
 [-9.44925099e-01 -6.77257034e-02]
 [-9.35048901e-01  1.11589007e-01]
 [-7.13642294e-01  2.87172503e-01]
 [-9.46125678e-01  2.20843297e-02]
 [-8.76776481e-01  1.29940846e-01]
 [-9.14681789e-01  3.84453318e-01]
 [-9.03535754e-01  2.06309645e-01]
 [-9.51026338e-01  2.00596303e-01]
 [-9.4