# Compute diffusion scores in Python

In [1]:
import networkx as nx
import numpy as np
import logging
import scipy as sp
from math import pi, sqrt
import sys
import os 

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

dir_path = os.path.dirname(os.path.realpath('__file__'))

DEBUG:root:test


#### Import kernel functions from diffuPy

The kernel functions a imported from the package. Despite this the functions implementation are in this notebook (final _imp in the function name).

In [2]:
from diffuPy.kernels import commute_time_kernel, p_step_kernel, inverse_cosine_kernel, diffusion_kernel, regularised_laplacian_kernel

from diffuPy.matrix import Matrix, LaplacianMatrix

from diffuPy.miscellaneous import get_label_list_graph

from diffuPy.diffuse_raw import diffuse_raw

from diffuPy.validate_inputs import _validate_scores, _validate_graph, _validate_K

### Import example graph

In [3]:
G = nx.read_gml(dir_path+'/04_unit_testing/_graph.gml', label='id')

### General functions

#### Labels mapping

In [4]:
def csv_labeled_matrix_to_matrix(path):
    # Import matrix from csv file and remove headers
    m = np.genfromtxt(path, dtype=None, delimiter=',')
    return Matrix(np.array([[float(x) for x in a[1:]] for a in m[1:]]),os.path.basename(path).split('.csv'), m[1:,0], m[0, 1:])

In [5]:
def run_score_method_test(method, G, input_scores, test_output_scores):
    
    computed_output_scores = diffuse(input_scores, method, graph = G)

    if isinstance(computed_output_scores, Matrix):
        computed_output_scores = computed_output_scores.mat
    
    if isinstance(test_output_scores, Matrix):
        test_output_scores = test_output_scores.mat
    
    logging.info(' %s  \n %s\n', 'Computed matrix', computed_output_scores)
    logging.info(' %s  \n %s\n', 'Test matrix', test_output_scores)
    # Assert rounded similarity (floating comma)
    assert np.allclose(computed_output_scores, test_output_scores)
    logging.info(' Test '+ method+' passed')

#### Helpers

In [6]:
# In which format is the input? Tell apart vector, matrix or list of matrices
def which_format(x):
    # if is.numeric(x) and is.null(dim(x)): return "vector"
    if isinstance(x, isinstance(x[0], list)) or isinstance(x, isinstance(x[0], np.ndarray)): return "matrix"
    if isinstance(x, list): return "list"
    
    raise ValueError('Non-recognised input scores format, object of class:  %s', x.__class__.__name__)

## Diffuse scores

Diffuse scores on a network. Diffuse takes a network in networkx format and an initial state to score all the nodes in the network.

In [7]:
def diffuse (scores,
    method,
    graph = None,
    **kargs):
        
    # sanity checks
    _validate_scores(scores)
    
    # Check if we have a graph or a kernel
    if graph:
        format_network = "graph"
    else:
        if not "K" in kargs:
            raise ValueError("Neither a graph 'graph' or a kernel 'K' has been provided.")
        format_network = "kernel"
    
    # Diffuse raw
    if method == "raw":
        return diffuse_raw(graph = graph, scores = scores, **kargs)
    
    # z scores
    elif method == "z":
        return diffuse_raw(graph, scores, z = True, **kargs)
    
    elif method == "ml":
        for score, i, j in scores.__iter__(get_labels = False, get_indices = True):
            if score not in [0, 1]:
                raise ValueError("'graph' cannot have NA as node names")
            if score == 0:
                scores.mat[j, i] = -1
                
        return diffuse_raw(graph, scores, **kargs)
    
    elif method == "gm":
        for score, i, j in scores.__iter__(get_labels = False, get_indices = True):
            if score not in [0, 1]:
                raise ValueError("Input scores must be binary.")
                # Have to match rownames with background
                # If the kernel is provided...
        
        if format_network == "graph":
            names_ordered = get_label_list_graph(graph, 'name')
        elif format_network == "kernel":
            names_ordered = K.rows_labels

        # If the graph is defined...
        ids_nobkgd = set(names_ordered) - set(scores.rows_labels)
        n_tot = len(names_ordered)

        n_bkgd = scores.mat.shape[0]
        n_cols = scores.mat.shape[1]

        # normalisation has to be performed
        # for each column, as it depends
        # on the number of positives and negatives...
        # n_pos and n_neg are vectors counting the number of 
        # positives and negatives in each column
        n_pos = np.sum(scores.mat, axis=0)
        n_neg = n_bkgd - n_pos
        
        # biases
        p = (n_pos - n_neg)/n_tot
        
        for score, i, j in scores.__iter__(get_labels = False, get_indices = True):
            if score == 0:
                scores.mat[j, i] = -1
                
        # add biases (each column has its bias)
        scores.row_bind(np.transpose(np.array([np.repeat(score, n_tot - n_bkgd) for score in p])), ids_nobkgd)
    
        return diffuse_raw(graph, scores, **kargs)

### Raw scores test

In [8]:
run_score_method_test('raw', G, Matrix.from_csv(dir_path+'/scores_test/input_methods_scores.csv'), Matrix.from_csv(dir_path+'/scores_test/output_raw_scores.csv'))


  m = np.genfromtxt(path, dtype=None, delimiter=',')
INFO:root:Kernel not supplied. Computing regularised Laplacian kernel ...
INFO:root:Done
INFO:root: Computed matrix  
 [[0.04214556 0.55614125]
 [0.02803012 0.55360892]
 [0.07392846 0.53154586]
 [0.02882746 0.50678179]
 [0.02498394 0.50949228]
 [0.02694918 0.442188  ]
 [0.04164558 0.52415819]
 [0.02889637 0.45216136]
 [0.03051561 0.53949119]
 [0.02342762 0.55466937]
 [0.02099228 0.39124988]
 [0.02743536 0.50483844]
 [0.03170256 0.45090718]
 [0.02845791 0.51940618]
 [0.02020737 0.35096041]
 [0.02814164 0.62753275]
 [0.02503623 0.46702074]
 [0.01942714 0.53144226]
 [0.02439709 0.52196247]
 [0.13784437 0.48001579]
 [0.02120247 0.39101081]
 [0.05397049 0.54440321]
 [0.03590104 0.65232824]
 [0.0431597  0.53525182]
 [0.01995905 0.36530151]
 [0.02123165 0.40021575]
 [0.01967457 0.37046459]
 [0.05469101 0.36665175]
 [0.02302591 0.39597344]
 [0.21329129 0.4339912 ]
 [0.04744166 0.58203765]
 [0.02109237 0.40453122]
 [0.01860675 0.35493876]
 [0

### z-scores test

In [9]:
run_score_method_test('z', G, Matrix.from_csv(dir_path+'/scores_test/input_methods_scores.csv'), Matrix.from_csv(dir_path+'/scores_test/output_z_scores.csv'))


INFO:root:Kernel not supplied. Computing regularised Laplacian kernel ...
INFO:root:Done
INFO:root: Computed matrix  
 [[ 5.72900503e-01  1.22632540e+00]
 [-1.13346069e-01  1.44617769e+00]
 [ 4.08043369e+00  1.63485142e+00]
 [-9.23887165e-02  7.20534307e-01]
 [-4.09220982e-01  8.21537631e-01]
 [-2.51410271e-01 -1.06394775e+00]
 [ 1.11370470e+00  1.44193077e+00]
 [-5.88803789e-02 -5.07132028e-01]
 [ 3.71221934e-02  1.46247368e+00]
 [-2.57795602e-01  1.00004557e+00]
 [-4.84654538e-01 -1.63046134e+00]
 [-1.25700004e-01  4.15678908e-01]
 [ 1.33054511e-01 -7.76315525e-01]
 [-7.37208835e-02  6.43234902e-01]
 [-2.17625149e-01 -9.79167636e-01]
 [-6.20089510e-02  1.68087973e+00]
 [-2.22217600e-01 -1.98399641e-01]
 [-3.76246194e-01  6.25062950e-01]
 [-3.13109721e-01  8.00696493e-01]
 [ 5.05401771e+00  2.52614672e-04]
 [-3.80558984e-01 -1.31438532e+00]
 [ 1.10604763e+00  1.01467900e+00]
 [ 1.31770256e-01  1.31392146e+00]
 [ 6.24516771e-01  8.95300441e-01]
 [-3.50165695e-01 -1.36577970e+00]
 [-3.8

###  ml scores test

In [10]:
run_score_method_test('ml', G, Matrix.from_csv(dir_path+'/scores_test/input_methods_scores.csv'), Matrix.from_csv(dir_path+'/scores_test/output_ml_scores.csv'))


INFO:root:Kernel not supplied. Computing regularised Laplacian kernel ...
INFO:root:Done
INFO:root: Computed matrix  
 [[-0.91570887  0.1122825 ]
 [-0.94393976  0.10721784]
 [-0.85214307  0.06309172]
 [-0.94234508  0.01356357]
 [-0.95003212  0.01898455]
 [-0.94610164 -0.115624  ]
 [-0.91670884  0.04831638]
 [-0.94220725 -0.09567727]
 [-0.93896878  0.07898238]
 [-0.95314475  0.10933875]
 [-0.95801544 -0.21750023]
 [-0.94512927  0.00967689]
 [-0.93659489 -0.09818564]
 [-0.94308418  0.03881236]
 [-0.95958525 -0.29807918]
 [-0.94371672  0.2550655 ]
 [-0.94992754 -0.06595853]
 [-0.96114573  0.06288452]
 [-0.95120582  0.04392493]
 [-0.72431125 -0.03996843]
 [-0.95759506 -0.21797839]
 [-0.89205902  0.08880642]
 [-0.92819792  0.30465648]
 [-0.9136806   0.07050363]
 [-0.9600819  -0.26939699]
 [-0.9575367  -0.1995685 ]
 [-0.96065085 -0.25907081]
 [-0.89061798 -0.2666965 ]
 [-0.95394818 -0.20805311]
 [-0.57341743 -0.1320176 ]
 [-0.90511667  0.1640753 ]
 [-0.95781525 -0.19093756]
 [-0.96278649 -0.

### gm scores test: input unlabeled p-value score prediction

In [11]:
run_score_method_test('gm', G, Matrix.from_csv(dir_path+'/scores_test/input_unlabeled_scores.csv'), Matrix.from_csv(dir_path+'/scores_test/output_gm_scores.csv'))


INFO:root:Kernel not supplied. Computing regularised Laplacian kernel ...
INFO:root:Done
INFO:root: Computed matrix  
 [[-0.90412601  0.11094189]
 [-0.92904171  0.09957647]
 [-0.83471148 -0.04802936]
 [-0.9300716   0.05318328]
 [-0.93875969 -0.00142241]
 [-0.93035903 -0.06685027]
 [-0.89497396 -0.0149064 ]
 [-0.93034528  0.12256431]
 [-0.92487485  0.01406011]
 [-0.94118688 -0.09587087]
 [-0.94380795  0.05403366]
 [-0.92893482  0.10066829]
 [-0.92397226  0.07601672]
 [-0.9244356   0.04133007]
 [-0.949866   -0.23642709]
 [-0.93252943 -0.05072414]
 [-0.93382456  0.16519214]
 [-0.9449251   0.08792037]
 [-0.9350489   0.12477806]
 [-0.71364229 -0.14136472]
 [-0.94612568  0.01807568]
 [-0.87677648  0.14179206]
 [-0.91468179 -0.24083983]
 [-0.90353575 -0.17467165]
 [-0.95102634  0.16765241]
 [-0.94773179 -0.04075892]
 [-0.94988838 -0.0791288 ]
 [-0.88019445  0.23339427]
 [-0.93560112  0.02837897]
 [-0.56246549  0.15736274]
 [-0.89489717 -0.24760536]
 [-0.93962298 -0.11026327]
 [-0.95281155 -0.