# Compute diffusion scores in Python

In [1]:
import networkx as nx
import numpy as np
import logging
import scipy as sp
from math import pi, sqrt
import sys
import os 

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

dir_path = os.path.dirname(os.path.realpath('__file__'))

DEBUG:root:test


#### Import kernel functions from diffuPy

The kernel functions a imported from the package. Despite this the functions implementation are in this notebook (final _imp in the function name).

In [2]:
from diffupy.kernel import commute_time_kernel, p_step_kernel, inverse_cosine_kernel, diffusion_kernel, regularised_laplacian_kernel

from diffupy.matrix import Matrix, LaplacianMatrix

from diffupy.miscellaneous import get_label_list_graph

from diffupy.diffuse_raw import diffuse_raw

### Import example graph

In [3]:
G = nx.read_gml(dir_path+'/04_unit_testing/_graph.gml', label='id')

### General functions

#### Labels mapping

In [4]:
def csv_labeled_matrix_to_matrix(path):
    # Import matrix from csv file and remove headers
    m = np.genfromtxt(path, dtype=None, delimiter=',')
    return Matrix(np.array([[float(x) for x in a[1:]] for a in m[1:]]),os.path.basename(path).split('.csv'), m[1:,0], m[0, 1:])

In [5]:
def run_score_test(score_func, G, input_scores, test_output_scores, z = False):
    
    computed_output_scores = score_func(G, input_scores, z)

    if isinstance(computed_output_scores, Matrix):
        computed_output_scores = computed_output_scores.mat
    
    if isinstance(test_output_scores, Matrix):
        test_output_scores = test_output_scores.mat
    
    logging.info(' %s  \n %s\n', 'Computed matrix', computed_output_scores)
    logging.info(' %s  \n %s\n', 'Test matrix', test_output_scores)
    # Assert rounded similarity (floating comma)
    assert np.allclose(computed_output_scores, test_output_scores)
    logging.info(' Test '+ score_func.__name__ +' passed')

#### Helpers

In [6]:
# In which format is the input? Tell apart vector, matrix or list of matrices
def which_format(x):
    # if is.numeric(x) and is.null(dim(x)): return "vector"
    if isinstance(x, isinstance(x[0], list)) or isinstance(x, isinstance(x[0], np.ndarray)): return "matrix"
    if isinstance(x, list): return "list"
    
    raise ValueError('Non-recognised input scores format, object of class:  %s', x.__class__.__name__)

#### Checkers

In [7]:
# TODO
#.check_method
#.check_metric

# Check scores sanity
def check_scores(scores):
    #if np.equal(scores.cols_labels, any(([], None, 'Nan')):
    if scores.cols_labels == []:
        raise ValueError("Scores must be a named list but supplied list contains no names.")

    if not 'float' and 'int' in str(scores.mat.dtype):
        raise ValueError("The scores in background are not numeric.")
        
    for score, col_label, row_label in iter(scores):
        if not isinstance(score, float) and not isinstance(score, int):
            raise ValueError("The scores in background are not numeric")
        if score in ['Nan', None]:
            raise ValueError("Scores input cannot contain NA. But background .")
        if col_label in ['Nan', None]:
            raise ValueError("The scores in background must have rownames according to the scored nodes.")
        if row_label in ['Nan', None]:
            raise ValueError("The scores in background must have colnames to differentiate score sets.")
            
    std_mat = Matrix(np.std(scores.mat, axis=0), ['sd'], scores.cols_labels)

    for sd, col_label, row_label in iter(std_mat):
        if sd in ['Nan', None]:
            raise ValueError("Standard deviation in background is NA in column:" + str(col_label))
        if sd == 0:
            raise ValueError("Standard deviation in background is 0 in column:" + str(col_label))

In [8]:
# Check graph sanity
def check_graph(graph):
    if graph in [None, 'NA', 'Nan']:
        raise ValueError("'graph' missing")

    if not isinstance(graph, nx.Graph):
        raise ValueError("'graph' must be an NetworkX graph object")

    nodes_names = get_label_list_graph(graph, 'name')
    if nodes_names in [None, 'NA', 'Nan']:
        raise ValueError("'graph' must have node names.")

    if any(nodes_names) is None:
        raise ValueError("'graph' cannot have NA as node names")

    if len(np.unique(nodes_names)) != len(nodes_names):
        raise ValueError("'graph' has non-unique names! Please check that the names are unique.")
                         
    if nx.is_directed(graph):
        raise Warning("graph' should be an undirected NetworkX graph object.")

    edge_weights = nx.get_edge_attributes(graph,'weight')
    if edge_weights:
        if any(edge_weights) is None:
            raise ValueError("'graph' cannot contain NA edge weights, all must have weights.")
        if any(edge_weights) < 0:
            raise Warning("'graph' should not contain negative edge weights.")


In [9]:
# Check graph sanity
def check_K(K):
    if not isinstance(K, Matrix):
        raise ValueError("'K' must be a matrix")
        
    # Check numeric type.
    if not 'float' and 'int' in str(K.mat.dtype):
        raise ValueError("'K' must be a numeric matrix, but it is not numeric.")
        
    n_rows = K.mat.shape[0]
    n_cols = K.mat.shape[1]
    if n_rows != n_cols:
        raise ValueError("'K' must be a square matrix, but it has "+str(n_rows)+" rows and "+str(n_cols)+" columns.")
    
    if K.cols_labels == []:
        raise ValueError("'K' kernel must have row names.")
        
    if K.rows_labels == []:
        raise ValueError("'K' kernel must have column names.")
        
    if K.rows_labels !=  K.cols_labels:
        raise ValueError("'K' rownames and colnames must coincide.")

    for score, col_label, row_label in iter(K):       
        if not isinstance(score, float) and not isinstance(score, int):
            raise ValueError("'K' must be a numeric matrix, but it is not numeric.")
            
        if score in ['Nan', None]:
            raise ValueError("Scores input cannot contain NA. But background .")
        
        if col_label in ['Nan', None] or row_label in ['Nan', None]:
            raise ValueError("'K' dimnames cannot be NA.")

    if len(np.unique(K.rows_labels)) != len(K.rows_labels):
        raise ValueError("'K' cannot contain duplicated row names.")
        
    if len(np.unique(K.cols_labels)) != len(K.cols_labels):
        raise ValueError("'K' cannot contain duplicated column names.")

## Diffuse scores

Diffuse scores on a network. Diffuse takes a network in networkx format and an initial state to score all the nodes in the network.

In [10]:
def diffuse (scores,
    method,
    graph = None,
    **kargs):
        
    # sanity checks
    check_scores(scores)
    
    # Check if we have a graph or a kernel
    if notmissing("graph"):
        format_network = "graph"
    else:
        if not "K" in kargs:
            raise ValueError("Neither a graph 'graph' or a kernel 'K' were provided.")

        format_network = "kernel"

    # Diffuse raw
    if method == "raw":
        return diffuse_raw(graph = graph, scores = scores, **kargs)
    
    # z scores
    if method == "z":
        return diffuse_raw(graph, scores, z = True, **kargs)
    
    if method == "ml":
        for score, i, j in iter(scores, get_labels = False, get_indices = True):
            if score not in [0, 1]:
                raise ValueError("'graph' cannot have NA as node names")
            if score == 0:
                scores.mat[j, i] = -1
                
        return diffuse_raw(graph, scores, **kargs)
    
    if method == "gm":
        for score, i, j in iter(scores, get_labels = False, get_indices = True):
            if score not in [0, 1]:
                raise ValueError("Input scores must be binary.")
                # Have to match rownames with background
                # If the kernel is provided...
            if score == 0:
                scores.mat[j, i] = -1
        
        
        if format_network == "graph":
            names_ordered = get_label_list_graph(graph, 'name')
        elif (format_network == "kernel"):
            names_ordered = K.rows_labels

        # If the graph is defined...
        ids_nobkgd = set(names_ordered) - set(scores.rows_labels)
        n_tot = len(names_ordered)

        n_bkgd = scores.mat.shape[0]
        n_cols = scores.mat.shape[1]

        # normalisation has to be performed
        # for each column, as it depends
        # on the number of positives and negatives...
        # n_pos and n_neg are vectors counting the number of 
        # positives and negatives in each column
        n_pos = np.sum(scores.mat, axis=0)
        n_neg = n_bkgd - n_pos
        
        # biases
        p = (n_pos - n_neg)/n_tot
        
        for score, i, j in iter(scores, get_labels = False, get_indices = True):
            if score == 0:
                scores.mat[j, i] = -1
                
        # add biases (each column has its bias)
        scores.row_bind(np.repeat(p, n_tot - n_bkgd), ids_nobkgd)
        
        #TODO: Check axis
        #mat.rbind  matrix(
            # nrow = n_tot - n_bkgd, 
            # ncol = n_col, 
            # data = rep(p, each = n_tot - n_bkgd))

        # TODO: sort the names as in the original graph
        # mat[names_ordered, , drop = FALSE]

    
        return diffuse_raw(graph, scores, **kargs)

### Raw scores test

In [11]:
run_score_test(diffuse_raw, G, Matrix.from_csv(dir_path+'/scores_test/input_scores.csv'), Matrix.from_csv(dir_path+'/scores_test/output_scores.csv'))


  m = np.genfromtxt(path, dtype=None, delimiter=',')
  if scores.cols_labels == []:
INFO:root:Kernel not supplied. Computing regularised Laplacian kernel ...
INFO:root:Done
INFO:root: Computed matrix  
 [[0.04214556 0.6160887 ]
 [0.02803012 0.52670386]
 [0.07392846 0.5939416 ]
 [0.02882746 0.56798698]
 [0.02498394 0.59266735]
 [0.02694918 0.49485222]
 [0.04164558 0.60225341]
 [0.02889637 0.56096404]
 [0.03051561 0.57069925]
 [0.02342762 0.47268313]
 [0.02099228 0.62264055]
 [0.02743536 0.63092324]
 [0.03170256 0.59075792]
 [0.02845791 0.530376  ]
 [0.02020737 0.6621209 ]
 [0.02814164 0.51668837]
 [0.02503623 0.62459717]
 [0.01942714 0.4771247 ]
 [0.02439709 0.63394709]
 [0.13784437 0.65513821]
 [0.02120247 0.64641259]
 [0.05397049 0.50302451]
 [0.03590104 0.68072472]
 [0.0431597  0.43430101]
 [0.01995905 0.63175857]
 [0.02123165 0.47331063]
 [0.01967457 0.48424492]
 [0.05469101 0.66325373]
 [0.02302591 0.58192566]
 [0.21329129 0.46346626]
 [0.04744166 0.69950698]
 [0.02109237 0.4091133

### z-scores test

In [12]:
run_score_test(diffuse_raw, G, Matrix.from_csv(dir_path+'/scores_test/input_scores.csv'), Matrix.from_csv(dir_path+'/scores_test/output_z_scores.csv'), z = True)

INFO:root:Kernel not supplied. Computing regularised Laplacian kernel ...
INFO:root:Done
INFO:root: Computed matrix  
 [[ 5.72900503e-01  9.09207602e-01]
 [-1.13346069e-01 -6.58395727e-01]
 [ 4.08043369e+00  1.08347464e+00]
 [-9.23887165e-02  2.16271649e-01]
 [-4.09220982e-01  9.15872583e-01]
 [-2.51410271e-01 -1.84498242e+00]
 [ 1.11370470e+00  1.38866286e+00]
 [-5.88803789e-02  1.76754269e-02]
 [ 3.71221934e-02  2.64722305e-01]
 [-2.57795602e-01 -1.17700250e+00]
 [-4.84654538e-01  1.15824144e+00]
 [-1.25700004e-01  1.19460436e+00]
 [ 1.33054511e-01  8.26059471e-01]
 [-7.37208835e-02 -4.86688190e-01]
 [-2.17625149e-01  7.79921088e-01]
 [-6.20089510e-02 -4.96654871e-01]
 [-2.22217600e-01  9.93816633e-01]
 [-3.76246194e-01 -1.01351644e+00]
 [-3.13109721e-01  1.42013606e+00]
 [ 5.05401771e+00  1.53221736e+00]
 [-3.80558984e-01  1.28458951e+00]
 [ 1.10604763e+00 -9.03464626e-01]
 [ 1.31770256e-01  9.26426743e-01]
 [ 6.24516771e-01 -2.05000986e+00]
 [-3.50165695e-01  8.60000438e-01]
 [-3.8