# Compute diffusion scores in Python

In [1]:
import networkx as nx
import numpy as np
import logging
import scipy as sp
from math import pi, sqrt
import sys
import os 

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

dir_path = os.path.dirname(os.path.realpath('__file__'))

DEBUG:root:test


#### Import kernel functions from diffuPy

The kernel functions a imported from the package. Despite this the functions implementation are in this notebook (final _imp in the function name).

In [2]:
from diffupy.kernel import commute_time_kernel, p_step_kernel, inverse_cosine_kernel, diffusion_kernel, regularised_laplacian_kernel

from diffupy.matrix import Matrix, LaplacianMatrix

from diffupy.miscellaneous import get_label_list_graph

### Import example graph

In [3]:
G = nx.read_gml(dir_path+'/04_unit_testing/_graph.gml', label='id')

### General functions

#### Labels mapping

In [4]:
def csv_labeled_matrix_to_matrix(path):
    # Import matrix from csv file and remove headers
    m = np.genfromtxt(path, dtype=None, delimiter=',')
    return Matrix(np.array([[float(x) for x in a[1:]] for a in m[1:]]),os.path.basename(path).split('.csv'), m[1:,0], m[0, 1:])

In [5]:
def run_score_test(score_func, G, input_scores, test_output_scores, z = False):
    
    computed_output_scores = score_func(G, input_scores, z)

    if isinstance(computed_output_scores, Matrix):
        computed_output_scores = computed_output_scores.mat
    
    if isinstance(test_output_scores, Matrix):
        test_output_scores = test_output_scores.mat
    
    logging.info(' %s  \n %s\n', 'Computed matrix', computed_output_scores)
    logging.info(' %s  \n %s\n', 'Test matrix', test_output_scores)
    # Assert rounded similarity (floating comma)
    assert np.allclose(computed_output_scores, test_output_scores)
    logging.info(' Test '+ score_func.__name__ +' passed')

#### Helpers

In [6]:
# In which format is the input? Tell apart vector, matrix or list of matrices
def which_format(x):
    # if is.numeric(x) and is.null(dim(x)): return "vector"
    if isinstance(x, isinstance(x[0], list)) or isinstance(x, isinstance(x[0], np.ndarray)): return "matrix"
    if isinstance(x, list): return "list"
    
    raise ValueError('Non-recognised input scores format, object of class:  %s', x.__class__.__name__)

#### Checkers

In [7]:
# TODO
#.check_method
#.check_metric

# Check scores sanity
def check_scores(scores):
    # TODO which_format
    
    if not scores.cols_labelsor:
        raise ValueError("Scores must be a named list but supplied list contains no names.")

    if not 'float' and 'int' in str(scores.mat.dtype):
        raise ValueError("The scores in background are not numeric.")
        
    for score, col_label, row_label in iter(scores):
        if not isinstance(score, float) and not isinstance(score, int):
            raise ValueError("The scores in background are not numeric")
        if score in ['Nan', None]:
            raise ValueError("Scores input cannot contain NA. But background .")
        if col_label in ['Nan', None]:
            raise ValueError("The scores in background must have rownames according to the scored nodes.")
        if row_label in ['Nan', None]:
            raise ValueError("The scores in background must have colnames to differentiate score sets.")
            
    std_mat = Matrix(np.std(scores.mat, axis=0), ['sd'], scores.cols_labels)

    for sd, col_label, row_label in iter(std_mat):
        if sd in ['Nan', None]:
            raise ValueError("Standard deviation in background is NA in column:" + str(col_label))
        if sd == 0:
            raise ValueError("Standard deviation in background is 0 in column:" + str(col_label))

In [8]:
# Check graph sanity
def check_graph(graph):
    if graph in [None, 'NA', 'Nan']:
        raise ValueError("'graph' missing")

    if not isinstance(graph, nx.Graph):
        raise ValueError("'graph' must be an NetworkX graph object")

    nodes_names = get_label_list_graph(graph, 'name')
    if nodes_names in [None, 'NA', 'Nan']:
        raise ValueError("'graph' must have node names.")

    if any(nodes_names) is None:
        raise ValueError("'graph' cannot have NA as node names")

    if len(np.unique(nodes_names)) != len(nodes_names):
        raise ValueError("'graph' has non-unique names! Please check that the names are unique.")
                         
    if nx.is_directed(graph):
        raise Warning("graph' should be an undirected NetworkX graph object.")

    edge_weights = nx.get_edge_attributes(graph,'weight')
    if edge_weights:
        if any(edge_weights) is None:
            raise ValueError("'graph' cannot contain NA edge weights, all must have weights.")
        if any(edge_weights) < 0:
            raise Warning("'graph' should not contain negative edge weights.")


In [9]:
# Check graph sanity
def check_K(K):
    if not isinstance(K, Matrix):
        raise ValueError("'K' must be a matrix")
        
    # Check numeric type.
    if not 'float' and 'int' in str(K.mat.dtype):
        raise ValueError("'K' must be a numeric matrix, but it is not numeric.")
        
    n_rows = K.mat.shape[0]
    n_cols = K.mat.shape[1]
    if n_rows != n_cols:
        raise ValueError("'K' must be a square matrix, but it has "+str(n_rows)+" rows and "+str(n_cols)+" columns.")
    
    if K.cols_labels == []:
        raise ValueError("'K' kernel must have row names.")
        
    if K.rows_labels == []:
        raise ValueError("'K' kernel must have column names.")
        
    if K.rows_labels !=  K.cols_labels:
        raise ValueError("'K' rownames and colnames must coincide.")

    for score, col_label, row_label in iter(K):       
        if not isinstance(score, float) and not isinstance(score, int):
            raise ValueError("'K' must be a numeric matrix, but it is not numeric.")
            
        if score in ['Nan', None]:
            raise ValueError("Scores input cannot contain NA. But background .")
        
        if col_label in ['Nan', None] or row_label in ['Nan', None]:
            raise ValueError("'K' dimnames cannot be NA.")

    if len(np.unique(K.rows_labels)) != len(K.rows_labels):
        raise ValueError("'K' cannot contain duplicated row names.")
        
    if len(np.unique(K.cols_labels)) != len(K.cols_labels):
        raise ValueError("'K' cannot contain duplicated column names.")

## Diffuse scores

Diffuse scores on a network Function diffuse takes a network in networkx format and an initial state to score all the nodes in the network.

In [10]:
def calculate_scores(col_ind, scores, diff, const_mean, const_var):
    col_in = scores[:, col_ind]
    col_raw = diff[:, col_ind]

    s1 = np.sum(col_in)
    s2 = np.sum(col_in**2)

    # means and vars depend on first and second moments
    # of the input. This should be valid for non-binary
    # inputs as well
    score_means = const_mean*s1
    score_vars = const_var*(len(scores)*s2 - s1**2)
    
    return np.subtract(col_raw, score_means)/np.sqrt(score_vars)

def diffuse_raw (graph,
    scores,
    z = False,
    K = None,
    *argv):
        
    # sanity checks
    check_scores(scores)

    # Kernel matrix
    if K is None:
        check_graph(graph)
        logging.info('Kernel not supplied. Computing regularised Laplacian kernel ...')
        K = regularised_laplacian_kernel(graph, normalized = False)
        logging.info('Done')
    else:
        check_K(K)
        logging.info('Using supplied kernel matrix...')

    # Compute scores        

    # Match indices
    scores = scores.match_rows(K)

    # TODO: Sparse
    # scores.mat <- methods::as(scores[[scores.name]], "sparseMatrix")

    n = len(scores.mat)
    K = K.mat
    
    # raw scores
    diff = np.matmul(K[:,:n], scores.mat)

    # Return base matrix if it is raw
    # Continue if we want z-scores
    if not z: 
        return diff

    # If we want z-scores, must compute rowmeans and rowmeans2
    row_sums = np.array([round(np.sum(row), 2) for row in K[:, :n]])
    row_sums_2 = np.array([np.sum(row) for row in K[:, :n]**2])

    # Constant terms over columns
    const_mean = row_sums/n
    const_var = np.subtract(n*row_sums_2, row_sums**2)/((n - 1)*(n**2))    
    
    return Matrix(np.transpose([np.array(calculate_scores(i, scores.mat, diff, const_mean, const_var)) for i in range(len(diff[0]))]), scores.rows_labels, scores.cols_labels) 

### Raw scores test

In [11]:
run_score_test(diffuse_raw, G, Matrix.from_csv(dir_path+'/scores_test/input_scores.csv'), Matrix.from_csv(dir_path+'/scores_test/output_scores.csv'))

  m = np.genfromtxt(path, dtype=None, delimiter=',')
  


b'"V1"'
b'"V2"'
b'"V3"'
b'"V4"'
b'"V5"'
b'"V6"'
b'"V7"'
b'"V8"'
b'"V9"'
b'"V10"'
b'"V11"'
b'"V12"'
b'"V13"'
b'"V14"'
b'"V15"'
b'"V16"'
b'"V17"'
b'"V18"'
b'"V19"'
b'"V20"'
b'"V21"'
b'"V22"'
b'"V23"'
b'"V24"'
b'"V25"'
b'"V26"'
b'"V27"'
b'"V28"'
b'"V29"'
b'"V30"'
b'"V31"'
b'"V32"'
b'"V33"'
b'"V34"'
b'"V35"'
b'"V36"'
b'"V37"'
b'"V38"'
b'"V39"'
b'"V40"'
b'"V41"'
b'"V42"'
b'"V43"'
b'"V44"'
b'"V45"'
b'"V46"'
b'"V47"'
b'"V48"'
b'"V49"'
b'"V50"'
b'"V51"'
b'"V52"'
b'"V53"'
b'"V54"'
b'"V55"'
b'"V56"'
b'"V57"'
b'"V58"'
b'"V59"'
b'"V60"'
b'"V61"'
b'"V62"'
b'"V63"'
b'"V64"'
b'"V65"'
b'"V66"'
b'"V67"'
b'"V68"'
b'"V69"'
b'"V70"'
b'"V71"'
b'"V72"'
b'"V73"'
b'"V74"'
b'"V75"'
b'"V76"'
b'"V77"'
b'"V78"'
b'"V79"'
b'"V80"'
b'"V81"'
b'"V82"'
b'"V83"'
b'"V84"'
b'"V85"'
b'"V86"'
b'"V87"'
b'"V88"'
b'"V89"'
b'"V90"'
b'"V91"'
b'"V92"'
b'"V93"'
b'"V94"'
b'"V95"'
b'"V96"'
b'"V97"'
b'"V98"'
b'"V99"'
b'"V100"'
{'"V1"': 0, '"V2"': 1, '"V3"': 2, '"V4"': 3, '"V5"': 4, '"V6"': 5, '"V7"': 6, '"V8"': 7, '"V9"': 8, '"V10"':

AttributeError: 'str' object has no attribute 'decode'

### z-scores test

In [None]:
run_score_test(diffuse_raw, G, Matrix.from_csv(dir_path+'/scores_test/input_scores.csv'), Matrix.from_csv(dir_path+'/scores_test/output_z_scores.csv'), z = True)