# Compute diffusion scores in Python

In [1]:
import networkx as nx
import numpy as np
import logging
import scipy as sp
from math import pi, sqrt
import sys
import os 

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

dir_path = os.path.dirname(os.path.realpath('__file__'))

DEBUG:root:test


#### Import kernel functions from diffuPy

The kernel functions a imported from the package. Despite this the functions implementation are in this notebook (final _imp in the function name).

In [2]:
from diffupy.kernel import commute_time_kernel, p_step_kernel, inverse_cosine_kernel, diffusion_kernel, regularised_laplacian_kernel

from diffupy.matrix import Matrix, LaplacianMatrix

### Import example graph

In [3]:
G = nx.read_gml(dir_path+'/04_unit_testing/_graph.gml', label='id')

### General functions

#### Labels mapping

In [4]:
def csv_labeled_matrix_to_matrix(path):
    # Import matrix from csv file and remove headers
    m = np.genfromtxt(path, dtype=None, delimiter=',')
    return Matrix(np.array([[float(x) for x in a[1:]] for a in m[1:]]),os.path.basename(path).split('.csv'), m[1:,0], m[0, 1:])                                                                                              
                                                                                                                    

In [5]:
def run_score_test(score_func, G, input_scores, test_output_scores, z = False):
    
    computed_output_scores = score_func(G, input_scores, z)

    if isinstance(computed_output_scores, Matrix):
        computed_output_scores = computed_output_scores.mat
    
    if isinstance(test_output_scores, Matrix):
        test_output_scores = test_output_scores.mat
    
    logging.info(' %s  \n %s\n', 'Computed matrix', computed_output_scores)
    logging.info(' %s  \n %s\n', 'Test matrix', test_output_scores)
    # Assert rounded similarity (floating comma)
    assert np.allclose(computed_output_scores, test_output_scores)
    logging.info(' Test '+ score_func.__name__ +' passed')

#### Helpers

In [6]:
# In which format is the input? Tell apart vector, matrix or list of matrices
def which_format(x):
    # if is.numeric(x) and is.null(dim(x)): return "vector"
    if isinstance(x, isinstance(x[0], list)) or isinstance(x, isinstance(x[0], np.ndarray)): return "matrix"
    if isinstance(x, list): return "list"
    
    raise ValueError('Non-recognised input scores format, object of class:  %s', x.__class__.__name__)

#### Checks

In [7]:
# TO-DO
#.check_method
#.check_metric

def check_scores(scores):
    if not 'float' and 'int' in str(scores.mat.dtype):
        raise ValueError("The scores in background are not numeric.")
        
    for score, col_label, row_label in iter(scores):
        if not isinstance(score, float) and not isinstance(score, int):
            raise ValueError("The scores in background are not numeric")
        if score in ['Nan', None]:
            raise ValueError("Scores input cannot contain NA. But background .")
        if col_label in ['Nan', None]:
            raise ValueError("The scores in background must have rownames according to the scored nodes.")
        if row_label in ['Nan', None]:
            raise ValueError("The scores in background must have colnames to differentiate score sets.")
            
    std_mat = Matrix(np.std(scores.mat, axis=0), ['sd'], scores.cols_labels)
    print(std_mat)
    for sd, col_label, row_label in iter(std_mat):
        print(sd)
        if sd in ['Nan', None]:
            raise ValueError("Standard deviation in background is NA in column:" + str(col_label))
        if sd == 0:
            raise ValueError("Standard deviation in background is 0 in column:" + str(col_label))

## Diffuse scores

Diffuse scores on a network Function diffuse takes a network in networkx format and an initial state to score all the nodes in the network.

In [8]:
def calculate_scores(col_ind, scores, diff, const_mean, const_var):
    col_in = scores[:, col_ind]
    col_raw = diff[:, col_ind]

    s1 = np.sum(col_in)
    s2 = np.sum(col_in**2)

    # means and vars depend on first and second moments
    # of the input. This should be valid for non-binary
    # inputs as well
    score_means = const_mean*s1
    score_vars = const_var*(len(scores)*s2 - s1**2)
    
    return np.subtract(col_raw, score_means)/np.sqrt(score_vars)

def diffuse_raw (graph,
    scores,
    z = False,
    K = None,
    *argv):
    
    # TODO
    # sanity checks
    check_scores(scores)

    # Kernel matrix
    if K is None:
        # .check_graph(graph)
        logging.info('Kernel not supplied. Computing regularised Laplacian kernel ...')
        K = regularised_laplacian_kernel(graph, normalized = False)
        logging.info('Done')
    else:
        # .check_K(K)
        logging.info('Using supplied kernel matrix...')

    # Compute scores        

    # TODO: match indices, for now we assume indices match
    # for i, input_scores in enumerate(scores):
    #    for j, score in enumerate(input_scores):
    #       id_label_mapping(i)
    # input scores

    # TODO: Sparse
    # scores.mat <- methods::as(scores[[scores.name]], "sparseMatrix")

    n = len(scores.mat)
    
    K = K.mat
    
    # raw scores
    diff = np.matmul(K[:,:n], scores.mat)

    # Return base matrix if it is raw
    # Continue if we want z-scores
    if not z: 
        return diff

    # If we want z-scores, must compute rowmeans and rowmeans2
    row_sums = np.array([round(np.sum(row), 2) for row in K[:, :n]])
    row_sums_2 = np.array([np.sum(row) for row in K[:, :n]**2])

    # Constant terms over columns
    const_mean = row_sums/n
    const_var = np.subtract(n*row_sums_2, row_sums**2)/((n - 1)*(n**2))    
    
    return Matrix(np.transpose([np.array(calculate_scores(i, scores.mat, diff, const_mean, const_var)) for i in range(len(diff[0]))]), scores.rows_labels, scores.cols_labels) 

### Raw scores test

In [9]:
run_score_test(diffuse_raw, G, Matrix.from_csv(dir_path+'/scores_test/input_scores.csv'), Matrix.from_csv(dir_path+'/scores_test/output_scores.csv'))


  m = np.genfromtxt(path, dtype=None, delimiter=',')
INFO:root:Kernel not supplied. Computing regularised Laplacian kernel ...
INFO:root:Done
INFO:root: Computed matrix  
 [[0.04214556 0.5214829 ]
 [0.02803012 0.51094974]
 [0.07392846 0.44912986]
 [0.02882746 0.4496067 ]
 [0.02498394 0.41749408]
 [0.02694918 0.44564431]
 [0.04164558 0.50412238]
 [0.02889637 0.41590905]
 [0.03051561 0.45756544]
 [0.02342762 0.59150125]
 [0.02099228 0.54286473]
 [0.02743536 0.43618487]
 [0.03170256 0.47188431]
 [0.02845791 0.51368974]
 [0.02020737 0.56976186]
 [0.02814164 0.39530545]
 [0.02503623 0.54527969]
 [0.01942714 0.41178428]
 [0.02439709 0.4188488 ]
 [0.13784437 0.48663452]
 [0.02120247 0.4773739 ]
 [0.05397049 0.54610831]
 [0.03590104 0.61605049]
 [0.0431597  0.57824798]
 [0.01995905 0.35958579]
 [0.02123165 0.45059075]
 [0.01967457 0.4200445 ]
 [0.05469101 0.61959678]
 [0.02302591 0.48778213]
 [0.21329129 0.4447339 ]
 [0.04744166 0.34758142]
 [0.02109237 0.54906121]
 [0.01860675 0.33645735]
 [0

matrix  
 [0.17058722 0.49959984] 
 row labels: ['sd'] 
 column labels: 
 [b'"input1"' b'"input2"'] 
 : 
0.17058722109232002
0.49959983987187156


### z-scores test

In [10]:
run_score_test(diffuse_raw, G, Matrix.from_csv(dir_path+'/scores_test/input_scores.csv'), Matrix.from_csv(dir_path+'/scores_test/output_z_scores.csv'), z = True)

INFO:root:Kernel not supplied. Computing regularised Laplacian kernel ...
INFO:root:Done
INFO:root: Computed matrix  
 [[ 5.72900503e-01  6.68120609e-01]
 [-1.13346069e-01  6.08062494e-01]
 [ 4.08043369e+00 -9.79091266e-01]
 [-9.23887165e-02 -8.17698102e-01]
 [-4.09220982e-01 -1.74116662e+00]
 [-2.51410271e-01 -9.66694646e-01]
 [ 1.11370470e+00  7.87686379e-01]
 [-5.88803789e-02 -1.16753471e+00]
 [ 3.71221934e-02 -5.51509482e-01]
 [-2.57795602e-01  1.49333415e+00]
 [-4.84654538e-01  1.15491128e+00]
 [-1.25700004e-01 -7.33259529e-01]
 [ 1.33054511e-01 -2.16559877e-01]
 [-7.37208835e-02  5.49924340e-01]
 [-2.17625149e-01  6.81123592e-01]
 [-6.20089510e-02 -9.64947482e-01]
 [-2.22217600e-01  9.97858358e-01]
 [-3.76246194e-01 -8.28873395e-01]
 [-3.13109721e-01 -1.16684154e+00]
 [ 5.05401771e+00  1.06162939e-01]
 [-3.80558984e-01 -3.87878815e-02]
 [ 1.10604763e+00  1.04154297e+00]
 [ 1.31770256e-01  1.03732076e+00]
 [ 6.24516771e-01  1.59201034e+00]
 [-3.50165695e-01 -1.43383993e+00]
 [-3.8

matrix  
 [0.17058722 0.49959984] 
 row labels: ['sd'] 
 column labels: 
 [b'"input1"' b'"input2"'] 
 : 
0.17058722109232002
0.49959983987187156
