# Compute graph kernels in Python

In [37]:
import networkx as nx
import numpy as np
import logging
import scipy as sp
from math import pi, sqrt
import sys

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")

import os 
dir_path = os.path.dirname(os.path.realpath('__file__'))

DEBUG:root:test


#### Import kernel functions from diffuPy

The kernel functions a imported from the package. Despite this the functions implementation are in this notebook (final _imp in the function name).

In [38]:
from diffupy.kernel import commute_time_kernel, p_step_kernel, inverse_cosine_kernel, diffusion_kernel, regularised_laplacian_kernel

### Import example graph

In [39]:
G = nx.read_gml(dir_path+'/04_unit_testing/_graph.gml', label='id')

### General functions

#### Labels mapping

In [40]:
def csv_labeled_matrix_to_nparray(path):
    # Import matrix from csv file and remove headers
    m = np.genfromtxt(path, delimiter=',')
    return np.array([[x for x in a if ~np.isnan(x)] for a in m[1:]])

In [58]:
def run_score_test(score_func, G, input_scores, output_scores_test):
    computed_output_scores = score_func(G, input_scores)
    logging.info(' %s  \n %s\n', 'Computed matrix', computed_output_scores)
    logging.info(' %s  \n %s\n', 'Test matrix', output_scores_test)
    # Assert rounded similarity (floating comma)
    assert np.allclose(computed_output_scores, output_scores_test)
    logging.info(' Test '+ score_func.__name__ +' passed')

#### Labels mapping

In [42]:
def get_label_list_graph(graph):
    return [v for k, v in nx.get_node_attributes(graph, 'name')]

def get_label_id_mapping_graph(graph):
    return {v: k for k, v in nx.get_node_attributes(graph, 'name').items()} 

def get_label_id_mapping_matrix(matrix, labels_x, labels_y = None):
    if labels_y:
        return {label: idx for idx, label in enumerate(labels_x)}
    else:
        return {label_i:{label_j: (j, i) for j, label_j in enumerate(labels_y)} for i, label_i in enumerate(labels_x)}

#### Helpers

In [43]:
# In which format is the input? Tell apart vector, matrix or list of matrices
def which_format(x):
    # if is.numeric(x) and is.null(dim(x)): return "vector"
    if isinstance(x, isinstance(x[0], list)) or isinstance(x, isinstance(x[0], np.ndarray)): return "matrix"
    if isinstance(x, list): return "list"
    
    raise ValueError('Non-recognised input scores format, object of class:  %s', x.__class__.__name__)


In [44]:
def set_diagonal_matrix(M, d):
    for j, row in enumerate(M):
        for i, x in enumerate(row):
            if i==j:
                M[j][i] = d[i]
            else:
                M[j][i] = x
    return M

#### Checks

In [45]:
# TO-DO
#.check_method
#.check_metric

# Labels matrix !!

# def check_scores(scores):
#     form = which_format(scores)
#     #scores_names = names(scores)
#     #if is.null(scores_names):
#     #    raise ValueError("Scores must be a named list, but supplied list contains no names.")

#     plyr::l_ply(
#         scores_names,
#         function(mat_name) {
#             mat <- scores[[mat_name]]

#             if (!is.numeric(mat) & !("dgCMatrix" %in% class(mat))) {
#                 stop(
#                     "The scores in background ",
#                     mat_name,
#                     " are not numeric!"
#                 )
#             }
#             if (any(is.na(mat))) {
#                 stop(
#                     "Scores input cannot contain NA! ",
#                     "But background ",
#                     mat_name,
#                     " does!")
#             }
#             if (is.null(rownames(mat)))
#                 stop(
#                     "The scores in background ",
#                     mat_name,
#                     " must have rownames ",
#                     "according to the scored nodes!"
#                 )
#             if (is.null(colnames(mat)))
#                 stop(
#                     "The scores in background ",
#                     mat_name,
#                     " must have colnames ",
#                     "to differentiate score sets!"
#                 )

#             std <- apply(mat, 2, stats::sd)
#             std_zero <- which(std == 0)
#             std_na <- which(is.na(std))

#             if (length(std_na))
#                 warning(
#                     "Standard deviation in background ",
#                     mat_name,
#                     " is NA in columns: ",
#                     paste(std_na, collapse = ",")
#                 )
#             if (length(std_zero))
#                 warning(
#                     "Standard deviation in background ",
#                     mat_name,
#                     " is 0 in columns: ",
#                     paste(std_zero, collapse = ",")
#                 )
#         }
#     )
#     invisible()
# }

## Diffuse scores

Diffuse scores on a network Function diffuse takes a network in networkx format and an initial state to score all the nodes in the network.

In [52]:
def calculate_scores(col_ind):
    col_in = scores[:, col_ind]
    col_raw = diff[:, col_ind]

    s1 = np.sum(col_in)
    s2 = np.sum(col_in*2)

    # means and vars depend on first and second moments
    # of the input. This should be valid for non-binary
    # inputs as well
    score_means = const_mean*s1
    score_vars = const_var*(n*s2 - s1*2)
    return (col_raw - score_means)/sqrt(score_vars)

def diffuse_raw (graph,
    scores,
    z = False,
    K = None,
    *argv):
    
    # TODO
    # sanity checks
    # .check_scores(scores)

    # Kernel matrix
    if K is None:
        # .check_graph(graph)
        logging.info('Kernel not supplied. Computing regularised Laplacian kernel ...')
        K = regularised_laplacian_kernel(graph, normalized = False)
        logging.info('Done')
    else:
        # .check_K(K)
        logging.info('Using supplied kernel matrix...')

    # Compute scores
    
    label_id_mapping = get_label_id_mapping_graph(graph)
        

    # TODO: match indices, for now we assume indices match
    # for i, input_scores in enumerate(scores):
    #    for j, score in enumerate(input_scores):
    #       id_label_mapping(i)
    # input scores

    # TODO: Sparse
    # scores.mat <- methods::as(scores[[scores.name]], "sparseMatrix")

    n = len(scores)

    # raw scores
    diff = np.matmul(K[:,:n], scores)

    # Return base matrix if it is raw
    # Continue if we want z-scores
    if z == False: 
        return diff

    # If we want z-scores, must compute rowmeans and rowmeans2
    rowSums = [np.sum(row) for row in K[:, :n]]
    rowSums2 = [np.sum(row) for row in K[:, :n]*2]

    # Constant terms over columns
    const_mean = rowSums/n
    const_var = (n*rowSums2 - rowSums*2)/((n - 1)*(n*2))

    return map(calculate_scores, diff[1:len(diff[0])])

In [60]:
run_score_test(diffuse_raw, G, csv_labeled_matrix_to_nparray(dir_path+'/scores_test/input_scores.csv'), csv_labeled_matrix_to_nparray(dir_path+'/scores_test/output_scores.csv'))

INFO:root:Kernel not supplied. Computing regularised Laplacian kernel ...
INFO:root:Done
INFO:root: Computed matrix  
 [[0.04214556 0.53003606]
 [0.02803012 0.45956238]
 [0.07392846 0.49160954]
 [0.02882746 0.47538022]
 [0.02498394 0.42005727]
 [0.02694918 0.5088062 ]
 [0.04164558 0.50024601]
 [0.02889637 0.45379271]
 [0.03051561 0.43356589]
 [0.02342762 0.51787904]
 [0.02099228 0.45112783]
 [0.02743536 0.55030449]
 [0.03170256 0.4819236 ]
 [0.02845791 0.56026797]
 [0.02020737 0.59566405]
 [0.02814164 0.52128379]
 [0.02503623 0.57553906]
 [0.01942714 0.58824644]
 [0.02439709 0.55225109]
 [0.13784437 0.51873062]
 [0.02120247 0.52154529]
 [0.05397049 0.59494176]
 [0.03590104 0.36285448]
 [0.0431597  0.38991442]
 [0.01995905 0.40193285]
 [0.02123165 0.48349573]
 [0.01967457 0.41526501]
 [0.05469101 0.65561964]
 [0.02302591 0.44442886]
 [0.21329129 0.39079524]
 [0.04744166 0.35423805]
 [0.02109237 0.40746168]
 [0.01860675 0.63729415]
 [0.03242628 0.61894899]
 [0.01849678 0.56423804]
 [0.02