From ca4228e062a1efb4b78fc2e6e7d5c0860b7aff9e Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Fri, 10 Apr 2020 18:26:30 +0200
Subject: [PATCH 01/17] Process network major recoding

---
 src/diffupy/constants.py       |  20 ++-
 src/diffupy/matrix.py          | 175 +++++++++++++++++++-----
 src/diffupy/process_network.py | 241 +++++++++++++++++++++++++++++++++
 3 files changed, 394 insertions(+), 42 deletions(-)
 create mode 100644 src/diffupy/process_network.py

diff --git a/src/diffupy/constants.py b/src/diffupy/constants.py
index fb1d8e9..48daee2 100644
--- a/src/diffupy/constants.py
+++ b/src/diffupy/constants.py
@@ -66,22 +66,30 @@ def ensure_output_dirs():
 #: bel
 BEL = 'bel'
 #: node link json
-NODE_LINK_JSON = 'json'
+JSON = 'json'
 #: pickle
-BEL_PICKLE = 'pickle'
+PICKLE = 'pickle'
 #: gml
 GML = 'gml'
 #: edge list
 EDGE_LIST = '.lst'
 
-#: DiffuPath available network formats
-FORMATS = [
+#: DiffuPath available graph formats
+GRAPH_FORMATS = [
     CSV,
     TSV,
     GRAPHML,
     BEL,
-    NODE_LINK_JSON,
-    BEL_PICKLE,
+    JSON,
+    PICKLE,
+]
+
+#: DiffuPath available kernel formats
+KERNEL_FORMATS = [
+    CSV,
+    TSV,
+    JSON,
+    PICKLE,
 ]
 
 #: Separators
diff --git a/src/diffupy/matrix.py b/src/diffupy/matrix.py
index d3105d6..dd4d2b5 100644
--- a/src/diffupy/matrix.py
+++ b/src/diffupy/matrix.py
@@ -7,9 +7,11 @@
 
 import numpy as np
 import pandas as pd
+from diffupy.constants import CSV
+from networkx import DiGraph
 
 from .utils import get_label_ix_mapping, get_label_list_graph, get_laplacian, decode_labels, get_idx_scores_mapping, \
-    get_repeated_labels
+    get_repeated_labels, from_dataframe_file, from_nparray_to_df
 
 log = logging.getLogger(__name__)
 
@@ -23,15 +25,14 @@ class Matrix:
     """Matrix class."""
 
     def __init__(
-        self,
-        mat=None,
-        rows_labels=None,
-        cols_labels=None,
-        graph=None,
-        quadratic=False,
-        name='',
-        init_value=None,
-        **kwargs
+            self,
+            mat=None,
+            rows_labels=None,
+            cols_labels=None,
+            graph=None,
+            quadratic=False,
+            name='',
+            init_value=None,
     ):
         """Initialize matrix.
 
@@ -86,15 +87,15 @@ def __str__(self):
 
         return f"\nmatrix {self.name} \n  {s} \n "
 
-    def __iter__(self, **kargs):
+    def __iter__(self, **attr):
         """Help method for the iteration of the Matrix."""
         self.i = -1
         self.j = 0
 
-        if 'get_indices' in kargs:
-            self.get_indices = kargs['get_indices']
-        if 'get_labels' in kargs:
-            self.get_labels = kargs['get_labels']
+        if 'get_indices' in attr:
+            self.get_indices = attr['get_indices']
+        if 'get_labels' in attr:
+            self.get_labels = attr['get_labels']
 
         return self
 
@@ -463,23 +464,6 @@ def order_rows(self, reverse=True, col_ref_idx=None):
 
         return ordered_mat
 
-    """Import"""
-
-    def from_csv(self, csv_path):
-        """Import matrix from csv file using the headers as a Matrix class."""
-        m = np.genfromtxt(csv_path, dtype=None, delimiter=',')
-        return Matrix(
-            mat=np.array(
-                [
-                    [float(x)
-                     for x in a[1:]]
-                    for a in m[1:]
-                ]),
-            rows_labels=list(m[1:, 0]),
-            cols_labels=list(m[0, 1:]),
-            name=str(os.path.basename(csv_path).replace('.csv', ''))
-        )
-
     """Export"""
 
     def to_dict(self, ordered=True):
@@ -496,19 +480,138 @@ def to_dict(self, ordered=True):
 
         return d
 
+    def to_df(self, ordered=True):
+        """Export matrix as a data frame using the headers (row_labels, cols_labels) of the Matrix class."""
+        d = self.to_dict(ordered)
+
+        rows_labels = d.pop('rows_labels')
+
+        df = pd.DataFrame(d)
+        df.rows.values = rows_labels
+
+        return df
+
     def to_csv(self, path, file_name='_export.csv', index=False, ordered=True):
         """Export matrix to csv file using the headers (row_labels, cols_labels) of the Matrix class."""
         # Generate dataframe
-        df = pd.DataFrame(data=self.to_dict(ordered))
 
-        df.to_csv(os.path.join(path, self.name, file_name), index=index)
+        self.to_df(ordered).to_csv(os.path.join(path, self.name, file_name), index=index)
+
+    def to_nx_graph(self):
+        """Export matrix as a Graph using the headers (row_labels, cols_labels) of the Matrix class."""
+        if len(self.cols_labels) != len(self.rows_labels) or not self.quadratic:
+            raise ValueError('The matrix cannot be converted as a graph since it is not quadratic, which '
+                             'it is the used representation of a network (usually a kernel) as a Matrix.')
+
+        graph = DiGraph()
+
+        for score, sub_name, obj_name in self.__iter__(get_labels=True, get_indices=False):
+            if score != 0:
+                graph.add_edge(
+                    sub_name, obj_name,
+                )
+
+        return graph
+
+
+class MatrixFromDict(Matrix):
+    """Constructor matrix class for Dictionary data structure to Matrix conversion."""
+
+    def __init__(self, d, name=''):
+        """Initialize laplacian."""
+        rows = list(d.pop('rows_labels'))
+        cols = list(d.keys())
+
+        Matrix.__init__(self, mat=np.array(list(d.values())),
+                        rows_labels=rows,
+                        cols_labels=cols,
+                        quadratic=len(cols) == len(rows),
+                        name=name
+                        )
+
+
+class MatrixFromDataFrame(Matrix):
+    """Constructor matrix class for DataFrame to Matrix conversion."""
+
+    def __init__(self, df, name=''):
+        """Initialize laplacian."""
+        rows = list(df.rows.values)
+        cols = list(df.cols.values)
+
+        Matrix.__init__(self, mat=df.to_numpy(),
+                        rows_labels=rows,
+                        cols_labels=cols,
+                        quadratic=len(cols) == len(rows),
+                        name=name
+                        )
+
+
+class MatrixFromNumpyArray(Matrix):
+    """Constructor matrix class for DataFrame to Matrix conversion."""
+
+    def __init__(self, nparray, name=''):
+        """Initialize laplacian."""
+
+        df = from_nparray_to_df(nparray)
+
+        rows = list(df.rows.values)
+        cols = list(df.cols.values)
+
+        Matrix.__init__(self, mat=df.to_numpy(),
+                        rows_labels=rows,
+                        cols_labels=cols,
+                        quadratic=len(cols) == len(rows),
+                        name=name
+                        )
+
+
+class MatrixFromCSV(Matrix):
+    """Constructor matrix class for CSV to Matrix conversion."""
+
+    def __init__(self, csv_path, fmt=CSV, name=None):
+        """Initialize laplacian."""
+        df = from_dataframe_file(csv_path, fmt)
+
+        if name is None:
+            name = str(os.path.basename(csv_path).replace('.csv', ''))
+
+        rows = list(df.rows.values)
+        cols = list(df.cols.values)
+
+        Matrix.__init__(self, mat=df.to_numpy(),
+                        rows_labels=rows,
+                        cols_labels=cols,
+                        quadratic=len(cols) == len(rows),
+                        name=name
+                        )
+
+
+class MatrixFromGraph(Matrix):
+    """Constructor matrix class for nx.Graph to Matrix conversion."""
+
+    # TODO : move instances initalization from global argument graph to here
+
+    def __init__(self, graph, node_argument='name', name=''):
+        # This initialization would make a matrix representing the graph (taking a graph argument as label)
+        rows = list(get_label_list_graph(graph, node_argument))
+
+        Matrix.__init__(self, rows_labels=rows,
+                        init_value=1,
+                        quadratic=True,
+                        name=name,
+                        )
 
 
 class LaplacianMatrix(Matrix):
     """Laplacian matrix class."""
 
-    def __init__(self, graph, normalized=False, name=''):
+    def __init__(self, graph, normalized=False, node_argument='name', name=''):
         """Initialize laplacian."""
         l_mat = get_laplacian(graph, normalized)
+        rows = list(get_label_list_graph(graph, node_argument))
 
-        Matrix.__init__(self, mat=l_mat, quadratic=True, name=name, graph=graph)
+        Matrix.__init__(self, mat=l_mat,
+                        rows_labels=rows,
+                        quadratic=True,
+                        name=name
+                        )
diff --git a/src/diffupy/process_network.py b/src/diffupy/process_network.py
new file mode 100644
index 0000000..2b2257b
--- /dev/null
+++ b/src/diffupy/process_network.py
@@ -0,0 +1,241 @@
+# -*- coding: utf-8 -*-
+
+"""Miscellaneous utils of the package."""
+
+import logging
+from typing import Tuple
+
+import numpy as np
+import pandas as pd
+import pybel
+from diffupy.matrix import Matrix, MatrixFromDataFrame, MatrixFromDict, MatrixFromNumpyArray
+from diffupy.utils import from_dataframe_file, format_checker, from_pickle, get_label_node, from_json
+from networkx import DiGraph, Graph, read_graphml, read_gml, node_link_graph, read_edgelist
+
+from .constants import *
+from .constants import CSV, TSV, GRAPHML, GML, BEL, PICKLE, EMOJI, GRAPH_FORMATS
+from .kernels import regularised_laplacian_kernel
+
+log = logging.getLogger(__name__)
+
+
+"""Process network as undefined format (could represented as a graph or as a kernel)"""
+
+
+def get_kernel_and_graph_from_network_file(path: str) -> Tuple[Matrix, Graph]:
+    """Load network provided in cli as a kernel and as a graph."""
+    graph = None
+    kernel = None
+
+    if path.endswith(KERNEL_FORMATS):
+        try:
+            graph = process_graph_from_file(path)
+
+        except ValueError or TypeError:
+            kernel = process_kernel_from_file(path)
+
+    elif path.endswith(GRAPH_FORMATS):
+        graph = process_graph_from_file(path)
+
+    else:
+        raise IOError(
+            f'{EMOJI} The selected network format is not valid neither as a graph or as a kernel. Please ensure you use one of the following formats: '
+            f'{GRAPH_FORMATS}'
+        )
+
+    if kernel is None and graph is not None:
+        kernel = regularised_laplacian_kernel(graph)
+
+    if kernel is not None and graph is None:
+        graph = kernel.to_nx_graph()
+
+    return kernel, graph
+
+
+def get_kernel_from_network_file(path: str) -> Matrix:
+    """Load network provided in cli as a kernel."""
+    if path.endswith(KERNEL_FORMATS):
+        try:
+            graph = process_graph_from_file(path)
+
+        except ValueError or TypeError:
+            return process_kernel_from_file(path)
+
+    elif path.endswith(GRAPH_FORMATS):
+        graph = process_graph_from_file(path)
+
+    else:
+        raise IOError(
+            f'{EMOJI} The selected network format is not valid neither as a graph or as a kernel. Please ensure you use one of the following formats: '
+            f'{GRAPH_FORMATS}'
+        )
+
+    return regularised_laplacian_kernel(graph)
+
+
+def get_graph_from_network_file(path: str) -> Graph:
+    """Load network provided in cli as a graph."""
+    if path.endswith(KERNEL_FORMATS):
+        try:
+            return process_graph_from_file(path)
+
+        except ValueError or TypeError:
+            kernel = process_kernel_from_file(path)
+
+    elif path.endswith(GRAPH_FORMATS):
+        return process_graph_from_file(path)
+
+    else:
+        raise IOError(
+            f'{EMOJI} The selected network format is not valid neither as a graph or as a kernel. Please ensure you use one of the following formats: '
+            f'{GRAPH_FORMATS}'
+        )
+
+    return kernel.to_nx_graph()
+
+
+"""Process input formats"""
+
+
+def process_graph_from_file(path: str) -> Graph:
+    """Load network from path."""
+    if path.endswith(CSV) or path.endswith(TSV):
+        graph = get_graph_from_df(path, CSV)
+
+    elif path.endswith(TSV):
+        graph = get_graph_from_df(path, TSV)
+
+    elif path.endswith(PICKLE):
+        graph = pybel.from_pickle(path)
+
+    elif path.endswith(GRAPHML):
+        graph = read_graphml(path)
+
+    elif path.endswith(GML):
+        graph = read_gml(path)
+
+    elif path.endswith(BEL):
+        graph = pybel.from_path(path)
+
+    elif path.endswith(EDGE_LIST):
+        graph = read_edgelist(path)
+
+    elif path.endswith(JSON):
+        data = from_json(path)
+        graph = node_link_graph(data)
+    else:
+        raise IOError(
+            f'{EMOJI} The selected graph format is not valid. Please ensure you use one of the following formats: '
+            f'{GRAPH_FORMATS}'
+        )
+
+    log.info(
+        f'{EMOJI} Graph loaded with: \n'
+        f'{graph.number_of_nodes()} nodes\n'
+        f'{graph.number_of_edges()} edges\n'
+        f'{EMOJI}'
+    )
+
+    return graph
+
+
+def process_kernel_from_file(path: str) -> Matrix:
+    """Load kernel from path."""
+    if path.endswith(CSV):
+        raw_kernel = MatrixFromDataFrame(from_dataframe_file(path, CSV))
+
+    elif path.endswith(TSV):
+        raw_kernel = MatrixFromDataFrame(from_dataframe_file(path, TSV))
+
+    elif path.endswith(PICKLE):
+        raw_kernel = from_pickle(path)
+
+    elif path.endswith(JSON):
+        raw_kernel = from_json(path)
+
+    else:
+        raise IOError(
+            f'{EMOJI} The selected kernel format is not valid. Please ensure you use one of the following formats: '
+            f'{KERNEL_FORMATS}'
+        )
+
+    # Check imported type of kernel
+    if isinstance(raw_kernel, Matrix):
+        kernel = raw_kernel
+
+    elif isinstance(raw_kernel, dict):
+        kernel = MatrixFromDict(raw_kernel)
+
+    elif isinstance(raw_kernel, pd.DataFrame):
+        kernel = MatrixFromDataFrame(raw_kernel)
+
+    elif isinstance(raw_kernel, np.ndarray):
+        kernel = MatrixFromNumpyArray(raw_kernel)
+
+    else:
+        raise IOError(
+            f'{EMOJI} The imported kernel type is not valid. Please ensure it is provided as a diffupy '
+            f'Matrix, a Dict, NumpyArray or Pandas DataFrame. '
+        )
+
+    log.info(
+        f'{EMOJI} Kernel loaded with: \n'
+        f'{len(kernel.rows_labels)} nodes\n'
+        f'{EMOJI}'
+    )
+
+    return kernel
+
+
+def get_simple_graph_from_multigraph(multigraph):
+    """Convert undirected graph from multigraph."""
+    graph = Graph()
+    for u, v, data in multigraph.edges(data=True):
+        u = get_label_node(u)
+        v = get_label_node(v)
+
+        w = data['weight'] if 'weight' in data else 1.0
+        if graph.has_edge(u, v):
+            graph[u][v]['weight'] += w
+        else:
+            graph.add_edge(u, v, weight=w)
+
+    return graph
+
+
+def get_graph_from_df(path: str, sep: str) -> DiGraph:
+    """Return network from dataFrame."""
+    format_checker(sep)
+
+    df = from_dataframe_file(path, sep)
+
+    if SOURCE not in df.columns or TARGET not in df.columns:
+        raise ValueError(
+            f'Ensure that your file contains columns for {SOURCE} and {TARGET}. The column for {RELATION} is optional'
+            f'and can be omitted.'
+        )
+
+    graph = DiGraph()
+
+    for index, row in df.iterrows():
+
+        # Get node names from data frame
+        sub_name = row[SOURCE]
+        obj_name = row[TARGET]
+
+        if RELATION in df.columns:
+
+            relation = row[RELATION]
+
+            # Store edge in the graph
+            graph.add_edge(
+                sub_name, obj_name,
+                relation=relation,
+            )
+
+        else:
+            graph.add_edge(
+                sub_name, obj_name,
+            )
+
+    return graph

From ef5c6e43ccf70058964e534ff64342466f2f4b23 Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Fri, 10 Apr 2020 18:36:23 +0200
Subject: [PATCH 02/17] Process data input major recoding and tests

---
 src/diffupy/constants.py                      |  16 +-
 src/diffupy/process_data_input.py             | 635 ++++++++++++++++++
 src/diffupy/process_input.py                  | 315 ---------
 tests/constants.py                            |   2 +-
 .../datasets/{node.csv => node_type_col.csv}  |   0
 tests/test_diffusion.py                       |   2 +-
 tests/test_input.py                           | 107 ++-
 7 files changed, 735 insertions(+), 342 deletions(-)
 create mode 100644 src/diffupy/process_data_input.py
 delete mode 100644 src/diffupy/process_input.py
 rename tests/resources/datasets/{node.csv => node_type_col.csv} (100%)

diff --git a/src/diffupy/constants.py b/src/diffupy/constants.py
index 48daee2..354cd91 100644
--- a/src/diffupy/constants.py
+++ b/src/diffupy/constants.py
@@ -117,11 +117,23 @@ def ensure_output_dirs():
 
 #: Node name
 NODE = 'Node'
+LABEL = 'Label'
+ENTITY = 'Entity'
+GENE = 'Gene'
+
+NODE_LABELING= [
+    NODE,
+    LABEL,
+    ENTITY,
+    GENE
+]
+
 #: Node type
 NODE_TYPE = 'NodeType'
+#: Unspecified score type
+SCORE = 'Score'
 #: Log2 fold change (logFC)
 LOG_FC = 'LogFC'
 #: Statistical significance (p-value)
 P_VALUE = 'p-value'
-#: Label
-LABEL = 'Label'
+
diff --git a/src/diffupy/process_data_input.py b/src/diffupy/process_data_input.py
new file mode 100644
index 0000000..b198fc4
--- /dev/null
+++ b/src/diffupy/process_data_input.py
@@ -0,0 +1,635 @@
+# -*- coding: utf-8 -*-
+
+"""Main matrix class and processing of input data."""
+
+from typing import Dict, Optional, Union
+
+import numpy as np
+import pandas as pd
+
+from .constants import *
+from .matrix import Matrix
+from .utils import from_pickle, from_json, from_dataframe_file, from_nparray_to_df, get_random_value_from_dict, \
+    get_random_key_from_dict
+
+"""Process input data"""
+
+
+def process_input_data_for_diff(data_input: Union[str, pd.DataFrame, list, dict, np.ndarray, Matrix],
+                                kernel: Matrix,
+                                background_labels: Union[list, dict] = None,
+                                method: Optional[str] = 'raw',
+                                binning: Optional[bool] = False,
+                                absolute_value: Optional[bool] = False,
+                                p_value: Optional[float] = None,
+                                threshold: Optional[float] = None,
+                                separator_str: Optional[str] = ', '
+                                ) -> Matrix:
+    """Process miscellaneous input data and format it for the diffusion computation function."""
+    # If specific label background not provided, get a list from kernel labels.
+    if not background_labels:
+        background_labels = list(kernel.rows_labels)
+        # TODO: Discuss store label classification (mapping or as a column argument) in kernel
+
+    # Pipeline the input, first preprocessing it, then mapping it to the background labels and finally formatting it.
+    return format_input_for_diffusion(map_labels_input(process_data_input(data_input,
+                                                                          method,
+                                                                          binning,
+                                                                          absolute_value,
+                                                                          p_value,
+                                                                          threshold,
+                                                                          separator_str
+                                                                          ),
+                                                       background_labels
+                                                       ),
+                                      kernel
+                                      )
+
+
+def process_data_input(data_input: Union[str, list, dict, np.ndarray, pd.DataFrame],
+                       method: str = 'raw',
+                       binning: bool = False,
+                       absolute_value: bool = False,
+                       p_value: float = None,
+                       threshold: Optional[float] = None,
+                       separator_str: Optional[str] = ', ',
+                       ) -> Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]]:
+    """Process and pipeline the provided miscellaneous data input in standardized data structures for further processing."""
+    # Preprocess the raw input according is format types.
+    preprocessed_data = _process_data_input_format(data_input, separator_str)
+
+    # If the preprocessed input is a list or a label type dict (Dict[str, list]) of lists return it for categorical input generation.
+    if _label_list_data_struct_check(preprocessed_data) or _type_label_list_data_struct_check(preprocessed_data):
+        return preprocessed_data
+
+    # If the preprocessed input is a label type label-scores dict (Dict[str, pd.DataFrame]) pipeline it for scores codifying.
+    if isinstance(preprocessed_data, dict):
+        return {label_type: _codify_input_data(preprocessed_data_i,
+                                               method,
+                                               binning,
+                                               absolute_value,
+                                               p_value,
+                                               threshold
+                                               )
+                for label_type, preprocessed_data_i in preprocessed_data.items()
+                }
+
+    # If the preprocessed input is a scores-label dataframe (pd.DataFrame) pipeline it for scores codifying.
+    return _codify_input_data(preprocessed_data,
+                              method,
+                              binning,
+                              absolute_value,
+                              p_value,
+                              threshold
+                              )
+
+
+"""Process input formats"""
+
+
+def _process_data_input_format(raw_data_input: Union[str, list, dict, np.ndarray, pd.DataFrame],
+                               separ_str: str = ',') -> Union[pd.DataFrame, list, Dict[str, Union[pd.DataFrame, list]]]:
+    """Format the input as a label-score dataframe, a list or a labels or a type dict for latter input processing."""
+    if isinstance(raw_data_input, str):
+        # If the data input type is a string, mostly will be a path to the dataset file.
+        if os.path.isfile(raw_data_input):
+            return _process_data_input_format(_load_data_input_from_file(raw_data_input))
+        elif '/' in raw_data_input and separ_str not in ['/', ' /', '/ ']:
+            raise IOError(
+                f'{EMOJI} The file could not have been located in the provided data input path,.'
+            )
+        # If it is not a path, will be treated as a label list with separator.
+        else:
+            return _process_data_input_format(raw_data_input.split(raw_data_input))
+
+    if isinstance(raw_data_input, pd.DataFrame):
+        return raw_data_input
+
+    elif isinstance(raw_data_input, list) or isinstance(raw_data_input, set):
+        return list(set(raw_data_input))
+
+    elif isinstance(raw_data_input, np.ndarray):
+        return from_nparray_to_df(raw_data_input)
+
+    elif isinstance(raw_data_input, dict):
+        if _scores_dict_data_struct_check(raw_data_input):
+            return pd.DataFrame.from_dict(raw_data_input, orient='index')
+        else:
+            return {label_type: _process_data_input_format(data_i) for label_type, data_i in raw_data_input.items()}
+
+    elif isinstance(raw_data_input, Matrix):
+        return raw_data_input.to_df()
+
+    else:
+        raise TypeError(
+            f'{EMOJI} The imported kernel type is not valid. Please ensure is provided as a diffupy '
+            f'Matrix, a Dict, NumpyArray or Pandas DataFrame. '
+        )
+
+
+def _load_data_input_from_file(path: str) -> Union[pd.DataFrame, list]:
+    """Load and process the input data according the input file format."""
+    if path.endswith(CSV):
+        return from_dataframe_file(path, CSV)
+
+    elif path.endswith(TSV):
+        return from_dataframe_file(path, TSV)
+
+    elif path.endswith(PICKLE):
+        return from_pickle(path)
+
+    elif path.endswith(JSON):
+        return from_json(path)
+
+    else:
+        raise IOError(
+            f'There is a problem with your file. Please ensure the file you submitted is correctly formatted with a'
+            f'.csv or .tsv file extension.'
+        )
+
+
+"""Pipeline input scores"""
+
+
+def _codify_input_data(df: pd.DataFrame,
+                       method: str,
+                       binning: bool,
+                       absolute_value: bool,
+                       p_value: float,
+                       threshold: Optional[float],
+                       ) -> Union[Dict[str, Dict[str, int]],
+                                  Dict[str, int]]:
+    """Process the input scores for the codifying process."""
+    # Ensure that node labeling is in the provided dataset.
+    if not any(n in df.columns for n in NODE_LABELING):
+        raise ValueError(
+            f'Ensure that your file contains a column {NODE_LABELING} with node IDs.'
+        )
+    # Standardize the title of the node column labeling column to 'label', for later processing.
+    elif LABEL not in df.columns:
+        for l in list(df.columns):
+            if l in NODE_LABELING:
+                df = df.rename(columns={l: LABEL})
+                break
+
+    # If node type provided in a column, classify in a dictionary the input codification by its node type.
+    if NODE_TYPE in df.columns:
+
+        node_types = list(set(df[NODE_TYPE]))  # Get the node types list set.
+        codified_by_type_dict = {}
+
+        for node_type in node_types:
+            # Filter the nodes by the iterable type.
+            df_by_type = df.loc[df[NODE_TYPE] == node_type]
+
+            # Codify the nodes for the iterable type.
+            codified_by_type_dict[node_type] = _codify_method_check(df_by_type,
+                                                                    method,
+                                                                    binning,
+                                                                    absolute_value,
+                                                                    p_value,
+                                                                    threshold
+                                                                    )
+        return codified_by_type_dict
+
+    else:
+        # Codify all the nodes of the dataframe.
+        return _codify_method_check(df,
+                                    method,
+                                    binning,
+                                    absolute_value,
+                                    p_value,
+                                    threshold
+                                    )
+
+
+def _codify_method_check(df: pd.DataFrame,
+                         method: str,
+                         binning: bool,
+                         absolute_value: bool,
+                         p_value: float,
+                         threshold: Optional[float],
+                         ) -> Dict[str, int]:
+    """Classify the input data codification according the diffusion method."""
+    # Prepare input data for quantitative diffusion scoring methods
+    if method == RAW or method == Z:
+        return _codify_quantitative_input_data(df, binning, absolute_value, p_value, threshold)
+
+    # Prepare input data for non-quantitative diffusion methods
+    elif method == ML or method == GM:
+        return _codify_non_quantitative_input_data(df, p_value, threshold)
+
+    else:
+        # TODO: ber_s, ber_p, mc
+        raise NotImplementedError('This diffusion method has not yet been implemented.')
+
+
+"""Assign binary labels to input for scoring methods that accept non-quantitative values"""
+
+
+def _codify_non_quantitative_input_data(
+        df: pd.DataFrame,
+        p_value: float,
+        threshold: Optional[float]
+) -> Dict[str, int]:
+    """Codify input data to get a set of labelled nodes for scoring methods that accept non-quantitative values."""
+    # LogFC provided in dataset and threshold given
+    if LOG_FC in df.columns and threshold:
+
+        # Label nodes with 1 if | logFC | passes threshold
+        df.loc[(df[LOG_FC]).abs() >= threshold, SCORE] = 1
+        # Label nodes with -1 if | logFC | below threshold
+        df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = -1
+
+        # If adjusted p-values are provided in dataset, label nodes that are not statistically significant with -1
+        if P_VALUE in df.columns:
+            df.loc[df[P_VALUE] > p_value, SCORE] = -1
+
+        return df.set_index(NODE)[SCORE].to_dict()
+
+    # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign labels as 1
+    df[SCORE] = 1
+
+    return df.set_index(NODE)[SCORE].to_dict()
+
+
+"""Assign binary labels to input for scoring methods that accept quantitative values"""
+
+
+def _codify_quantitative_input_data(
+        df: pd.DataFrame,
+        binning: bool,
+        absolute_value: bool,
+        p_value: float,
+        threshold: Optional[float],
+) -> Dict[str, int]:
+    """Codify input data to get a set of labelled nodes for scoring methods that accept quantitative values."""
+    # LogFC provided in dataset and threshold given
+    if LOG_FC in df.columns and threshold:
+
+        # Binarize labels with 1, 0 and/or -1
+        if binning is True:
+
+            # Add binning labels where | logFC | values above threshold are 1 and below are 0
+            if absolute_value is True:
+                return _bin_quantitative_input_by_abs_val(df, threshold, p_value)
+
+            # Add signed labels where | logFC | values above threshold are 1 or -1 (signed) and values below are 0
+
+            return _bin_quantitative_input_by_threshold(df, threshold, p_value)
+
+        # Labels are 0s or logFC values rather than binary values
+        else:
+            # Codify inputs with | logFC | if they pass threshold; otherwise assign label as 0
+            if absolute_value is True:
+                return _codify_quantitative_input_by_abs_val(df, threshold, p_value)
+
+            # Codify inputs with logFC if they pass threshold; otherwise assign label as 0
+            return _codify_quantitative_input_by_threshold(df, threshold, p_value)
+
+    # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign labels as 1
+    df[SCORE] = 1
+
+    return df.set_index(NODE)[SCORE].to_dict()
+
+
+def _bin_quantitative_input_by_abs_val(
+        df: pd.DataFrame,
+        threshold: float,
+        p_value: float,
+) -> Dict[str, int]:
+    """Process quantitative inputs and bin labels by absolute value."""
+    # Add label 1 if | logFC | is above threshold
+    df.loc[(df[LOG_FC]).abs() >= threshold, SCORE] = 1
+    # Add label 0 if | logFC | below threshold
+    df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0
+
+    # logFC and adjusted p-values are provided in dataset
+    if P_VALUE in df.columns:
+        return _remove_non_significant_entities(df, p_value)
+
+    return df.set_index(NODE)[SCORE].to_dict()
+
+
+def _bin_quantitative_input_by_threshold(
+        df: pd.DataFrame,
+        threshold: float,
+        p_value: float,
+) -> Dict[str, int]:
+    """Process quantitative inputs and bin labels by threshold."""
+    # Add label 1 if logFC is above threshold
+    df.loc[df[LOG_FC] >= threshold, SCORE] = 1
+    # Add label 0 if | logFC | below threshold
+    df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0
+    # Replace remaining labels with -1 (i.e. | logFC | above threshold but sign is negative)
+    df = df.fillna(-1)
+
+    if p_value:
+        # LogFC values and adjusted p-values are provided in dataset
+        if P_VALUE in df.columns:
+            # Disregard entities if logFC adjusted p-value is not significant
+            return _remove_non_significant_entities(df, p_value)
+
+    return df.set_index(NODE)[SCORE].to_dict()
+
+
+"""Assign logFC as labels for input for scoring methods that accept quantitative values"""
+
+
+def _codify_quantitative_input_by_abs_val(
+        df: pd.DataFrame,
+        threshold: float,
+        p_value: float,
+) -> Dict[str, int]:
+    """Codify nodes with | logFC | if they pass threshold, otherwise label is 0."""
+    # Codify nodes with | logFC | if they pass threshold
+    df.loc[(df[LOG_FC]).abs() >= threshold, SCORE] = (df[LOG_FC]).abs()
+    # Codify nodes with label 0 if it falls below threshold
+    df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0
+
+    # LogFC and adjusted p-values are provided in dataset
+    if P_VALUE in df.columns:
+        # Disregard entities if logFC adjusted p-value is not significant
+        return _remove_non_significant_entities(df, p_value)
+
+    return df.set_index(NODE)[SCORE].to_dict()
+
+
+def _codify_quantitative_input_by_threshold(
+        df: pd.DataFrame,
+        threshold: float,
+        p_value: float,
+) -> Dict[str, int]:
+    """Codify inputs with logFC if they pass threshold value."""
+    df.loc[df[LOG_FC] >= threshold, SCORE] = df[LOG_FC]
+    df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0
+    df.loc[((df[LOG_FC]).abs() >= threshold) & ((df[LOG_FC]) < 0), SCORE] = df[LOG_FC]
+
+    # LogFC values and adjusted p-values are provided in dataset
+    if P_VALUE in df.columns:
+        # Disregard entities if logFC adjusted p-value is not significant
+        return _remove_non_significant_entities(df, p_value)
+
+    return df.set_index(NODE)[SCORE].to_dict()
+
+
+def _remove_non_significant_entities(df: pd.DataFrame, p_value: float) -> Dict[str, int]:
+    # Label entity 0 if adjusted p-value for logFC is not significant
+    df.loc[df[P_VALUE] > p_value, SCORE] = 0
+
+    return df.set_index(NODE)[SCORE].to_dict()
+
+
+"""Data structures format checkers"""
+
+
+def _scores_dict_data_struct_check(v: Union[dict, list]) -> bool:
+    """Check data structure type Dict[str, int]."""
+    return (isinstance(v, dict) and
+            isinstance(get_random_value_from_dict(v), int)
+            )
+
+
+def _type_scores_dict_data_struct_check(v: Union[dict, list]) -> bool:
+    """Check data structure type Dict[str, Dict[str, int]]."""
+    return (isinstance(v, dict) and
+            isinstance(get_random_value_from_dict(v), dict) and
+            isinstance(get_random_value_from_dict(get_random_value_from_dict(v)), int)
+            )
+
+
+def _label_list_data_struct_check(v: Union[dict, list]) -> bool:
+    """Check data structure type list."""
+    return isinstance(v, list)
+
+
+def _type_label_list_data_struct_check(v: Union[dict, list]) -> bool:
+    """Check data structure type Dict[str, list]."""
+    return (isinstance(v, dict) and
+            isinstance(get_random_value_from_dict(v), list)
+            )
+
+
+"""Mappers from input to network background"""
+
+
+def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]],
+                     background_labels: Union[Dict[str, list], list]) -> Union[Dict[str, int], list]:
+    """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes."""
+    if isinstance(background_labels, list):
+        return _map_labels_to_background(input_labels, background_labels)
+
+    elif isinstance(background_labels, dict):
+        return {node_type: _map_labels_to_background(input_labels, node_set, node_type)
+                for node_type, node_set
+                in background_labels.items()
+                if _map_labels_to_background(input_labels, node_set, node_type) not in [[], {}]
+                }
+    else:
+        raise IOError(
+            f'{EMOJI} The background mapping labels should be provided as a label list or as a type dict of label list.'
+        )
+
+
+def _map_labels_to_background(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]],
+                              background_labels: list,
+                              background_labels_type: str = None
+                              ) -> Union[Dict[str, Dict[str, int]],
+                                         Dict[str, int]]:
+    """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes."""
+    if _type_scores_dict_data_struct_check(input_labels) or _type_label_list_data_struct_check(input_labels):
+        if background_labels_type:
+            if background_labels_type in input_labels.keys():
+                return _map_labels(input_labels[background_labels_type], background_labels)
+        else:
+            return {
+                type: _map_labels(label_list, background_labels)
+                for type, label_list in input_labels.items()
+                if _map_labels(label_list, background_labels) not in [[], {}]
+            }
+
+    return _map_labels(input_labels, background_labels)
+
+
+def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]],
+                background_labels: list) -> Union[Dict[str, int], list]:
+    """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes."""
+    if _label_list_data_struct_check(input_labels):
+        return list(set(input_labels).intersection(set(background_labels)))
+
+    elif _scores_dict_data_struct_check(input_labels):
+        return {labels: input_labels[labels]
+                for labels in background_labels
+                if labels in input_labels
+                }
+
+    elif _type_label_list_data_struct_check(input_labels):
+        l = []
+        for type, label_list in input_labels.items():
+            l += _map_labels(label_list, background_labels)
+        return l
+
+    elif _type_scores_dict_data_struct_check(input_labels):
+        l = {}
+        for type, scores_dict in input_labels.items():
+            l.update(_map_labels(scores_dict, background_labels))
+        return l
+
+    else:
+        raise TypeError(
+            f'{EMOJI} The input labels data structure can not be processed for label mapping'
+        )
+
+
+"""Generate/format data input as a vector/matrix for the diffusion computation matching the kernel rows"""
+
+
+def format_input_for_diffusion(processed_input: Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]],
+                               kernel: Matrix) -> Matrix:
+    """Format/generate input vector/matrix according the data structure of the processed_data_input."""
+    if _label_list_data_struct_check(processed_input):
+        return format_categorical_input_vector_from_label_list(rows_labeled=processed_input,
+                                                               col_label='scores',
+                                                               kernel=kernel
+                                                               )
+
+    elif _scores_dict_data_struct_check(processed_input):
+        return format_input_vector_from_scores_dict(processed_input, kernel)
+
+    elif _type_label_list_data_struct_check(processed_input):
+        return format_categorical_input_matrix_from_label_list(processed_input, kernel)
+
+    elif _type_scores_dict_data_struct_check(processed_input):
+        return format_input_matrix_from_scores_dict(processed_input, kernel)
+
+    else:
+        raise TypeError(
+            f'{EMOJI} The label/scores mapping data structure can not be processed for the input formatting.'
+        )
+
+
+"""Generate categorical (non-quantitative) input vector matrix from raw input dataset labels"""
+
+
+def format_categorical_input_vector_from_label_list(rows_labeled,
+                                                    col_label,
+                                                    kernel,
+                                                    missing_value=-1,
+                                                    rows_unlabeled=None  # TODO: To discuss, to handle
+                                                    ) -> Matrix:
+    """Generate categoric input vector from labels."""
+    if isinstance(col_label, str):
+        col_label = [col_label]
+
+    input_mat = Matrix(
+        rows_labels=list(rows_labeled),
+        cols_labels=col_label,
+        init_value=1)
+    if rows_unlabeled:
+        input_mat.row_bind(
+            matrix=Matrix(
+                rows_labels=list(rows_unlabeled),
+                cols_labels=col_label,
+                init_value=0)
+        )
+
+    return input_mat.match_missing_rows(kernel.rows_labels, missing_value).match_rows(kernel)
+
+
+def format_categorical_input_matrix_from_label_list(rows_labels,
+                                                    cols_labels: list,
+                                                    kernel,
+                                                    missing_value=-1,
+                                                    rows_unlabeled=None  # TODO: To discuss, to handle
+                                                    ) -> Matrix:
+    """Generate input vector from labels."""
+    if not isinstance(cols_labels, list):
+        raise NotImplementedError('The column labels should be provided as a list.')
+
+    if len(cols_labels) > 1:
+        input_mat = format_categorical_input_vector_from_label_list(
+            rows_labels[0],
+            cols_labels[0],
+            kernel,
+            missing_value,
+            rows_unlabeled[0]
+        )
+
+        for idx, row_label in enumerate(rows_labels[1:]):
+            input_vector = format_categorical_input_vector_from_label_list(
+                row_label,
+                cols_labels[idx + 1],
+                kernel,
+                missing_value,
+                rows_unlabeled[idx + 1],
+            )
+            input_mat.col_bind(matrix=input_vector)
+
+        return input_mat
+
+    elif isinstance(cols_labels, list):
+        return format_categorical_input_vector_from_label_list(
+            rows_labels,
+            cols_labels,
+            kernel,
+            missing_value,
+            rows_unlabeled
+        )
+
+
+"""Generate quantitative or binarized/categorical input vector matrix from preprocesed input dataset scores"""
+
+
+def format_input_vector_from_scores_dict(scores_dict: dict,
+                                         kernel,
+                                         col_label: str = 'scores',
+                                         missing_value=-1,
+                                         rows_unlabeled=None  # TODO: To discuss, to handle
+                                         ) -> Matrix:
+    """Generate scores input vector from labels scores dict."""
+
+    input_mat = Matrix(
+        mat=np.array(list(scores_dict.values())),
+        rows_labels=list(scores_dict.keys()),
+        cols_labels=[col_label]
+    )
+
+    if rows_unlabeled:
+        input_mat.row_bind(
+            matrix=Matrix(
+                rows_labels=list(rows_unlabeled),
+                cols_labels=col_label,
+                init_value=0)
+        )
+
+    return input_mat.match_missing_rows(kernel.rows_labels, missing_value).match_rows(kernel)
+
+
+def format_input_matrix_from_scores_dict(scores_dicts: Union[Dict[str, Dict[str, int]],
+                                                             Dict[str, int]],
+                                         kernel,
+                                         rows_unlabeled=None,  # TODO: To discuss, to handle
+                                         ) -> Matrix:
+    """Generate input matrix from labels scores dict and/or handle type classification by columns."""
+    if _scores_dict_data_struct_check(scores_dicts):
+        scores_dicts.pop('node_types')
+
+        init_k = get_random_key_from_dict(scores_dicts)
+        init_v = scores_dicts.pop(init_k)
+        input_mat = format_input_vector_from_scores_dict(scores_dicts,
+                                                         kernel,
+                                                         col_label=init_k,
+                                                         rows_unlabeled=init_v
+                                                         )
+
+        for node_type, scores_dict in scores_dicts.items():
+            input_vector = format_input_vector_from_scores_dict(scores_dict,
+                                                                kernel,
+                                                                col_label=node_type,
+                                                                rows_unlabeled=rows_unlabeled
+                                                                )
+            input_mat.col_bind(matrix=input_vector)
+
+        return input_mat
+    else:
+        return format_input_vector_from_scores_dict(scores_dicts, kernel)
diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py
deleted file mode 100644
index bedc100..0000000
--- a/src/diffupy/process_input.py
+++ /dev/null
@@ -1,315 +0,0 @@
-# -*- coding: utf-8 -*-
-
-"""Main matrix class and processing of input data."""
-from typing import Dict, List, Optional
-
-import networkx as nx
-import pandas as pd
-
-from .constants import *
-from .matrix import Matrix
-
-"""Process datasets"""
-
-
-def process_input(
-        path: str,
-        method: str,
-        binning: bool,
-        absolute_value: bool,
-        p_value: float,
-        threshold: Optional[float],
-) -> Dict[str, int]:
-    """Read input file and ensure necessary columns exist."""
-    if path.endswith(CSV):
-        fmt = CSV
-
-    elif path.endswith(TSV):
-        fmt = TSV
-
-    else:
-        raise IOError(
-            f'There is a problem with your file. Please ensure the file you submitted is correctly formatted with a'
-            f'.csv or .tsv file extension.'
-        )
-
-    df = pd.read_csv(
-        path,
-        header=0,
-        sep=FORMAT_SEPARATOR_MAPPING[CSV] if fmt == CSV else FORMAT_SEPARATOR_MAPPING[TSV]
-    )
-
-    # Ensure that column Node is in dataset
-    if NODE not in df.columns:
-        raise ValueError(
-            f'Ensure that your file contains a column {NODE} with node IDs.'
-        )
-
-    # If logFC column not in dataFrame, ensure node type column is at least given
-    elif LOG_FC not in df.columns:
-        if NODE_TYPE not in df.columns:
-            raise ValueError(
-                f'Ensure that your file contains a column, {NODE_TYPE}, indicating node types.'
-            )
-
-    return _codify_input_data(df, method, binning, absolute_value, p_value, threshold)
-
-
-"""Codify input according to diffusion scoring method"""
-
-
-def _codify_input_data(
-        df: pd.DataFrame,
-        method: str,
-        binning: bool,
-        absolute_value: bool,
-        p_value: float,
-        threshold: Optional[float],
-) -> Dict[str, int]:
-    """Prepare input data for diffusion."""
-    # Prepare input data for quantitative diffusion scoring methods
-    if method == RAW or method == Z:
-        return _codify_quantitative_input_data(df, binning, absolute_value, p_value, threshold)
-
-    # Prepare input data for non-quantitative diffusion methods
-    elif method == ML or method == GM:
-        return _codify_non_quantitative_input_data(df, p_value, threshold)
-
-    else:
-        # TODO: ber_s, ber_p, mc
-        raise NotImplementedError('This diffusion method has not yet been implemented.')
-
-
-"""Assign binary labels to input for scoring methods that accept non-quantitative values"""
-
-
-def _codify_non_quantitative_input_data(
-        df: pd.DataFrame,
-        p_value: float,
-        threshold: Optional[float]
-) -> Dict[str, int]:
-    """Codify input data to get a set of labelled nodes for scoring methods that accept non-quantitative values."""
-    # LogFC provided in dataset and threshold given
-    if LOG_FC in df.columns and threshold:
-
-        # Label nodes with 1 if | logFC | passes threshold
-        df.loc[(df[LOG_FC]).abs() >= threshold, LABEL] = 1
-        # Label nodes with -1 if | logFC | below threshold
-        df.loc[(df[LOG_FC]).abs() < threshold, LABEL] = -1
-
-        # If adjusted p-values are provided in dataset, label nodes that are not statistically significant with -1
-        if P_VALUE in df.columns:
-            df.loc[df[P_VALUE] > p_value, LABEL] = -1
-
-        return df.set_index(NODE)[LABEL].to_dict()
-
-    # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign labels as 1
-    df[LABEL] = 1
-
-    return df.set_index(NODE)[LABEL].to_dict()
-
-
-"""Assign binary labels to input for scoring methods that accept quantitative values"""
-
-
-def _codify_quantitative_input_data(
-        df: pd.DataFrame,
-        binning: bool,
-        absolute_value: bool,
-        p_value: float,
-        threshold: Optional[float],
-) -> Dict[str, int]:
-    """Codify input data to get a set of labelled nodes for scoring methods that accept quantitative values."""
-    # LogFC provided in dataset and threshold given
-    if LOG_FC in df.columns and threshold:
-
-        # Binarize labels with 1, 0 and/or -1
-        if binning is True:
-
-            # Add binning labels where | logFC | values above threshold are 1 and below are 0
-            if absolute_value is True:
-                return _bin_quantitative_input_by_abs_val(df, threshold, p_value)
-
-            # Add signed labels where | logFC | values above threshold are 1 or -1 (signed) and values below are 0
-
-            return _bin_quantitative_input_by_threshold(df, threshold, p_value)
-
-        # Labels are 0s or logFC values rather than binary values
-        else:
-            # Codify inputs with | logFC | if they pass threshold; otherwise assign label as 0
-            if absolute_value is True:
-                return _codify_quantitative_input_by_abs_val(df, threshold, p_value)
-
-            # Codify inputs with logFC if they pass threshold; otherwise assign label as 0
-            return _codify_quantitative_input_by_threshold(df, threshold, p_value)
-
-    # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign labels as 1
-    df[LABEL] = 1
-
-    # TODO handle NODE_TYPE
-    return df.set_index(NODE)[LABEL].to_dict()
-
-
-def _bin_quantitative_input_by_abs_val(
-        df: pd.DataFrame,
-        threshold: float,
-        p_value: float,
-) -> Dict[str, int]:
-    """Process quantitative inputs and bin labels by absolute value."""
-    # Add label 1 if | logFC | is above threshold
-    df.loc[(df[LOG_FC]).abs() >= threshold, LABEL] = 1
-    # Add label 0 if | logFC | below threshold
-    df.loc[(df[LOG_FC]).abs() < threshold, LABEL] = 0
-
-    # logFC and adjusted p-values are provided in dataset
-    if P_VALUE in df.columns:
-        return _remove_non_significant_entities(df, p_value)
-
-    return df.set_index(NODE)[LABEL].to_dict()
-
-
-def _bin_quantitative_input_by_threshold(
-        df: pd.DataFrame,
-        threshold: float,
-        p_value: float,
-) -> Dict[str, int]:
-    """Process quantitative inputs and bin labels by threshold."""
-    # Add label 1 if logFC is above threshold
-    df.loc[df[LOG_FC] >= threshold, LABEL] = 1
-    # Add label 0 if | logFC | below threshold
-    df.loc[(df[LOG_FC]).abs() < threshold, LABEL] = 0
-    # Replace remaining labels with -1 (i.e. | logFC | above threshold but sign is negative)
-    df = df.fillna(-1)
-
-    if p_value:
-        # LogFC values and adjusted p-values are provided in dataset
-        if P_VALUE in df.columns:
-            # Disregard entities if logFC adjusted p-value is not significant
-            return _remove_non_significant_entities(df, p_value)
-
-    return df.set_index(NODE)[LABEL].to_dict()
-
-
-"""Assign logFC as labels for input for scoring methods that accept quantitative values"""
-
-
-def _codify_quantitative_input_by_abs_val(
-        df: pd.DataFrame,
-        threshold: float,
-        p_value: float,
-) -> Dict[str, int]:
-    """Codify nodes with | logFC | if they pass threshold, otherwise label is 0."""
-    # Codify nodes with | logFC | if they pass threshold
-    df.loc[(df[LOG_FC]).abs() >= threshold, LABEL] = (df[LOG_FC]).abs()
-    # Codify nodes with label 0 if it falls below threshold
-    df.loc[(df[LOG_FC]).abs() < threshold, LABEL] = 0
-
-    # LogFC and adjusted p-values are provided in dataset
-    if P_VALUE in df.columns:
-        # Disregard entities if logFC adjusted p-value is not significant
-        return _remove_non_significant_entities(df, p_value)
-
-    return df.set_index(NODE)[LABEL].to_dict()
-
-
-def _codify_quantitative_input_by_threshold(
-        df: pd.DataFrame,
-        threshold: float,
-        p_value: float,
-) -> Dict[str, int]:
-    """Codify inputs with logFC if they pass threshold value."""
-    df.loc[df[LOG_FC] >= threshold, LABEL] = df[LOG_FC]
-    df.loc[(df[LOG_FC]).abs() < threshold, LABEL] = 0
-    df.loc[((df[LOG_FC]).abs() >= threshold) & ((df[LOG_FC]) < 0), LABEL] = df[LOG_FC]
-
-    # LogFC values and adjusted p-values are provided in dataset
-    if P_VALUE in df.columns:
-        # Disregard entities if logFC adjusted p-value is not significant
-        return _remove_non_significant_entities(df, p_value)
-
-    return df.set_index(NODE)[LABEL].to_dict()
-
-
-def _remove_non_significant_entities(df: pd.DataFrame, p_value: float) -> pd.DataFrame:
-    # Label entity 0 if adjusted p-value for logFC is not significant
-    df.loc[df[P_VALUE] > p_value, LABEL] = 0
-
-    return df.set_index(NODE)[LABEL].to_dict()
-
-
-"""Map nodes from input to network"""
-
-
-def map_nodes(input_node_dict: Dict[str, int], network: nx.Graph) -> List:
-    """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes."""
-    # List of nodes in network
-    network_nodes = list(network.nodes)
-
-    return [input_node_dict[node] if node in input_node_dict else None for node in network_nodes]
-
-
-"""Generate input vector from dataset labels"""
-
-
-def generate_categoric_input_vector_from_labels(
-        rows_labeled,
-        col_label,
-        background_mat,
-        missing_value=-1,
-        rows_unlabeled=None,
-):
-    """Generate categoric input vector from labels."""
-    if isinstance(col_label, str):
-        col_label = [col_label]
-
-    input_mat = Matrix(
-        rows_labels=list(rows_labeled),
-        cols_labels=col_label,
-        init_value=1)
-    if rows_unlabeled:
-        input_mat.row_bind(
-            matrix=Matrix(
-                rows_labels=list(rows_unlabeled),
-                cols_labels=col_label,
-                init_value=0)
-        )
-
-    return input_mat.match_missing_rows(background_mat.rows_labels, missing_value).match_rows(background_mat)
-
-
-def generate_categoric_input_from_labels(
-        rows_labels,
-        cols_labels,
-        background_mat,
-        missing_value=-1,
-        rows_unlabeled=None,
-):
-    """Generate input vector from labels."""
-    if isinstance(cols_labels, list) and len(cols_labels) > 1:
-        input_mat = generate_categoric_input_vector_from_labels(
-            rows_labels[0],
-            cols_labels[0],
-            background_mat,
-            missing_value,
-            rows_unlabeled[0]
-        )
-
-        for idx, row_label in enumerate(rows_labels[1:]):
-            input_vector = generate_categoric_input_vector_from_labels(
-                row_label,
-                cols_labels[idx + 1],
-                background_mat,
-                missing_value,
-                rows_unlabeled[idx + 1],
-            )
-            input_mat.col_bind(matrix=input_vector)
-
-        return input_mat
-    else:
-        return generate_categoric_input_vector_from_labels(
-            rows_labels,
-            cols_labels,
-            background_mat,
-            missing_value,
-            rows_unlabeled
-        )
diff --git a/tests/constants.py b/tests/constants.py
index 690ab12..d60e38b 100644
--- a/tests/constants.py
+++ b/tests/constants.py
@@ -14,7 +14,7 @@
 REGULARISED_LAPLACIAN_KERNEL = os.path.join(RESOURCES_FOLDER, 'regularisedLaplacianKernel.csv')
 
 DATASETS_FOLDER = os.path.join(RESOURCES_FOLDER, 'datasets')
-NODE_TEST_PATH = os.path.join(DATASETS_FOLDER, 'node.csv')
+NODE_TYPE_COL_TEST_PATH = os.path.join(DATASETS_FOLDER, 'node_type_col.csv')
 NODE_LOGFC_TEST_PATH = os.path.join(DATASETS_FOLDER, 'node_logfc.csv')
 NODE_LOGFC_PVAL_TEST_PATH = os.path.join(DATASETS_FOLDER, 'node_logfc_pval.csv')
 INPUT_SCORES = os.path.join(RESOURCES_FOLDER, 'input_scores.csv')
diff --git a/tests/resources/datasets/node.csv b/tests/resources/datasets/node_type_col.csv
similarity index 100%
rename from tests/resources/datasets/node.csv
rename to tests/resources/datasets/node_type_col.csv
diff --git a/tests/test_diffusion.py b/tests/test_diffusion.py
index 4fee51c..4e911df 100644
--- a/tests/test_diffusion.py
+++ b/tests/test_diffusion.py
@@ -10,7 +10,7 @@
 
 from diffupy.diffuse import diffuse
 from diffupy.matrix import Matrix
-from tests.constants import *
+from .constants import *
 
 log = logging.getLogger(__name__)
 
diff --git a/tests/test_input.py b/tests/test_input.py
index 47bc22c..dcf3704 100644
--- a/tests/test_input.py
+++ b/tests/test_input.py
@@ -7,11 +7,11 @@
 
 from diffupy.constants import *
 from diffupy.matrix import Matrix
-from diffupy.process_input import process_input, map_nodes
-from diffupy.utils import process_network
+from diffupy.process_data_input import process_data_input, _map_labels_to_background, map_labels_input
+from diffupy.process_network import get_graph_from_df
 from diffupy.validate_input import _validate_scores
 
-from tests.constants import *
+from .constants import *
 
 log = logging.getLogger(__name__)
 
@@ -21,24 +21,27 @@ class ValidateTest(unittest.TestCase):
 
     def test_quantitative_bin_id(self):
         """Test codify label_input for quantitative scoring methods- only entity IDs given (binary labels)."""
-        input = NODE_TEST_PATH
-        input_labels_dict = process_input(
+        input = NODE_TYPE_COL_TEST_PATH
+        input_labels_dict = process_data_input(
             input, method=RAW, binning=True, absolute_value=True, p_value=0.05, threshold=None,
         )
-        self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': 1, 'D': 1, 'E': 1})
+        print(input_labels_dict)
+        self.assertEqual(input_labels_dict, {'Metabolite': {'C': 1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}})
 
     def test_quantitative_bin_fc_sign(self):
         """Test codify label_input for quantitative scoring methods- logFC given (binary, signed labels)."""
         input = NODE_LOGFC_TEST_PATH
-        input_labels_dict = process_input(
+        input_labels_dict = process_data_input(
             input, method=RAW, binning=True, absolute_value=False, p_value=0.05, threshold=0.5,
         )
+        print(input_labels_dict)
+
         self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': 0, 'D': 0, 'E': -1})
 
     def test_quantitative_bin_fc_abs(self):
         """Test codify label_input for quantitative scoring methods- logFC given (binary, absolute values)."""
         input = NODE_LOGFC_TEST_PATH
-        input_labels_dict = process_input(
+        input_labels_dict = process_data_input(
             input, method=RAW, binning=True, absolute_value=True, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': 0, 'D': 0, 'E': 1})
@@ -46,7 +49,7 @@ def test_quantitative_bin_fc_abs(self):
     def test_quantitative_bin_fcp_sign(self):
         """Test codify label_input for quantitative scoring methods- logFC and adj. p-value given (binary, signed labels)."""
         input = NODE_LOGFC_PVAL_TEST_PATH
-        input_labels_dict = process_input(
+        input_labels_dict = process_data_input(
             input, method=RAW, binning=True, absolute_value=False, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': 0, 'B': 1, 'C': 0, 'D': 0, 'E': -1})
@@ -54,7 +57,7 @@ def test_quantitative_bin_fcp_sign(self):
     def test_quantitative_bin_fcp_abs(self):
         """Test codify label_input for quant. scoring methods- logFC and adj. p-value given (binary, absolute values)."""
         input = NODE_LOGFC_PVAL_TEST_PATH
-        input_labels_dict = process_input(
+        input_labels_dict = process_data_input(
             input, method=RAW, binning=True, absolute_value=True, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': 0, 'B': 1, 'C': 0, 'D': 0, 'E': 1})
@@ -62,7 +65,7 @@ def test_quantitative_bin_fcp_abs(self):
     def test_quantitative_fc_sign(self):
         """Test codify label_input for quantitative scoring methods- logFC given (quantitative, signed labels)."""
         input = NODE_LOGFC_TEST_PATH
-        input_labels_dict = process_input(
+        input_labels_dict = process_data_input(
             input, method=RAW, binning=False, absolute_value=False, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': 0.7, 'B': 1.2, 'C': 0, 'D': 0, 'E': -2.2})
@@ -70,7 +73,7 @@ def test_quantitative_fc_sign(self):
     def test_quantitative_fc_abs(self):
         """Test codify label_input for quantitative scoring methods- logFC given (quant., absolute values)."""
         input = NODE_LOGFC_TEST_PATH
-        input_labels_dict = process_input(
+        input_labels_dict = process_data_input(
             input, method=RAW, binning=False, absolute_value=True, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': 0.7, 'B': 1.2, 'C': 0, 'D': 0, 'E': 2.2})
@@ -78,7 +81,7 @@ def test_quantitative_fc_abs(self):
     def test_quantitative_fcp_sign(self):
         """Test codify label_input for quantitative scoring methods- logFC and adj. p-value given (quant., signed labels)."""
         input = NODE_LOGFC_PVAL_TEST_PATH
-        input_labels_dict = process_input(
+        input_labels_dict = process_data_input(
             input, method=RAW, binning=False, absolute_value=False, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': 0, 'B': 1.2, 'C': 0, 'D': 0, 'E': -2.2})
@@ -86,23 +89,23 @@ def test_quantitative_fcp_sign(self):
     def test_quantitative_fcp_abs(self):
         """Test codify label_input for quant. scoring methods- logFC and adj. p-value given (quant., absolute values)."""
         input = NODE_LOGFC_PVAL_TEST_PATH
-        input_labels_dict = process_input(
+        input_labels_dict = process_data_input(
             input, method=RAW, binning=False, absolute_value=True, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': 0, 'B': 1.2, 'C': 0, 'D': 0, 'E': 2.2})
 
     def test_non_quantitative_bin_id(self):
         """Test codify label_input for non-quantitative scoring methods- only entity IDs given (binary labels)."""
-        input = NODE_TEST_PATH
-        input_labels_dict = process_input(
+        input = NODE_TYPE_COL_TEST_PATH
+        input_labels_dict = process_data_input(
             input, method=ML, binning=True, absolute_value=True, p_value=0.05, threshold=None,
         )
-        self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': 1, 'D': 1, 'E': 1})
+        self.assertEqual(input_labels_dict, {'Metabolite': {'C': 1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}})
 
     def test_non_quantitative_bin_fc_abs(self):
         """Test codify label_input for non-quantitative scoring methods- logFC given (binary, absolute values (sign))."""
         input = NODE_LOGFC_TEST_PATH
-        input_labels_dict = process_input(
+        input_labels_dict = process_data_input(
             input, method=ML, binning=True, absolute_value=True, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': -1, 'D': -1, 'E': 1})
@@ -110,14 +113,72 @@ def test_non_quantitative_bin_fc_abs(self):
     def test_non_quantitative_bin_fcp_abs(self):
         """Test codify label_input for non-quant. scoring methods- logFC and adj. p-value given (binary, absolute values)."""
         input = NODE_LOGFC_PVAL_TEST_PATH
-        input_labels_dict = process_input(
+        input_labels_dict = process_data_input(
             input, method=ML, binning=True, absolute_value=True, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': -1, 'B': 1, 'C': -1, 'D': -1, 'E': 1})
 
+    def test_map_labels_input_label_list_background_list(self):
+        """Test map label_input."""
+        mapping = map_labels_input(input_labels=['A', 'B', 'C', 'D'],
+                                   background_labels=['A', 'B', 'C'])
+
+        self.assertEqual(set(mapping), {'A', 'C', 'B'})
+
+    def test_map_labels_input_label_list_background_dict(self):
+        """Test map label_input."""
+        mapping = map_labels_input(input_labels=['A', 'B', 'C', 'D'],
+                                   background_labels={'Gene': ['A', 'B'], 'Metabolite': ['C']})
+
+        self.assertEqual(mapping, {'Gene': ['A', 'B'], 'Metabolite': ['C']})
+
+    def test_map_labels_input_type_dict_label_list_background_list(self):
+        """Test map label_input."""
+        mapping = map_labels_input(input_labels={'Gene': ['A', 'B'], 'Metabolite': ['C', 'D']},
+                                   background_labels=['A', 'B', 'C'])
+
+        self.assertEqual(mapping, {'Gene': ['A', 'B'], 'Metabolite': ['C']})
+
+    def test_map_labels_input_type_dict_label_dict_background_dict(self):
+        """Test map label_input."""
+        # If the labels are classified in another type ('D' and 'B'), since it do not match with the background it will be not maped.
+        mapping = map_labels_input(input_labels={'Gene': ['A'], 'Metabolite': ['C', 'B']},
+                                   background_labels={'Gene': ['A', 'B'], 'Metabolite': ['D']})
+
+        self.assertEqual(mapping, {'Gene': ['A']})
+
+    def test_map_labels_input_label_scores_dict_background_list(self):
+        """Test map label_input."""
+        mapping = map_labels_input(input_labels={'A': 1, 'B': 1, 'D': 1, 'E': 1},
+                                   background_labels=['B', 'C', 'D'])
+
+        self.assertEqual(mapping, {'B': 1, 'D': 1})
+
+    def test_map_labels_input_label_scores_dict_background_dict(self):
+        """Test map label_input."""
+        mapping = map_labels_input(input_labels={'A': 1, 'B': 1, 'D': 1, 'E': 1},
+                                   background_labels={'Gene': ['A', 'B'], 'Metabolite': ['D']})
+
+        self.assertEqual(mapping, {'Metabolite': {'D': 1}, 'Gene': {'A': 1, 'B': 1}})
+
+    def test_map_labels_input_type_dict_label_scores_dict_background_list(self):
+        """Test map label_input."""
+        mapping = map_labels_input(input_labels={'Metabolite': {'C': 1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}},
+                                   background_labels=['A', 'B', 'C'])
+
+        self.assertEqual(mapping, {'Metabolite': {'C': 1}, 'Gene': {'A': 1, 'B': 1}})
+
+    def test_map_labels_input_type_dict_label_scores_dict_background_dict(self):
+        """Test map label_input."""
+        # If the labels are classified in another type ('D' and 'B'), since it do not match with the background it will be not maped.
+        mapping = map_labels_input(input_labels={'Metabolite': {'C': -1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}},
+                                   background_labels={'Gene': ['A', 'B'], 'Metabolite': ['C']})
+
+        self.assertEqual(mapping, {'Metabolite': {'C': -1}, 'Gene': {'A': 1, 'B': 1}})
+
     def test_network(self):
         """Test generate graph from csv."""
-        graph = process_network(NETWORK_PATH, CSV)
+        graph = get_graph_from_df(NETWORK_PATH, CSV)
         graph_nodes = set(graph.nodes())
         graph_edges = set(graph.edges())
 
@@ -138,14 +199,14 @@ def test_network(self):
     def test_node_mapping(self):
         """Test mapping of nodes in label_input to nodes in network."""
         input = NODE_LOGFC_TEST_PATH
-        input_labels_dict = process_input(
+        input_labels_dict = process_data_input(
             input, method=RAW, binning=False, absolute_value=True, p_value=0.05, threshold=0.5,
         )
 
-        graph = process_network(NETWORK_PATH, CSV)
+        graph = get_graph_from_df(NETWORK_PATH, CSV)
         graph_nodes = list(graph.nodes())
 
-        mapped_nodes_list = map_nodes(input_labels_dict, graph_nodes)
+        mapped_nodes_list = _map_labels_to_background(input_labels_dict, graph_nodes)
 
         self.assertEqual(mapped_nodes_list, [0.0, 1.0, 0.0, 0.0, None, 1.0, None, None, None])
 

From dd8e4e13bd1611a2cb2359ce09736240cd44e31b Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Fri, 10 Apr 2020 18:37:27 +0200
Subject: [PATCH 03/17] Diffupy Process data input utils

---
 src/diffupy/utils.py | 179 +++++++++++--------------------------------
 1 file changed, 45 insertions(+), 134 deletions(-)

diff --git a/src/diffupy/utils.py b/src/diffupy/utils.py
index d747520..ca4ae7b 100644
--- a/src/diffupy/utils.py
+++ b/src/diffupy/utils.py
@@ -5,24 +5,54 @@
 import json
 import logging
 import pickle
+import random
 import warnings
-from typing import List, Tuple
+from typing import List
 
 import networkx as nx
 import numpy as np
 import pandas as pd
 import pybel
-
-from networkx import DiGraph, read_graphml, read_gml, node_link_graph, read_edgelist
+from networkx import Graph
 
 from .constants import *
-from .constants import CSV, TSV, GRAPHML, GML, BEL, BEL_PICKLE, NODE_LINK_JSON, EMOJI, FORMATS
-
+from .constants import CSV, TSV, GRAPH_FORMATS
 
 log = logging.getLogger(__name__)
 
 
-def get_laplacian(graph: nx.Graph, normalized: bool = False) -> np.ndarray:
+def from_dataframe_file(path: str, fmt: str) -> pd.DataFrame:
+    """Read network file."""
+    format_checker(fmt)
+
+    return pd.read_csv(
+        path,
+        header=0,
+        sep=FORMAT_SEPARATOR_MAPPING[CSV] if fmt == CSV else FORMAT_SEPARATOR_MAPPING[TSV]
+    )
+
+
+def from_json(path: str):
+    """Read from json file."""
+    with open(path) as f:
+        return json.load(f)
+
+
+def from_pickle(input_path):
+    """Read from pickle file."""
+    with open(input_path, 'rb') as f:
+        unpickler = pickle.Unpickler(f)
+        return unpickler.load()
+
+
+def from_nparray_to_df(nparray: np.ndarray) -> pd.DataFrame:
+    """Convert numpy array to data frame."""
+    return pd.DataFrame(data=nparray[1:, 1:],
+                        index=nparray[1:, 0],
+                        columns=nparray[0, 1:])
+
+
+def get_laplacian(graph: Graph, normalized: bool = False) -> np.ndarray:
     """Return Laplacian matrix."""
     if nx.is_directed(graph):
         warnings.warn('Since graph is directed, it will be converted to an undirected graph.')
@@ -35,7 +65,7 @@ def get_laplacian(graph: nx.Graph, normalized: bool = False) -> np.ndarray:
     return nx.laplacian_matrix(graph).toarray()
 
 
-def set_diagonal_matrix(matrix, d):
+def set_diagonal_matrix(matrix: np.ndarray, d: list) -> np.ndarray:
     """Set diagonal matrix."""
     for j, row in enumerate(matrix):
         for i, x in enumerate(row):
@@ -157,137 +187,18 @@ def print_dict_dimensions(entities_db, title):
     print(f'Total: {total} ')
 
 
-def get_simple_graph_from_multigraph(multigraph):
-    """Convert undirected graph from multigraph."""
-    graph = nx.Graph()
-    for u, v, data in multigraph.edges(data=True):
-        u = get_label_node(u)
-        v = get_label_node(v)
-
-        w = data['weight'] if 'weight' in data else 1.0
-        if graph.has_edge(u, v):
-            graph[u][v]['weight'] += w
-        else:
-            graph.add_edge(u, v, weight=w)
-
-    return graph
-
-
-"""Check formats of networks """
-
-
-def _format_checker(fmt: str) -> None:
-    """Check column sep."""
-    if fmt not in FORMATS:
+def format_checker(fmt: str, fmt_list: list = GRAPH_FORMATS) -> None:
+    """Check formats."""
+    if fmt not in fmt_list:
         raise ValueError(
             f'The selected sep {fmt} is not valid. Please ensure you use one of the following formats: '
-            f'{FORMATS}'
+            f'{fmt_list}'
         )
 
 
-"""Process networks"""
-
-
-def _read_network_file(path: str, fmt: str) -> pd.DataFrame:
-    """Read network file."""
-    _format_checker(fmt)
-
-    df = pd.read_csv(
-        path,
-        header=0,
-        sep=FORMAT_SEPARATOR_MAPPING[CSV] if fmt == CSV else FORMAT_SEPARATOR_MAPPING[TSV]
-    )
-
-    if SOURCE not in df.columns or TARGET not in df.columns:
-        raise ValueError(
-            f'Ensure that your file contains columns for {SOURCE} and {TARGET}. The column for {RELATION} is optional'
-            f'and can be omitted.'
-        )
-
-    return df
-
-
-def process_network(path: str, sep: str) -> DiGraph:
-    """Return network from dataFrame."""
-    _format_checker(sep)
-
-    df = _read_network_file(path, sep)
-
-    graph = DiGraph()
-
-    for index, row in df.iterrows():
-
-        # Get node names from data frame
-        sub_name = row[SOURCE]
-        obj_name = row[TARGET]
-
-        if RELATION in df.columns:
-
-            relation = row[RELATION]
-
-            # Store edge in the graph
-            graph.add_edge(
-                sub_name, obj_name,
-                relation=relation,
-            )
-
-        else:
-            graph.add_edge(
-                sub_name, obj_name,
-            )
-
-    return graph
-
-
-def load_json_file(path: str) -> DiGraph:
-    """Read json file."""
-    with open(path) as f:
-        return json.load(f)
-
-
-def from_pickle(input_path):
-    """Read from pickle file."""
-    with open(input_path, 'rb') as f:
-        unpickler = pickle.Unpickler(f)
-        return unpickler.load()
-
-
-def process_network_from_cli(path: str) -> nx.Graph:
-    """Load network from path."""
-    if path.endswith(CSV):
-        graph = process_network(path, CSV)
-
-    elif path.endswith(TSV):
-        graph = process_network(path, TSV)
-
-    elif path.endswith(GRAPHML):
-        graph = read_graphml(path)
-
-    elif path.endswith(GML):
-        graph = read_gml(path)
-
-    elif path.endswith(BEL):
-        graph = pybel.from_path(path)
-
-    elif path.endswith(BEL_PICKLE):
-        graph = pybel.from_pickle(path)
-
-    elif path.endswith(EDGE_LIST):
-        graph = read_edgelist(path)
-
-    elif path.endswith(NODE_LINK_JSON):
-        data = load_json_file(path)
-        graph = node_link_graph(data)
-
-    else:
-        raise IOError(
-            f'{EMOJI} The selected format is not valid. Please ensure you use one of the following formats: '
-            f'{FORMATS}'
-        )
-    return graph
+def get_random_key_from_dict(d):
+    return random.choice(list(d.keys()))
 
 
-def process_kernel_from_cli(path: str):
-    """Process kernel from cli."""
-    # TODO process different kinds of input format kernel
-    return from_pickle(path)
+def get_random_value_from_dict(d):
+    return d[get_random_key_from_dict(d)]

From 78556e3f496a2af244c94c5ae22cff446eb5fd07 Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Tue, 14 Apr 2020 18:24:34 +0200
Subject: [PATCH 04/17] Format inputs refactor and tested

---
 ...process_data_input.py => process_input.py} | 163 +++++++++++-------
 tests/test_input.py                           |  89 +++++++---
 2 files changed, 164 insertions(+), 88 deletions(-)
 rename src/diffupy/{process_data_input.py => process_input.py} (82%)

diff --git a/src/diffupy/process_data_input.py b/src/diffupy/process_input.py
similarity index 82%
rename from src/diffupy/process_data_input.py
rename to src/diffupy/process_input.py
index b198fc4..ef6875b 100644
--- a/src/diffupy/process_data_input.py
+++ b/src/diffupy/process_input.py
@@ -46,7 +46,7 @@ def process_input_data_for_diff(data_input: Union[str, pd.DataFrame, list, dict,
                                       )
 
 
-def process_data_input(data_input: Union[str, list, dict, np.ndarray, pd.DataFrame],
+def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFrame],
                        method: str = 'raw',
                        binning: bool = False,
                        absolute_value: bool = False,
@@ -485,22 +485,34 @@ def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, i
 
 
 def format_input_for_diffusion(processed_input: Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]],
-                               kernel: Matrix) -> Matrix:
+                               kernel: Matrix,
+                               missing_value: int = -1) -> Matrix:
     """Format/generate input vector/matrix according the data structure of the processed_data_input."""
     if _label_list_data_struct_check(processed_input):
         return format_categorical_input_vector_from_label_list(rows_labeled=processed_input,
                                                                col_label='scores',
-                                                               kernel=kernel
+                                                               kernel=kernel,
+                                                               missing_value=missing_value
                                                                )
 
-    elif _scores_dict_data_struct_check(processed_input):
-        return format_input_vector_from_scores_dict(processed_input, kernel)
+    elif _type_dict_label_list_data_struct_check(processed_input):
+        return format_categorical_input_matrix_from_label_list(rows_labels=list(processed_input.values()),
+                                                               cols_labels=list(processed_input.keys()),
+                                                               kernel=kernel,
+                                                               missing_value=missing_value
+                                                               )
 
-    elif _type_label_list_data_struct_check(processed_input):
-        return format_categorical_input_matrix_from_label_list(processed_input, kernel)
+    elif _label_scores_dict_data_struct_check(processed_input):
+        return format_input_vector_from_label_score_dict(labels_scores_dict=processed_input,
+                                                         kernel=kernel,
+                                                         missing_value=missing_value
+                                                         )
 
-    elif _type_scores_dict_data_struct_check(processed_input):
-        return format_input_matrix_from_scores_dict(processed_input, kernel)
+    elif _type_dict_label_scores_dict_data_struct_check(processed_input):
+        return format_input_matrix_from_type_label_score_dict(type_dict_labels_scores_dict=processed_input,
+                                                              kernel=kernel,
+                                                              missing_value=missing_value
+                                                              )
 
     else:
         raise TypeError(
@@ -511,48 +523,57 @@ def format_input_for_diffusion(processed_input: Union[list, Dict[str, int], Dict
 """Generate categorical (non-quantitative) input vector matrix from raw input dataset labels"""
 
 
-def format_categorical_input_vector_from_label_list(rows_labeled,
-                                                    col_label,
-                                                    kernel,
-                                                    missing_value=-1,
-                                                    rows_unlabeled=None  # TODO: To discuss, to handle
+def format_categorical_input_vector_from_label_list(rows_labeled: Union[set, list],
+                                                    col_label: Union[str, set, list],
+                                                    kernel: Matrix,
+                                                    missing_value: int = -1,
+                                                    rows_unlabeled=None,
+                                                    i: int = None
                                                     ) -> Matrix:
     """Generate categoric input vector from labels."""
     if isinstance(col_label, str):
         col_label = [col_label]
 
     input_mat = Matrix(
-        rows_labels=list(rows_labeled),
+        rows_labels=list(set(rows_labeled)),
         cols_labels=col_label,
-        init_value=1)
+        init_value=1  # By default the categorical input value is 1
+    )
+
     if rows_unlabeled:
+        if i:
+            rows_unlabeled = rows_unlabeled[i]
+
         input_mat.row_bind(
             matrix=Matrix(
                 rows_labels=list(rows_unlabeled),
                 cols_labels=col_label,
-                init_value=0)
+                init_value=0  # By default the non labeled input value is 0
+            )
         )
 
     return input_mat.match_missing_rows(kernel.rows_labels, missing_value).match_rows(kernel)
 
 
-def format_categorical_input_matrix_from_label_list(rows_labels,
-                                                    cols_labels: list,
-                                                    kernel,
-                                                    missing_value=-1,
-                                                    rows_unlabeled=None  # TODO: To discuss, to handle
+def format_categorical_input_matrix_from_label_list(rows_labels: Union[set, list],
+                                                    cols_labels: Union[set, list],
+                                                    kernel: Matrix,
+                                                    missing_value: int = -1,
+                                                    rows_unlabeled=None
                                                     ) -> Matrix:
     """Generate input vector from labels."""
     if not isinstance(cols_labels, list):
         raise NotImplementedError('The column labels should be provided as a list.')
 
     if len(cols_labels) > 1:
+
         input_mat = format_categorical_input_vector_from_label_list(
             rows_labels[0],
             cols_labels[0],
             kernel,
             missing_value,
-            rows_unlabeled[0]
+            rows_unlabeled,
+            i=0
         )
 
         for idx, row_label in enumerate(rows_labels[1:]):
@@ -561,75 +582,85 @@ def format_categorical_input_matrix_from_label_list(rows_labels,
                 cols_labels[idx + 1],
                 kernel,
                 missing_value,
-                rows_unlabeled[idx + 1],
+                rows_unlabeled,
+                idx + 1
             )
             input_mat.col_bind(matrix=input_vector)
 
         return input_mat
 
-    elif isinstance(cols_labels, list):
-        return format_categorical_input_vector_from_label_list(
-            rows_labels,
-            cols_labels,
-            kernel,
-            missing_value,
-            rows_unlabeled
-        )
+    return format_categorical_input_vector_from_label_list(
+        rows_labels,
+        cols_labels,
+        kernel,
+        missing_value,
+        rows_unlabeled
+    )
 
 
 """Generate quantitative or binarized/categorical input vector matrix from preprocesed input dataset scores"""
 
 
-def format_input_vector_from_scores_dict(scores_dict: dict,
-                                         kernel,
-                                         col_label: str = 'scores',
-                                         missing_value=-1,
-                                         rows_unlabeled=None  # TODO: To discuss, to handle
-                                         ) -> Matrix:
+def format_input_vector_from_label_score_dict(labels_scores_dict: Dict[str, int],
+                                              kernel: Matrix,
+                                              col_label: str = 'scores',
+                                              missing_value: int = -1,
+                                              rows_unlabeled: dict = None,  # TODO: To discuss
+                                              type_k: bool = False
+                                              ) -> Matrix:
     """Generate scores input vector from labels scores dict."""
 
     input_mat = Matrix(
-        mat=np.array(list(scores_dict.values())),
-        rows_labels=list(scores_dict.keys()),
+        mat=np.transpose(np.array([list(labels_scores_dict.values())])),
+        rows_labels=list(labels_scores_dict.keys()),
         cols_labels=[col_label]
     )
 
     if rows_unlabeled:
+        if type_k:
+            rows_unlabeled = rows_unlabeled[col_label]
+
         input_mat.row_bind(
             matrix=Matrix(
-                rows_labels=list(rows_unlabeled),
-                cols_labels=col_label,
-                init_value=0)
+                mat=np.transpose(np.array([list(rows_unlabeled.values())])),
+                rows_labels=list(rows_unlabeled.keys()),
+                cols_labels=[col_label]
+            )
         )
 
     return input_mat.match_missing_rows(kernel.rows_labels, missing_value).match_rows(kernel)
 
 
-def format_input_matrix_from_scores_dict(scores_dicts: Union[Dict[str, Dict[str, int]],
-                                                             Dict[str, int]],
-                                         kernel,
-                                         rows_unlabeled=None,  # TODO: To discuss, to handle
-                                         ) -> Matrix:
+def format_input_matrix_from_type_label_score_dict(type_dict_labels_scores_dict: Union[Dict[str, Dict[str, int]],
+                                                                                       Dict[str, int]],
+                                                   kernel,
+                                                   missing_value: int = -1,
+                                                   rows_unlabeled=None,  # TODO: To discuss
+                                                   ) -> Matrix:
     """Generate input matrix from labels scores dict and/or handle type classification by columns."""
-    if _scores_dict_data_struct_check(scores_dicts):
-        scores_dicts.pop('node_types')
-
-        init_k = get_random_key_from_dict(scores_dicts)
-        init_v = scores_dicts.pop(init_k)
-        input_mat = format_input_vector_from_scores_dict(scores_dicts,
-                                                         kernel,
-                                                         col_label=init_k,
-                                                         rows_unlabeled=init_v
-                                                         )
-
-        for node_type, scores_dict in scores_dicts.items():
-            input_vector = format_input_vector_from_scores_dict(scores_dict,
-                                                                kernel,
-                                                                col_label=node_type,
-                                                                rows_unlabeled=rows_unlabeled
-                                                                )
+    if _type_dict_label_scores_dict_data_struct_check(type_dict_labels_scores_dict):
+
+        init_k = get_random_key_from_dict(type_dict_labels_scores_dict)
+        init_v = type_dict_labels_scores_dict.pop(init_k)
+
+        input_mat = format_input_vector_from_label_score_dict(init_v,
+                                                              kernel,
+                                                              init_k,
+                                                              missing_value,
+                                                              rows_unlabeled,
+                                                              True
+                                                              )
+
+        for node_type, scores_dict in type_dict_labels_scores_dict.items():
+            input_vector = format_input_vector_from_label_score_dict(scores_dict,
+                                                                     kernel,
+                                                                     node_type,
+                                                                     missing_value,
+                                                                     rows_unlabeled,
+                                                                     True
+                                                                     )
             input_mat.col_bind(matrix=input_vector)
 
         return input_mat
     else:
-        return format_input_vector_from_scores_dict(scores_dicts, kernel)
+        return format_input_vector_from_label_score_dict(type_dict_labels_scores_dict, kernel)
diff --git a/tests/test_input.py b/tests/test_input.py
index dcf3704..c141c85 100644
--- a/tests/test_input.py
+++ b/tests/test_input.py
@@ -5,9 +5,11 @@
 import logging
 import unittest
 
+import numpy as np
 from diffupy.constants import *
 from diffupy.matrix import Matrix
-from diffupy.process_data_input import process_data_input, _map_labels_to_background, map_labels_input
+from diffupy.process_input import process_input_data, map_labels_input, \
+    format_input_for_diffusion
 from diffupy.process_network import get_graph_from_df
 from diffupy.validate_input import _validate_scores
 
@@ -22,26 +24,23 @@ class ValidateTest(unittest.TestCase):
     def test_quantitative_bin_id(self):
         """Test codify label_input for quantitative scoring methods- only entity IDs given (binary labels)."""
         input = NODE_TYPE_COL_TEST_PATH
-        input_labels_dict = process_data_input(
+        input_labels_dict = process_input_data(
             input, method=RAW, binning=True, absolute_value=True, p_value=0.05, threshold=None,
         )
-        print(input_labels_dict)
         self.assertEqual(input_labels_dict, {'Metabolite': {'C': 1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}})
 
     def test_quantitative_bin_fc_sign(self):
         """Test codify label_input for quantitative scoring methods- logFC given (binary, signed labels)."""
         input = NODE_LOGFC_TEST_PATH
-        input_labels_dict = process_data_input(
+        input_labels_dict = process_input_data(
             input, method=RAW, binning=True, absolute_value=False, p_value=0.05, threshold=0.5,
         )
-        print(input_labels_dict)
-
         self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': 0, 'D': 0, 'E': -1})
 
     def test_quantitative_bin_fc_abs(self):
         """Test codify label_input for quantitative scoring methods- logFC given (binary, absolute values)."""
         input = NODE_LOGFC_TEST_PATH
-        input_labels_dict = process_data_input(
+        input_labels_dict = process_input_data(
             input, method=RAW, binning=True, absolute_value=True, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': 0, 'D': 0, 'E': 1})
@@ -49,7 +48,7 @@ def test_quantitative_bin_fc_abs(self):
     def test_quantitative_bin_fcp_sign(self):
         """Test codify label_input for quantitative scoring methods- logFC and adj. p-value given (binary, signed labels)."""
         input = NODE_LOGFC_PVAL_TEST_PATH
-        input_labels_dict = process_data_input(
+        input_labels_dict = process_input_data(
             input, method=RAW, binning=True, absolute_value=False, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': 0, 'B': 1, 'C': 0, 'D': 0, 'E': -1})
@@ -57,7 +56,7 @@ def test_quantitative_bin_fcp_sign(self):
     def test_quantitative_bin_fcp_abs(self):
         """Test codify label_input for quant. scoring methods- logFC and adj. p-value given (binary, absolute values)."""
         input = NODE_LOGFC_PVAL_TEST_PATH
-        input_labels_dict = process_data_input(
+        input_labels_dict = process_input_data(
             input, method=RAW, binning=True, absolute_value=True, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': 0, 'B': 1, 'C': 0, 'D': 0, 'E': 1})
@@ -65,7 +64,7 @@ def test_quantitative_bin_fcp_abs(self):
     def test_quantitative_fc_sign(self):
         """Test codify label_input for quantitative scoring methods- logFC given (quantitative, signed labels)."""
         input = NODE_LOGFC_TEST_PATH
-        input_labels_dict = process_data_input(
+        input_labels_dict = process_input_data(
             input, method=RAW, binning=False, absolute_value=False, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': 0.7, 'B': 1.2, 'C': 0, 'D': 0, 'E': -2.2})
@@ -73,7 +72,7 @@ def test_quantitative_fc_sign(self):
     def test_quantitative_fc_abs(self):
         """Test codify label_input for quantitative scoring methods- logFC given (quant., absolute values)."""
         input = NODE_LOGFC_TEST_PATH
-        input_labels_dict = process_data_input(
+        input_labels_dict = process_input_data(
             input, method=RAW, binning=False, absolute_value=True, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': 0.7, 'B': 1.2, 'C': 0, 'D': 0, 'E': 2.2})
@@ -81,7 +80,7 @@ def test_quantitative_fc_abs(self):
     def test_quantitative_fcp_sign(self):
         """Test codify label_input for quantitative scoring methods- logFC and adj. p-value given (quant., signed labels)."""
         input = NODE_LOGFC_PVAL_TEST_PATH
-        input_labels_dict = process_data_input(
+        input_labels_dict = process_input_data(
             input, method=RAW, binning=False, absolute_value=False, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': 0, 'B': 1.2, 'C': 0, 'D': 0, 'E': -2.2})
@@ -89,7 +88,7 @@ def test_quantitative_fcp_sign(self):
     def test_quantitative_fcp_abs(self):
         """Test codify label_input for quant. scoring methods- logFC and adj. p-value given (quant., absolute values)."""
         input = NODE_LOGFC_PVAL_TEST_PATH
-        input_labels_dict = process_data_input(
+        input_labels_dict = process_input_data(
             input, method=RAW, binning=False, absolute_value=True, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': 0, 'B': 1.2, 'C': 0, 'D': 0, 'E': 2.2})
@@ -97,7 +96,7 @@ def test_quantitative_fcp_abs(self):
     def test_non_quantitative_bin_id(self):
         """Test codify label_input for non-quantitative scoring methods- only entity IDs given (binary labels)."""
         input = NODE_TYPE_COL_TEST_PATH
-        input_labels_dict = process_data_input(
+        input_labels_dict = process_input_data(
             input, method=ML, binning=True, absolute_value=True, p_value=0.05, threshold=None,
         )
         self.assertEqual(input_labels_dict, {'Metabolite': {'C': 1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}})
@@ -105,7 +104,7 @@ def test_non_quantitative_bin_id(self):
     def test_non_quantitative_bin_fc_abs(self):
         """Test codify label_input for non-quantitative scoring methods- logFC given (binary, absolute values (sign))."""
         input = NODE_LOGFC_TEST_PATH
-        input_labels_dict = process_data_input(
+        input_labels_dict = process_input_data(
             input, method=ML, binning=True, absolute_value=True, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': -1, 'D': -1, 'E': 1})
@@ -113,7 +112,7 @@ def test_non_quantitative_bin_fc_abs(self):
     def test_non_quantitative_bin_fcp_abs(self):
         """Test codify label_input for non-quant. scoring methods- logFC and adj. p-value given (binary, absolute values)."""
         input = NODE_LOGFC_PVAL_TEST_PATH
-        input_labels_dict = process_data_input(
+        input_labels_dict = process_input_data(
             input, method=ML, binning=True, absolute_value=True, p_value=0.05, threshold=0.5,
         )
         self.assertEqual(input_labels_dict, {'A': -1, 'B': 1, 'C': -1, 'D': -1, 'E': 1})
@@ -122,7 +121,7 @@ def test_map_labels_input_label_list_background_list(self):
         """Test map label_input."""
         mapping = map_labels_input(input_labels=['A', 'B', 'C', 'D'],
                                    background_labels=['A', 'B', 'C'])
-
+        # As set because the order is not relevant.
         self.assertEqual(set(mapping), {'A', 'C', 'B'})
 
     def test_map_labels_input_label_list_background_dict(self):
@@ -141,7 +140,7 @@ def test_map_labels_input_type_dict_label_list_background_list(self):
 
     def test_map_labels_input_type_dict_label_dict_background_dict(self):
         """Test map label_input."""
-        # If the labels are classified in another type ('D' and 'B'), since it do not match with the background it will be not maped.
+        # If the labels are classified in another type ('D' and 'B'), since it do not match with the background it will be not mapped.
         mapping = map_labels_input(input_labels={'Gene': ['A'], 'Metabolite': ['C', 'B']},
                                    background_labels={'Gene': ['A', 'B'], 'Metabolite': ['D']})
 
@@ -170,7 +169,7 @@ def test_map_labels_input_type_dict_label_scores_dict_background_list(self):
 
     def test_map_labels_input_type_dict_label_scores_dict_background_dict(self):
         """Test map label_input."""
-        # If the labels are classified in another type ('D' and 'B'), since it do not match with the background it will be not maped.
+        # If the labels are classified in another type ('D' and 'B'), since it do not match with the background it will be not mapped.
         mapping = map_labels_input(input_labels={'Metabolite': {'C': -1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}},
                                    background_labels={'Gene': ['A', 'B'], 'Metabolite': ['C']})
 
@@ -199,16 +198,16 @@ def test_network(self):
     def test_node_mapping(self):
         """Test mapping of nodes in label_input to nodes in network."""
         input = NODE_LOGFC_TEST_PATH
-        input_labels_dict = process_data_input(
+        input_labels_dict = process_input_data(
             input, method=RAW, binning=False, absolute_value=True, p_value=0.05, threshold=0.5,
         )
 
         graph = get_graph_from_df(NETWORK_PATH, CSV)
         graph_nodes = list(graph.nodes())
 
-        mapped_nodes_list = _map_labels_to_background(input_labels_dict, graph_nodes)
+        mapped_nodes_list = map_labels_input(input_labels_dict, graph_nodes)
 
-        self.assertEqual(mapped_nodes_list, [0.0, 1.0, 0.0, 0.0, None, 1.0, None, None, None])
+        self.assertEqual(mapped_nodes_list, {'A': 0.7, 'B': 1.2, 'C': 0.0, 'D': 0.0, 'E': 2.2})
 
     def test_validate_scores_1(self):
         """Test validate scores 1."""
@@ -248,3 +247,49 @@ def test_validate_scores_4(self):
         )
         with self.assertRaises(ValueError):
             _validate_scores(matrix)
+
+    kernel_test_1 = Matrix(
+        [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
+        cols_labels=['A', 'B', 'C', 'D'],
+        rows_labels=['A', 'B', 'C', 'D'],
+        name='Test Kernel 1'
+    )
+
+    kernel_test_2 = Matrix(
+        [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
+        cols_labels=['A', 'B', 'C', 'F'],
+        rows_labels=['A', 'B', 'C', 'F'],
+        name='Test Kernel 2'
+    )
+
+    kernel_test_3 = Matrix(
+        [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]],
+        cols_labels=['A', 'B', 'C', 'D', 'F'],
+        rows_labels=['A', 'B', 'C', 'D', 'F'],
+        name='Test Kernel 3'
+    )
+
+    def test_format_input_for_diffusion_label_list(self):
+        """Test empty matrix."""
+
+        processed_mapped_nodes_list = format_input_for_diffusion(
+            map_labels_input({'Metabolite': {'C': -1}, 'Gene': {'A': 2, 'B': 1}, 'mirnas': {'A': 1, 'T': 1}},
+                             self.kernel_test_1.rows_labels),
+            self.kernel_test_1,
+        )
+
+        # TODO: Implement in Matrix equal, now if the col order is mixed it raises error
+        #assert(np.allclose(processed_mapped_nodes_list.mat,
+        #                    np.array([[-1, 2, 1],
+        #                              [-1, 1, -1],
+        #                              [-1, -1, -1],
+        #                              [-1, -1, -1]]
+        #                             )
+        #                    )
+        #        )
+        #self.assertEqual(processed_mapped_nodes_list.cols_labels,
+        #                 ['Metabolite', 'Gene', 'mirnas']
+        #                 )
+        #self.assertEqual(processed_mapped_nodes_list.rows_labels,
+        #                 ['A', 'B', 'C', 'D']
+        #                 )

From 10f11b06bdce5412078f84cb666df302e10a01f8 Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Wed, 15 Apr 2020 17:07:53 +0200
Subject: [PATCH 05/17] Mapping subsets labels, implemented as _map_label_dict
 and _map_label_list

---
 src/diffupy/process_input.py | 74 ++++++++++++++++++++++++++----------
 1 file changed, 54 insertions(+), 20 deletions(-)

diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py
index ef6875b..ca908df 100644
--- a/src/diffupy/process_input.py
+++ b/src/diffupy/process_input.py
@@ -383,18 +383,18 @@ def _remove_non_significant_entities(df: pd.DataFrame, p_value: float) -> Dict[s
 """Data structures format checkers"""
 
 
-def _scores_dict_data_struct_check(v: Union[dict, list]) -> bool:
+def _label_scores_dict_data_struct_check(v: Union[dict, list]) -> bool:
     """Check data structure type Dict[str, int]."""
     return (isinstance(v, dict) and
-            isinstance(get_random_value_from_dict(v), int)
+            isinstance(get_random_value_from_dict(v), (int, float))
             )
 
 
-def _type_scores_dict_data_struct_check(v: Union[dict, list]) -> bool:
+def _type_dict_label_scores_dict_data_struct_check(v: Union[dict, list]) -> bool:
     """Check data structure type Dict[str, Dict[str, int]]."""
     return (isinstance(v, dict) and
             isinstance(get_random_value_from_dict(v), dict) and
-            isinstance(get_random_value_from_dict(get_random_value_from_dict(v)), int)
+            isinstance(get_random_value_from_dict(get_random_value_from_dict(v)), (int, float))
             )
 
 
@@ -403,7 +403,7 @@ def _label_list_data_struct_check(v: Union[dict, list]) -> bool:
     return isinstance(v, list)
 
 
-def _type_label_list_data_struct_check(v: Union[dict, list]) -> bool:
+def _type_dict_label_list_data_struct_check(v: Union[dict, list]) -> bool:
     """Check data structure type Dict[str, list]."""
     return (isinstance(v, dict) and
             isinstance(get_random_value_from_dict(v), list)
@@ -415,7 +415,7 @@ def _type_label_list_data_struct_check(v: Union[dict, list]) -> bool:
 
 def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]],
                      background_labels: Union[Dict[str, list], list]) -> Union[Dict[str, int], list]:
-    """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes."""
+    """Map nodes from input dataset to nodes in network to get a set of labelled nodes."""
     if isinstance(background_labels, list):
         return _map_labels_to_background(input_labels, background_labels)
 
@@ -436,8 +436,9 @@ def _map_labels_to_background(input_labels: Union[list, Dict[str, Dict[str, int]
                               background_labels_type: str = None
                               ) -> Union[Dict[str, Dict[str, int]],
                                          Dict[str, int]]:
-    """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes."""
-    if _type_scores_dict_data_struct_check(input_labels) or _type_label_list_data_struct_check(input_labels):
+    """Map nodes from input dataset to nodes in network to get a set of labelled nodes."""
+    if _type_dict_label_scores_dict_data_struct_check(input_labels) or _type_dict_label_list_data_struct_check(
+            input_labels):
         if background_labels_type:
             if background_labels_type in input_labels.keys():
                 return _map_labels(input_labels[background_labels_type], background_labels)
@@ -451,29 +452,62 @@ def _map_labels_to_background(input_labels: Union[list, Dict[str, Dict[str, int]
     return _map_labels(input_labels, background_labels)
 
 
+def _map_label_list(input_labels: Union[str, Set[str], List[str]],
+                    background_labels: List[str]) -> List[str]:
+    mapped_list = []
+    for label in input_labels:
+        if isinstance(label, str):
+            if label in background_labels:
+                mapped_list.append(label)
+        elif isinstance(label, set) or isinstance(label, list):
+            for sublabel in set(label):
+                if sublabel in background_labels:
+                    mapped_list.append(label)
+        else:
+            raise TypeError(
+                f'{EMOJI} The input label {label}  data structure can not be processed for label mapping'
+            )
+    return mapped_list
+
+
+def _map_label_dict(input_labels: Dict[Union[str, set], Union[int, float]],
+                    background_labels: list) -> Dict[str, Union[int, float]]:
+    mapped_dict = {}
+    for label, v in input_labels.items():
+        if isinstance(label, str):
+            if label in background_labels:
+                mapped_dict[label] = v
+        elif isinstance(label, set) or isinstance(label, list):
+            for sublabel in set(label):
+                if sublabel in background_labels:
+                    mapped_dict[label] = v
+        else:
+            raise TypeError(
+                f'{EMOJI} The input label {label}  data structure can not be processed for label mapping'
+            )
+    return mapped_dict
+
+
 def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]],
-                background_labels: list) -> Union[Dict[str, int], list]:
+                background_labels: list) -> Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]]:
     """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes."""
     if _label_list_data_struct_check(input_labels):
-        return list(set(input_labels).intersection(set(background_labels)))
+        return _map_label_list(input_labels, background_labels)
 
-    elif _scores_dict_data_struct_check(input_labels):
-        return {labels: input_labels[labels]
-                for labels in background_labels
-                if labels in input_labels
-                }
+    elif _label_scores_dict_data_struct_check(input_labels):
+        return _map_label_dict(input_labels, background_labels)
 
-    elif _type_label_list_data_struct_check(input_labels):
+    elif _type_dict_label_list_data_struct_check(input_labels):
         l = []
         for type, label_list in input_labels.items():
             l += _map_labels(label_list, background_labels)
         return l
 
-    elif _type_scores_dict_data_struct_check(input_labels):
-        l = {}
+    elif _type_dict_label_scores_dict_data_struct_check(input_labels):
+        d = {}
         for type, scores_dict in input_labels.items():
-            l.update(_map_labels(scores_dict, background_labels))
-        return l
+            d.update(_map_labels(scores_dict, background_labels))
+        return d
 
     else:
         raise TypeError(

From 7c42d5dc83b4179d32fad772f108958393cac18c Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Thu, 16 Apr 2020 14:25:56 +0200
Subject: [PATCH 06/17] Parse xls added to diffuPy utils and as a process input
 option

---
 src/diffupy/constants.py     |   9 ++
 src/diffupy/process_input.py |  11 +-
 src/diffupy/utils.py         | 231 +++++++++++++++++++++++++++--------
 3 files changed, 198 insertions(+), 53 deletions(-)

diff --git a/src/diffupy/constants.py b/src/diffupy/constants.py
index 354cd91..bf9ad8e 100644
--- a/src/diffupy/constants.py
+++ b/src/diffupy/constants.py
@@ -59,6 +59,10 @@ def ensure_output_dirs():
 
 #: csv
 CSV = 'csv'
+#: xml
+XML = 'xml'
+#: xmls
+XMLS = 'xmls'
 #: tsv
 TSV = 'tsv'
 #: graphML
@@ -74,6 +78,11 @@ def ensure_output_dirs():
 #: edge list
 EDGE_LIST = '.lst'
 
+XLS_FORMATS = [
+    XML,
+    XMLS
+]
+
 #: DiffuPath available graph formats
 GRAPH_FORMATS = [
     CSV,
diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py
index ca908df..58f0686 100644
--- a/src/diffupy/process_input.py
+++ b/src/diffupy/process_input.py
@@ -127,11 +127,20 @@ def _process_data_input_format(raw_data_input: Union[str, list, dict, np.ndarray
         )
 
 
-def _load_data_input_from_file(path: str) -> Union[pd.DataFrame, list]:
+def _load_data_input_from_file(path: str, **further_parse_args) -> Union[pd.DataFrame, list]:
     """Load and process the input data according the input file format."""
     if path.endswith(CSV):
         return from_dataframe_file(path, CSV)
 
+    elif path.endswith(XLS_FORMATS):
+        return parse_xls_to_df(path,
+                               further_parse_args.get('min_row'),
+                               further_parse_args.get('relevant_sheets'),
+                               further_parse_args.get('irrelevant_sheets'),
+                               further_parse_args.get('relevant_cols'),
+                               further_parse_args.get('irrelevant_cols')
+                               )
+
     elif path.endswith(TSV):
         return from_dataframe_file(path, TSV)
 
diff --git a/src/diffupy/utils.py b/src/diffupy/utils.py
index ca4ae7b..ff31c87 100644
--- a/src/diffupy/utils.py
+++ b/src/diffupy/utils.py
@@ -7,10 +7,12 @@
 import pickle
 import random
 import warnings
-from typing import List
+from collections import defaultdict
+from typing import List, Union, Dict, Optional
 
 import networkx as nx
 import numpy as np
+import openpyxl as opxl
 import pandas as pd
 import pybel
 from networkx import Graph
@@ -20,36 +22,7 @@
 
 log = logging.getLogger(__name__)
 
-
-def from_dataframe_file(path: str, fmt: str) -> pd.DataFrame:
-    """Read network file."""
-    format_checker(fmt)
-
-    return pd.read_csv(
-        path,
-        header=0,
-        sep=FORMAT_SEPARATOR_MAPPING[CSV] if fmt == CSV else FORMAT_SEPARATOR_MAPPING[TSV]
-    )
-
-
-def from_json(path: str):
-    """Read from json file."""
-    with open(path) as f:
-        return json.load(f)
-
-
-def from_pickle(input_path):
-    """Read from pickle file."""
-    with open(input_path, 'rb') as f:
-        unpickler = pickle.Unpickler(f)
-        return unpickler.load()
-
-
-def from_nparray_to_df(nparray: np.ndarray) -> pd.DataFrame:
-    """Convert numpy array to data frame."""
-    return pd.DataFrame(data=nparray[1:, 1:],
-                        index=nparray[1:, 0],
-                        columns=nparray[0, 1:])
+"""Matrix/graph handling utils."""
 
 
 def get_laplacian(graph: Graph, normalized: bool = False) -> np.ndarray:
@@ -151,23 +124,6 @@ def get_idx_scores_mapping(scores):
     return {i: score for i, score in enumerate(scores)}
 
 
-def decode_labels(labels):
-    """Validate labels."""
-    labels_decode = []
-
-    for label in labels:
-        if not isinstance(label, str):
-
-            if isinstance(label, int):
-                label = str(label)
-            else:
-                label = label.decode('utf-8').replace('"', '')
-
-        labels_decode.append(label)
-
-    return labels_decode
-
-
 def print_dict_dimensions(entities_db, title):
     """Print dimension of the dictionary."""
     total = 0
@@ -187,6 +143,17 @@ def print_dict_dimensions(entities_db, title):
     print(f'Total: {total} ')
 
 
+def get_random_key_from_dict(d):
+    return random.choice(list(d.keys()))
+
+
+def get_random_value_from_dict(d):
+    return d[get_random_key_from_dict(d)]
+
+
+"""File loading utils."""
+
+
 def format_checker(fmt: str, fmt_list: list = GRAPH_FORMATS) -> None:
     """Check formats."""
     if fmt not in fmt_list:
@@ -196,9 +163,169 @@ def format_checker(fmt: str, fmt_list: list = GRAPH_FORMATS) -> None:
         )
 
 
-def get_random_key_from_dict(d):
-    return random.choice(list(d.keys()))
+def from_dataframe_file(path: str, fmt: str) -> pd.DataFrame:
+    """Read network file."""
+    format_checker(fmt)
+
+    return pd.read_csv(
+        path,
+        header=0,
+        sep=FORMAT_SEPARATOR_MAPPING[CSV] if fmt == CSV else FORMAT_SEPARATOR_MAPPING[TSV]
+    )
 
 
-def get_random_value_from_dict(d):
-    return d[get_random_key_from_dict(d)]
+def from_json(path: str):
+    """Read from json file."""
+    with open(path) as f:
+        return json.load(f)
+
+
+def from_pickle(input_path):
+    """Read from pickle file."""
+    with open(input_path, 'rb') as f:
+        unpickler = pickle.Unpickler(f)
+        return unpickler.load()
+
+
+def from_nparray_to_df(nparray: np.ndarray) -> pd.DataFrame:
+    """Convert numpy array to data frame."""
+    return pd.DataFrame(data=nparray[1:, 1:],
+                        index=nparray[1:, 0],
+                        columns=nparray[0, 1:])
+
+
+"""Data parsing utils."""
+
+
+def decode_labels(labels):
+    """Validate labels."""
+    labels_decode = []
+
+    for label in labels:
+        if not isinstance(label, str):
+
+            if isinstance(label, int):
+                label = str(label)
+            else:
+                label = label.decode('utf-8').replace('"', '')
+
+        labels_decode.append(label)
+
+    return labels_decode
+
+
+def munge_label(label: Union[str, int, float]) -> str:
+    """Munge label strings."""
+    remove_set = ['*', ' ', '|', '-', '"', "'", "↑", "↓", "\n"]
+    split_set = ['/']
+
+    label = str(label).lower()
+
+    for symb in remove_set:
+        if symb in label:
+            label = label.replace(symb, '')
+
+    for symb in split_set:
+        if symb in label:
+            label = tuple(set(label.split(symb)))
+            if len(label) == 1:
+                label = label[0]
+
+    return label
+
+
+def munge_label_list(labels: list):
+    """Munge labels list."""
+    return list(set([munge_label(label) for label in labels]))
+
+
+def munge_label_scores_dict(labels: dict) -> Dict[str, Union[list, int, str]]:
+    """Munge labels dict."""
+    return {munge_label(label): v for label, v in labels.items()}
+
+
+def munge_label_type_dict(label_dict: Dict[str, Union[list, int, str, dict]]) -> Dict[str, Union[list, int, str, dict]]:
+    """Munge labels type dict."""
+    type_label_dict = {}
+
+    for type_label, labels in label_dict.items():
+        if isinstance(labels, dict):
+            type_label_dict[type_label] = munge_label_scores_dict(labels)
+
+        elif isinstance(labels, dict):
+            type_label_dict[type_label] = munge_label_scores_dict(labels)
+
+    return type_label_dict
+
+
+def munge_cell(cell):
+    """Munge cell."""
+    if isinstance(cell, str):
+        if cell.replace(',', '').replace('.', '').replace('-', '').isnumeric():
+            return float(cell)
+        else:
+            return munge_label(cell)
+
+    elif isinstance(cell, float) or isinstance(cell, int):
+        return cell
+
+    else:
+        raise TypeError('The cell type could not be processed.')
+
+
+def parse_xls_sheet_to_df(sheet: opxl.workbook,
+                          min_row: Optional[int] = 1,
+                          relevant_cols: Optional[list] = None,
+                          irrelevant_cols: Optional[list] = None) -> pd.DataFrame:
+    """Process/format excel sheets to DataFrame."""
+    parsed_sheet_dict = defaultdict(list)
+
+    for col in sheet.iter_cols(min_row=min_row):
+        col_label = col[0].value
+
+        if relevant_cols is None and irrelevant_cols is None:
+            relevant_cols = [col_label]
+            irrelevant_cols = []
+        elif relevant_cols is None:
+            relevant_cols = []
+        elif irrelevant_cols is None:
+            irrelevant_cols = []
+
+        parsed_sheet_dict[col_label].append([munge_cell(cell.value)
+                                             for cell in col[1:]
+                                             if (col_label in relevant_cols or col_label not in irrelevant_cols) and
+                                             munge_cell(cell.value) != ''
+                                             ])
+
+    return pd.DataFrame.from_dict(parsed_sheet_dict)
+
+
+def parse_xls_to_df(path: str,
+                    min_row: Optional[int] = 1,
+                    relevant_sheets: Optional[list] = None,
+                    irrelevant_sheets: Optional[list] = None,
+                    relevant_cols: Optional[list] = None,
+                    irrelevant_cols: Optional[list] = None,
+                    ) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]:
+    """Process excel file as a set (if several excel sheets) or a single dataframe."""
+    wb = opxl.load_workbook(filename=path)
+
+    sheets = wb.sheetnames
+    df_dict = {}
+
+    if relevant_sheets is None and irrelevant_sheets is None:
+        relevant_sheets = sheets
+        irrelevant_sheets = []
+    elif relevant_sheets is None:
+        relevant_sheets = []
+    elif irrelevant_sheets is None:
+        irrelevant_sheets = []
+
+    if len(sheets) > 1:
+        return {df_dict[sheets[ix].lower()]: parse_xls_sheet_to_df(sheet, min_row, relevant_cols, irrelevant_cols)
+                for ix, sheet in enumerate(wb)
+                if sheets[ix] in relevant_sheets or sheets[ix] not in irrelevant_sheets
+                }
+
+    else:
+        return parse_xls_sheet_to_df(wb[sheets[0]])

From 526781d6e71267d7e86ccefdb98a63849137fbd1 Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Sun, 19 Apr 2020 17:37:11 +0200
Subject: [PATCH 07/17] General refactors and documentation in process_input

---
 src/diffupy/process_input.py | 149 +++++++++++++++++++++--------------
 1 file changed, 90 insertions(+), 59 deletions(-)

diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py
index 58f0686..f5325d0 100644
--- a/src/diffupy/process_input.py
+++ b/src/diffupy/process_input.py
@@ -2,7 +2,7 @@
 
 """Main matrix class and processing of input data."""
 
-from typing import Dict, Optional, Union
+from typing import Dict, Optional, Union, List, Set
 
 import numpy as np
 import pandas as pd
@@ -10,35 +10,49 @@
 from .constants import *
 from .matrix import Matrix
 from .utils import from_pickle, from_json, from_dataframe_file, from_nparray_to_df, get_random_value_from_dict, \
-    get_random_key_from_dict
+    get_random_key_from_dict, parse_xls_to_df
 
 """Process input data"""
 
 
 def process_input_data_for_diff(data_input: Union[str, pd.DataFrame, list, dict, np.ndarray, Matrix],
                                 kernel: Matrix,
-                                background_labels: Union[list, dict] = None,
-                                method: Optional[str] = 'raw',
+                                method: str = 'raw',
                                 binning: Optional[bool] = False,
                                 absolute_value: Optional[bool] = False,
                                 p_value: Optional[float] = None,
                                 threshold: Optional[float] = None,
-                                separator_str: Optional[str] = ', '
+                                background_labels: Optional[Union[list, Dict[str, list]]] = None,
+                                **further_parse_args
                                 ) -> Matrix:
-    """Process miscellaneous input data and format it for the diffusion computation function."""
+    """Process miscellaneous data input, perform the mapping to the diffusion background network (as a kernel) and
+    format it for the diffusion computation function.
+
+    :param data_input: A miscellaneous data input to be processed/formatted for the diffuPy diffusion computation.
+    :param kernel: A pre-computed kernel to perform the label mapping and the matching for the input formatting.
+    :param method: Elected method ["raw", "ml", "gm", "ber_s", "ber_p", "mc", "z"].
+    :param binning: If logFC provided in dataset, convert logFC to binary.
+    :param absolute_value: Codify node labels by applying threshold to | logFC | in input.
+    :param p_value: Statistical significance.
+    :param threshold: Codify node labels by applying a threshold to logFC in input.
+    :param background_labels: Labels set to map the input labels, which can provide label classification by type dict.
+    :param further_parse_args: Arguments to refine the data input parsing, among which:
+                                for string list parsing: separ_str
+                                for excel/csv parsing: min_row, cols_mapping, relevant_cols, irrelevant_cols
+                                for excel: relevant_sheets, irrelevant_sheets
+    """
     # If specific label background not provided, get a list from kernel labels.
     if not background_labels:
         background_labels = list(kernel.rows_labels)
-        # TODO: Discuss store label classification (mapping or as a column argument) in kernel
 
     # Pipeline the input, first preprocessing it, then mapping it to the background labels and finally formatting it.
-    return format_input_for_diffusion(map_labels_input(process_data_input(data_input,
+    return format_input_for_diffusion(map_labels_input(process_input_data(data_input,
                                                                           method,
                                                                           binning,
                                                                           absolute_value,
                                                                           p_value,
                                                                           threshold,
-                                                                          separator_str
+                                                                          **further_parse_args
                                                                           ),
                                                        background_labels
                                                        ),
@@ -52,14 +66,27 @@ def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFra
                        absolute_value: bool = False,
                        p_value: float = None,
                        threshold: Optional[float] = None,
-                       separator_str: Optional[str] = ', ',
+                       **further_parse_args
                        ) -> Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]]:
-    """Process and pipeline the provided miscellaneous data input in standardized data structures for further processing."""
-    # Preprocess the raw input according is format types.
-    preprocessed_data = _process_data_input_format(data_input, separator_str)
-
-    # If the preprocessed input is a list or a label type dict (Dict[str, list]) of lists return it for categorical input generation.
-    if _label_list_data_struct_check(preprocessed_data) or _type_label_list_data_struct_check(preprocessed_data):
+    """Pipeline the provided miscellaneous data input for further processing, in the following standardized data structures:
+    label list, type_dict label lists, label-scores dict or type_dict label-scores dicts.
+
+    :param data_input: A miscellaneous data input to be processed.
+    :param method: Elected method ["raw", "ml", "gm", "ber_s", "ber_p", "mc", "z"]
+    :param binning: If logFC provided in dataset, convert logFC to binary.
+    :param absolute_value: Codify node labels by applying threshold to | logFC | in input.
+    :param p_value: Statistical significance.
+    :param threshold: Codify node labels by applying a threshold to logFC in input.
+    :param further_parse_args: Arguments to refine the data input parsing, among which:
+                                for string list parsing: separ_str
+                                for excel/csv parsing: min_row, cols_mapping, relevant_cols, irrelevant_cols
+                                for excel: relevant_sheets, irrelevant_sheets
+    """
+    # Preprocess the raw input according its data structure types.
+    preprocessed_data = _process_data_input_format(data_input, **further_parse_args)
+
+    # If the preprocessed input is a list or a label type dict (Dict[str, list]) return it for latter categorical input generation.
+    if _label_list_data_struct_check(preprocessed_data) or _type_dict_label_list_data_struct_check(preprocessed_data):
         return preprocessed_data
 
     # If the preprocessed input is a label type label-scores dict (Dict[str, pd.DataFrame]) pipeline it for scores codifying.
@@ -88,35 +115,39 @@ def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFra
 
 
 def _process_data_input_format(raw_data_input: Union[str, list, dict, np.ndarray, pd.DataFrame],
-                               separ_str: str = ',') -> Union[pd.DataFrame, list, Dict[str, Union[pd.DataFrame, list]]]:
+                               separ_str: str = ', ',
+                               **further_parse_args) -> Union[pd.DataFrame, list, Dict[str, Union[pd.DataFrame, list]]]:
     """Format the input as a label-score dataframe, a list or a labels or a type dict for latter input processing."""
     if isinstance(raw_data_input, str):
         # If the data input type is a string, mostly will be a path to the dataset file.
         if os.path.isfile(raw_data_input):
-            return _process_data_input_format(_load_data_input_from_file(raw_data_input))
+            return _process_data_input_format(_load_data_input_from_file(raw_data_input, **further_parse_args))
         elif '/' in raw_data_input and separ_str not in ['/', ' /', '/ ']:
             raise IOError(
                 f'{EMOJI} The file could not have been located in the provided data input path,.'
             )
-        # If it is not a path, will be treated as a label list with separator.
+        # If the data input is not identified as a path, it will be treated as a label list with an indicated separator.
         else:
-            return _process_data_input_format(raw_data_input.split(raw_data_input))
-
-    if isinstance(raw_data_input, pd.DataFrame):
-        return raw_data_input
+            return _process_data_input_format(raw_data_input.split(separ_str))
 
     elif isinstance(raw_data_input, list) or isinstance(raw_data_input, set):
         return list(set(raw_data_input))
 
-    elif isinstance(raw_data_input, np.ndarray):
-        return from_nparray_to_df(raw_data_input)
+    if isinstance(raw_data_input, pd.DataFrame):
+        return raw_data_input
 
     elif isinstance(raw_data_input, dict):
-        if _scores_dict_data_struct_check(raw_data_input):
+        # If the data input type dict is a label-scores dict, codify it as a Panda's dataframe for latter processing.
+        if _label_scores_dict_data_struct_check(raw_data_input):
             return pd.DataFrame.from_dict(raw_data_input, orient='index')
+        # Else it will be treated as a label_type dict, calling recursively the process input format for each type subset (key).
         else:
+            # It is assumed that the all the dict values match the same data type.
             return {label_type: _process_data_input_format(data_i) for label_type, data_i in raw_data_input.items()}
 
+    elif isinstance(raw_data_input, np.ndarray):
+        return from_nparray_to_df(raw_data_input)
+
     elif isinstance(raw_data_input, Matrix):
         return raw_data_input.to_df()
 
@@ -168,13 +199,13 @@ def _codify_input_data(df: pd.DataFrame,
                        threshold: Optional[float],
                        ) -> Union[Dict[str, Dict[str, int]],
                                   Dict[str, int]]:
-    """Process the input scores for the codifying process."""
+    """Process the input scores dataframe for the codifying process."""
     # Ensure that node labeling is in the provided dataset.
     if not any(n in df.columns for n in NODE_LABELING):
         raise ValueError(
             f'Ensure that your file contains a column {NODE_LABELING} with node IDs.'
         )
-    # Standardize the title of the node column labeling column to 'label', for later processing.
+    # Standardize the title of the node column labeling column to 'Label', for later processing.
     elif LABEL not in df.columns:
         for l in list(df.columns):
             if l in NODE_LABELING:
@@ -230,10 +261,10 @@ def _codify_method_check(df: pd.DataFrame,
 
     else:
         # TODO: ber_s, ber_p, mc
-        raise NotImplementedError('This diffusion method has not yet been implemented.')
+        raise NotImplementedError('This diffusion method has not been yet implemented.')
 
 
-"""Assign binary labels to input for scoring methods that accept non-quantitative values"""
+"""Assign binary scores to input for scoring methods that ONLY accept non-quantitative values"""
 
 
 def _codify_non_quantitative_input_data(
@@ -241,7 +272,7 @@ def _codify_non_quantitative_input_data(
         p_value: float,
         threshold: Optional[float]
 ) -> Dict[str, int]:
-    """Codify input data to get a set of labelled nodes for scoring methods that accept non-quantitative values."""
+    """Codify input data to get a set of scored nodes for scoring methods that accept non-quantitative values."""
     # LogFC provided in dataset and threshold given
     if LOG_FC in df.columns and threshold:
 
@@ -250,19 +281,19 @@ def _codify_non_quantitative_input_data(
         # Label nodes with -1 if | logFC | below threshold
         df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = -1
 
-        # If adjusted p-values are provided in dataset, label nodes that are not statistically significant with -1
+        # If adjusted p-values are provided in dataset, score nodes that are not statistically significant with -1
         if P_VALUE in df.columns:
             df.loc[df[P_VALUE] > p_value, SCORE] = -1
 
-        return df.set_index(NODE)[SCORE].to_dict()
+        return df.set_index(LABEL)[SCORE].to_dict()
 
-    # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign labels as 1
+    # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign scores as 1
     df[SCORE] = 1
 
-    return df.set_index(NODE)[SCORE].to_dict()
+    return df.set_index(LABEL)[SCORE].to_dict()
 
 
-"""Assign binary labels to input for scoring methods that accept quantitative values"""
+"""Assign binary scores to input for scoring methods that accept quantitative values"""
 
 
 def _codify_quantitative_input_data(
@@ -272,34 +303,34 @@ def _codify_quantitative_input_data(
         p_value: float,
         threshold: Optional[float],
 ) -> Dict[str, int]:
-    """Codify input data to get a set of labelled nodes for scoring methods that accept quantitative values."""
+    """Codify input data to get a set of scored nodes for scoring methods that accept quantitative values."""
     # LogFC provided in dataset and threshold given
     if LOG_FC in df.columns and threshold:
 
-        # Binarize labels with 1, 0 and/or -1
+        # Binarize scores with 1, 0 and/or -1
         if binning is True:
 
-            # Add binning labels where | logFC | values above threshold are 1 and below are 0
+            # Add binning scores where | logFC | values above threshold are 1 and below are 0
             if absolute_value is True:
                 return _bin_quantitative_input_by_abs_val(df, threshold, p_value)
 
-            # Add signed labels where | logFC | values above threshold are 1 or -1 (signed) and values below are 0
+            # Add signed scores where | logFC | values above threshold are 1 or -1 (signed) and values below are 0
 
             return _bin_quantitative_input_by_threshold(df, threshold, p_value)
 
         # Labels are 0s or logFC values rather than binary values
         else:
-            # Codify inputs with | logFC | if they pass threshold; otherwise assign label as 0
+            # Codify inputs with | logFC | if they pass threshold; otherwise assign score as 0
             if absolute_value is True:
                 return _codify_quantitative_input_by_abs_val(df, threshold, p_value)
 
-            # Codify inputs with logFC if they pass threshold; otherwise assign label as 0
+            # Codify inputs with logFC if they pass threshold; otherwise assign score as 0
             return _codify_quantitative_input_by_threshold(df, threshold, p_value)
 
-    # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign labels as 1
+    # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign scores as 1
     df[SCORE] = 1
 
-    return df.set_index(NODE)[SCORE].to_dict()
+    return df.set_index(LABEL)[SCORE].to_dict()
 
 
 def _bin_quantitative_input_by_abs_val(
@@ -307,17 +338,17 @@ def _bin_quantitative_input_by_abs_val(
         threshold: float,
         p_value: float,
 ) -> Dict[str, int]:
-    """Process quantitative inputs and bin labels by absolute value."""
-    # Add label 1 if | logFC | is above threshold
+    """Process quantitative inputs and bin scores by absolute value."""
+    # Add score 1 if | logFC | is above threshold
     df.loc[(df[LOG_FC]).abs() >= threshold, SCORE] = 1
-    # Add label 0 if | logFC | below threshold
+    # Add score 0 if | logFC | below threshold
     df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0
 
     # logFC and adjusted p-values are provided in dataset
     if P_VALUE in df.columns:
         return _remove_non_significant_entities(df, p_value)
 
-    return df.set_index(NODE)[SCORE].to_dict()
+    return df.set_index(LABEL)[SCORE].to_dict()
 
 
 def _bin_quantitative_input_by_threshold(
@@ -325,12 +356,12 @@ def _bin_quantitative_input_by_threshold(
         threshold: float,
         p_value: float,
 ) -> Dict[str, int]:
-    """Process quantitative inputs and bin labels by threshold."""
-    # Add label 1 if logFC is above threshold
+    """Process quantitative inputs and bin scores by threshold."""
+    # Add score 1 if logFC is above threshold
     df.loc[df[LOG_FC] >= threshold, SCORE] = 1
-    # Add label 0 if | logFC | below threshold
+    # Add score 0 if | logFC | below threshold
     df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0
-    # Replace remaining labels with -1 (i.e. | logFC | above threshold but sign is negative)
+    # Replace remaining score with -1 (i.e. | logFC | above threshold but sign is negative)
     df = df.fillna(-1)
 
     if p_value:
@@ -339,10 +370,10 @@ def _bin_quantitative_input_by_threshold(
             # Disregard entities if logFC adjusted p-value is not significant
             return _remove_non_significant_entities(df, p_value)
 
-    return df.set_index(NODE)[SCORE].to_dict()
+    return df.set_index(LABEL)[SCORE].to_dict()
 
 
-"""Assign logFC as labels for input for scoring methods that accept quantitative values"""
+"""Assign logFC as score for input for scoring methods that accept quantitative values"""
 
 
 def _codify_quantitative_input_by_abs_val(
@@ -350,10 +381,10 @@ def _codify_quantitative_input_by_abs_val(
         threshold: float,
         p_value: float,
 ) -> Dict[str, int]:
-    """Codify nodes with | logFC | if they pass threshold, otherwise label is 0."""
+    """Codify nodes with | logFC | if they pass threshold, otherwise score is 0."""
     # Codify nodes with | logFC | if they pass threshold
     df.loc[(df[LOG_FC]).abs() >= threshold, SCORE] = (df[LOG_FC]).abs()
-    # Codify nodes with label 0 if it falls below threshold
+    # Codify nodes with score 0 if it falls below threshold
     df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0
 
     # LogFC and adjusted p-values are provided in dataset
@@ -361,7 +392,7 @@ def _codify_quantitative_input_by_abs_val(
         # Disregard entities if logFC adjusted p-value is not significant
         return _remove_non_significant_entities(df, p_value)
 
-    return df.set_index(NODE)[SCORE].to_dict()
+    return df.set_index(LABEL)[SCORE].to_dict()
 
 
 def _codify_quantitative_input_by_threshold(
@@ -379,14 +410,14 @@ def _codify_quantitative_input_by_threshold(
         # Disregard entities if logFC adjusted p-value is not significant
         return _remove_non_significant_entities(df, p_value)
 
-    return df.set_index(NODE)[SCORE].to_dict()
+    return df.set_index(LABEL)[SCORE].to_dict()
 
 
 def _remove_non_significant_entities(df: pd.DataFrame, p_value: float) -> Dict[str, int]:
     # Label entity 0 if adjusted p-value for logFC is not significant
     df.loc[df[P_VALUE] > p_value, SCORE] = 0
 
-    return df.set_index(NODE)[SCORE].to_dict()
+    return df.set_index(LABEL)[SCORE].to_dict()
 
 
 """Data structures format checkers"""

From 9c8388e604240a3eff9e98a4444f1d35655ea47f Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Mon, 20 Apr 2020 13:39:06 +0200
Subject: [PATCH 08/17] General refator in imports and function naming updates
 in diffuPy package

---
 src/diffupy/cli.py             | 61 ++++++++++++++++++----------------
 src/diffupy/diffuse.py         |  2 +-
 src/diffupy/matrix.py          |  4 +--
 src/diffupy/process_network.py |  6 ++--
 4 files changed, 38 insertions(+), 35 deletions(-)

diff --git a/src/diffupy/cli.py b/src/diffupy/cli.py
index 7e6b0a7..c5cbdf8 100644
--- a/src/diffupy/cli.py
+++ b/src/diffupy/cli.py
@@ -10,12 +10,13 @@
 import time
 
 import click
+from diffupy.process_network import get_kernel_from_network_path
 
-from .constants import OUTPUT, METHODS, EMOJI
+from .constants import OUTPUT, METHODS, EMOJI, RAW
 from .diffuse import diffuse as run_diffusion
 from .kernels import regularised_laplacian_kernel
-from .process_input import process_input
-from .utils import process_network_from_cli
+from .process_input import process_input_data_for_diff
+from .process_network import process_graph_from_file
 
 logger = logging.getLogger(__name__)
 
@@ -42,9 +43,9 @@ def main():
 )
 @click.option('-l', '--log', is_flag=True, help='Activate debug mode')
 def kernel(
-    network: str,
-    output: str = OUTPUT,
-    log: bool = None
+        graph: str,
+        output: str = OUTPUT,
+        log: bool = None
 ):
     """Generate a kernel for a given network."""
     # Configure logging level
@@ -55,16 +56,16 @@ def kernel(
         logging.basicConfig(level=logging.INFO)
         logger.setLevel(logging.INFO)
 
-    click.secho(f'{EMOJI} Loading graph from {network} {EMOJI}')
+    click.secho(f'{EMOJI} Loading graph from {graph} {EMOJI}')
 
-    graph = process_network_from_cli(network)
+    graph = process_graph_from_file(graph)
 
-    click.secho(f'{EMOJI} Calculating regularized Laplacian kernel. This might take a while... {EMOJI}')
+    click.secho(f'{EMOJI} Generating regularized Laplacian kernel from graph. This might take a while... {EMOJI}')
     exe_t_0 = time.time()
     background_mat = regularised_laplacian_kernel(graph)
     exe_t_f = time.time()
 
-    output_file = os.path.join(output, f'{network.split("/")[-1]}.pickle')
+    output_file = os.path.join(output, f'{graph.split("/")[-1]}.pickle')
 
     # Export numpy array
     with open(output_file, 'wb') as file:
@@ -98,7 +99,7 @@ def kernel(
     '-m', '--method',
     help='Diffusion method',
     type=click.Choice(METHODS),
-    required=True,
+    default=RAW,
 )
 @click.option(
     '-b', '--binarize',
@@ -112,6 +113,7 @@ def kernel(
 @click.option(
     '-t', '--threshold',
     help='Codify node labels by applying a threshold to logFC in input.',
+    default=None,
     type=float,
 )
 @click.option(
@@ -130,36 +132,37 @@ def kernel(
     show_default=True,
 )
 def diffuse(
-    network: str,
-    data: str,
-    output: str,
-    method: str,
-    binarize: bool,
-    absolute_value: bool,
-    threshold: float,
-    p_value: float,
+        input_data: str,
+        network: str,
+        output: str = sys.stdout,
+        method: str = RAW,
+        binarize: bool = True,
+        threshold: float = None,
+        absolute_value: bool = True,
+        p_value: float = 0.05,
 ):
     """Run a diffusion method over a network or pre-generated kernel."""
     click.secho(f'{EMOJI} Loading graph from {network} {EMOJI}')
-    graph = process_network_from_cli(network)
 
-    click.secho(
-        f'{EMOJI} Graph loaded with: \n'
-        f'{graph.number_of_nodes()} nodes\n'
-        f'{graph.number_of_edges()} edges\n'
-        f'{EMOJI}'
-    )
+    kernel = get_kernel_from_network_path(network)
 
-    click.secho(f'Codifying data from {data}.')
+    click.secho(f'Codifying data from {input_data}.')
 
-    input_scores_dict = process_input(data, method, binarize, absolute_value, p_value, threshold)
+    input_scores_dict = process_input_data_for_diff(input_data,
+                                                    kernel,
+                                                    method,
+                                                    binarize,
+                                                    absolute_value,
+                                                    p_value,
+                                                    threshold,
+                                                    )
 
     click.secho(f'Running the diffusion algorithm.')
 
     results = run_diffusion(
         input_scores_dict,
         method,
-        graph,
+        k=kernel
     )
 
     json.dump(results, output, indent=2)
diff --git a/src/diffupy/diffuse.py b/src/diffupy/diffuse.py
index c6b8202..88befca 100644
--- a/src/diffupy/diffuse.py
+++ b/src/diffupy/diffuse.py
@@ -28,7 +28,7 @@ def diffuse(
 ) -> Matrix:
     """Run diffusion on a network given an input and a diffusion method.
 
-    :param input_scores: score collection, supplied as n-dimensional array. Could be 1-dimensional (List) or n-dimensional (Matrix).
+    :param input_scores: score collection, supplied as n-dimensional array. Could be 1-dimensional (Vector) or n-dimensional (Matrix).
     :param method: Elected method ["raw", "ml", "gm", "ber_s", "ber_p", "mc", "z"]
     :param graph: A network as a graph. It could be optional if a Kernel is provided
     :param kwargs: Optional arguments:
diff --git a/src/diffupy/matrix.py b/src/diffupy/matrix.py
index dd4d2b5..bbda07e 100644
--- a/src/diffupy/matrix.py
+++ b/src/diffupy/matrix.py
@@ -80,7 +80,7 @@ def __init__(
 
     def __str__(self):
         """Return a string representation of the Matrix."""
-        s = f"        {self.cols_labels}"
+        s = f"  {self.cols_labels}"
 
         for i, row_label in enumerate(self.rows_labels):
             s += f"\n {row_label}  {self.mat[i]} "
@@ -589,7 +589,7 @@ def __init__(self, csv_path, fmt=CSV, name=None):
 class MatrixFromGraph(Matrix):
     """Constructor matrix class for nx.Graph to Matrix conversion."""
 
-    # TODO : move instances initalization from global argument graph to here
+    # TODO : move instances initialization from global argument graph to here
 
     def __init__(self, graph, node_argument='name', name=''):
         # This initialization would make a matrix representing the graph (taking a graph argument as label)
diff --git a/src/diffupy/process_network.py b/src/diffupy/process_network.py
index 2b2257b..4b699c8 100644
--- a/src/diffupy/process_network.py
+++ b/src/diffupy/process_network.py
@@ -22,7 +22,7 @@
 """Process network as undefined format (could represented as a graph or as a kernel)"""
 
 
-def get_kernel_and_graph_from_network_file(path: str) -> Tuple[Matrix, Graph]:
+def get_kernel_and_graph_from_network_path(path: str) -> Tuple[Matrix, Graph]:
     """Load network provided in cli as a kernel and as a graph."""
     graph = None
     kernel = None
@@ -52,7 +52,7 @@ def get_kernel_and_graph_from_network_file(path: str) -> Tuple[Matrix, Graph]:
     return kernel, graph
 
 
-def get_kernel_from_network_file(path: str) -> Matrix:
+def get_kernel_from_network_path(path: str) -> Matrix:
     """Load network provided in cli as a kernel."""
     if path.endswith(KERNEL_FORMATS):
         try:
@@ -73,7 +73,7 @@ def get_kernel_from_network_file(path: str) -> Matrix:
     return regularised_laplacian_kernel(graph)
 
 
-def get_graph_from_network_file(path: str) -> Graph:
+def get_graph_from_network_path(path: str) -> Graph:
     """Load network provided in cli as a graph."""
     if path.endswith(KERNEL_FORMATS):
         try:

From 8e01ad1f8c5904534af7ad25b351fe17294478b5 Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Mon, 20 Apr 2020 14:07:49 +0200
Subject: [PATCH 09/17] Added feature rename dataframe column titles according
 (if) provided label_mapping

---
 src/diffupy/process_input.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py
index f5325d0..1ef3ee4 100644
--- a/src/diffupy/process_input.py
+++ b/src/diffupy/process_input.py
@@ -96,7 +96,8 @@ def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFra
                                                binning,
                                                absolute_value,
                                                p_value,
-                                               threshold
+                                               threshold,
+                                               further_parse_args.get('cols_titles_mapping')
                                                )
                 for label_type, preprocessed_data_i in preprocessed_data.items()
                 }
@@ -107,7 +108,8 @@ def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFra
                               binning,
                               absolute_value,
                               p_value,
-                              threshold
+                              threshold,
+                              further_parse_args.get('cols_titles_mapping')
                               )
 
 
@@ -197,6 +199,7 @@ def _codify_input_data(df: pd.DataFrame,
                        absolute_value: bool,
                        p_value: float,
                        threshold: Optional[float],
+                       cols_titles_mapping: Optional[Dict[str:str]] = None
                        ) -> Union[Dict[str, Dict[str, int]],
                                   Dict[str, int]]:
     """Process the input scores dataframe for the codifying process."""
@@ -205,8 +208,15 @@ def _codify_input_data(df: pd.DataFrame,
         raise ValueError(
             f'Ensure that your file contains a column {NODE_LABELING} with node IDs.'
         )
+
+    # Rename dataframe column titles according (if) provided label_mapping.
+    if cols_titles_mapping is not None:
+        for label_to_rename, new_name in cols_titles_mapping.items():
+            if label_to_rename in df.columns:
+                df = df.rename(columns={label_to_rename: new_name})
+
     # Standardize the title of the node column labeling column to 'Label', for later processing.
-    elif LABEL not in df.columns:
+    if LABEL not in df.columns:
         for l in list(df.columns):
             if l in NODE_LABELING:
                 df = df.rename(columns={l: LABEL})

From fc345ded76ca67d66ea1197dfaf6346669b6440f Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Tue, 21 Apr 2020 14:24:58 +0200
Subject: [PATCH 10/17] Excel parser refactor after testing

---
 src/diffupy/utils.py | 32 +++++++-------------------------
 1 file changed, 7 insertions(+), 25 deletions(-)

diff --git a/src/diffupy/utils.py b/src/diffupy/utils.py
index ff31c87..f8d0ec8 100644
--- a/src/diffupy/utils.py
+++ b/src/diffupy/utils.py
@@ -278,24 +278,14 @@ def parse_xls_sheet_to_df(sheet: opxl.workbook,
                           relevant_cols: Optional[list] = None,
                           irrelevant_cols: Optional[list] = None) -> pd.DataFrame:
     """Process/format excel sheets to DataFrame."""
-    parsed_sheet_dict = defaultdict(list)
+    parsed_sheet_dict = {}
 
     for col in sheet.iter_cols(min_row=min_row):
         col_label = col[0].value
 
-        if relevant_cols is None and irrelevant_cols is None:
-            relevant_cols = [col_label]
-            irrelevant_cols = []
-        elif relevant_cols is None:
-            relevant_cols = []
-        elif irrelevant_cols is None:
-            irrelevant_cols = []
-
-        parsed_sheet_dict[col_label].append([munge_cell(cell.value)
-                                             for cell in col[1:]
-                                             if (col_label in relevant_cols or col_label not in irrelevant_cols) and
-                                             munge_cell(cell.value) != ''
-                                             ])
+        if ((relevant_cols is not None and col_label in relevant_cols) or
+                (irrelevant_cols is not None and col_label not in irrelevant_cols)):
+            parsed_sheet_dict[col_label] = [munge_cell(cell.value) for cell in col[1:]]
 
     return pd.DataFrame.from_dict(parsed_sheet_dict)
 
@@ -311,20 +301,12 @@ def parse_xls_to_df(path: str,
     wb = opxl.load_workbook(filename=path)
 
     sheets = wb.sheetnames
-    df_dict = {}
-
-    if relevant_sheets is None and irrelevant_sheets is None:
-        relevant_sheets = sheets
-        irrelevant_sheets = []
-    elif relevant_sheets is None:
-        relevant_sheets = []
-    elif irrelevant_sheets is None:
-        irrelevant_sheets = []
 
     if len(sheets) > 1:
-        return {df_dict[sheets[ix].lower()]: parse_xls_sheet_to_df(sheet, min_row, relevant_cols, irrelevant_cols)
+        return {sheets[ix].lower(): parse_xls_sheet_to_df(sheet, min_row, relevant_cols, irrelevant_cols)
                 for ix, sheet in enumerate(wb)
-                if sheets[ix] in relevant_sheets or sheets[ix] not in irrelevant_sheets
+                if (relevant_sheets is not None and sheets[ix] in relevant_sheets) or
+                (irrelevant_sheets is not None and sheets[ix] in irrelevant_sheets)
                 }
 
     else:

From ea597723681ec50c2c5fa7fbba098f91486116a9 Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Tue, 21 Apr 2020 17:26:44 +0200
Subject: [PATCH 11/17] Process input refactor and process substrings feature

---
 src/diffupy/cli.py           |  22 ++--
 src/diffupy/constants.py     |  24 ++--
 src/diffupy/process_input.py | 207 ++++++++++++++++++++++-------------
 3 files changed, 151 insertions(+), 102 deletions(-)

diff --git a/src/diffupy/cli.py b/src/diffupy/cli.py
index c5cbdf8..0c49023 100644
--- a/src/diffupy/cli.py
+++ b/src/diffupy/cli.py
@@ -15,7 +15,7 @@
 from .constants import OUTPUT, METHODS, EMOJI, RAW
 from .diffuse import diffuse as run_diffusion
 from .kernels import regularised_laplacian_kernel
-from .process_input import process_input_data_for_diff
+from .process_input import process_map_and_format_input_data_for_diff
 from .process_network import process_graph_from_file
 
 logger = logging.getLogger(__name__)
@@ -62,14 +62,14 @@ def kernel(
 
     click.secho(f'{EMOJI} Generating regularized Laplacian kernel from graph. This might take a while... {EMOJI}')
     exe_t_0 = time.time()
-    background_mat = regularised_laplacian_kernel(graph)
+    kernel = regularised_laplacian_kernel(graph)
     exe_t_f = time.time()
 
     output_file = os.path.join(output, f'{graph.split("/")[-1]}.pickle')
 
     # Export numpy array
     with open(output_file, 'wb') as file:
-        pickle.dump(background_mat, file, protocol=4)
+        pickle.dump(kernel, file, protocol=4)
 
     running_time = exe_t_f - exe_t_0
 
@@ -148,14 +148,14 @@ def diffuse(
 
     click.secho(f'Codifying data from {input_data}.')
 
-    input_scores_dict = process_input_data_for_diff(input_data,
-                                                    kernel,
-                                                    method,
-                                                    binarize,
-                                                    absolute_value,
-                                                    p_value,
-                                                    threshold,
-                                                    )
+    input_scores_dict = process_map_and_format_input_data_for_diff(input_data,
+                                                                   kernel,
+                                                                   method,
+                                                                   binarize,
+                                                                   absolute_value,
+                                                                   p_value,
+                                                                   threshold,
+                                                                   )
 
     click.secho(f'Running the diffusion algorithm.')
 
diff --git a/src/diffupy/constants.py b/src/diffupy/constants.py
index bf9ad8e..581ce9e 100644
--- a/src/diffupy/constants.py
+++ b/src/diffupy/constants.py
@@ -60,9 +60,9 @@ def ensure_output_dirs():
 #: csv
 CSV = 'csv'
 #: xml
-XML = 'xml'
+XLS = 'xls'
 #: xmls
-XMLS = 'xmls'
+XLSX = 'xlsx'
 #: tsv
 TSV = 'tsv'
 #: graphML
@@ -78,28 +78,28 @@ def ensure_output_dirs():
 #: edge list
 EDGE_LIST = '.lst'
 
-XLS_FORMATS = [
-    XML,
-    XMLS
-]
+XLS_FORMATS = (
+    XLS,
+    XLSX
+)
 
-#: DiffuPath available graph formats
-GRAPH_FORMATS = [
+#: Available graph formats
+GRAPH_FORMATS = (
     CSV,
     TSV,
     GRAPHML,
     BEL,
     JSON,
     PICKLE,
-]
+)
 
-#: DiffuPath available kernel formats
-KERNEL_FORMATS = [
+#: Available kernel formats
+KERNEL_FORMATS = (
     CSV,
     TSV,
     JSON,
     PICKLE,
-]
+)
 
 #: Separators
 FORMAT_SEPARATOR_MAPPING = {
diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py
index 1ef3ee4..4e65078 100644
--- a/src/diffupy/process_input.py
+++ b/src/diffupy/process_input.py
@@ -2,7 +2,7 @@
 
 """Main matrix class and processing of input data."""
 
-from typing import Dict, Optional, Union, List, Set
+from typing import Dict, Optional, Union, List, Set, Tuple
 
 import numpy as np
 import pandas as pd
@@ -15,16 +15,16 @@
 """Process input data"""
 
 
-def process_input_data_for_diff(data_input: Union[str, pd.DataFrame, list, dict, np.ndarray, Matrix],
-                                kernel: Matrix,
-                                method: str = 'raw',
-                                binning: Optional[bool] = False,
-                                absolute_value: Optional[bool] = False,
-                                p_value: Optional[float] = None,
-                                threshold: Optional[float] = None,
-                                background_labels: Optional[Union[list, Dict[str, list]]] = None,
-                                **further_parse_args
-                                ) -> Matrix:
+def process_map_and_format_input_data_for_diff(data_input: Union[str, pd.DataFrame, list, dict, np.ndarray, Matrix],
+                                               kernel: Matrix,
+                                               method: str = 'raw',
+                                               binning: Optional[bool] = False,
+                                               absolute_value: Optional[bool] = False,
+                                               p_value: Optional[float] = None,
+                                               threshold: Optional[float] = None,
+                                               background_labels: Optional[Union[list, Dict[str, list]]] = None,
+                                               **further_parse_args
+                                               ) -> Matrix:
     """Process miscellaneous data input, perform the mapping to the diffusion background network (as a kernel) and
     format it for the diffusion computation function.
 
@@ -40,6 +40,7 @@ def process_input_data_for_diff(data_input: Union[str, pd.DataFrame, list, dict,
                                 for string list parsing: separ_str
                                 for excel/csv parsing: min_row, cols_mapping, relevant_cols, irrelevant_cols
                                 for excel: relevant_sheets, irrelevant_sheets
+                                for mapping: check_substrings (as a bool if input list or list of labels types if input dict)
     """
     # If specific label background not provided, get a list from kernel labels.
     if not background_labels:
@@ -54,7 +55,8 @@ def process_input_data_for_diff(data_input: Union[str, pd.DataFrame, list, dict,
                                                                           threshold,
                                                                           **further_parse_args
                                                                           ),
-                                                       background_labels
+                                                       background_labels,
+                                                       check_substrings=further_parse_args.get('check_substrings')
                                                        ),
                                       kernel
                                       )
@@ -64,8 +66,8 @@ def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFra
                        method: str = 'raw',
                        binning: bool = False,
                        absolute_value: bool = False,
-                       p_value: float = None,
-                       threshold: Optional[float] = None,
+                       p_value: float = 0.05,
+                       threshold: Optional[float] = 0.5,
                        **further_parse_args
                        ) -> Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]]:
     """Pipeline the provided miscellaneous data input for further processing, in the following standardized data structures:
@@ -199,22 +201,22 @@ def _codify_input_data(df: pd.DataFrame,
                        absolute_value: bool,
                        p_value: float,
                        threshold: Optional[float],
-                       cols_titles_mapping: Optional[Dict[str:str]] = None
+                       cols_titles_mapping: Optional[Dict[str, str]] = None
                        ) -> Union[Dict[str, Dict[str, int]],
                                   Dict[str, int]]:
     """Process the input scores dataframe for the codifying process."""
-    # Ensure that node labeling is in the provided dataset.
-    if not any(n in df.columns for n in NODE_LABELING):
-        raise ValueError(
-            f'Ensure that your file contains a column {NODE_LABELING} with node IDs.'
-        )
-
     # Rename dataframe column titles according (if) provided label_mapping.
     if cols_titles_mapping is not None:
         for label_to_rename, new_name in cols_titles_mapping.items():
             if label_to_rename in df.columns:
                 df = df.rename(columns={label_to_rename: new_name})
 
+    # Ensure that node labeling is in the provided dataset.
+    if not any(n in df.columns for n in NODE_LABELING):
+        raise ValueError(
+            f'Ensure that your file contains a column {NODE_LABELING} with node IDs.'
+        )
+
     # Standardize the title of the node column labeling column to 'Label', for later processing.
     if LABEL not in df.columns:
         for l in list(df.columns):
@@ -464,16 +466,25 @@ def _type_dict_label_list_data_struct_check(v: Union[dict, list]) -> bool:
 
 
 def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]],
-                     background_labels: Union[Dict[str, list], list]) -> Union[Dict[str, int], list]:
+                     background_labels: Union[Dict[str, list], list],
+                     check_substrings: Union[List, bool] = None) -> Union[Dict[str, int], list]:
     """Map nodes from input dataset to nodes in network to get a set of labelled nodes."""
     if isinstance(background_labels, list):
-        return _map_labels_to_background(input_labels, background_labels)
+        return _map_labels_to_background(input_labels,
+                                         background_labels,
+                                         check_substring=check_substrings)
 
     elif isinstance(background_labels, dict):
-        return {node_type: _map_labels_to_background(input_labels, node_set, node_type)
+        return {node_type: _map_labels_to_background(input_labels,
+                                                     node_set,
+                                                     background_labels_type=node_type,
+                                                     check_substring=check_substrings)
                 for node_type, node_set
                 in background_labels.items()
-                if _map_labels_to_background(input_labels, node_set, node_type) not in [[], {}]
+                if _map_labels_to_background(input_labels,
+                                             node_set,
+                                             background_labels_type=node_type,
+                                             check_substring=check_substrings) not in [[], {}]
                 }
     else:
         raise IOError(
@@ -481,88 +492,126 @@ def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[st
         )
 
 
+def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]],
+                background_labels: list,
+                check_substrings: bool = False) -> Union[
+    list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]]:
+    """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes."""
+    if _label_list_data_struct_check(input_labels):
+        return _map_label_list(input_labels, background_labels, check_substrings)
+
+    elif _label_scores_dict_data_struct_check(input_labels):
+        return _map_label_dict(input_labels, background_labels, check_substrings)
+
+    elif _type_dict_label_list_data_struct_check(input_labels):
+        l = []
+        for type, label_list in input_labels.items():
+            l += _map_labels(label_list, background_labels, check_substrings)
+        return l
+
+    elif _type_dict_label_scores_dict_data_struct_check(input_labels):
+        d = {}
+        for type, scores_dict in input_labels.items():
+            d.update(_map_labels(scores_dict, background_labels, check_substrings))
+        return d
+
+    else:
+        raise TypeError(
+            f'{EMOJI} The input labels data structure can not be processed for label mapping'
+        )
+
+
 def _map_labels_to_background(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]],
                               background_labels: list,
-                              background_labels_type: str = None
+                              background_labels_type: str = None,
+                              check_substring: Union[List, bool] = None
                               ) -> Union[Dict[str, Dict[str, int]],
                                          Dict[str, int]]:
     """Map nodes from input dataset to nodes in network to get a set of labelled nodes."""
-    if _type_dict_label_scores_dict_data_struct_check(input_labels) or _type_dict_label_list_data_struct_check(
-            input_labels):
-        if background_labels_type:
-            if background_labels_type in input_labels.keys():
-                return _map_labels(input_labels[background_labels_type], background_labels)
-        else:
-            return {
-                type: _map_labels(label_list, background_labels)
-                for type, label_list in input_labels.items()
-                if _map_labels(label_list, background_labels) not in [[], {}]
-            }
+    if _type_dict_label_scores_dict_data_struct_check(input_labels) or \
+            _type_dict_label_list_data_struct_check(input_labels):
+
+        if background_labels_type and background_labels_type in input_labels.keys():
+            return _map_labels(input_labels[background_labels_type], background_labels,
+                               check_substring is not None and background_labels_type in check_substring)
+        return {
+            type: _map_labels(label_list, background_labels,
+                              check_substring is not None and type in check_substring)
+            for type, label_list in input_labels.items()
+            if _map_labels(label_list, background_labels,
+                           check_substring is not None and type in check_substring) not in [[], {}]
+        }
+
+    return _map_labels(input_labels, background_labels, check_substring)
+
+
+def _check_label_to_background_labels(label: str,
+                                      label_list: List[Union[str, Tuple[str]]],
+                                      substring: bool = False) -> Union[str, None]:
+    if label in label_list:
+        return label
 
-    return _map_labels(input_labels, background_labels)
+    # If the first fast mapping check do not match, perform further mapping iteration
+    for entity in label_list:
+
+        if isinstance(entity, set) or isinstance(entity, tuple) or isinstance(entity, list):
+            for subentity in entity:
+                if not substring:
+                    if str(subentity) == label: return subentity
+                elif str(subentity) in label or label in str(subentity):
+                    return subentity
+
+        elif substring and (str(entity) in label or label in str(entity)):
+            return entity
+
+    return None
 
 
 def _map_label_list(input_labels: Union[str, Set[str], List[str]],
-                    background_labels: List[str]) -> List[str]:
+                    background_labels: List[str],
+                    check_substrings: bool = False) -> List[str]:
     mapped_list = []
     for label in input_labels:
         if isinstance(label, str):
-            if label in background_labels:
-                mapped_list.append(label)
-        elif isinstance(label, set) or isinstance(label, list):
+            label_bck = _check_label_to_background_labels(label, background_labels, check_substrings)
+            if label_bck is not None:
+                mapped_list.append(label_bck)
+        elif isinstance(label, set) or isinstance(label, tuple) or isinstance(label, list):
             for sublabel in set(label):
-                if sublabel in background_labels:
-                    mapped_list.append(label)
+                label_bck = _check_label_to_background_labels(sublabel, background_labels, check_substrings)
+                if label_bck is not None:
+                    mapped_list.append(label_bck)
         else:
             raise TypeError(
-                f'{EMOJI} The input label {label}  data structure can not be processed for label mapping'
+                f'{EMOJI} The input label "{label}" "{type(label)}" data type can not be processed for label mapping'
             )
     return mapped_list
 
 
 def _map_label_dict(input_labels: Dict[Union[str, set], Union[int, float]],
-                    background_labels: list) -> Dict[str, Union[int, float]]:
+                    background_labels: list,
+                    check_substrings: bool = False) -> Dict[str, Union[int, float]]:
     mapped_dict = {}
+
     for label, v in input_labels.items():
+        if isinstance(label, int) or isinstance(label, float):
+            label = str(label)
+
         if isinstance(label, str):
-            if label in background_labels:
-                mapped_dict[label] = v
-        elif isinstance(label, set) or isinstance(label, list):
+            label_bck = _check_label_to_background_labels(label, background_labels, check_substrings)
+            if label_bck is not None:
+                mapped_dict[label_bck] = v
+        elif isinstance(label, set) or isinstance(label, tuple) or isinstance(label, list):
             for sublabel in set(label):
-                if sublabel in background_labels:
-                    mapped_dict[label] = v
+                label_bck = _check_label_to_background_labels(sublabel, background_labels, check_substrings)
+                if label_bck is not None:
+                    mapped_dict[label_bck] = v
         else:
             raise TypeError(
-                f'{EMOJI} The input label {label}  data structure can not be processed for label mapping'
+                f'{EMOJI} The input label "{label}" "{type(label)}" data type can not be processed for label mapping'
             )
-    return mapped_dict
-
-
-def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]],
-                background_labels: list) -> Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]]:
-    """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes."""
-    if _label_list_data_struct_check(input_labels):
-        return _map_label_list(input_labels, background_labels)
 
-    elif _label_scores_dict_data_struct_check(input_labels):
-        return _map_label_dict(input_labels, background_labels)
-
-    elif _type_dict_label_list_data_struct_check(input_labels):
-        l = []
-        for type, label_list in input_labels.items():
-            l += _map_labels(label_list, background_labels)
-        return l
-
-    elif _type_dict_label_scores_dict_data_struct_check(input_labels):
-        d = {}
-        for type, scores_dict in input_labels.items():
-            d.update(_map_labels(scores_dict, background_labels))
-        return d
-
-    else:
-        raise TypeError(
-            f'{EMOJI} The input labels data structure can not be processed for label mapping'
-        )
+    return mapped_dict
 
 
 """Generate/format data input as a vector/matrix for the diffusion computation matching the kernel rows"""

From 47c8c768cec032f5f7af19044cbcb7c50b61f761 Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Tue, 21 Apr 2020 23:04:17 +0200
Subject: [PATCH 12/17] Show mapping statistics feature added

---
 src/diffupy/process_input.py | 71 +++++++++++++++++++++++++++++-------
 1 file changed, 58 insertions(+), 13 deletions(-)

diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py
index 4e65078..1916351 100644
--- a/src/diffupy/process_input.py
+++ b/src/diffupy/process_input.py
@@ -2,6 +2,7 @@
 
 """Main matrix class and processing of input data."""
 
+import logging
 from typing import Dict, Optional, Union, List, Set, Tuple
 
 import numpy as np
@@ -10,7 +11,9 @@
 from .constants import *
 from .matrix import Matrix
 from .utils import from_pickle, from_json, from_dataframe_file, from_nparray_to_df, get_random_value_from_dict, \
-    get_random_key_from_dict, parse_xls_to_df
+    get_random_key_from_dict, parse_xls_to_df, log_dict
+
+log = logging.getLogger(__name__)
 
 """Process input data"""
 
@@ -23,6 +26,7 @@ def process_map_and_format_input_data_for_diff(data_input: Union[str, pd.DataFra
                                                p_value: Optional[float] = None,
                                                threshold: Optional[float] = None,
                                                background_labels: Optional[Union[list, Dict[str, list]]] = None,
+                                               show_statistics: bool = True,
                                                **further_parse_args
                                                ) -> Matrix:
     """Process miscellaneous data input, perform the mapping to the diffusion background network (as a kernel) and
@@ -46,17 +50,19 @@ def process_map_and_format_input_data_for_diff(data_input: Union[str, pd.DataFra
     if not background_labels:
         background_labels = list(kernel.rows_labels)
 
-    # Pipeline the input, first preprocessing it, then mapping it to the background labels and finally formatting it.
-    return format_input_for_diffusion(map_labels_input(process_input_data(data_input,
-                                                                          method,
-                                                                          binning,
-                                                                          absolute_value,
-                                                                          p_value,
-                                                                          threshold,
-                                                                          **further_parse_args
-                                                                          ),
-                                                       background_labels,
-                                                       check_substrings=further_parse_args.get('check_substrings')
+    # Pipeline the input, first preprocessing it, then mapping it to the background labels
+    # and finally formatting it with the kernel reference.
+    return format_input_for_diffusion(map_labels_input(input_labels=process_input_data(data_input,
+                                                                                       method,
+                                                                                       binning,
+                                                                                       absolute_value,
+                                                                                       p_value,
+                                                                                       threshold,
+                                                                                       **further_parse_args
+                                                                                       ),
+                                                       background_labels=background_labels,
+                                                       check_substrings=further_parse_args.get('check_substrings'),
+                                                       show_statistics=show_statistics
                                                        ),
                                       kernel
                                       )
@@ -84,6 +90,8 @@ def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFra
                                 for excel/csv parsing: min_row, cols_mapping, relevant_cols, irrelevant_cols
                                 for excel: relevant_sheets, irrelevant_sheets
     """
+    log.info("Processing the data input.")
+
     # Preprocess the raw input according its data structure types.
     preprocessed_data = _process_data_input_format(data_input, **further_parse_args)
 
@@ -467,7 +475,10 @@ def _type_dict_label_list_data_struct_check(v: Union[dict, list]) -> bool:
 
 def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]],
                      background_labels: Union[Dict[str, list], list],
-                     check_substrings: Union[List, bool] = None) -> Union[Dict[str, int], list]:
+                     check_substrings: Union[List, bool] = None,
+                     show_statistics: bool = False) -> Union[Dict[str, int], list]:
+    log.info("Mapping the input labels to the background labels reference.")
+
     """Map nodes from input dataset to nodes in network to get a set of labelled nodes."""
     if isinstance(background_labels, list):
         return _map_labels_to_background(input_labels,
@@ -491,6 +502,38 @@ def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[st
             f'{EMOJI} The background mapping labels should be provided as a label list or as a type dict of label list.'
         )
 
+    if show_statistics: log_dict(mapping_statistics(mapped_labels, input_labels))
+
+    return mapped_labels
+
+
+def mapping_statistics(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]],
+                       mapped_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]]) -> Dict:
+    percentage_dict = {}
+    total_mapping = 0
+    total_labels = 0
+
+    if _label_list_data_struct_check(input_labels) or _label_scores_dict_data_struct_check(input_labels):
+        total_mapping = len(input_labels)
+        total_labels = len(mapped_labels)
+
+    elif _type_dict_label_list_data_struct_check(input_labels) or _type_dict_label_scores_dict_data_struct_check(
+            input_labels):
+        for input_type, mapping in input_labels.items():
+            if input_type in mapped_labels:
+                percentage_dict[input_type] = len(mapping) / len(mapped_labels[input_type])
+                total_mapping += len(mapping)
+                total_labels += len(mapped_labels[input_type])
+
+    else:
+        raise TypeError(
+            f'{EMOJI} The input labels data structure can not be processed for label mapping'
+        )
+
+    percentage_dict['General mapping'] = total_mapping / total_labels
+
+    return percentage_dict
+
 
 def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]],
                 background_labels: list,
@@ -621,6 +664,8 @@ def format_input_for_diffusion(processed_input: Union[list, Dict[str, int], Dict
                                kernel: Matrix,
                                missing_value: int = -1) -> Matrix:
     """Format/generate input vector/matrix according the data structure of the processed_data_input."""
+    log.info("Formatting the processed to the reference kernel Matrix.")
+
     if _label_list_data_struct_check(processed_input):
         return format_categorical_input_vector_from_label_list(rows_labeled=processed_input,
                                                                col_label='scores',

From f1e088487a7c85bb757ae96627e4661530046c81 Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Fri, 24 Apr 2020 09:59:07 +0200
Subject: [PATCH 13/17] flake8 cleaning in diffupy

---
 src/diffupy/constants.py     |  3 +-
 src/diffupy/process_input.py | 64 +++++++++++++++++-------------------
 src/diffupy/utils.py         | 25 +++++++++-----
 3 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/src/diffupy/constants.py b/src/diffupy/constants.py
index 581ce9e..3984660 100644
--- a/src/diffupy/constants.py
+++ b/src/diffupy/constants.py
@@ -130,7 +130,7 @@ def ensure_output_dirs():
 ENTITY = 'Entity'
 GENE = 'Gene'
 
-NODE_LABELING= [
+NODE_LABELING = [
     NODE,
     LABEL,
     ENTITY,
@@ -145,4 +145,3 @@ def ensure_output_dirs():
 LOG_FC = 'LogFC'
 #: Statistical significance (p-value)
 P_VALUE = 'p-value'
-
diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py
index 1916351..79bb997 100644
--- a/src/diffupy/process_input.py
+++ b/src/diffupy/process_input.py
@@ -445,17 +445,13 @@ def _remove_non_significant_entities(df: pd.DataFrame, p_value: float) -> Dict[s
 
 def _label_scores_dict_data_struct_check(v: Union[dict, list]) -> bool:
     """Check data structure type Dict[str, int]."""
-    return (isinstance(v, dict) and
-            isinstance(get_random_value_from_dict(v), (int, float))
-            )
+    return isinstance(v, dict) and isinstance(get_random_value_from_dict(v), (int, float))
 
 
 def _type_dict_label_scores_dict_data_struct_check(v: Union[dict, list]) -> bool:
     """Check data structure type Dict[str, Dict[str, int]]."""
-    return (isinstance(v, dict) and
-            isinstance(get_random_value_from_dict(v), dict) and
-            isinstance(get_random_value_from_dict(get_random_value_from_dict(v)), (int, float))
-            )
+    return isinstance(v, dict) and isinstance(get_random_value_from_dict(v), dict) and isinstance(
+        get_random_value_from_dict(get_random_value_from_dict(v)), (int, float))
 
 
 def _label_list_data_struct_check(v: Union[dict, list]) -> bool:
@@ -465,9 +461,7 @@ def _label_list_data_struct_check(v: Union[dict, list]) -> bool:
 
 def _type_dict_label_list_data_struct_check(v: Union[dict, list]) -> bool:
     """Check data structure type Dict[str, list]."""
-    return (isinstance(v, dict) and
-            isinstance(get_random_value_from_dict(v), list)
-            )
+    return isinstance(v, dict) and isinstance(get_random_value_from_dict(v), list)
 
 
 """Mappers from input to network background"""
@@ -481,28 +475,29 @@ def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[st
 
     """Map nodes from input dataset to nodes in network to get a set of labelled nodes."""
     if isinstance(background_labels, list):
-        return _map_labels_to_background(input_labels,
-                                         background_labels,
-                                         check_substring=check_substrings)
+        mapped_labels = _map_labels_to_background(input_labels,
+                                                  background_labels,
+                                                  check_substring=check_substrings)
 
     elif isinstance(background_labels, dict):
-        return {node_type: _map_labels_to_background(input_labels,
-                                                     node_set,
-                                                     background_labels_type=node_type,
-                                                     check_substring=check_substrings)
-                for node_type, node_set
-                in background_labels.items()
-                if _map_labels_to_background(input_labels,
-                                             node_set,
-                                             background_labels_type=node_type,
-                                             check_substring=check_substrings) not in [[], {}]
-                }
+        mapped_labels = {node_type: _map_labels_to_background(input_labels,
+                                                              node_set,
+                                                              background_labels_type=node_type,
+                                                              check_substring=check_substrings)
+                         for node_type, node_set
+                         in background_labels.items()
+                         if _map_labels_to_background(input_labels,
+                                                      node_set,
+                                                      background_labels_type=node_type,
+                                                      check_substring=check_substrings) not in [[], {}]
+                         }
     else:
         raise IOError(
             f'{EMOJI} The background mapping labels should be provided as a label list or as a type dict of label list.'
         )
 
-    if show_statistics: log_dict(mapping_statistics(mapped_labels, input_labels))
+    if show_statistics:
+        log_dict(mapping_statistics(mapped_labels, input_labels))
 
     return mapped_labels
 
@@ -537,8 +532,8 @@ def mapping_statistics(input_labels: Union[list, Dict[str, Dict[str, int]], Dict
 
 def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]],
                 background_labels: list,
-                check_substrings: bool = False) -> Union[
-    list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]]:
+                check_substrings: bool = False
+                ) -> Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]]:
     """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes."""
     if _label_list_data_struct_check(input_labels):
         return _map_label_list(input_labels, background_labels, check_substrings)
@@ -547,16 +542,16 @@ def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, i
         return _map_label_dict(input_labels, background_labels, check_substrings)
 
     elif _type_dict_label_list_data_struct_check(input_labels):
-        l = []
+        map_list = []
         for type, label_list in input_labels.items():
-            l += _map_labels(label_list, background_labels, check_substrings)
-        return l
+            map_list += _map_labels(label_list, background_labels, check_substrings)
+        return map_list
 
     elif _type_dict_label_scores_dict_data_struct_check(input_labels):
-        d = {}
+        map_dict = {}
         for type, scores_dict in input_labels.items():
-            d.update(_map_labels(scores_dict, background_labels, check_substrings))
-        return d
+            map_dict.update(_map_labels(scores_dict, background_labels, check_substrings))
+        return map_dict
 
     else:
         raise TypeError(
@@ -600,7 +595,8 @@ def _check_label_to_background_labels(label: str,
         if isinstance(entity, set) or isinstance(entity, tuple) or isinstance(entity, list):
             for subentity in entity:
                 if not substring:
-                    if str(subentity) == label: return subentity
+                    if str(subentity) == label:
+                        return subentity
                 elif str(subentity) in label or label in str(subentity):
                     return subentity
 
diff --git a/src/diffupy/utils.py b/src/diffupy/utils.py
index f8d0ec8..6e275d3 100644
--- a/src/diffupy/utils.py
+++ b/src/diffupy/utils.py
@@ -7,7 +7,6 @@
 import pickle
 import random
 import warnings
-from collections import defaultdict
 from typing import List, Union, Dict, Optional
 
 import networkx as nx
@@ -124,10 +123,10 @@ def get_idx_scores_mapping(scores):
     return {i: score for i, score in enumerate(scores)}
 
 
-def print_dict_dimensions(entities_db, title):
-    """Print dimension of the dictionary."""
+def print_dict_dimensions(entities_db, message='Total number of '):
+    """Print dimension of the dictionary"""
     total = 0
-    print(title)
+
     for k1, v1 in entities_db.items():
         m = ''
         if isinstance(v1, dict):
@@ -138,11 +137,19 @@ def print_dict_dimensions(entities_db, title):
             m += f'{len(v1)} '
             total += len(v1)
 
-        print(f'Total number of {k1}: {m} ')
+        log_dict({k1: m}, message)
 
     print(f'Total: {total} ')
 
 
+def log_dict(dict_to_print: dict, message: str = ''):
+    """Print dictionary as list with a message"""
+
+    for k1, v1 in dict_to_print.items():
+        log.info(f'{message} {k1}: {v1} ')
+        print(f'{message} {k1}: {v1} ')
+
+
 def get_random_key_from_dict(d):
     return random.choice(list(d.keys()))
 
@@ -283,8 +290,8 @@ def parse_xls_sheet_to_df(sheet: opxl.workbook,
     for col in sheet.iter_cols(min_row=min_row):
         col_label = col[0].value
 
-        if ((relevant_cols is not None and col_label in relevant_cols) or
-                (irrelevant_cols is not None and col_label not in irrelevant_cols)):
+        if ((relevant_cols is not None and col_label in relevant_cols) or (
+                irrelevant_cols is not None and col_label not in irrelevant_cols)):
             parsed_sheet_dict[col_label] = [munge_cell(cell.value) for cell in col[1:]]
 
     return pd.DataFrame.from_dict(parsed_sheet_dict)
@@ -305,8 +312,8 @@ def parse_xls_to_df(path: str,
     if len(sheets) > 1:
         return {sheets[ix].lower(): parse_xls_sheet_to_df(sheet, min_row, relevant_cols, irrelevant_cols)
                 for ix, sheet in enumerate(wb)
-                if (relevant_sheets is not None and sheets[ix] in relevant_sheets) or
-                (irrelevant_sheets is not None and sheets[ix] in irrelevant_sheets)
+                if (relevant_sheets is not None and sheets[ix] in relevant_sheets) or (
+                    irrelevant_sheets is not None and sheets[ix] in irrelevant_sheets)
                 }
 
     else:

From 798416bd707aae735baa11a9a8ceb25dc1be5095 Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Fri, 24 Apr 2020 10:15:10 +0200
Subject: [PATCH 14/17] flake8 cleaning in diffupy

---
 src/diffupy/matrix.py        |  1 -
 src/diffupy/process_input.py | 14 ++++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/diffupy/matrix.py b/src/diffupy/matrix.py
index bbda07e..8c52cf4 100644
--- a/src/diffupy/matrix.py
+++ b/src/diffupy/matrix.py
@@ -551,7 +551,6 @@ class MatrixFromNumpyArray(Matrix):
 
     def __init__(self, nparray, name=''):
         """Initialize laplacian."""
-
         df = from_nparray_to_df(nparray)
 
         rows = list(df.rows.values)
diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py
index 79bb997..9bbe50a 100644
--- a/src/diffupy/process_input.py
+++ b/src/diffupy/process_input.py
@@ -29,8 +29,7 @@ def process_map_and_format_input_data_for_diff(data_input: Union[str, pd.DataFra
                                                show_statistics: bool = True,
                                                **further_parse_args
                                                ) -> Matrix:
-    """Process miscellaneous data input, perform the mapping to the diffusion background network (as a kernel) and
-    format it for the diffusion computation function.
+    """Process miscellaneous data input, perform the mapping to the diffusion background network (as a kernel) and format it for the diffusion computation function.
 
     :param data_input: A miscellaneous data input to be processed/formatted for the diffuPy diffusion computation.
     :param kernel: A pre-computed kernel to perform the label mapping and the matching for the input formatting.
@@ -76,8 +75,7 @@ def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFra
                        threshold: Optional[float] = 0.5,
                        **further_parse_args
                        ) -> Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]]:
-    """Pipeline the provided miscellaneous data input for further processing, in the following standardized data structures:
-    label list, type_dict label lists, label-scores dict or type_dict label-scores dicts.
+    """Pipeline the provided miscellaneous data input for further processing, in the following standardized data structures: label list, type_dict label lists, label-scores dict or type_dict label-scores dicts.
 
     :param data_input: A miscellaneous data input to be processed.
     :param method: Elected method ["raw", "ml", "gm", "ber_s", "ber_p", "mc", "z"]
@@ -471,6 +469,7 @@ def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[st
                      background_labels: Union[Dict[str, list], list],
                      check_substrings: Union[List, bool] = None,
                      show_statistics: bool = False) -> Union[Dict[str, int], list]:
+    """Get the mappings from preprocessed input_labels."""
     log.info("Mapping the input labels to the background labels reference.")
 
     """Map nodes from input dataset to nodes in network to get a set of labelled nodes."""
@@ -504,6 +503,7 @@ def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[st
 
 def mapping_statistics(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]],
                        mapped_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]]) -> Dict:
+    """Get the mapping statistics."""
     percentage_dict = {}
     total_mapping = 0
     total_labels = 0
@@ -565,7 +565,7 @@ def _map_labels_to_background(input_labels: Union[list, Dict[str, Dict[str, int]
                               check_substring: Union[List, bool] = None
                               ) -> Union[Dict[str, Dict[str, int]],
                                          Dict[str, int]]:
-    """Map nodes from input dataset to nodes in network to get a set of labelled nodes."""
+    """Map labels from preprocessed input to background_labels to get a set of matched labels."""
     if _type_dict_label_scores_dict_data_struct_check(input_labels) or \
             _type_dict_label_list_data_struct_check(input_labels):
 
@@ -586,6 +586,7 @@ def _map_labels_to_background(input_labels: Union[list, Dict[str, Dict[str, int]
 def _check_label_to_background_labels(label: str,
                                       label_list: List[Union[str, Tuple[str]]],
                                       substring: bool = False) -> Union[str, None]:
+    """Check if label string in a label list, also check further if substring checking."""
     if label in label_list:
         return label
 
@@ -609,6 +610,7 @@ def _check_label_to_background_labels(label: str,
 def _map_label_list(input_labels: Union[str, Set[str], List[str]],
                     background_labels: List[str],
                     check_substrings: bool = False) -> List[str]:
+    """Map labels from preprocessed input to background_labels LIST to get a set of matched labels."""
     mapped_list = []
     for label in input_labels:
         if isinstance(label, str):
@@ -630,6 +632,7 @@ def _map_label_list(input_labels: Union[str, Set[str], List[str]],
 def _map_label_dict(input_labels: Dict[Union[str, set], Union[int, float]],
                     background_labels: list,
                     check_substrings: bool = False) -> Dict[str, Union[int, float]]:
+    """Map labels from preprocessed input to background_labels DICT to get a set of matched labels."""
     mapped_dict = {}
 
     for label, v in input_labels.items():
@@ -783,7 +786,6 @@ def format_input_vector_from_label_score_dict(labels_scores_dict: Dict[str, int]
                                               type_k: bool = False
                                               ) -> Matrix:
     """Generate scores input vector from labels scores dict."""
-
     input_mat = Matrix(
         mat=np.transpose(np.array([list(labels_scores_dict.values())])),
         rows_labels=list(labels_scores_dict.keys()),

From b04db50e6b713381ddcc7bd05e5afa0c5e0173a7 Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Fri, 24 Apr 2020 10:23:19 +0200
Subject: [PATCH 15/17] flake8 cleaning in diffupy

---
 src/diffupy/utils.py | 11 ++++++-----
 tests/test_input.py  |  8 +++-----
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/src/diffupy/utils.py b/src/diffupy/utils.py
index 6e275d3..aa506d1 100644
--- a/src/diffupy/utils.py
+++ b/src/diffupy/utils.py
@@ -124,7 +124,7 @@ def get_idx_scores_mapping(scores):
 
 
 def print_dict_dimensions(entities_db, message='Total number of '):
-    """Print dimension of the dictionary"""
+    """Print dimension of the dictionary."""
     total = 0
 
     for k1, v1 in entities_db.items():
@@ -143,18 +143,19 @@ def print_dict_dimensions(entities_db, message='Total number of '):
 
 
 def log_dict(dict_to_print: dict, message: str = ''):
-    """Print dictionary as list with a message"""
-
+    """Print dictionary as list with a message."""
     for k1, v1 in dict_to_print.items():
         log.info(f'{message} {k1}: {v1} ')
         print(f'{message} {k1}: {v1} ')
 
 
-def get_random_key_from_dict(d):
+def get_random_key_from_dict(d: dict) -> [Union[str, int, tuple]]:
+    """Return random key from provided dict."""
     return random.choice(list(d.keys()))
 
 
-def get_random_value_from_dict(d):
+def get_random_value_from_dict(d: dict):
+    """Return random value from provided dict."""
     return d[get_random_key_from_dict(d)]
 
 
diff --git a/tests/test_input.py b/tests/test_input.py
index c141c85..f3e5273 100644
--- a/tests/test_input.py
+++ b/tests/test_input.py
@@ -5,7 +5,6 @@
 import logging
 import unittest
 
-import numpy as np
 from diffupy.constants import *
 from diffupy.matrix import Matrix
 from diffupy.process_input import process_input_data, map_labels_input, \
@@ -271,7 +270,6 @@ def test_validate_scores_4(self):
 
     def test_format_input_for_diffusion_label_list(self):
         """Test empty matrix."""
-
         processed_mapped_nodes_list = format_input_for_diffusion(
             map_labels_input({'Metabolite': {'C': -1}, 'Gene': {'A': 2, 'B': 1}, 'mirnas': {'A': 1, 'T': 1}},
                              self.kernel_test_1.rows_labels),
@@ -279,7 +277,7 @@ def test_format_input_for_diffusion_label_list(self):
         )
 
         # TODO: Implement in Matrix equal, now if the col order is mixed it raises error
-        #assert(np.allclose(processed_mapped_nodes_list.mat,
+        # assert(np.allclose(processed_mapped_nodes_list.mat,
         #                    np.array([[-1, 2, 1],
         #                              [-1, 1, -1],
         #                              [-1, -1, -1],
@@ -287,9 +285,9 @@ def test_format_input_for_diffusion_label_list(self):
         #                             )
         #                    )
         #        )
-        #self.assertEqual(processed_mapped_nodes_list.cols_labels,
+        # self.assertEqual(processed_mapped_nodes_list.cols_labels,
         #                 ['Metabolite', 'Gene', 'mirnas']
         #                 )
-        #self.assertEqual(processed_mapped_nodes_list.rows_labels,
+        # self.assertEqual(processed_mapped_nodes_list.rows_labels,
         #                 ['A', 'B', 'C', 'D']
         #                 )

From 65f06f9c562ab1308eecc4f918441a1855281795 Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Fri, 24 Apr 2020 10:29:27 +0200
Subject: [PATCH 16/17] diffupy cli refactor and output format feature added

---
 src/diffupy/cli.py | 43 ++++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/src/diffupy/cli.py b/src/diffupy/cli.py
index ec1db95..fcb9f82 100644
--- a/src/diffupy/cli.py
+++ b/src/diffupy/cli.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 
-"""Command line interface for DiffuPy."""
+"""Command line interface for diffuPy."""
 
 import json
 import logging
@@ -12,7 +12,7 @@
 import click
 from diffupy.process_network import get_kernel_from_network_path
 
-from .constants import OUTPUT, METHODS, EMOJI, RAW
+from .constants import OUTPUT, METHODS, EMOJI, RAW, CSV, JSON
 from .diffuse import diffuse as run_diffusion
 from .kernels import regularised_laplacian_kernel
 from .process_input import process_map_and_format_input_data_for_diff
@@ -78,14 +78,14 @@ def kernel(
 
 @main.command()
 @click.option(
-    '-n', '--network',
-    help='Path to the network graph or kernel',
+    '-i', '--input',
+    help='Input data',
     required=True,
     type=click.Path(exists=True, dir_okay=False)
 )
 @click.option(
-    '-i', '--data',
-    help='Input data',
+    '-n', '--network',
+    help='Path to the network graph or kernel',
     required=True,
     type=click.Path(exists=True, dir_okay=False)
 )
@@ -131,8 +131,15 @@ def kernel(
     default=0.05,
     show_default=True,
 )
+@click.option(
+    '-f', '--output_format',
+    help='Statistical significance (p-value).',
+    type=float,
+    default=CSV,
+    show_default=True,
+)
 def diffuse(
-        input_data: str,
+        input: str,
         network: str,
         output: str = sys.stdout,
         method: str = RAW,
@@ -140,15 +147,16 @@ def diffuse(
         threshold: float = None,
         absolute_value: bool = True,
         p_value: float = 0.05,
+        output_format: str = CSV
 ):
     """Run a diffusion method over a network or pre-generated kernel."""
     click.secho(f'{EMOJI} Loading graph from {network} {EMOJI}')
 
     kernel = get_kernel_from_network_path(network)
 
-    click.secho(f'Codifying data from {input_data}.')
+    click.secho(f'Processing data input from {input}.')
 
-    input_scores_dict = process_map_and_format_input_data_for_diff(input_data,
+    input_scores_dict = process_map_and_format_input_data_for_diff(input,
                                                                    kernel,
                                                                    method,
                                                                    binarize,
@@ -157,24 +165,21 @@ def diffuse(
                                                                    threshold,
                                                                    )
 
-
-    click.secho(f'Running the diffusion algorithm.')
+    click.secho(f'Computing the diffusion algorithm.')
 
     results = run_diffusion(
-        label_dict,
+        input_scores_dict,
         method,
         k=kernel
     )
 
-    # results = run_diffusion(
-    #     label_dict,
-    #     method,
-    #     graph,
-    # )
+    if output_format is CSV:
+        results.to_csv(output)
 
-    # json.dump(results, output, indent=2)
+    elif output_format is JSON:
+        json.dump(results, output, indent=2)
 
-    click.secho(f'Finished!')
+    click.secho(f'{EMOJI} Diffusion performed with success. Output located at {output} {EMOJI}')
 
 
 if __name__ == '__main__':

From b138ad9f101872464a20628689feed7fc84e37a9 Mon Sep 17 00:00:00 2001
From: jmarinllao <josepmarinllao@gmail.com>
Date: Fri, 24 Apr 2020 10:36:10 +0200
Subject: [PATCH 17/17] openpyxl dependence

---
 setup.cfg | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.cfg b/setup.cfg
index 72f668b..a978139 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -52,6 +52,7 @@ install_requires =
     scipy
     pybel==0.13.2
     pandas
+    openpyxl
 
 # Random options
 zip_safe = false