From ca4228e062a1efb4b78fc2e6e7d5c0860b7aff9e Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Fri, 10 Apr 2020 18:26:30 +0200 Subject: [PATCH 01/17] Process network major recoding --- src/diffupy/constants.py | 20 ++- src/diffupy/matrix.py | 175 +++++++++++++++++++----- src/diffupy/process_network.py | 241 +++++++++++++++++++++++++++++++++ 3 files changed, 394 insertions(+), 42 deletions(-) create mode 100644 src/diffupy/process_network.py diff --git a/src/diffupy/constants.py b/src/diffupy/constants.py index fb1d8e9..48daee2 100644 --- a/src/diffupy/constants.py +++ b/src/diffupy/constants.py @@ -66,22 +66,30 @@ def ensure_output_dirs(): #: bel BEL = 'bel' #: node link json -NODE_LINK_JSON = 'json' +JSON = 'json' #: pickle -BEL_PICKLE = 'pickle' +PICKLE = 'pickle' #: gml GML = 'gml' #: edge list EDGE_LIST = '.lst' -#: DiffuPath available network formats -FORMATS = [ +#: DiffuPath available graph formats +GRAPH_FORMATS = [ CSV, TSV, GRAPHML, BEL, - NODE_LINK_JSON, - BEL_PICKLE, + JSON, + PICKLE, +] + +#: DiffuPath available kernel formats +KERNEL_FORMATS = [ + CSV, + TSV, + JSON, + PICKLE, ] #: Separators diff --git a/src/diffupy/matrix.py b/src/diffupy/matrix.py index d3105d6..dd4d2b5 100644 --- a/src/diffupy/matrix.py +++ b/src/diffupy/matrix.py @@ -7,9 +7,11 @@ import numpy as np import pandas as pd +from diffupy.constants import CSV +from networkx import DiGraph from .utils import get_label_ix_mapping, get_label_list_graph, get_laplacian, decode_labels, get_idx_scores_mapping, \ - get_repeated_labels + get_repeated_labels, from_dataframe_file, from_nparray_to_df log = logging.getLogger(__name__) @@ -23,15 +25,14 @@ class Matrix: """Matrix class.""" def __init__( - self, - mat=None, - rows_labels=None, - cols_labels=None, - graph=None, - quadratic=False, - name='', - init_value=None, - **kwargs + self, + mat=None, + rows_labels=None, + cols_labels=None, + graph=None, + quadratic=False, + name='', + init_value=None, ): """Initialize matrix. @@ -86,15 +87,15 @@ def __str__(self): return f"\nmatrix {self.name} \n {s} \n " - def __iter__(self, **kargs): + def __iter__(self, **attr): """Help method for the iteration of the Matrix.""" self.i = -1 self.j = 0 - if 'get_indices' in kargs: - self.get_indices = kargs['get_indices'] - if 'get_labels' in kargs: - self.get_labels = kargs['get_labels'] + if 'get_indices' in attr: + self.get_indices = attr['get_indices'] + if 'get_labels' in attr: + self.get_labels = attr['get_labels'] return self @@ -463,23 +464,6 @@ def order_rows(self, reverse=True, col_ref_idx=None): return ordered_mat - """Import""" - - def from_csv(self, csv_path): - """Import matrix from csv file using the headers as a Matrix class.""" - m = np.genfromtxt(csv_path, dtype=None, delimiter=',') - return Matrix( - mat=np.array( - [ - [float(x) - for x in a[1:]] - for a in m[1:] - ]), - rows_labels=list(m[1:, 0]), - cols_labels=list(m[0, 1:]), - name=str(os.path.basename(csv_path).replace('.csv', '')) - ) - """Export""" def to_dict(self, ordered=True): @@ -496,19 +480,138 @@ def to_dict(self, ordered=True): return d + def to_df(self, ordered=True): + """Export matrix as a data frame using the headers (row_labels, cols_labels) of the Matrix class.""" + d = self.to_dict(ordered) + + rows_labels = d.pop('rows_labels') + + df = pd.DataFrame(d) + df.rows.values = rows_labels + + return df + def to_csv(self, path, file_name='_export.csv', index=False, ordered=True): """Export matrix to csv file using the headers (row_labels, cols_labels) of the Matrix class.""" # Generate dataframe - df = pd.DataFrame(data=self.to_dict(ordered)) - df.to_csv(os.path.join(path, self.name, file_name), index=index) + self.to_df(ordered).to_csv(os.path.join(path, self.name, file_name), index=index) + + def to_nx_graph(self): + """Export matrix as a Graph using the headers (row_labels, cols_labels) of the Matrix class.""" + if len(self.cols_labels) != len(self.rows_labels) or not self.quadratic: + raise ValueError('The matrix cannot be converted as a graph since it is not quadratic, which ' + 'it is the used representation of a network (usually a kernel) as a Matrix.') + + graph = DiGraph() + + for score, sub_name, obj_name in self.__iter__(get_labels=True, get_indices=False): + if score != 0: + graph.add_edge( + sub_name, obj_name, + ) + + return graph + + +class MatrixFromDict(Matrix): + """Constructor matrix class for Dictionary data structure to Matrix conversion.""" + + def __init__(self, d, name=''): + """Initialize laplacian.""" + rows = list(d.pop('rows_labels')) + cols = list(d.keys()) + + Matrix.__init__(self, mat=np.array(list(d.values())), + rows_labels=rows, + cols_labels=cols, + quadratic=len(cols) == len(rows), + name=name + ) + + +class MatrixFromDataFrame(Matrix): + """Constructor matrix class for DataFrame to Matrix conversion.""" + + def __init__(self, df, name=''): + """Initialize laplacian.""" + rows = list(df.rows.values) + cols = list(df.cols.values) + + Matrix.__init__(self, mat=df.to_numpy(), + rows_labels=rows, + cols_labels=cols, + quadratic=len(cols) == len(rows), + name=name + ) + + +class MatrixFromNumpyArray(Matrix): + """Constructor matrix class for DataFrame to Matrix conversion.""" + + def __init__(self, nparray, name=''): + """Initialize laplacian.""" + + df = from_nparray_to_df(nparray) + + rows = list(df.rows.values) + cols = list(df.cols.values) + + Matrix.__init__(self, mat=df.to_numpy(), + rows_labels=rows, + cols_labels=cols, + quadratic=len(cols) == len(rows), + name=name + ) + + +class MatrixFromCSV(Matrix): + """Constructor matrix class for CSV to Matrix conversion.""" + + def __init__(self, csv_path, fmt=CSV, name=None): + """Initialize laplacian.""" + df = from_dataframe_file(csv_path, fmt) + + if name is None: + name = str(os.path.basename(csv_path).replace('.csv', '')) + + rows = list(df.rows.values) + cols = list(df.cols.values) + + Matrix.__init__(self, mat=df.to_numpy(), + rows_labels=rows, + cols_labels=cols, + quadratic=len(cols) == len(rows), + name=name + ) + + +class MatrixFromGraph(Matrix): + """Constructor matrix class for nx.Graph to Matrix conversion.""" + + # TODO : move instances initalization from global argument graph to here + + def __init__(self, graph, node_argument='name', name=''): + # This initialization would make a matrix representing the graph (taking a graph argument as label) + rows = list(get_label_list_graph(graph, node_argument)) + + Matrix.__init__(self, rows_labels=rows, + init_value=1, + quadratic=True, + name=name, + ) class LaplacianMatrix(Matrix): """Laplacian matrix class.""" - def __init__(self, graph, normalized=False, name=''): + def __init__(self, graph, normalized=False, node_argument='name', name=''): """Initialize laplacian.""" l_mat = get_laplacian(graph, normalized) + rows = list(get_label_list_graph(graph, node_argument)) - Matrix.__init__(self, mat=l_mat, quadratic=True, name=name, graph=graph) + Matrix.__init__(self, mat=l_mat, + rows_labels=rows, + quadratic=True, + name=name + ) diff --git a/src/diffupy/process_network.py b/src/diffupy/process_network.py new file mode 100644 index 0000000..2b2257b --- /dev/null +++ b/src/diffupy/process_network.py @@ -0,0 +1,241 @@ +# -*- coding: utf-8 -*- + +"""Miscellaneous utils of the package.""" + +import logging +from typing import Tuple + +import numpy as np +import pandas as pd +import pybel +from diffupy.matrix import Matrix, MatrixFromDataFrame, MatrixFromDict, MatrixFromNumpyArray +from diffupy.utils import from_dataframe_file, format_checker, from_pickle, get_label_node, from_json +from networkx import DiGraph, Graph, read_graphml, read_gml, node_link_graph, read_edgelist + +from .constants import * +from .constants import CSV, TSV, GRAPHML, GML, BEL, PICKLE, EMOJI, GRAPH_FORMATS +from .kernels import regularised_laplacian_kernel + +log = logging.getLogger(__name__) + + +"""Process network as undefined format (could represented as a graph or as a kernel)""" + + +def get_kernel_and_graph_from_network_file(path: str) -> Tuple[Matrix, Graph]: + """Load network provided in cli as a kernel and as a graph.""" + graph = None + kernel = None + + if path.endswith(KERNEL_FORMATS): + try: + graph = process_graph_from_file(path) + + except ValueError or TypeError: + kernel = process_kernel_from_file(path) + + elif path.endswith(GRAPH_FORMATS): + graph = process_graph_from_file(path) + + else: + raise IOError( + f'{EMOJI} The selected network format is not valid neither as a graph or as a kernel. Please ensure you use one of the following formats: ' + f'{GRAPH_FORMATS}' + ) + + if kernel is None and graph is not None: + kernel = regularised_laplacian_kernel(graph) + + if kernel is not None and graph is None: + graph = kernel.to_nx_graph() + + return kernel, graph + + +def get_kernel_from_network_file(path: str) -> Matrix: + """Load network provided in cli as a kernel.""" + if path.endswith(KERNEL_FORMATS): + try: + graph = process_graph_from_file(path) + + except ValueError or TypeError: + return process_kernel_from_file(path) + + elif path.endswith(GRAPH_FORMATS): + graph = process_graph_from_file(path) + + else: + raise IOError( + f'{EMOJI} The selected network format is not valid neither as a graph or as a kernel. Please ensure you use one of the following formats: ' + f'{GRAPH_FORMATS}' + ) + + return regularised_laplacian_kernel(graph) + + +def get_graph_from_network_file(path: str) -> Graph: + """Load network provided in cli as a graph.""" + if path.endswith(KERNEL_FORMATS): + try: + return process_graph_from_file(path) + + except ValueError or TypeError: + kernel = process_kernel_from_file(path) + + elif path.endswith(GRAPH_FORMATS): + return process_graph_from_file(path) + + else: + raise IOError( + f'{EMOJI} The selected network format is not valid neither as a graph or as a kernel. Please ensure you use one of the following formats: ' + f'{GRAPH_FORMATS}' + ) + + return kernel.to_nx_graph() + + +"""Process input formats""" + + +def process_graph_from_file(path: str) -> Graph: + """Load network from path.""" + if path.endswith(CSV) or path.endswith(TSV): + graph = get_graph_from_df(path, CSV) + + elif path.endswith(TSV): + graph = get_graph_from_df(path, TSV) + + elif path.endswith(PICKLE): + graph = pybel.from_pickle(path) + + elif path.endswith(GRAPHML): + graph = read_graphml(path) + + elif path.endswith(GML): + graph = read_gml(path) + + elif path.endswith(BEL): + graph = pybel.from_path(path) + + elif path.endswith(EDGE_LIST): + graph = read_edgelist(path) + + elif path.endswith(JSON): + data = from_json(path) + graph = node_link_graph(data) + else: + raise IOError( + f'{EMOJI} The selected graph format is not valid. Please ensure you use one of the following formats: ' + f'{GRAPH_FORMATS}' + ) + + log.info( + f'{EMOJI} Graph loaded with: \n' + f'{graph.number_of_nodes()} nodes\n' + f'{graph.number_of_edges()} edges\n' + f'{EMOJI}' + ) + + return graph + + +def process_kernel_from_file(path: str) -> Matrix: + """Load kernel from path.""" + if path.endswith(CSV): + raw_kernel = MatrixFromDataFrame(from_dataframe_file(path, CSV)) + + elif path.endswith(TSV): + raw_kernel = MatrixFromDataFrame(from_dataframe_file(path, TSV)) + + elif path.endswith(PICKLE): + raw_kernel = from_pickle(path) + + elif path.endswith(JSON): + raw_kernel = from_json(path) + + else: + raise IOError( + f'{EMOJI} The selected kernel format is not valid. Please ensure you use one of the following formats: ' + f'{KERNEL_FORMATS}' + ) + + # Check imported type of kernel + if isinstance(raw_kernel, Matrix): + kernel = raw_kernel + + elif isinstance(raw_kernel, dict): + kernel = MatrixFromDict(raw_kernel) + + elif isinstance(raw_kernel, pd.DataFrame): + kernel = MatrixFromDataFrame(raw_kernel) + + elif isinstance(raw_kernel, np.ndarray): + kernel = MatrixFromNumpyArray(raw_kernel) + + else: + raise IOError( + f'{EMOJI} The imported kernel type is not valid. Please ensure it is provided as a diffupy ' + f'Matrix, a Dict, NumpyArray or Pandas DataFrame. ' + ) + + log.info( + f'{EMOJI} Kernel loaded with: \n' + f'{len(kernel.rows_labels)} nodes\n' + f'{EMOJI}' + ) + + return kernel + + +def get_simple_graph_from_multigraph(multigraph): + """Convert undirected graph from multigraph.""" + graph = Graph() + for u, v, data in multigraph.edges(data=True): + u = get_label_node(u) + v = get_label_node(v) + + w = data['weight'] if 'weight' in data else 1.0 + if graph.has_edge(u, v): + graph[u][v]['weight'] += w + else: + graph.add_edge(u, v, weight=w) + + return graph + + +def get_graph_from_df(path: str, sep: str) -> DiGraph: + """Return network from dataFrame.""" + format_checker(sep) + + df = from_dataframe_file(path, sep) + + if SOURCE not in df.columns or TARGET not in df.columns: + raise ValueError( + f'Ensure that your file contains columns for {SOURCE} and {TARGET}. The column for {RELATION} is optional' + f'and can be omitted.' + ) + + graph = DiGraph() + + for index, row in df.iterrows(): + + # Get node names from data frame + sub_name = row[SOURCE] + obj_name = row[TARGET] + + if RELATION in df.columns: + + relation = row[RELATION] + + # Store edge in the graph + graph.add_edge( + sub_name, obj_name, + relation=relation, + ) + + else: + graph.add_edge( + sub_name, obj_name, + ) + + return graph From ef5c6e43ccf70058964e534ff64342466f2f4b23 Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Fri, 10 Apr 2020 18:36:23 +0200 Subject: [PATCH 02/17] Process data input major recoding and tests --- src/diffupy/constants.py | 16 +- src/diffupy/process_data_input.py | 635 ++++++++++++++++++ src/diffupy/process_input.py | 315 --------- tests/constants.py | 2 +- .../datasets/{node.csv => node_type_col.csv} | 0 tests/test_diffusion.py | 2 +- tests/test_input.py | 107 ++- 7 files changed, 735 insertions(+), 342 deletions(-) create mode 100644 src/diffupy/process_data_input.py delete mode 100644 src/diffupy/process_input.py rename tests/resources/datasets/{node.csv => node_type_col.csv} (100%) diff --git a/src/diffupy/constants.py b/src/diffupy/constants.py index 48daee2..354cd91 100644 --- a/src/diffupy/constants.py +++ b/src/diffupy/constants.py @@ -117,11 +117,23 @@ def ensure_output_dirs(): #: Node name NODE = 'Node' +LABEL = 'Label' +ENTITY = 'Entity' +GENE = 'Gene' + +NODE_LABELING= [ + NODE, + LABEL, + ENTITY, + GENE +] + #: Node type NODE_TYPE = 'NodeType' +#: Unspecified score type +SCORE = 'Score' #: Log2 fold change (logFC) LOG_FC = 'LogFC' #: Statistical significance (p-value) P_VALUE = 'p-value' -#: Label -LABEL = 'Label' + diff --git a/src/diffupy/process_data_input.py b/src/diffupy/process_data_input.py new file mode 100644 index 0000000..b198fc4 --- /dev/null +++ b/src/diffupy/process_data_input.py @@ -0,0 +1,635 @@ +# -*- coding: utf-8 -*- + +"""Main matrix class and processing of input data.""" + +from typing import Dict, Optional, Union + +import numpy as np +import pandas as pd + +from .constants import * +from .matrix import Matrix +from .utils import from_pickle, from_json, from_dataframe_file, from_nparray_to_df, get_random_value_from_dict, \ + get_random_key_from_dict + +"""Process input data""" + + +def process_input_data_for_diff(data_input: Union[str, pd.DataFrame, list, dict, np.ndarray, Matrix], + kernel: Matrix, + background_labels: Union[list, dict] = None, + method: Optional[str] = 'raw', + binning: Optional[bool] = False, + absolute_value: Optional[bool] = False, + p_value: Optional[float] = None, + threshold: Optional[float] = None, + separator_str: Optional[str] = ', ' + ) -> Matrix: + """Process miscellaneous input data and format it for the diffusion computation function.""" + # If specific label background not provided, get a list from kernel labels. + if not background_labels: + background_labels = list(kernel.rows_labels) + # TODO: Discuss store label classification (mapping or as a column argument) in kernel + + # Pipeline the input, first preprocessing it, then mapping it to the background labels and finally formatting it. + return format_input_for_diffusion(map_labels_input(process_data_input(data_input, + method, + binning, + absolute_value, + p_value, + threshold, + separator_str + ), + background_labels + ), + kernel + ) + + +def process_data_input(data_input: Union[str, list, dict, np.ndarray, pd.DataFrame], + method: str = 'raw', + binning: bool = False, + absolute_value: bool = False, + p_value: float = None, + threshold: Optional[float] = None, + separator_str: Optional[str] = ', ', + ) -> Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]]: + """Process and pipeline the provided miscellaneous data input in standardized data structures for further processing.""" + # Preprocess the raw input according is format types. + preprocessed_data = _process_data_input_format(data_input, separator_str) + + # If the preprocessed input is a list or a label type dict (Dict[str, list]) of lists return it for categorical input generation. + if _label_list_data_struct_check(preprocessed_data) or _type_label_list_data_struct_check(preprocessed_data): + return preprocessed_data + + # If the preprocessed input is a label type label-scores dict (Dict[str, pd.DataFrame]) pipeline it for scores codifying. + if isinstance(preprocessed_data, dict): + return {label_type: _codify_input_data(preprocessed_data_i, + method, + binning, + absolute_value, + p_value, + threshold + ) + for label_type, preprocessed_data_i in preprocessed_data.items() + } + + # If the preprocessed input is a scores-label dataframe (pd.DataFrame) pipeline it for scores codifying. + return _codify_input_data(preprocessed_data, + method, + binning, + absolute_value, + p_value, + threshold + ) + + +"""Process input formats""" + + +def _process_data_input_format(raw_data_input: Union[str, list, dict, np.ndarray, pd.DataFrame], + separ_str: str = ',') -> Union[pd.DataFrame, list, Dict[str, Union[pd.DataFrame, list]]]: + """Format the input as a label-score dataframe, a list or a labels or a type dict for latter input processing.""" + if isinstance(raw_data_input, str): + # If the data input type is a string, mostly will be a path to the dataset file. + if os.path.isfile(raw_data_input): + return _process_data_input_format(_load_data_input_from_file(raw_data_input)) + elif '/' in raw_data_input and separ_str not in ['/', ' /', '/ ']: + raise IOError( + f'{EMOJI} The file could not have been located in the provided data input path,.' + ) + # If it is not a path, will be treated as a label list with separator. + else: + return _process_data_input_format(raw_data_input.split(raw_data_input)) + + if isinstance(raw_data_input, pd.DataFrame): + return raw_data_input + + elif isinstance(raw_data_input, list) or isinstance(raw_data_input, set): + return list(set(raw_data_input)) + + elif isinstance(raw_data_input, np.ndarray): + return from_nparray_to_df(raw_data_input) + + elif isinstance(raw_data_input, dict): + if _scores_dict_data_struct_check(raw_data_input): + return pd.DataFrame.from_dict(raw_data_input, orient='index') + else: + return {label_type: _process_data_input_format(data_i) for label_type, data_i in raw_data_input.items()} + + elif isinstance(raw_data_input, Matrix): + return raw_data_input.to_df() + + else: + raise TypeError( + f'{EMOJI} The imported kernel type is not valid. Please ensure is provided as a diffupy ' + f'Matrix, a Dict, NumpyArray or Pandas DataFrame. ' + ) + + +def _load_data_input_from_file(path: str) -> Union[pd.DataFrame, list]: + """Load and process the input data according the input file format.""" + if path.endswith(CSV): + return from_dataframe_file(path, CSV) + + elif path.endswith(TSV): + return from_dataframe_file(path, TSV) + + elif path.endswith(PICKLE): + return from_pickle(path) + + elif path.endswith(JSON): + return from_json(path) + + else: + raise IOError( + f'There is a problem with your file. Please ensure the file you submitted is correctly formatted with a' + f'.csv or .tsv file extension.' + ) + + +"""Pipeline input scores""" + + +def _codify_input_data(df: pd.DataFrame, + method: str, + binning: bool, + absolute_value: bool, + p_value: float, + threshold: Optional[float], + ) -> Union[Dict[str, Dict[str, int]], + Dict[str, int]]: + """Process the input scores for the codifying process.""" + # Ensure that node labeling is in the provided dataset. + if not any(n in df.columns for n in NODE_LABELING): + raise ValueError( + f'Ensure that your file contains a column {NODE_LABELING} with node IDs.' + ) + # Standardize the title of the node column labeling column to 'label', for later processing. + elif LABEL not in df.columns: + for l in list(df.columns): + if l in NODE_LABELING: + df = df.rename(columns={l: LABEL}) + break + + # If node type provided in a column, classify in a dictionary the input codification by its node type. + if NODE_TYPE in df.columns: + + node_types = list(set(df[NODE_TYPE])) # Get the node types list set. + codified_by_type_dict = {} + + for node_type in node_types: + # Filter the nodes by the iterable type. + df_by_type = df.loc[df[NODE_TYPE] == node_type] + + # Codify the nodes for the iterable type. + codified_by_type_dict[node_type] = _codify_method_check(df_by_type, + method, + binning, + absolute_value, + p_value, + threshold + ) + return codified_by_type_dict + + else: + # Codify all the nodes of the dataframe. + return _codify_method_check(df, + method, + binning, + absolute_value, + p_value, + threshold + ) + + +def _codify_method_check(df: pd.DataFrame, + method: str, + binning: bool, + absolute_value: bool, + p_value: float, + threshold: Optional[float], + ) -> Dict[str, int]: + """Classify the input data codification according the diffusion method.""" + # Prepare input data for quantitative diffusion scoring methods + if method == RAW or method == Z: + return _codify_quantitative_input_data(df, binning, absolute_value, p_value, threshold) + + # Prepare input data for non-quantitative diffusion methods + elif method == ML or method == GM: + return _codify_non_quantitative_input_data(df, p_value, threshold) + + else: + # TODO: ber_s, ber_p, mc + raise NotImplementedError('This diffusion method has not yet been implemented.') + + +"""Assign binary labels to input for scoring methods that accept non-quantitative values""" + + +def _codify_non_quantitative_input_data( + df: pd.DataFrame, + p_value: float, + threshold: Optional[float] +) -> Dict[str, int]: + """Codify input data to get a set of labelled nodes for scoring methods that accept non-quantitative values.""" + # LogFC provided in dataset and threshold given + if LOG_FC in df.columns and threshold: + + # Label nodes with 1 if | logFC | passes threshold + df.loc[(df[LOG_FC]).abs() >= threshold, SCORE] = 1 + # Label nodes with -1 if | logFC | below threshold + df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = -1 + + # If adjusted p-values are provided in dataset, label nodes that are not statistically significant with -1 + if P_VALUE in df.columns: + df.loc[df[P_VALUE] > p_value, SCORE] = -1 + + return df.set_index(NODE)[SCORE].to_dict() + + # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign labels as 1 + df[SCORE] = 1 + + return df.set_index(NODE)[SCORE].to_dict() + + +"""Assign binary labels to input for scoring methods that accept quantitative values""" + + +def _codify_quantitative_input_data( + df: pd.DataFrame, + binning: bool, + absolute_value: bool, + p_value: float, + threshold: Optional[float], +) -> Dict[str, int]: + """Codify input data to get a set of labelled nodes for scoring methods that accept quantitative values.""" + # LogFC provided in dataset and threshold given + if LOG_FC in df.columns and threshold: + + # Binarize labels with 1, 0 and/or -1 + if binning is True: + + # Add binning labels where | logFC | values above threshold are 1 and below are 0 + if absolute_value is True: + return _bin_quantitative_input_by_abs_val(df, threshold, p_value) + + # Add signed labels where | logFC | values above threshold are 1 or -1 (signed) and values below are 0 + + return _bin_quantitative_input_by_threshold(df, threshold, p_value) + + # Labels are 0s or logFC values rather than binary values + else: + # Codify inputs with | logFC | if they pass threshold; otherwise assign label as 0 + if absolute_value is True: + return _codify_quantitative_input_by_abs_val(df, threshold, p_value) + + # Codify inputs with logFC if they pass threshold; otherwise assign label as 0 + return _codify_quantitative_input_by_threshold(df, threshold, p_value) + + # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign labels as 1 + df[SCORE] = 1 + + return df.set_index(NODE)[SCORE].to_dict() + + +def _bin_quantitative_input_by_abs_val( + df: pd.DataFrame, + threshold: float, + p_value: float, +) -> Dict[str, int]: + """Process quantitative inputs and bin labels by absolute value.""" + # Add label 1 if | logFC | is above threshold + df.loc[(df[LOG_FC]).abs() >= threshold, SCORE] = 1 + # Add label 0 if | logFC | below threshold + df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0 + + # logFC and adjusted p-values are provided in dataset + if P_VALUE in df.columns: + return _remove_non_significant_entities(df, p_value) + + return df.set_index(NODE)[SCORE].to_dict() + + +def _bin_quantitative_input_by_threshold( + df: pd.DataFrame, + threshold: float, + p_value: float, +) -> Dict[str, int]: + """Process quantitative inputs and bin labels by threshold.""" + # Add label 1 if logFC is above threshold + df.loc[df[LOG_FC] >= threshold, SCORE] = 1 + # Add label 0 if | logFC | below threshold + df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0 + # Replace remaining labels with -1 (i.e. | logFC | above threshold but sign is negative) + df = df.fillna(-1) + + if p_value: + # LogFC values and adjusted p-values are provided in dataset + if P_VALUE in df.columns: + # Disregard entities if logFC adjusted p-value is not significant + return _remove_non_significant_entities(df, p_value) + + return df.set_index(NODE)[SCORE].to_dict() + + +"""Assign logFC as labels for input for scoring methods that accept quantitative values""" + + +def _codify_quantitative_input_by_abs_val( + df: pd.DataFrame, + threshold: float, + p_value: float, +) -> Dict[str, int]: + """Codify nodes with | logFC | if they pass threshold, otherwise label is 0.""" + # Codify nodes with | logFC | if they pass threshold + df.loc[(df[LOG_FC]).abs() >= threshold, SCORE] = (df[LOG_FC]).abs() + # Codify nodes with label 0 if it falls below threshold + df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0 + + # LogFC and adjusted p-values are provided in dataset + if P_VALUE in df.columns: + # Disregard entities if logFC adjusted p-value is not significant + return _remove_non_significant_entities(df, p_value) + + return df.set_index(NODE)[SCORE].to_dict() + + +def _codify_quantitative_input_by_threshold( + df: pd.DataFrame, + threshold: float, + p_value: float, +) -> Dict[str, int]: + """Codify inputs with logFC if they pass threshold value.""" + df.loc[df[LOG_FC] >= threshold, SCORE] = df[LOG_FC] + df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0 + df.loc[((df[LOG_FC]).abs() >= threshold) & ((df[LOG_FC]) < 0), SCORE] = df[LOG_FC] + + # LogFC values and adjusted p-values are provided in dataset + if P_VALUE in df.columns: + # Disregard entities if logFC adjusted p-value is not significant + return _remove_non_significant_entities(df, p_value) + + return df.set_index(NODE)[SCORE].to_dict() + + +def _remove_non_significant_entities(df: pd.DataFrame, p_value: float) -> Dict[str, int]: + # Label entity 0 if adjusted p-value for logFC is not significant + df.loc[df[P_VALUE] > p_value, SCORE] = 0 + + return df.set_index(NODE)[SCORE].to_dict() + + +"""Data structures format checkers""" + + +def _scores_dict_data_struct_check(v: Union[dict, list]) -> bool: + """Check data structure type Dict[str, int].""" + return (isinstance(v, dict) and + isinstance(get_random_value_from_dict(v), int) + ) + + +def _type_scores_dict_data_struct_check(v: Union[dict, list]) -> bool: + """Check data structure type Dict[str, Dict[str, int]].""" + return (isinstance(v, dict) and + isinstance(get_random_value_from_dict(v), dict) and + isinstance(get_random_value_from_dict(get_random_value_from_dict(v)), int) + ) + + +def _label_list_data_struct_check(v: Union[dict, list]) -> bool: + """Check data structure type list.""" + return isinstance(v, list) + + +def _type_label_list_data_struct_check(v: Union[dict, list]) -> bool: + """Check data structure type Dict[str, list].""" + return (isinstance(v, dict) and + isinstance(get_random_value_from_dict(v), list) + ) + + +"""Mappers from input to network background""" + + +def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]], + background_labels: Union[Dict[str, list], list]) -> Union[Dict[str, int], list]: + """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes.""" + if isinstance(background_labels, list): + return _map_labels_to_background(input_labels, background_labels) + + elif isinstance(background_labels, dict): + return {node_type: _map_labels_to_background(input_labels, node_set, node_type) + for node_type, node_set + in background_labels.items() + if _map_labels_to_background(input_labels, node_set, node_type) not in [[], {}] + } + else: + raise IOError( + f'{EMOJI} The background mapping labels should be provided as a label list or as a type dict of label list.' + ) + + +def _map_labels_to_background(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]], + background_labels: list, + background_labels_type: str = None + ) -> Union[Dict[str, Dict[str, int]], + Dict[str, int]]: + """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes.""" + if _type_scores_dict_data_struct_check(input_labels) or _type_label_list_data_struct_check(input_labels): + if background_labels_type: + if background_labels_type in input_labels.keys(): + return _map_labels(input_labels[background_labels_type], background_labels) + else: + return { + type: _map_labels(label_list, background_labels) + for type, label_list in input_labels.items() + if _map_labels(label_list, background_labels) not in [[], {}] + } + + return _map_labels(input_labels, background_labels) + + +def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]], + background_labels: list) -> Union[Dict[str, int], list]: + """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes.""" + if _label_list_data_struct_check(input_labels): + return list(set(input_labels).intersection(set(background_labels))) + + elif _scores_dict_data_struct_check(input_labels): + return {labels: input_labels[labels] + for labels in background_labels + if labels in input_labels + } + + elif _type_label_list_data_struct_check(input_labels): + l = [] + for type, label_list in input_labels.items(): + l += _map_labels(label_list, background_labels) + return l + + elif _type_scores_dict_data_struct_check(input_labels): + l = {} + for type, scores_dict in input_labels.items(): + l.update(_map_labels(scores_dict, background_labels)) + return l + + else: + raise TypeError( + f'{EMOJI} The input labels data structure can not be processed for label mapping' + ) + + +"""Generate/format data input as a vector/matrix for the diffusion computation matching the kernel rows""" + + +def format_input_for_diffusion(processed_input: Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]], + kernel: Matrix) -> Matrix: + """Format/generate input vector/matrix according the data structure of the processed_data_input.""" + if _label_list_data_struct_check(processed_input): + return format_categorical_input_vector_from_label_list(rows_labeled=processed_input, + col_label='scores', + kernel=kernel + ) + + elif _scores_dict_data_struct_check(processed_input): + return format_input_vector_from_scores_dict(processed_input, kernel) + + elif _type_label_list_data_struct_check(processed_input): + return format_categorical_input_matrix_from_label_list(processed_input, kernel) + + elif _type_scores_dict_data_struct_check(processed_input): + return format_input_matrix_from_scores_dict(processed_input, kernel) + + else: + raise TypeError( + f'{EMOJI} The label/scores mapping data structure can not be processed for the input formatting.' + ) + + +"""Generate categorical (non-quantitative) input vector matrix from raw input dataset labels""" + + +def format_categorical_input_vector_from_label_list(rows_labeled, + col_label, + kernel, + missing_value=-1, + rows_unlabeled=None # TODO: To discuss, to handle + ) -> Matrix: + """Generate categoric input vector from labels.""" + if isinstance(col_label, str): + col_label = [col_label] + + input_mat = Matrix( + rows_labels=list(rows_labeled), + cols_labels=col_label, + init_value=1) + if rows_unlabeled: + input_mat.row_bind( + matrix=Matrix( + rows_labels=list(rows_unlabeled), + cols_labels=col_label, + init_value=0) + ) + + return input_mat.match_missing_rows(kernel.rows_labels, missing_value).match_rows(kernel) + + +def format_categorical_input_matrix_from_label_list(rows_labels, + cols_labels: list, + kernel, + missing_value=-1, + rows_unlabeled=None # TODO: To discuss, to handle + ) -> Matrix: + """Generate input vector from labels.""" + if not isinstance(cols_labels, list): + raise NotImplementedError('The column labels should be provided as a list.') + + if len(cols_labels) > 1: + input_mat = format_categorical_input_vector_from_label_list( + rows_labels[0], + cols_labels[0], + kernel, + missing_value, + rows_unlabeled[0] + ) + + for idx, row_label in enumerate(rows_labels[1:]): + input_vector = format_categorical_input_vector_from_label_list( + row_label, + cols_labels[idx + 1], + kernel, + missing_value, + rows_unlabeled[idx + 1], + ) + input_mat.col_bind(matrix=input_vector) + + return input_mat + + elif isinstance(cols_labels, list): + return format_categorical_input_vector_from_label_list( + rows_labels, + cols_labels, + kernel, + missing_value, + rows_unlabeled + ) + + +"""Generate quantitative or binarized/categorical input vector matrix from preprocesed input dataset scores""" + + +def format_input_vector_from_scores_dict(scores_dict: dict, + kernel, + col_label: str = 'scores', + missing_value=-1, + rows_unlabeled=None # TODO: To discuss, to handle + ) -> Matrix: + """Generate scores input vector from labels scores dict.""" + + input_mat = Matrix( + mat=np.array(list(scores_dict.values())), + rows_labels=list(scores_dict.keys()), + cols_labels=[col_label] + ) + + if rows_unlabeled: + input_mat.row_bind( + matrix=Matrix( + rows_labels=list(rows_unlabeled), + cols_labels=col_label, + init_value=0) + ) + + return input_mat.match_missing_rows(kernel.rows_labels, missing_value).match_rows(kernel) + + +def format_input_matrix_from_scores_dict(scores_dicts: Union[Dict[str, Dict[str, int]], + Dict[str, int]], + kernel, + rows_unlabeled=None, # TODO: To discuss, to handle + ) -> Matrix: + """Generate input matrix from labels scores dict and/or handle type classification by columns.""" + if _scores_dict_data_struct_check(scores_dicts): + scores_dicts.pop('node_types') + + init_k = get_random_key_from_dict(scores_dicts) + init_v = scores_dicts.pop(init_k) + input_mat = format_input_vector_from_scores_dict(scores_dicts, + kernel, + col_label=init_k, + rows_unlabeled=init_v + ) + + for node_type, scores_dict in scores_dicts.items(): + input_vector = format_input_vector_from_scores_dict(scores_dict, + kernel, + col_label=node_type, + rows_unlabeled=rows_unlabeled + ) + input_mat.col_bind(matrix=input_vector) + + return input_mat + else: + return format_input_vector_from_scores_dict(scores_dicts, kernel) diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py deleted file mode 100644 index bedc100..0000000 --- a/src/diffupy/process_input.py +++ /dev/null @@ -1,315 +0,0 @@ -# -*- coding: utf-8 -*- - -"""Main matrix class and processing of input data.""" -from typing import Dict, List, Optional - -import networkx as nx -import pandas as pd - -from .constants import * -from .matrix import Matrix - -"""Process datasets""" - - -def process_input( - path: str, - method: str, - binning: bool, - absolute_value: bool, - p_value: float, - threshold: Optional[float], -) -> Dict[str, int]: - """Read input file and ensure necessary columns exist.""" - if path.endswith(CSV): - fmt = CSV - - elif path.endswith(TSV): - fmt = TSV - - else: - raise IOError( - f'There is a problem with your file. Please ensure the file you submitted is correctly formatted with a' - f'.csv or .tsv file extension.' - ) - - df = pd.read_csv( - path, - header=0, - sep=FORMAT_SEPARATOR_MAPPING[CSV] if fmt == CSV else FORMAT_SEPARATOR_MAPPING[TSV] - ) - - # Ensure that column Node is in dataset - if NODE not in df.columns: - raise ValueError( - f'Ensure that your file contains a column {NODE} with node IDs.' - ) - - # If logFC column not in dataFrame, ensure node type column is at least given - elif LOG_FC not in df.columns: - if NODE_TYPE not in df.columns: - raise ValueError( - f'Ensure that your file contains a column, {NODE_TYPE}, indicating node types.' - ) - - return _codify_input_data(df, method, binning, absolute_value, p_value, threshold) - - -"""Codify input according to diffusion scoring method""" - - -def _codify_input_data( - df: pd.DataFrame, - method: str, - binning: bool, - absolute_value: bool, - p_value: float, - threshold: Optional[float], -) -> Dict[str, int]: - """Prepare input data for diffusion.""" - # Prepare input data for quantitative diffusion scoring methods - if method == RAW or method == Z: - return _codify_quantitative_input_data(df, binning, absolute_value, p_value, threshold) - - # Prepare input data for non-quantitative diffusion methods - elif method == ML or method == GM: - return _codify_non_quantitative_input_data(df, p_value, threshold) - - else: - # TODO: ber_s, ber_p, mc - raise NotImplementedError('This diffusion method has not yet been implemented.') - - -"""Assign binary labels to input for scoring methods that accept non-quantitative values""" - - -def _codify_non_quantitative_input_data( - df: pd.DataFrame, - p_value: float, - threshold: Optional[float] -) -> Dict[str, int]: - """Codify input data to get a set of labelled nodes for scoring methods that accept non-quantitative values.""" - # LogFC provided in dataset and threshold given - if LOG_FC in df.columns and threshold: - - # Label nodes with 1 if | logFC | passes threshold - df.loc[(df[LOG_FC]).abs() >= threshold, LABEL] = 1 - # Label nodes with -1 if | logFC | below threshold - df.loc[(df[LOG_FC]).abs() < threshold, LABEL] = -1 - - # If adjusted p-values are provided in dataset, label nodes that are not statistically significant with -1 - if P_VALUE in df.columns: - df.loc[df[P_VALUE] > p_value, LABEL] = -1 - - return df.set_index(NODE)[LABEL].to_dict() - - # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign labels as 1 - df[LABEL] = 1 - - return df.set_index(NODE)[LABEL].to_dict() - - -"""Assign binary labels to input for scoring methods that accept quantitative values""" - - -def _codify_quantitative_input_data( - df: pd.DataFrame, - binning: bool, - absolute_value: bool, - p_value: float, - threshold: Optional[float], -) -> Dict[str, int]: - """Codify input data to get a set of labelled nodes for scoring methods that accept quantitative values.""" - # LogFC provided in dataset and threshold given - if LOG_FC in df.columns and threshold: - - # Binarize labels with 1, 0 and/or -1 - if binning is True: - - # Add binning labels where | logFC | values above threshold are 1 and below are 0 - if absolute_value is True: - return _bin_quantitative_input_by_abs_val(df, threshold, p_value) - - # Add signed labels where | logFC | values above threshold are 1 or -1 (signed) and values below are 0 - - return _bin_quantitative_input_by_threshold(df, threshold, p_value) - - # Labels are 0s or logFC values rather than binary values - else: - # Codify inputs with | logFC | if they pass threshold; otherwise assign label as 0 - if absolute_value is True: - return _codify_quantitative_input_by_abs_val(df, threshold, p_value) - - # Codify inputs with logFC if they pass threshold; otherwise assign label as 0 - return _codify_quantitative_input_by_threshold(df, threshold, p_value) - - # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign labels as 1 - df[LABEL] = 1 - - # TODO handle NODE_TYPE - return df.set_index(NODE)[LABEL].to_dict() - - -def _bin_quantitative_input_by_abs_val( - df: pd.DataFrame, - threshold: float, - p_value: float, -) -> Dict[str, int]: - """Process quantitative inputs and bin labels by absolute value.""" - # Add label 1 if | logFC | is above threshold - df.loc[(df[LOG_FC]).abs() >= threshold, LABEL] = 1 - # Add label 0 if | logFC | below threshold - df.loc[(df[LOG_FC]).abs() < threshold, LABEL] = 0 - - # logFC and adjusted p-values are provided in dataset - if P_VALUE in df.columns: - return _remove_non_significant_entities(df, p_value) - - return df.set_index(NODE)[LABEL].to_dict() - - -def _bin_quantitative_input_by_threshold( - df: pd.DataFrame, - threshold: float, - p_value: float, -) -> Dict[str, int]: - """Process quantitative inputs and bin labels by threshold.""" - # Add label 1 if logFC is above threshold - df.loc[df[LOG_FC] >= threshold, LABEL] = 1 - # Add label 0 if | logFC | below threshold - df.loc[(df[LOG_FC]).abs() < threshold, LABEL] = 0 - # Replace remaining labels with -1 (i.e. | logFC | above threshold but sign is negative) - df = df.fillna(-1) - - if p_value: - # LogFC values and adjusted p-values are provided in dataset - if P_VALUE in df.columns: - # Disregard entities if logFC adjusted p-value is not significant - return _remove_non_significant_entities(df, p_value) - - return df.set_index(NODE)[LABEL].to_dict() - - -"""Assign logFC as labels for input for scoring methods that accept quantitative values""" - - -def _codify_quantitative_input_by_abs_val( - df: pd.DataFrame, - threshold: float, - p_value: float, -) -> Dict[str, int]: - """Codify nodes with | logFC | if they pass threshold, otherwise label is 0.""" - # Codify nodes with | logFC | if they pass threshold - df.loc[(df[LOG_FC]).abs() >= threshold, LABEL] = (df[LOG_FC]).abs() - # Codify nodes with label 0 if it falls below threshold - df.loc[(df[LOG_FC]).abs() < threshold, LABEL] = 0 - - # LogFC and adjusted p-values are provided in dataset - if P_VALUE in df.columns: - # Disregard entities if logFC adjusted p-value is not significant - return _remove_non_significant_entities(df, p_value) - - return df.set_index(NODE)[LABEL].to_dict() - - -def _codify_quantitative_input_by_threshold( - df: pd.DataFrame, - threshold: float, - p_value: float, -) -> Dict[str, int]: - """Codify inputs with logFC if they pass threshold value.""" - df.loc[df[LOG_FC] >= threshold, LABEL] = df[LOG_FC] - df.loc[(df[LOG_FC]).abs() < threshold, LABEL] = 0 - df.loc[((df[LOG_FC]).abs() >= threshold) & ((df[LOG_FC]) < 0), LABEL] = df[LOG_FC] - - # LogFC values and adjusted p-values are provided in dataset - if P_VALUE in df.columns: - # Disregard entities if logFC adjusted p-value is not significant - return _remove_non_significant_entities(df, p_value) - - return df.set_index(NODE)[LABEL].to_dict() - - -def _remove_non_significant_entities(df: pd.DataFrame, p_value: float) -> pd.DataFrame: - # Label entity 0 if adjusted p-value for logFC is not significant - df.loc[df[P_VALUE] > p_value, LABEL] = 0 - - return df.set_index(NODE)[LABEL].to_dict() - - -"""Map nodes from input to network""" - - -def map_nodes(input_node_dict: Dict[str, int], network: nx.Graph) -> List: - """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes.""" - # List of nodes in network - network_nodes = list(network.nodes) - - return [input_node_dict[node] if node in input_node_dict else None for node in network_nodes] - - -"""Generate input vector from dataset labels""" - - -def generate_categoric_input_vector_from_labels( - rows_labeled, - col_label, - background_mat, - missing_value=-1, - rows_unlabeled=None, -): - """Generate categoric input vector from labels.""" - if isinstance(col_label, str): - col_label = [col_label] - - input_mat = Matrix( - rows_labels=list(rows_labeled), - cols_labels=col_label, - init_value=1) - if rows_unlabeled: - input_mat.row_bind( - matrix=Matrix( - rows_labels=list(rows_unlabeled), - cols_labels=col_label, - init_value=0) - ) - - return input_mat.match_missing_rows(background_mat.rows_labels, missing_value).match_rows(background_mat) - - -def generate_categoric_input_from_labels( - rows_labels, - cols_labels, - background_mat, - missing_value=-1, - rows_unlabeled=None, -): - """Generate input vector from labels.""" - if isinstance(cols_labels, list) and len(cols_labels) > 1: - input_mat = generate_categoric_input_vector_from_labels( - rows_labels[0], - cols_labels[0], - background_mat, - missing_value, - rows_unlabeled[0] - ) - - for idx, row_label in enumerate(rows_labels[1:]): - input_vector = generate_categoric_input_vector_from_labels( - row_label, - cols_labels[idx + 1], - background_mat, - missing_value, - rows_unlabeled[idx + 1], - ) - input_mat.col_bind(matrix=input_vector) - - return input_mat - else: - return generate_categoric_input_vector_from_labels( - rows_labels, - cols_labels, - background_mat, - missing_value, - rows_unlabeled - ) diff --git a/tests/constants.py b/tests/constants.py index 690ab12..d60e38b 100644 --- a/tests/constants.py +++ b/tests/constants.py @@ -14,7 +14,7 @@ REGULARISED_LAPLACIAN_KERNEL = os.path.join(RESOURCES_FOLDER, 'regularisedLaplacianKernel.csv') DATASETS_FOLDER = os.path.join(RESOURCES_FOLDER, 'datasets') -NODE_TEST_PATH = os.path.join(DATASETS_FOLDER, 'node.csv') +NODE_TYPE_COL_TEST_PATH = os.path.join(DATASETS_FOLDER, 'node_type_col.csv') NODE_LOGFC_TEST_PATH = os.path.join(DATASETS_FOLDER, 'node_logfc.csv') NODE_LOGFC_PVAL_TEST_PATH = os.path.join(DATASETS_FOLDER, 'node_logfc_pval.csv') INPUT_SCORES = os.path.join(RESOURCES_FOLDER, 'input_scores.csv') diff --git a/tests/resources/datasets/node.csv b/tests/resources/datasets/node_type_col.csv similarity index 100% rename from tests/resources/datasets/node.csv rename to tests/resources/datasets/node_type_col.csv diff --git a/tests/test_diffusion.py b/tests/test_diffusion.py index 4fee51c..4e911df 100644 --- a/tests/test_diffusion.py +++ b/tests/test_diffusion.py @@ -10,7 +10,7 @@ from diffupy.diffuse import diffuse from diffupy.matrix import Matrix -from tests.constants import * +from .constants import * log = logging.getLogger(__name__) diff --git a/tests/test_input.py b/tests/test_input.py index 47bc22c..dcf3704 100644 --- a/tests/test_input.py +++ b/tests/test_input.py @@ -7,11 +7,11 @@ from diffupy.constants import * from diffupy.matrix import Matrix -from diffupy.process_input import process_input, map_nodes -from diffupy.utils import process_network +from diffupy.process_data_input import process_data_input, _map_labels_to_background, map_labels_input +from diffupy.process_network import get_graph_from_df from diffupy.validate_input import _validate_scores -from tests.constants import * +from .constants import * log = logging.getLogger(__name__) @@ -21,24 +21,27 @@ class ValidateTest(unittest.TestCase): def test_quantitative_bin_id(self): """Test codify label_input for quantitative scoring methods- only entity IDs given (binary labels).""" - input = NODE_TEST_PATH - input_labels_dict = process_input( + input = NODE_TYPE_COL_TEST_PATH + input_labels_dict = process_data_input( input, method=RAW, binning=True, absolute_value=True, p_value=0.05, threshold=None, ) - self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': 1, 'D': 1, 'E': 1}) + print(input_labels_dict) + self.assertEqual(input_labels_dict, {'Metabolite': {'C': 1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}}) def test_quantitative_bin_fc_sign(self): """Test codify label_input for quantitative scoring methods- logFC given (binary, signed labels).""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_data_input( input, method=RAW, binning=True, absolute_value=False, p_value=0.05, threshold=0.5, ) + print(input_labels_dict) + self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': 0, 'D': 0, 'E': -1}) def test_quantitative_bin_fc_abs(self): """Test codify label_input for quantitative scoring methods- logFC given (binary, absolute values).""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_data_input( input, method=RAW, binning=True, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': 0, 'D': 0, 'E': 1}) @@ -46,7 +49,7 @@ def test_quantitative_bin_fc_abs(self): def test_quantitative_bin_fcp_sign(self): """Test codify label_input for quantitative scoring methods- logFC and adj. p-value given (binary, signed labels).""" input = NODE_LOGFC_PVAL_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_data_input( input, method=RAW, binning=True, absolute_value=False, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0, 'B': 1, 'C': 0, 'D': 0, 'E': -1}) @@ -54,7 +57,7 @@ def test_quantitative_bin_fcp_sign(self): def test_quantitative_bin_fcp_abs(self): """Test codify label_input for quant. scoring methods- logFC and adj. p-value given (binary, absolute values).""" input = NODE_LOGFC_PVAL_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_data_input( input, method=RAW, binning=True, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0, 'B': 1, 'C': 0, 'D': 0, 'E': 1}) @@ -62,7 +65,7 @@ def test_quantitative_bin_fcp_abs(self): def test_quantitative_fc_sign(self): """Test codify label_input for quantitative scoring methods- logFC given (quantitative, signed labels).""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_data_input( input, method=RAW, binning=False, absolute_value=False, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0.7, 'B': 1.2, 'C': 0, 'D': 0, 'E': -2.2}) @@ -70,7 +73,7 @@ def test_quantitative_fc_sign(self): def test_quantitative_fc_abs(self): """Test codify label_input for quantitative scoring methods- logFC given (quant., absolute values).""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_data_input( input, method=RAW, binning=False, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0.7, 'B': 1.2, 'C': 0, 'D': 0, 'E': 2.2}) @@ -78,7 +81,7 @@ def test_quantitative_fc_abs(self): def test_quantitative_fcp_sign(self): """Test codify label_input for quantitative scoring methods- logFC and adj. p-value given (quant., signed labels).""" input = NODE_LOGFC_PVAL_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_data_input( input, method=RAW, binning=False, absolute_value=False, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0, 'B': 1.2, 'C': 0, 'D': 0, 'E': -2.2}) @@ -86,23 +89,23 @@ def test_quantitative_fcp_sign(self): def test_quantitative_fcp_abs(self): """Test codify label_input for quant. scoring methods- logFC and adj. p-value given (quant., absolute values).""" input = NODE_LOGFC_PVAL_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_data_input( input, method=RAW, binning=False, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0, 'B': 1.2, 'C': 0, 'D': 0, 'E': 2.2}) def test_non_quantitative_bin_id(self): """Test codify label_input for non-quantitative scoring methods- only entity IDs given (binary labels).""" - input = NODE_TEST_PATH - input_labels_dict = process_input( + input = NODE_TYPE_COL_TEST_PATH + input_labels_dict = process_data_input( input, method=ML, binning=True, absolute_value=True, p_value=0.05, threshold=None, ) - self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': 1, 'D': 1, 'E': 1}) + self.assertEqual(input_labels_dict, {'Metabolite': {'C': 1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}}) def test_non_quantitative_bin_fc_abs(self): """Test codify label_input for non-quantitative scoring methods- logFC given (binary, absolute values (sign)).""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_data_input( input, method=ML, binning=True, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': -1, 'D': -1, 'E': 1}) @@ -110,14 +113,72 @@ def test_non_quantitative_bin_fc_abs(self): def test_non_quantitative_bin_fcp_abs(self): """Test codify label_input for non-quant. scoring methods- logFC and adj. p-value given (binary, absolute values).""" input = NODE_LOGFC_PVAL_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_data_input( input, method=ML, binning=True, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': -1, 'B': 1, 'C': -1, 'D': -1, 'E': 1}) + def test_map_labels_input_label_list_background_list(self): + """Test map label_input.""" + mapping = map_labels_input(input_labels=['A', 'B', 'C', 'D'], + background_labels=['A', 'B', 'C']) + + self.assertEqual(set(mapping), {'A', 'C', 'B'}) + + def test_map_labels_input_label_list_background_dict(self): + """Test map label_input.""" + mapping = map_labels_input(input_labels=['A', 'B', 'C', 'D'], + background_labels={'Gene': ['A', 'B'], 'Metabolite': ['C']}) + + self.assertEqual(mapping, {'Gene': ['A', 'B'], 'Metabolite': ['C']}) + + def test_map_labels_input_type_dict_label_list_background_list(self): + """Test map label_input.""" + mapping = map_labels_input(input_labels={'Gene': ['A', 'B'], 'Metabolite': ['C', 'D']}, + background_labels=['A', 'B', 'C']) + + self.assertEqual(mapping, {'Gene': ['A', 'B'], 'Metabolite': ['C']}) + + def test_map_labels_input_type_dict_label_dict_background_dict(self): + """Test map label_input.""" + # If the labels are classified in another type ('D' and 'B'), since it do not match with the background it will be not maped. + mapping = map_labels_input(input_labels={'Gene': ['A'], 'Metabolite': ['C', 'B']}, + background_labels={'Gene': ['A', 'B'], 'Metabolite': ['D']}) + + self.assertEqual(mapping, {'Gene': ['A']}) + + def test_map_labels_input_label_scores_dict_background_list(self): + """Test map label_input.""" + mapping = map_labels_input(input_labels={'A': 1, 'B': 1, 'D': 1, 'E': 1}, + background_labels=['B', 'C', 'D']) + + self.assertEqual(mapping, {'B': 1, 'D': 1}) + + def test_map_labels_input_label_scores_dict_background_dict(self): + """Test map label_input.""" + mapping = map_labels_input(input_labels={'A': 1, 'B': 1, 'D': 1, 'E': 1}, + background_labels={'Gene': ['A', 'B'], 'Metabolite': ['D']}) + + self.assertEqual(mapping, {'Metabolite': {'D': 1}, 'Gene': {'A': 1, 'B': 1}}) + + def test_map_labels_input_type_dict_label_scores_dict_background_list(self): + """Test map label_input.""" + mapping = map_labels_input(input_labels={'Metabolite': {'C': 1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}}, + background_labels=['A', 'B', 'C']) + + self.assertEqual(mapping, {'Metabolite': {'C': 1}, 'Gene': {'A': 1, 'B': 1}}) + + def test_map_labels_input_type_dict_label_scores_dict_background_dict(self): + """Test map label_input.""" + # If the labels are classified in another type ('D' and 'B'), since it do not match with the background it will be not maped. + mapping = map_labels_input(input_labels={'Metabolite': {'C': -1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}}, + background_labels={'Gene': ['A', 'B'], 'Metabolite': ['C']}) + + self.assertEqual(mapping, {'Metabolite': {'C': -1}, 'Gene': {'A': 1, 'B': 1}}) + def test_network(self): """Test generate graph from csv.""" - graph = process_network(NETWORK_PATH, CSV) + graph = get_graph_from_df(NETWORK_PATH, CSV) graph_nodes = set(graph.nodes()) graph_edges = set(graph.edges()) @@ -138,14 +199,14 @@ def test_network(self): def test_node_mapping(self): """Test mapping of nodes in label_input to nodes in network.""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_data_input( input, method=RAW, binning=False, absolute_value=True, p_value=0.05, threshold=0.5, ) - graph = process_network(NETWORK_PATH, CSV) + graph = get_graph_from_df(NETWORK_PATH, CSV) graph_nodes = list(graph.nodes()) - mapped_nodes_list = map_nodes(input_labels_dict, graph_nodes) + mapped_nodes_list = _map_labels_to_background(input_labels_dict, graph_nodes) self.assertEqual(mapped_nodes_list, [0.0, 1.0, 0.0, 0.0, None, 1.0, None, None, None]) From dd8e4e13bd1611a2cb2359ce09736240cd44e31b Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Fri, 10 Apr 2020 18:37:27 +0200 Subject: [PATCH 03/17] Diffupy Process data input utils --- src/diffupy/utils.py | 179 +++++++++++-------------------------------- 1 file changed, 45 insertions(+), 134 deletions(-) diff --git a/src/diffupy/utils.py b/src/diffupy/utils.py index d747520..ca4ae7b 100644 --- a/src/diffupy/utils.py +++ b/src/diffupy/utils.py @@ -5,24 +5,54 @@ import json import logging import pickle +import random import warnings -from typing import List, Tuple +from typing import List import networkx as nx import numpy as np import pandas as pd import pybel - -from networkx import DiGraph, read_graphml, read_gml, node_link_graph, read_edgelist +from networkx import Graph from .constants import * -from .constants import CSV, TSV, GRAPHML, GML, BEL, BEL_PICKLE, NODE_LINK_JSON, EMOJI, FORMATS - +from .constants import CSV, TSV, GRAPH_FORMATS log = logging.getLogger(__name__) -def get_laplacian(graph: nx.Graph, normalized: bool = False) -> np.ndarray: +def from_dataframe_file(path: str, fmt: str) -> pd.DataFrame: + """Read network file.""" + format_checker(fmt) + + return pd.read_csv( + path, + header=0, + sep=FORMAT_SEPARATOR_MAPPING[CSV] if fmt == CSV else FORMAT_SEPARATOR_MAPPING[TSV] + ) + + +def from_json(path: str): + """Read from json file.""" + with open(path) as f: + return json.load(f) + + +def from_pickle(input_path): + """Read from pickle file.""" + with open(input_path, 'rb') as f: + unpickler = pickle.Unpickler(f) + return unpickler.load() + + +def from_nparray_to_df(nparray: np.ndarray) -> pd.DataFrame: + """Convert numpy array to data frame.""" + return pd.DataFrame(data=nparray[1:, 1:], + index=nparray[1:, 0], + columns=nparray[0, 1:]) + + +def get_laplacian(graph: Graph, normalized: bool = False) -> np.ndarray: """Return Laplacian matrix.""" if nx.is_directed(graph): warnings.warn('Since graph is directed, it will be converted to an undirected graph.') @@ -35,7 +65,7 @@ def get_laplacian(graph: nx.Graph, normalized: bool = False) -> np.ndarray: return nx.laplacian_matrix(graph).toarray() -def set_diagonal_matrix(matrix, d): +def set_diagonal_matrix(matrix: np.ndarray, d: list) -> np.ndarray: """Set diagonal matrix.""" for j, row in enumerate(matrix): for i, x in enumerate(row): @@ -157,137 +187,18 @@ def print_dict_dimensions(entities_db, title): print(f'Total: {total} ') -def get_simple_graph_from_multigraph(multigraph): - """Convert undirected graph from multigraph.""" - graph = nx.Graph() - for u, v, data in multigraph.edges(data=True): - u = get_label_node(u) - v = get_label_node(v) - - w = data['weight'] if 'weight' in data else 1.0 - if graph.has_edge(u, v): - graph[u][v]['weight'] += w - else: - graph.add_edge(u, v, weight=w) - - return graph - - -"""Check formats of networks """ - - -def _format_checker(fmt: str) -> None: - """Check column sep.""" - if fmt not in FORMATS: +def format_checker(fmt: str, fmt_list: list = GRAPH_FORMATS) -> None: + """Check formats.""" + if fmt not in fmt_list: raise ValueError( f'The selected sep {fmt} is not valid. Please ensure you use one of the following formats: ' - f'{FORMATS}' + f'{fmt_list}' ) -"""Process networks""" - - -def _read_network_file(path: str, fmt: str) -> pd.DataFrame: - """Read network file.""" - _format_checker(fmt) - - df = pd.read_csv( - path, - header=0, - sep=FORMAT_SEPARATOR_MAPPING[CSV] if fmt == CSV else FORMAT_SEPARATOR_MAPPING[TSV] - ) - - if SOURCE not in df.columns or TARGET not in df.columns: - raise ValueError( - f'Ensure that your file contains columns for {SOURCE} and {TARGET}. The column for {RELATION} is optional' - f'and can be omitted.' - ) - - return df - - -def process_network(path: str, sep: str) -> DiGraph: - """Return network from dataFrame.""" - _format_checker(sep) - - df = _read_network_file(path, sep) - - graph = DiGraph() - - for index, row in df.iterrows(): - - # Get node names from data frame - sub_name = row[SOURCE] - obj_name = row[TARGET] - - if RELATION in df.columns: - - relation = row[RELATION] - - # Store edge in the graph - graph.add_edge( - sub_name, obj_name, - relation=relation, - ) - - else: - graph.add_edge( - sub_name, obj_name, - ) - - return graph - - -def load_json_file(path: str) -> DiGraph: - """Read json file.""" - with open(path) as f: - return json.load(f) - - -def from_pickle(input_path): - """Read from pickle file.""" - with open(input_path, 'rb') as f: - unpickler = pickle.Unpickler(f) - return unpickler.load() - - -def process_network_from_cli(path: str) -> nx.Graph: - """Load network from path.""" - if path.endswith(CSV): - graph = process_network(path, CSV) - - elif path.endswith(TSV): - graph = process_network(path, TSV) - - elif path.endswith(GRAPHML): - graph = read_graphml(path) - - elif path.endswith(GML): - graph = read_gml(path) - - elif path.endswith(BEL): - graph = pybel.from_path(path) - - elif path.endswith(BEL_PICKLE): - graph = pybel.from_pickle(path) - - elif path.endswith(EDGE_LIST): - graph = read_edgelist(path) - - elif path.endswith(NODE_LINK_JSON): - data = load_json_file(path) - graph = node_link_graph(data) - - else: - raise IOError( - f'{EMOJI} The selected format is not valid. Please ensure you use one of the following formats: ' - f'{FORMATS}' - ) - return graph +def get_random_key_from_dict(d): + return random.choice(list(d.keys())) -def process_kernel_from_cli(path: str): - """Process kernel from cli.""" - # TODO process different kinds of input format kernel - return from_pickle(path) +def get_random_value_from_dict(d): + return d[get_random_key_from_dict(d)] From 78556e3f496a2af244c94c5ae22cff446eb5fd07 Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Tue, 14 Apr 2020 18:24:34 +0200 Subject: [PATCH 04/17] Format inputs refactor and tested --- ...process_data_input.py => process_input.py} | 163 +++++++++++------- tests/test_input.py | 89 +++++++--- 2 files changed, 164 insertions(+), 88 deletions(-) rename src/diffupy/{process_data_input.py => process_input.py} (82%) diff --git a/src/diffupy/process_data_input.py b/src/diffupy/process_input.py similarity index 82% rename from src/diffupy/process_data_input.py rename to src/diffupy/process_input.py index b198fc4..ef6875b 100644 --- a/src/diffupy/process_data_input.py +++ b/src/diffupy/process_input.py @@ -46,7 +46,7 @@ def process_input_data_for_diff(data_input: Union[str, pd.DataFrame, list, dict, ) -def process_data_input(data_input: Union[str, list, dict, np.ndarray, pd.DataFrame], +def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFrame], method: str = 'raw', binning: bool = False, absolute_value: bool = False, @@ -485,22 +485,34 @@ def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, i def format_input_for_diffusion(processed_input: Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]], - kernel: Matrix) -> Matrix: + kernel: Matrix, + missing_value: int = -1) -> Matrix: """Format/generate input vector/matrix according the data structure of the processed_data_input.""" if _label_list_data_struct_check(processed_input): return format_categorical_input_vector_from_label_list(rows_labeled=processed_input, col_label='scores', - kernel=kernel + kernel=kernel, + missing_value=missing_value ) - elif _scores_dict_data_struct_check(processed_input): - return format_input_vector_from_scores_dict(processed_input, kernel) + elif _type_dict_label_list_data_struct_check(processed_input): + return format_categorical_input_matrix_from_label_list(rows_labels=list(processed_input.values()), + cols_labels=list(processed_input.keys()), + kernel=kernel, + missing_value=missing_value + ) - elif _type_label_list_data_struct_check(processed_input): - return format_categorical_input_matrix_from_label_list(processed_input, kernel) + elif _label_scores_dict_data_struct_check(processed_input): + return format_input_vector_from_label_score_dict(labels_scores_dict=processed_input, + kernel=kernel, + missing_value=missing_value + ) - elif _type_scores_dict_data_struct_check(processed_input): - return format_input_matrix_from_scores_dict(processed_input, kernel) + elif _type_dict_label_scores_dict_data_struct_check(processed_input): + return format_input_matrix_from_type_label_score_dict(type_dict_labels_scores_dict=processed_input, + kernel=kernel, + missing_value=missing_value + ) else: raise TypeError( @@ -511,48 +523,57 @@ def format_input_for_diffusion(processed_input: Union[list, Dict[str, int], Dict """Generate categorical (non-quantitative) input vector matrix from raw input dataset labels""" -def format_categorical_input_vector_from_label_list(rows_labeled, - col_label, - kernel, - missing_value=-1, - rows_unlabeled=None # TODO: To discuss, to handle +def format_categorical_input_vector_from_label_list(rows_labeled: Union[set, list], + col_label: Union[str, set, list], + kernel: Matrix, + missing_value: int = -1, + rows_unlabeled=None, + i: int = None ) -> Matrix: """Generate categoric input vector from labels.""" if isinstance(col_label, str): col_label = [col_label] input_mat = Matrix( - rows_labels=list(rows_labeled), + rows_labels=list(set(rows_labeled)), cols_labels=col_label, - init_value=1) + init_value=1 # By default the categorical input value is 1 + ) + if rows_unlabeled: + if i: + rows_unlabeled = rows_unlabeled[i] + input_mat.row_bind( matrix=Matrix( rows_labels=list(rows_unlabeled), cols_labels=col_label, - init_value=0) + init_value=0 # By default the non labeled input value is 0 + ) ) return input_mat.match_missing_rows(kernel.rows_labels, missing_value).match_rows(kernel) -def format_categorical_input_matrix_from_label_list(rows_labels, - cols_labels: list, - kernel, - missing_value=-1, - rows_unlabeled=None # TODO: To discuss, to handle +def format_categorical_input_matrix_from_label_list(rows_labels: Union[set, list], + cols_labels: Union[set, list], + kernel: Matrix, + missing_value: int = -1, + rows_unlabeled=None ) -> Matrix: """Generate input vector from labels.""" if not isinstance(cols_labels, list): raise NotImplementedError('The column labels should be provided as a list.') if len(cols_labels) > 1: + input_mat = format_categorical_input_vector_from_label_list( rows_labels[0], cols_labels[0], kernel, missing_value, - rows_unlabeled[0] + rows_unlabeled, + i=0 ) for idx, row_label in enumerate(rows_labels[1:]): @@ -561,75 +582,85 @@ def format_categorical_input_matrix_from_label_list(rows_labels, cols_labels[idx + 1], kernel, missing_value, - rows_unlabeled[idx + 1], + rows_unlabeled, + idx + 1 ) input_mat.col_bind(matrix=input_vector) return input_mat - elif isinstance(cols_labels, list): - return format_categorical_input_vector_from_label_list( - rows_labels, - cols_labels, - kernel, - missing_value, - rows_unlabeled - ) + return format_categorical_input_vector_from_label_list( + rows_labels, + cols_labels, + kernel, + missing_value, + rows_unlabeled + ) """Generate quantitative or binarized/categorical input vector matrix from preprocesed input dataset scores""" -def format_input_vector_from_scores_dict(scores_dict: dict, - kernel, - col_label: str = 'scores', - missing_value=-1, - rows_unlabeled=None # TODO: To discuss, to handle - ) -> Matrix: +def format_input_vector_from_label_score_dict(labels_scores_dict: Dict[str, int], + kernel: Matrix, + col_label: str = 'scores', + missing_value: int = -1, + rows_unlabeled: dict = None, # TODO: To discuss + type_k: bool = False + ) -> Matrix: """Generate scores input vector from labels scores dict.""" input_mat = Matrix( - mat=np.array(list(scores_dict.values())), - rows_labels=list(scores_dict.keys()), + mat=np.transpose(np.array([list(labels_scores_dict.values())])), + rows_labels=list(labels_scores_dict.keys()), cols_labels=[col_label] ) if rows_unlabeled: + if type_k: + rows_unlabeled = rows_unlabeled[col_label] + input_mat.row_bind( matrix=Matrix( - rows_labels=list(rows_unlabeled), - cols_labels=col_label, - init_value=0) + mat=np.transpose(np.array([list(rows_unlabeled.values())])), + rows_labels=list(rows_unlabeled.keys()), + cols_labels=[col_label] + ) ) return input_mat.match_missing_rows(kernel.rows_labels, missing_value).match_rows(kernel) -def format_input_matrix_from_scores_dict(scores_dicts: Union[Dict[str, Dict[str, int]], - Dict[str, int]], - kernel, - rows_unlabeled=None, # TODO: To discuss, to handle - ) -> Matrix: +def format_input_matrix_from_type_label_score_dict(type_dict_labels_scores_dict: Union[Dict[str, Dict[str, int]], + Dict[str, int]], + kernel, + missing_value: int = -1, + rows_unlabeled=None, # TODO: To discuss + ) -> Matrix: """Generate input matrix from labels scores dict and/or handle type classification by columns.""" - if _scores_dict_data_struct_check(scores_dicts): - scores_dicts.pop('node_types') - - init_k = get_random_key_from_dict(scores_dicts) - init_v = scores_dicts.pop(init_k) - input_mat = format_input_vector_from_scores_dict(scores_dicts, - kernel, - col_label=init_k, - rows_unlabeled=init_v - ) - - for node_type, scores_dict in scores_dicts.items(): - input_vector = format_input_vector_from_scores_dict(scores_dict, - kernel, - col_label=node_type, - rows_unlabeled=rows_unlabeled - ) + if _type_dict_label_scores_dict_data_struct_check(type_dict_labels_scores_dict): + + init_k = get_random_key_from_dict(type_dict_labels_scores_dict) + init_v = type_dict_labels_scores_dict.pop(init_k) + + input_mat = format_input_vector_from_label_score_dict(init_v, + kernel, + init_k, + missing_value, + rows_unlabeled, + True + ) + + for node_type, scores_dict in type_dict_labels_scores_dict.items(): + input_vector = format_input_vector_from_label_score_dict(scores_dict, + kernel, + node_type, + missing_value, + rows_unlabeled, + True + ) input_mat.col_bind(matrix=input_vector) return input_mat else: - return format_input_vector_from_scores_dict(scores_dicts, kernel) + return format_input_vector_from_label_score_dict(type_dict_labels_scores_dict, kernel) diff --git a/tests/test_input.py b/tests/test_input.py index dcf3704..c141c85 100644 --- a/tests/test_input.py +++ b/tests/test_input.py @@ -5,9 +5,11 @@ import logging import unittest +import numpy as np from diffupy.constants import * from diffupy.matrix import Matrix -from diffupy.process_data_input import process_data_input, _map_labels_to_background, map_labels_input +from diffupy.process_input import process_input_data, map_labels_input, \ + format_input_for_diffusion from diffupy.process_network import get_graph_from_df from diffupy.validate_input import _validate_scores @@ -22,26 +24,23 @@ class ValidateTest(unittest.TestCase): def test_quantitative_bin_id(self): """Test codify label_input for quantitative scoring methods- only entity IDs given (binary labels).""" input = NODE_TYPE_COL_TEST_PATH - input_labels_dict = process_data_input( + input_labels_dict = process_input_data( input, method=RAW, binning=True, absolute_value=True, p_value=0.05, threshold=None, ) - print(input_labels_dict) self.assertEqual(input_labels_dict, {'Metabolite': {'C': 1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}}) def test_quantitative_bin_fc_sign(self): """Test codify label_input for quantitative scoring methods- logFC given (binary, signed labels).""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_data_input( + input_labels_dict = process_input_data( input, method=RAW, binning=True, absolute_value=False, p_value=0.05, threshold=0.5, ) - print(input_labels_dict) - self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': 0, 'D': 0, 'E': -1}) def test_quantitative_bin_fc_abs(self): """Test codify label_input for quantitative scoring methods- logFC given (binary, absolute values).""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_data_input( + input_labels_dict = process_input_data( input, method=RAW, binning=True, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': 0, 'D': 0, 'E': 1}) @@ -49,7 +48,7 @@ def test_quantitative_bin_fc_abs(self): def test_quantitative_bin_fcp_sign(self): """Test codify label_input for quantitative scoring methods- logFC and adj. p-value given (binary, signed labels).""" input = NODE_LOGFC_PVAL_TEST_PATH - input_labels_dict = process_data_input( + input_labels_dict = process_input_data( input, method=RAW, binning=True, absolute_value=False, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0, 'B': 1, 'C': 0, 'D': 0, 'E': -1}) @@ -57,7 +56,7 @@ def test_quantitative_bin_fcp_sign(self): def test_quantitative_bin_fcp_abs(self): """Test codify label_input for quant. scoring methods- logFC and adj. p-value given (binary, absolute values).""" input = NODE_LOGFC_PVAL_TEST_PATH - input_labels_dict = process_data_input( + input_labels_dict = process_input_data( input, method=RAW, binning=True, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0, 'B': 1, 'C': 0, 'D': 0, 'E': 1}) @@ -65,7 +64,7 @@ def test_quantitative_bin_fcp_abs(self): def test_quantitative_fc_sign(self): """Test codify label_input for quantitative scoring methods- logFC given (quantitative, signed labels).""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_data_input( + input_labels_dict = process_input_data( input, method=RAW, binning=False, absolute_value=False, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0.7, 'B': 1.2, 'C': 0, 'D': 0, 'E': -2.2}) @@ -73,7 +72,7 @@ def test_quantitative_fc_sign(self): def test_quantitative_fc_abs(self): """Test codify label_input for quantitative scoring methods- logFC given (quant., absolute values).""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_data_input( + input_labels_dict = process_input_data( input, method=RAW, binning=False, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0.7, 'B': 1.2, 'C': 0, 'D': 0, 'E': 2.2}) @@ -81,7 +80,7 @@ def test_quantitative_fc_abs(self): def test_quantitative_fcp_sign(self): """Test codify label_input for quantitative scoring methods- logFC and adj. p-value given (quant., signed labels).""" input = NODE_LOGFC_PVAL_TEST_PATH - input_labels_dict = process_data_input( + input_labels_dict = process_input_data( input, method=RAW, binning=False, absolute_value=False, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0, 'B': 1.2, 'C': 0, 'D': 0, 'E': -2.2}) @@ -89,7 +88,7 @@ def test_quantitative_fcp_sign(self): def test_quantitative_fcp_abs(self): """Test codify label_input for quant. scoring methods- logFC and adj. p-value given (quant., absolute values).""" input = NODE_LOGFC_PVAL_TEST_PATH - input_labels_dict = process_data_input( + input_labels_dict = process_input_data( input, method=RAW, binning=False, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0, 'B': 1.2, 'C': 0, 'D': 0, 'E': 2.2}) @@ -97,7 +96,7 @@ def test_quantitative_fcp_abs(self): def test_non_quantitative_bin_id(self): """Test codify label_input for non-quantitative scoring methods- only entity IDs given (binary labels).""" input = NODE_TYPE_COL_TEST_PATH - input_labels_dict = process_data_input( + input_labels_dict = process_input_data( input, method=ML, binning=True, absolute_value=True, p_value=0.05, threshold=None, ) self.assertEqual(input_labels_dict, {'Metabolite': {'C': 1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}}) @@ -105,7 +104,7 @@ def test_non_quantitative_bin_id(self): def test_non_quantitative_bin_fc_abs(self): """Test codify label_input for non-quantitative scoring methods- logFC given (binary, absolute values (sign)).""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_data_input( + input_labels_dict = process_input_data( input, method=ML, binning=True, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': -1, 'D': -1, 'E': 1}) @@ -113,7 +112,7 @@ def test_non_quantitative_bin_fc_abs(self): def test_non_quantitative_bin_fcp_abs(self): """Test codify label_input for non-quant. scoring methods- logFC and adj. p-value given (binary, absolute values).""" input = NODE_LOGFC_PVAL_TEST_PATH - input_labels_dict = process_data_input( + input_labels_dict = process_input_data( input, method=ML, binning=True, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': -1, 'B': 1, 'C': -1, 'D': -1, 'E': 1}) @@ -122,7 +121,7 @@ def test_map_labels_input_label_list_background_list(self): """Test map label_input.""" mapping = map_labels_input(input_labels=['A', 'B', 'C', 'D'], background_labels=['A', 'B', 'C']) - + # As set because the order is not relevant. self.assertEqual(set(mapping), {'A', 'C', 'B'}) def test_map_labels_input_label_list_background_dict(self): @@ -141,7 +140,7 @@ def test_map_labels_input_type_dict_label_list_background_list(self): def test_map_labels_input_type_dict_label_dict_background_dict(self): """Test map label_input.""" - # If the labels are classified in another type ('D' and 'B'), since it do not match with the background it will be not maped. + # If the labels are classified in another type ('D' and 'B'), since it do not match with the background it will be not mapped. mapping = map_labels_input(input_labels={'Gene': ['A'], 'Metabolite': ['C', 'B']}, background_labels={'Gene': ['A', 'B'], 'Metabolite': ['D']}) @@ -170,7 +169,7 @@ def test_map_labels_input_type_dict_label_scores_dict_background_list(self): def test_map_labels_input_type_dict_label_scores_dict_background_dict(self): """Test map label_input.""" - # If the labels are classified in another type ('D' and 'B'), since it do not match with the background it will be not maped. + # If the labels are classified in another type ('D' and 'B'), since it do not match with the background it will be not mapped. mapping = map_labels_input(input_labels={'Metabolite': {'C': -1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}}, background_labels={'Gene': ['A', 'B'], 'Metabolite': ['C']}) @@ -199,16 +198,16 @@ def test_network(self): def test_node_mapping(self): """Test mapping of nodes in label_input to nodes in network.""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_data_input( + input_labels_dict = process_input_data( input, method=RAW, binning=False, absolute_value=True, p_value=0.05, threshold=0.5, ) graph = get_graph_from_df(NETWORK_PATH, CSV) graph_nodes = list(graph.nodes()) - mapped_nodes_list = _map_labels_to_background(input_labels_dict, graph_nodes) + mapped_nodes_list = map_labels_input(input_labels_dict, graph_nodes) - self.assertEqual(mapped_nodes_list, [0.0, 1.0, 0.0, 0.0, None, 1.0, None, None, None]) + self.assertEqual(mapped_nodes_list, {'A': 0.7, 'B': 1.2, 'C': 0.0, 'D': 0.0, 'E': 2.2}) def test_validate_scores_1(self): """Test validate scores 1.""" @@ -248,3 +247,49 @@ def test_validate_scores_4(self): ) with self.assertRaises(ValueError): _validate_scores(matrix) + + kernel_test_1 = Matrix( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], + cols_labels=['A', 'B', 'C', 'D'], + rows_labels=['A', 'B', 'C', 'D'], + name='Test Kernel 1' + ) + + kernel_test_2 = Matrix( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], + cols_labels=['A', 'B', 'C', 'F'], + rows_labels=['A', 'B', 'C', 'F'], + name='Test Kernel 2' + ) + + kernel_test_3 = Matrix( + [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]], + cols_labels=['A', 'B', 'C', 'D', 'F'], + rows_labels=['A', 'B', 'C', 'D', 'F'], + name='Test Kernel 3' + ) + + def test_format_input_for_diffusion_label_list(self): + """Test empty matrix.""" + + processed_mapped_nodes_list = format_input_for_diffusion( + map_labels_input({'Metabolite': {'C': -1}, 'Gene': {'A': 2, 'B': 1}, 'mirnas': {'A': 1, 'T': 1}}, + self.kernel_test_1.rows_labels), + self.kernel_test_1, + ) + + # TODO: Implement in Matrix equal, now if the col order is mixed it raises error + #assert(np.allclose(processed_mapped_nodes_list.mat, + # np.array([[-1, 2, 1], + # [-1, 1, -1], + # [-1, -1, -1], + # [-1, -1, -1]] + # ) + # ) + # ) + #self.assertEqual(processed_mapped_nodes_list.cols_labels, + # ['Metabolite', 'Gene', 'mirnas'] + # ) + #self.assertEqual(processed_mapped_nodes_list.rows_labels, + # ['A', 'B', 'C', 'D'] + # ) From 10f11b06bdce5412078f84cb666df302e10a01f8 Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Wed, 15 Apr 2020 17:07:53 +0200 Subject: [PATCH 05/17] Mapping subsets labels, implemented as _map_label_dict and _map_label_list --- src/diffupy/process_input.py | 74 ++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 20 deletions(-) diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py index ef6875b..ca908df 100644 --- a/src/diffupy/process_input.py +++ b/src/diffupy/process_input.py @@ -383,18 +383,18 @@ def _remove_non_significant_entities(df: pd.DataFrame, p_value: float) -> Dict[s """Data structures format checkers""" -def _scores_dict_data_struct_check(v: Union[dict, list]) -> bool: +def _label_scores_dict_data_struct_check(v: Union[dict, list]) -> bool: """Check data structure type Dict[str, int].""" return (isinstance(v, dict) and - isinstance(get_random_value_from_dict(v), int) + isinstance(get_random_value_from_dict(v), (int, float)) ) -def _type_scores_dict_data_struct_check(v: Union[dict, list]) -> bool: +def _type_dict_label_scores_dict_data_struct_check(v: Union[dict, list]) -> bool: """Check data structure type Dict[str, Dict[str, int]].""" return (isinstance(v, dict) and isinstance(get_random_value_from_dict(v), dict) and - isinstance(get_random_value_from_dict(get_random_value_from_dict(v)), int) + isinstance(get_random_value_from_dict(get_random_value_from_dict(v)), (int, float)) ) @@ -403,7 +403,7 @@ def _label_list_data_struct_check(v: Union[dict, list]) -> bool: return isinstance(v, list) -def _type_label_list_data_struct_check(v: Union[dict, list]) -> bool: +def _type_dict_label_list_data_struct_check(v: Union[dict, list]) -> bool: """Check data structure type Dict[str, list].""" return (isinstance(v, dict) and isinstance(get_random_value_from_dict(v), list) @@ -415,7 +415,7 @@ def _type_label_list_data_struct_check(v: Union[dict, list]) -> bool: def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]], background_labels: Union[Dict[str, list], list]) -> Union[Dict[str, int], list]: - """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes.""" + """Map nodes from input dataset to nodes in network to get a set of labelled nodes.""" if isinstance(background_labels, list): return _map_labels_to_background(input_labels, background_labels) @@ -436,8 +436,9 @@ def _map_labels_to_background(input_labels: Union[list, Dict[str, Dict[str, int] background_labels_type: str = None ) -> Union[Dict[str, Dict[str, int]], Dict[str, int]]: - """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes.""" - if _type_scores_dict_data_struct_check(input_labels) or _type_label_list_data_struct_check(input_labels): + """Map nodes from input dataset to nodes in network to get a set of labelled nodes.""" + if _type_dict_label_scores_dict_data_struct_check(input_labels) or _type_dict_label_list_data_struct_check( + input_labels): if background_labels_type: if background_labels_type in input_labels.keys(): return _map_labels(input_labels[background_labels_type], background_labels) @@ -451,29 +452,62 @@ def _map_labels_to_background(input_labels: Union[list, Dict[str, Dict[str, int] return _map_labels(input_labels, background_labels) +def _map_label_list(input_labels: Union[str, Set[str], List[str]], + background_labels: List[str]) -> List[str]: + mapped_list = [] + for label in input_labels: + if isinstance(label, str): + if label in background_labels: + mapped_list.append(label) + elif isinstance(label, set) or isinstance(label, list): + for sublabel in set(label): + if sublabel in background_labels: + mapped_list.append(label) + else: + raise TypeError( + f'{EMOJI} The input label {label} data structure can not be processed for label mapping' + ) + return mapped_list + + +def _map_label_dict(input_labels: Dict[Union[str, set], Union[int, float]], + background_labels: list) -> Dict[str, Union[int, float]]: + mapped_dict = {} + for label, v in input_labels.items(): + if isinstance(label, str): + if label in background_labels: + mapped_dict[label] = v + elif isinstance(label, set) or isinstance(label, list): + for sublabel in set(label): + if sublabel in background_labels: + mapped_dict[label] = v + else: + raise TypeError( + f'{EMOJI} The input label {label} data structure can not be processed for label mapping' + ) + return mapped_dict + + def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]], - background_labels: list) -> Union[Dict[str, int], list]: + background_labels: list) -> Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]]: """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes.""" if _label_list_data_struct_check(input_labels): - return list(set(input_labels).intersection(set(background_labels))) + return _map_label_list(input_labels, background_labels) - elif _scores_dict_data_struct_check(input_labels): - return {labels: input_labels[labels] - for labels in background_labels - if labels in input_labels - } + elif _label_scores_dict_data_struct_check(input_labels): + return _map_label_dict(input_labels, background_labels) - elif _type_label_list_data_struct_check(input_labels): + elif _type_dict_label_list_data_struct_check(input_labels): l = [] for type, label_list in input_labels.items(): l += _map_labels(label_list, background_labels) return l - elif _type_scores_dict_data_struct_check(input_labels): - l = {} + elif _type_dict_label_scores_dict_data_struct_check(input_labels): + d = {} for type, scores_dict in input_labels.items(): - l.update(_map_labels(scores_dict, background_labels)) - return l + d.update(_map_labels(scores_dict, background_labels)) + return d else: raise TypeError( From 7c42d5dc83b4179d32fad772f108958393cac18c Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Thu, 16 Apr 2020 14:25:56 +0200 Subject: [PATCH 06/17] Parse xls added to diffuPy utils and as a process input option --- src/diffupy/constants.py | 9 ++ src/diffupy/process_input.py | 11 +- src/diffupy/utils.py | 231 +++++++++++++++++++++++++++-------- 3 files changed, 198 insertions(+), 53 deletions(-) diff --git a/src/diffupy/constants.py b/src/diffupy/constants.py index 354cd91..bf9ad8e 100644 --- a/src/diffupy/constants.py +++ b/src/diffupy/constants.py @@ -59,6 +59,10 @@ def ensure_output_dirs(): #: csv CSV = 'csv' +#: xml +XML = 'xml' +#: xmls +XMLS = 'xmls' #: tsv TSV = 'tsv' #: graphML @@ -74,6 +78,11 @@ def ensure_output_dirs(): #: edge list EDGE_LIST = '.lst' +XLS_FORMATS = [ + XML, + XMLS +] + #: DiffuPath available graph formats GRAPH_FORMATS = [ CSV, diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py index ca908df..58f0686 100644 --- a/src/diffupy/process_input.py +++ b/src/diffupy/process_input.py @@ -127,11 +127,20 @@ def _process_data_input_format(raw_data_input: Union[str, list, dict, np.ndarray ) -def _load_data_input_from_file(path: str) -> Union[pd.DataFrame, list]: +def _load_data_input_from_file(path: str, **further_parse_args) -> Union[pd.DataFrame, list]: """Load and process the input data according the input file format.""" if path.endswith(CSV): return from_dataframe_file(path, CSV) + elif path.endswith(XLS_FORMATS): + return parse_xls_to_df(path, + further_parse_args.get('min_row'), + further_parse_args.get('relevant_sheets'), + further_parse_args.get('irrelevant_sheets'), + further_parse_args.get('relevant_cols'), + further_parse_args.get('irrelevant_cols') + ) + elif path.endswith(TSV): return from_dataframe_file(path, TSV) diff --git a/src/diffupy/utils.py b/src/diffupy/utils.py index ca4ae7b..ff31c87 100644 --- a/src/diffupy/utils.py +++ b/src/diffupy/utils.py @@ -7,10 +7,12 @@ import pickle import random import warnings -from typing import List +from collections import defaultdict +from typing import List, Union, Dict, Optional import networkx as nx import numpy as np +import openpyxl as opxl import pandas as pd import pybel from networkx import Graph @@ -20,36 +22,7 @@ log = logging.getLogger(__name__) - -def from_dataframe_file(path: str, fmt: str) -> pd.DataFrame: - """Read network file.""" - format_checker(fmt) - - return pd.read_csv( - path, - header=0, - sep=FORMAT_SEPARATOR_MAPPING[CSV] if fmt == CSV else FORMAT_SEPARATOR_MAPPING[TSV] - ) - - -def from_json(path: str): - """Read from json file.""" - with open(path) as f: - return json.load(f) - - -def from_pickle(input_path): - """Read from pickle file.""" - with open(input_path, 'rb') as f: - unpickler = pickle.Unpickler(f) - return unpickler.load() - - -def from_nparray_to_df(nparray: np.ndarray) -> pd.DataFrame: - """Convert numpy array to data frame.""" - return pd.DataFrame(data=nparray[1:, 1:], - index=nparray[1:, 0], - columns=nparray[0, 1:]) +"""Matrix/graph handling utils.""" def get_laplacian(graph: Graph, normalized: bool = False) -> np.ndarray: @@ -151,23 +124,6 @@ def get_idx_scores_mapping(scores): return {i: score for i, score in enumerate(scores)} -def decode_labels(labels): - """Validate labels.""" - labels_decode = [] - - for label in labels: - if not isinstance(label, str): - - if isinstance(label, int): - label = str(label) - else: - label = label.decode('utf-8').replace('"', '') - - labels_decode.append(label) - - return labels_decode - - def print_dict_dimensions(entities_db, title): """Print dimension of the dictionary.""" total = 0 @@ -187,6 +143,17 @@ def print_dict_dimensions(entities_db, title): print(f'Total: {total} ') +def get_random_key_from_dict(d): + return random.choice(list(d.keys())) + + +def get_random_value_from_dict(d): + return d[get_random_key_from_dict(d)] + + +"""File loading utils.""" + + def format_checker(fmt: str, fmt_list: list = GRAPH_FORMATS) -> None: """Check formats.""" if fmt not in fmt_list: @@ -196,9 +163,169 @@ def format_checker(fmt: str, fmt_list: list = GRAPH_FORMATS) -> None: ) -def get_random_key_from_dict(d): - return random.choice(list(d.keys())) +def from_dataframe_file(path: str, fmt: str) -> pd.DataFrame: + """Read network file.""" + format_checker(fmt) + + return pd.read_csv( + path, + header=0, + sep=FORMAT_SEPARATOR_MAPPING[CSV] if fmt == CSV else FORMAT_SEPARATOR_MAPPING[TSV] + ) -def get_random_value_from_dict(d): - return d[get_random_key_from_dict(d)] +def from_json(path: str): + """Read from json file.""" + with open(path) as f: + return json.load(f) + + +def from_pickle(input_path): + """Read from pickle file.""" + with open(input_path, 'rb') as f: + unpickler = pickle.Unpickler(f) + return unpickler.load() + + +def from_nparray_to_df(nparray: np.ndarray) -> pd.DataFrame: + """Convert numpy array to data frame.""" + return pd.DataFrame(data=nparray[1:, 1:], + index=nparray[1:, 0], + columns=nparray[0, 1:]) + + +"""Data parsing utils.""" + + +def decode_labels(labels): + """Validate labels.""" + labels_decode = [] + + for label in labels: + if not isinstance(label, str): + + if isinstance(label, int): + label = str(label) + else: + label = label.decode('utf-8').replace('"', '') + + labels_decode.append(label) + + return labels_decode + + +def munge_label(label: Union[str, int, float]) -> str: + """Munge label strings.""" + remove_set = ['*', ' ', '|', '-', '"', "'", "↑", "↓", "\n"] + split_set = ['/'] + + label = str(label).lower() + + for symb in remove_set: + if symb in label: + label = label.replace(symb, '') + + for symb in split_set: + if symb in label: + label = tuple(set(label.split(symb))) + if len(label) == 1: + label = label[0] + + return label + + +def munge_label_list(labels: list): + """Munge labels list.""" + return list(set([munge_label(label) for label in labels])) + + +def munge_label_scores_dict(labels: dict) -> Dict[str, Union[list, int, str]]: + """Munge labels dict.""" + return {munge_label(label): v for label, v in labels.items()} + + +def munge_label_type_dict(label_dict: Dict[str, Union[list, int, str, dict]]) -> Dict[str, Union[list, int, str, dict]]: + """Munge labels type dict.""" + type_label_dict = {} + + for type_label, labels in label_dict.items(): + if isinstance(labels, dict): + type_label_dict[type_label] = munge_label_scores_dict(labels) + + elif isinstance(labels, dict): + type_label_dict[type_label] = munge_label_scores_dict(labels) + + return type_label_dict + + +def munge_cell(cell): + """Munge cell.""" + if isinstance(cell, str): + if cell.replace(',', '').replace('.', '').replace('-', '').isnumeric(): + return float(cell) + else: + return munge_label(cell) + + elif isinstance(cell, float) or isinstance(cell, int): + return cell + + else: + raise TypeError('The cell type could not be processed.') + + +def parse_xls_sheet_to_df(sheet: opxl.workbook, + min_row: Optional[int] = 1, + relevant_cols: Optional[list] = None, + irrelevant_cols: Optional[list] = None) -> pd.DataFrame: + """Process/format excel sheets to DataFrame.""" + parsed_sheet_dict = defaultdict(list) + + for col in sheet.iter_cols(min_row=min_row): + col_label = col[0].value + + if relevant_cols is None and irrelevant_cols is None: + relevant_cols = [col_label] + irrelevant_cols = [] + elif relevant_cols is None: + relevant_cols = [] + elif irrelevant_cols is None: + irrelevant_cols = [] + + parsed_sheet_dict[col_label].append([munge_cell(cell.value) + for cell in col[1:] + if (col_label in relevant_cols or col_label not in irrelevant_cols) and + munge_cell(cell.value) != '' + ]) + + return pd.DataFrame.from_dict(parsed_sheet_dict) + + +def parse_xls_to_df(path: str, + min_row: Optional[int] = 1, + relevant_sheets: Optional[list] = None, + irrelevant_sheets: Optional[list] = None, + relevant_cols: Optional[list] = None, + irrelevant_cols: Optional[list] = None, + ) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]: + """Process excel file as a set (if several excel sheets) or a single dataframe.""" + wb = opxl.load_workbook(filename=path) + + sheets = wb.sheetnames + df_dict = {} + + if relevant_sheets is None and irrelevant_sheets is None: + relevant_sheets = sheets + irrelevant_sheets = [] + elif relevant_sheets is None: + relevant_sheets = [] + elif irrelevant_sheets is None: + irrelevant_sheets = [] + + if len(sheets) > 1: + return {df_dict[sheets[ix].lower()]: parse_xls_sheet_to_df(sheet, min_row, relevant_cols, irrelevant_cols) + for ix, sheet in enumerate(wb) + if sheets[ix] in relevant_sheets or sheets[ix] not in irrelevant_sheets + } + + else: + return parse_xls_sheet_to_df(wb[sheets[0]]) From 526781d6e71267d7e86ccefdb98a63849137fbd1 Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Sun, 19 Apr 2020 17:37:11 +0200 Subject: [PATCH 07/17] General refactors and documentation in process_input --- src/diffupy/process_input.py | 149 +++++++++++++++++++++-------------- 1 file changed, 90 insertions(+), 59 deletions(-) diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py index 58f0686..f5325d0 100644 --- a/src/diffupy/process_input.py +++ b/src/diffupy/process_input.py @@ -2,7 +2,7 @@ """Main matrix class and processing of input data.""" -from typing import Dict, Optional, Union +from typing import Dict, Optional, Union, List, Set import numpy as np import pandas as pd @@ -10,35 +10,49 @@ from .constants import * from .matrix import Matrix from .utils import from_pickle, from_json, from_dataframe_file, from_nparray_to_df, get_random_value_from_dict, \ - get_random_key_from_dict + get_random_key_from_dict, parse_xls_to_df """Process input data""" def process_input_data_for_diff(data_input: Union[str, pd.DataFrame, list, dict, np.ndarray, Matrix], kernel: Matrix, - background_labels: Union[list, dict] = None, - method: Optional[str] = 'raw', + method: str = 'raw', binning: Optional[bool] = False, absolute_value: Optional[bool] = False, p_value: Optional[float] = None, threshold: Optional[float] = None, - separator_str: Optional[str] = ', ' + background_labels: Optional[Union[list, Dict[str, list]]] = None, + **further_parse_args ) -> Matrix: - """Process miscellaneous input data and format it for the diffusion computation function.""" + """Process miscellaneous data input, perform the mapping to the diffusion background network (as a kernel) and + format it for the diffusion computation function. + + :param data_input: A miscellaneous data input to be processed/formatted for the diffuPy diffusion computation. + :param kernel: A pre-computed kernel to perform the label mapping and the matching for the input formatting. + :param method: Elected method ["raw", "ml", "gm", "ber_s", "ber_p", "mc", "z"]. + :param binning: If logFC provided in dataset, convert logFC to binary. + :param absolute_value: Codify node labels by applying threshold to | logFC | in input. + :param p_value: Statistical significance. + :param threshold: Codify node labels by applying a threshold to logFC in input. + :param background_labels: Labels set to map the input labels, which can provide label classification by type dict. + :param further_parse_args: Arguments to refine the data input parsing, among which: + for string list parsing: separ_str + for excel/csv parsing: min_row, cols_mapping, relevant_cols, irrelevant_cols + for excel: relevant_sheets, irrelevant_sheets + """ # If specific label background not provided, get a list from kernel labels. if not background_labels: background_labels = list(kernel.rows_labels) - # TODO: Discuss store label classification (mapping or as a column argument) in kernel # Pipeline the input, first preprocessing it, then mapping it to the background labels and finally formatting it. - return format_input_for_diffusion(map_labels_input(process_data_input(data_input, + return format_input_for_diffusion(map_labels_input(process_input_data(data_input, method, binning, absolute_value, p_value, threshold, - separator_str + **further_parse_args ), background_labels ), @@ -52,14 +66,27 @@ def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFra absolute_value: bool = False, p_value: float = None, threshold: Optional[float] = None, - separator_str: Optional[str] = ', ', + **further_parse_args ) -> Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]]: - """Process and pipeline the provided miscellaneous data input in standardized data structures for further processing.""" - # Preprocess the raw input according is format types. - preprocessed_data = _process_data_input_format(data_input, separator_str) - - # If the preprocessed input is a list or a label type dict (Dict[str, list]) of lists return it for categorical input generation. - if _label_list_data_struct_check(preprocessed_data) or _type_label_list_data_struct_check(preprocessed_data): + """Pipeline the provided miscellaneous data input for further processing, in the following standardized data structures: + label list, type_dict label lists, label-scores dict or type_dict label-scores dicts. + + :param data_input: A miscellaneous data input to be processed. + :param method: Elected method ["raw", "ml", "gm", "ber_s", "ber_p", "mc", "z"] + :param binning: If logFC provided in dataset, convert logFC to binary. + :param absolute_value: Codify node labels by applying threshold to | logFC | in input. + :param p_value: Statistical significance. + :param threshold: Codify node labels by applying a threshold to logFC in input. + :param further_parse_args: Arguments to refine the data input parsing, among which: + for string list parsing: separ_str + for excel/csv parsing: min_row, cols_mapping, relevant_cols, irrelevant_cols + for excel: relevant_sheets, irrelevant_sheets + """ + # Preprocess the raw input according its data structure types. + preprocessed_data = _process_data_input_format(data_input, **further_parse_args) + + # If the preprocessed input is a list or a label type dict (Dict[str, list]) return it for latter categorical input generation. + if _label_list_data_struct_check(preprocessed_data) or _type_dict_label_list_data_struct_check(preprocessed_data): return preprocessed_data # If the preprocessed input is a label type label-scores dict (Dict[str, pd.DataFrame]) pipeline it for scores codifying. @@ -88,35 +115,39 @@ def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFra def _process_data_input_format(raw_data_input: Union[str, list, dict, np.ndarray, pd.DataFrame], - separ_str: str = ',') -> Union[pd.DataFrame, list, Dict[str, Union[pd.DataFrame, list]]]: + separ_str: str = ', ', + **further_parse_args) -> Union[pd.DataFrame, list, Dict[str, Union[pd.DataFrame, list]]]: """Format the input as a label-score dataframe, a list or a labels or a type dict for latter input processing.""" if isinstance(raw_data_input, str): # If the data input type is a string, mostly will be a path to the dataset file. if os.path.isfile(raw_data_input): - return _process_data_input_format(_load_data_input_from_file(raw_data_input)) + return _process_data_input_format(_load_data_input_from_file(raw_data_input, **further_parse_args)) elif '/' in raw_data_input and separ_str not in ['/', ' /', '/ ']: raise IOError( f'{EMOJI} The file could not have been located in the provided data input path,.' ) - # If it is not a path, will be treated as a label list with separator. + # If the data input is not identified as a path, it will be treated as a label list with an indicated separator. else: - return _process_data_input_format(raw_data_input.split(raw_data_input)) - - if isinstance(raw_data_input, pd.DataFrame): - return raw_data_input + return _process_data_input_format(raw_data_input.split(separ_str)) elif isinstance(raw_data_input, list) or isinstance(raw_data_input, set): return list(set(raw_data_input)) - elif isinstance(raw_data_input, np.ndarray): - return from_nparray_to_df(raw_data_input) + if isinstance(raw_data_input, pd.DataFrame): + return raw_data_input elif isinstance(raw_data_input, dict): - if _scores_dict_data_struct_check(raw_data_input): + # If the data input type dict is a label-scores dict, codify it as a Panda's dataframe for latter processing. + if _label_scores_dict_data_struct_check(raw_data_input): return pd.DataFrame.from_dict(raw_data_input, orient='index') + # Else it will be treated as a label_type dict, calling recursively the process input format for each type subset (key). else: + # It is assumed that the all the dict values match the same data type. return {label_type: _process_data_input_format(data_i) for label_type, data_i in raw_data_input.items()} + elif isinstance(raw_data_input, np.ndarray): + return from_nparray_to_df(raw_data_input) + elif isinstance(raw_data_input, Matrix): return raw_data_input.to_df() @@ -168,13 +199,13 @@ def _codify_input_data(df: pd.DataFrame, threshold: Optional[float], ) -> Union[Dict[str, Dict[str, int]], Dict[str, int]]: - """Process the input scores for the codifying process.""" + """Process the input scores dataframe for the codifying process.""" # Ensure that node labeling is in the provided dataset. if not any(n in df.columns for n in NODE_LABELING): raise ValueError( f'Ensure that your file contains a column {NODE_LABELING} with node IDs.' ) - # Standardize the title of the node column labeling column to 'label', for later processing. + # Standardize the title of the node column labeling column to 'Label', for later processing. elif LABEL not in df.columns: for l in list(df.columns): if l in NODE_LABELING: @@ -230,10 +261,10 @@ def _codify_method_check(df: pd.DataFrame, else: # TODO: ber_s, ber_p, mc - raise NotImplementedError('This diffusion method has not yet been implemented.') + raise NotImplementedError('This diffusion method has not been yet implemented.') -"""Assign binary labels to input for scoring methods that accept non-quantitative values""" +"""Assign binary scores to input for scoring methods that ONLY accept non-quantitative values""" def _codify_non_quantitative_input_data( @@ -241,7 +272,7 @@ def _codify_non_quantitative_input_data( p_value: float, threshold: Optional[float] ) -> Dict[str, int]: - """Codify input data to get a set of labelled nodes for scoring methods that accept non-quantitative values.""" + """Codify input data to get a set of scored nodes for scoring methods that accept non-quantitative values.""" # LogFC provided in dataset and threshold given if LOG_FC in df.columns and threshold: @@ -250,19 +281,19 @@ def _codify_non_quantitative_input_data( # Label nodes with -1 if | logFC | below threshold df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = -1 - # If adjusted p-values are provided in dataset, label nodes that are not statistically significant with -1 + # If adjusted p-values are provided in dataset, score nodes that are not statistically significant with -1 if P_VALUE in df.columns: df.loc[df[P_VALUE] > p_value, SCORE] = -1 - return df.set_index(NODE)[SCORE].to_dict() + return df.set_index(LABEL)[SCORE].to_dict() - # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign labels as 1 + # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign scores as 1 df[SCORE] = 1 - return df.set_index(NODE)[SCORE].to_dict() + return df.set_index(LABEL)[SCORE].to_dict() -"""Assign binary labels to input for scoring methods that accept quantitative values""" +"""Assign binary scores to input for scoring methods that accept quantitative values""" def _codify_quantitative_input_data( @@ -272,34 +303,34 @@ def _codify_quantitative_input_data( p_value: float, threshold: Optional[float], ) -> Dict[str, int]: - """Codify input data to get a set of labelled nodes for scoring methods that accept quantitative values.""" + """Codify input data to get a set of scored nodes for scoring methods that accept quantitative values.""" # LogFC provided in dataset and threshold given if LOG_FC in df.columns and threshold: - # Binarize labels with 1, 0 and/or -1 + # Binarize scores with 1, 0 and/or -1 if binning is True: - # Add binning labels where | logFC | values above threshold are 1 and below are 0 + # Add binning scores where | logFC | values above threshold are 1 and below are 0 if absolute_value is True: return _bin_quantitative_input_by_abs_val(df, threshold, p_value) - # Add signed labels where | logFC | values above threshold are 1 or -1 (signed) and values below are 0 + # Add signed scores where | logFC | values above threshold are 1 or -1 (signed) and values below are 0 return _bin_quantitative_input_by_threshold(df, threshold, p_value) # Labels are 0s or logFC values rather than binary values else: - # Codify inputs with | logFC | if they pass threshold; otherwise assign label as 0 + # Codify inputs with | logFC | if they pass threshold; otherwise assign score as 0 if absolute_value is True: return _codify_quantitative_input_by_abs_val(df, threshold, p_value) - # Codify inputs with logFC if they pass threshold; otherwise assign label as 0 + # Codify inputs with logFC if they pass threshold; otherwise assign score as 0 return _codify_quantitative_input_by_threshold(df, threshold, p_value) - # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign labels as 1 + # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign scores as 1 df[SCORE] = 1 - return df.set_index(NODE)[SCORE].to_dict() + return df.set_index(LABEL)[SCORE].to_dict() def _bin_quantitative_input_by_abs_val( @@ -307,17 +338,17 @@ def _bin_quantitative_input_by_abs_val( threshold: float, p_value: float, ) -> Dict[str, int]: - """Process quantitative inputs and bin labels by absolute value.""" - # Add label 1 if | logFC | is above threshold + """Process quantitative inputs and bin scores by absolute value.""" + # Add score 1 if | logFC | is above threshold df.loc[(df[LOG_FC]).abs() >= threshold, SCORE] = 1 - # Add label 0 if | logFC | below threshold + # Add score 0 if | logFC | below threshold df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0 # logFC and adjusted p-values are provided in dataset if P_VALUE in df.columns: return _remove_non_significant_entities(df, p_value) - return df.set_index(NODE)[SCORE].to_dict() + return df.set_index(LABEL)[SCORE].to_dict() def _bin_quantitative_input_by_threshold( @@ -325,12 +356,12 @@ def _bin_quantitative_input_by_threshold( threshold: float, p_value: float, ) -> Dict[str, int]: - """Process quantitative inputs and bin labels by threshold.""" - # Add label 1 if logFC is above threshold + """Process quantitative inputs and bin scores by threshold.""" + # Add score 1 if logFC is above threshold df.loc[df[LOG_FC] >= threshold, SCORE] = 1 - # Add label 0 if | logFC | below threshold + # Add score 0 if | logFC | below threshold df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0 - # Replace remaining labels with -1 (i.e. | logFC | above threshold but sign is negative) + # Replace remaining score with -1 (i.e. | logFC | above threshold but sign is negative) df = df.fillna(-1) if p_value: @@ -339,10 +370,10 @@ def _bin_quantitative_input_by_threshold( # Disregard entities if logFC adjusted p-value is not significant return _remove_non_significant_entities(df, p_value) - return df.set_index(NODE)[SCORE].to_dict() + return df.set_index(LABEL)[SCORE].to_dict() -"""Assign logFC as labels for input for scoring methods that accept quantitative values""" +"""Assign logFC as score for input for scoring methods that accept quantitative values""" def _codify_quantitative_input_by_abs_val( @@ -350,10 +381,10 @@ def _codify_quantitative_input_by_abs_val( threshold: float, p_value: float, ) -> Dict[str, int]: - """Codify nodes with | logFC | if they pass threshold, otherwise label is 0.""" + """Codify nodes with | logFC | if they pass threshold, otherwise score is 0.""" # Codify nodes with | logFC | if they pass threshold df.loc[(df[LOG_FC]).abs() >= threshold, SCORE] = (df[LOG_FC]).abs() - # Codify nodes with label 0 if it falls below threshold + # Codify nodes with score 0 if it falls below threshold df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0 # LogFC and adjusted p-values are provided in dataset @@ -361,7 +392,7 @@ def _codify_quantitative_input_by_abs_val( # Disregard entities if logFC adjusted p-value is not significant return _remove_non_significant_entities(df, p_value) - return df.set_index(NODE)[SCORE].to_dict() + return df.set_index(LABEL)[SCORE].to_dict() def _codify_quantitative_input_by_threshold( @@ -379,14 +410,14 @@ def _codify_quantitative_input_by_threshold( # Disregard entities if logFC adjusted p-value is not significant return _remove_non_significant_entities(df, p_value) - return df.set_index(NODE)[SCORE].to_dict() + return df.set_index(LABEL)[SCORE].to_dict() def _remove_non_significant_entities(df: pd.DataFrame, p_value: float) -> Dict[str, int]: # Label entity 0 if adjusted p-value for logFC is not significant df.loc[df[P_VALUE] > p_value, SCORE] = 0 - return df.set_index(NODE)[SCORE].to_dict() + return df.set_index(LABEL)[SCORE].to_dict() """Data structures format checkers""" From 9c8388e604240a3eff9e98a4444f1d35655ea47f Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Mon, 20 Apr 2020 13:39:06 +0200 Subject: [PATCH 08/17] General refator in imports and function naming updates in diffuPy package --- src/diffupy/cli.py | 61 ++++++++++++++++++---------------- src/diffupy/diffuse.py | 2 +- src/diffupy/matrix.py | 4 +-- src/diffupy/process_network.py | 6 ++-- 4 files changed, 38 insertions(+), 35 deletions(-) diff --git a/src/diffupy/cli.py b/src/diffupy/cli.py index 7e6b0a7..c5cbdf8 100644 --- a/src/diffupy/cli.py +++ b/src/diffupy/cli.py @@ -10,12 +10,13 @@ import time import click +from diffupy.process_network import get_kernel_from_network_path -from .constants import OUTPUT, METHODS, EMOJI +from .constants import OUTPUT, METHODS, EMOJI, RAW from .diffuse import diffuse as run_diffusion from .kernels import regularised_laplacian_kernel -from .process_input import process_input -from .utils import process_network_from_cli +from .process_input import process_input_data_for_diff +from .process_network import process_graph_from_file logger = logging.getLogger(__name__) @@ -42,9 +43,9 @@ def main(): ) @click.option('-l', '--log', is_flag=True, help='Activate debug mode') def kernel( - network: str, - output: str = OUTPUT, - log: bool = None + graph: str, + output: str = OUTPUT, + log: bool = None ): """Generate a kernel for a given network.""" # Configure logging level @@ -55,16 +56,16 @@ def kernel( logging.basicConfig(level=logging.INFO) logger.setLevel(logging.INFO) - click.secho(f'{EMOJI} Loading graph from {network} {EMOJI}') + click.secho(f'{EMOJI} Loading graph from {graph} {EMOJI}') - graph = process_network_from_cli(network) + graph = process_graph_from_file(graph) - click.secho(f'{EMOJI} Calculating regularized Laplacian kernel. This might take a while... {EMOJI}') + click.secho(f'{EMOJI} Generating regularized Laplacian kernel from graph. This might take a while... {EMOJI}') exe_t_0 = time.time() background_mat = regularised_laplacian_kernel(graph) exe_t_f = time.time() - output_file = os.path.join(output, f'{network.split("/")[-1]}.pickle') + output_file = os.path.join(output, f'{graph.split("/")[-1]}.pickle') # Export numpy array with open(output_file, 'wb') as file: @@ -98,7 +99,7 @@ def kernel( '-m', '--method', help='Diffusion method', type=click.Choice(METHODS), - required=True, + default=RAW, ) @click.option( '-b', '--binarize', @@ -112,6 +113,7 @@ def kernel( @click.option( '-t', '--threshold', help='Codify node labels by applying a threshold to logFC in input.', + default=None, type=float, ) @click.option( @@ -130,36 +132,37 @@ def kernel( show_default=True, ) def diffuse( - network: str, - data: str, - output: str, - method: str, - binarize: bool, - absolute_value: bool, - threshold: float, - p_value: float, + input_data: str, + network: str, + output: str = sys.stdout, + method: str = RAW, + binarize: bool = True, + threshold: float = None, + absolute_value: bool = True, + p_value: float = 0.05, ): """Run a diffusion method over a network or pre-generated kernel.""" click.secho(f'{EMOJI} Loading graph from {network} {EMOJI}') - graph = process_network_from_cli(network) - click.secho( - f'{EMOJI} Graph loaded with: \n' - f'{graph.number_of_nodes()} nodes\n' - f'{graph.number_of_edges()} edges\n' - f'{EMOJI}' - ) + kernel = get_kernel_from_network_path(network) - click.secho(f'Codifying data from {data}.') + click.secho(f'Codifying data from {input_data}.') - input_scores_dict = process_input(data, method, binarize, absolute_value, p_value, threshold) + input_scores_dict = process_input_data_for_diff(input_data, + kernel, + method, + binarize, + absolute_value, + p_value, + threshold, + ) click.secho(f'Running the diffusion algorithm.') results = run_diffusion( input_scores_dict, method, - graph, + k=kernel ) json.dump(results, output, indent=2) diff --git a/src/diffupy/diffuse.py b/src/diffupy/diffuse.py index c6b8202..88befca 100644 --- a/src/diffupy/diffuse.py +++ b/src/diffupy/diffuse.py @@ -28,7 +28,7 @@ def diffuse( ) -> Matrix: """Run diffusion on a network given an input and a diffusion method. - :param input_scores: score collection, supplied as n-dimensional array. Could be 1-dimensional (List) or n-dimensional (Matrix). + :param input_scores: score collection, supplied as n-dimensional array. Could be 1-dimensional (Vector) or n-dimensional (Matrix). :param method: Elected method ["raw", "ml", "gm", "ber_s", "ber_p", "mc", "z"] :param graph: A network as a graph. It could be optional if a Kernel is provided :param kwargs: Optional arguments: diff --git a/src/diffupy/matrix.py b/src/diffupy/matrix.py index dd4d2b5..bbda07e 100644 --- a/src/diffupy/matrix.py +++ b/src/diffupy/matrix.py @@ -80,7 +80,7 @@ def __init__( def __str__(self): """Return a string representation of the Matrix.""" - s = f" {self.cols_labels}" + s = f" {self.cols_labels}" for i, row_label in enumerate(self.rows_labels): s += f"\n {row_label} {self.mat[i]} " @@ -589,7 +589,7 @@ def __init__(self, csv_path, fmt=CSV, name=None): class MatrixFromGraph(Matrix): """Constructor matrix class for nx.Graph to Matrix conversion.""" - # TODO : move instances initalization from global argument graph to here + # TODO : move instances initialization from global argument graph to here def __init__(self, graph, node_argument='name', name=''): # This initialization would make a matrix representing the graph (taking a graph argument as label) diff --git a/src/diffupy/process_network.py b/src/diffupy/process_network.py index 2b2257b..4b699c8 100644 --- a/src/diffupy/process_network.py +++ b/src/diffupy/process_network.py @@ -22,7 +22,7 @@ """Process network as undefined format (could represented as a graph or as a kernel)""" -def get_kernel_and_graph_from_network_file(path: str) -> Tuple[Matrix, Graph]: +def get_kernel_and_graph_from_network_path(path: str) -> Tuple[Matrix, Graph]: """Load network provided in cli as a kernel and as a graph.""" graph = None kernel = None @@ -52,7 +52,7 @@ def get_kernel_and_graph_from_network_file(path: str) -> Tuple[Matrix, Graph]: return kernel, graph -def get_kernel_from_network_file(path: str) -> Matrix: +def get_kernel_from_network_path(path: str) -> Matrix: """Load network provided in cli as a kernel.""" if path.endswith(KERNEL_FORMATS): try: @@ -73,7 +73,7 @@ def get_kernel_from_network_file(path: str) -> Matrix: return regularised_laplacian_kernel(graph) -def get_graph_from_network_file(path: str) -> Graph: +def get_graph_from_network_path(path: str) -> Graph: """Load network provided in cli as a graph.""" if path.endswith(KERNEL_FORMATS): try: From 8e01ad1f8c5904534af7ad25b351fe17294478b5 Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Mon, 20 Apr 2020 14:07:49 +0200 Subject: [PATCH 09/17] Added feature rename dataframe column titles according (if) provided label_mapping --- src/diffupy/process_input.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py index f5325d0..1ef3ee4 100644 --- a/src/diffupy/process_input.py +++ b/src/diffupy/process_input.py @@ -96,7 +96,8 @@ def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFra binning, absolute_value, p_value, - threshold + threshold, + further_parse_args.get('cols_titles_mapping') ) for label_type, preprocessed_data_i in preprocessed_data.items() } @@ -107,7 +108,8 @@ def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFra binning, absolute_value, p_value, - threshold + threshold, + further_parse_args.get('cols_titles_mapping') ) @@ -197,6 +199,7 @@ def _codify_input_data(df: pd.DataFrame, absolute_value: bool, p_value: float, threshold: Optional[float], + cols_titles_mapping: Optional[Dict[str:str]] = None ) -> Union[Dict[str, Dict[str, int]], Dict[str, int]]: """Process the input scores dataframe for the codifying process.""" @@ -205,8 +208,15 @@ def _codify_input_data(df: pd.DataFrame, raise ValueError( f'Ensure that your file contains a column {NODE_LABELING} with node IDs.' ) + + # Rename dataframe column titles according (if) provided label_mapping. + if cols_titles_mapping is not None: + for label_to_rename, new_name in cols_titles_mapping.items(): + if label_to_rename in df.columns: + df = df.rename(columns={label_to_rename: new_name}) + # Standardize the title of the node column labeling column to 'Label', for later processing. - elif LABEL not in df.columns: + if LABEL not in df.columns: for l in list(df.columns): if l in NODE_LABELING: df = df.rename(columns={l: LABEL}) From fc345ded76ca67d66ea1197dfaf6346669b6440f Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Tue, 21 Apr 2020 14:24:58 +0200 Subject: [PATCH 10/17] Excel parser refactor after testing --- src/diffupy/utils.py | 32 +++++++------------------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/src/diffupy/utils.py b/src/diffupy/utils.py index ff31c87..f8d0ec8 100644 --- a/src/diffupy/utils.py +++ b/src/diffupy/utils.py @@ -278,24 +278,14 @@ def parse_xls_sheet_to_df(sheet: opxl.workbook, relevant_cols: Optional[list] = None, irrelevant_cols: Optional[list] = None) -> pd.DataFrame: """Process/format excel sheets to DataFrame.""" - parsed_sheet_dict = defaultdict(list) + parsed_sheet_dict = {} for col in sheet.iter_cols(min_row=min_row): col_label = col[0].value - if relevant_cols is None and irrelevant_cols is None: - relevant_cols = [col_label] - irrelevant_cols = [] - elif relevant_cols is None: - relevant_cols = [] - elif irrelevant_cols is None: - irrelevant_cols = [] - - parsed_sheet_dict[col_label].append([munge_cell(cell.value) - for cell in col[1:] - if (col_label in relevant_cols or col_label not in irrelevant_cols) and - munge_cell(cell.value) != '' - ]) + if ((relevant_cols is not None and col_label in relevant_cols) or + (irrelevant_cols is not None and col_label not in irrelevant_cols)): + parsed_sheet_dict[col_label] = [munge_cell(cell.value) for cell in col[1:]] return pd.DataFrame.from_dict(parsed_sheet_dict) @@ -311,20 +301,12 @@ def parse_xls_to_df(path: str, wb = opxl.load_workbook(filename=path) sheets = wb.sheetnames - df_dict = {} - - if relevant_sheets is None and irrelevant_sheets is None: - relevant_sheets = sheets - irrelevant_sheets = [] - elif relevant_sheets is None: - relevant_sheets = [] - elif irrelevant_sheets is None: - irrelevant_sheets = [] if len(sheets) > 1: - return {df_dict[sheets[ix].lower()]: parse_xls_sheet_to_df(sheet, min_row, relevant_cols, irrelevant_cols) + return {sheets[ix].lower(): parse_xls_sheet_to_df(sheet, min_row, relevant_cols, irrelevant_cols) for ix, sheet in enumerate(wb) - if sheets[ix] in relevant_sheets or sheets[ix] not in irrelevant_sheets + if (relevant_sheets is not None and sheets[ix] in relevant_sheets) or + (irrelevant_sheets is not None and sheets[ix] in irrelevant_sheets) } else: From ea597723681ec50c2c5fa7fbba098f91486116a9 Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Tue, 21 Apr 2020 17:26:44 +0200 Subject: [PATCH 11/17] Process input refactor and process substrings feature --- src/diffupy/cli.py | 22 ++-- src/diffupy/constants.py | 24 ++-- src/diffupy/process_input.py | 207 ++++++++++++++++++++++------------- 3 files changed, 151 insertions(+), 102 deletions(-) diff --git a/src/diffupy/cli.py b/src/diffupy/cli.py index c5cbdf8..0c49023 100644 --- a/src/diffupy/cli.py +++ b/src/diffupy/cli.py @@ -15,7 +15,7 @@ from .constants import OUTPUT, METHODS, EMOJI, RAW from .diffuse import diffuse as run_diffusion from .kernels import regularised_laplacian_kernel -from .process_input import process_input_data_for_diff +from .process_input import process_map_and_format_input_data_for_diff from .process_network import process_graph_from_file logger = logging.getLogger(__name__) @@ -62,14 +62,14 @@ def kernel( click.secho(f'{EMOJI} Generating regularized Laplacian kernel from graph. This might take a while... {EMOJI}') exe_t_0 = time.time() - background_mat = regularised_laplacian_kernel(graph) + kernel = regularised_laplacian_kernel(graph) exe_t_f = time.time() output_file = os.path.join(output, f'{graph.split("/")[-1]}.pickle') # Export numpy array with open(output_file, 'wb') as file: - pickle.dump(background_mat, file, protocol=4) + pickle.dump(kernel, file, protocol=4) running_time = exe_t_f - exe_t_0 @@ -148,14 +148,14 @@ def diffuse( click.secho(f'Codifying data from {input_data}.') - input_scores_dict = process_input_data_for_diff(input_data, - kernel, - method, - binarize, - absolute_value, - p_value, - threshold, - ) + input_scores_dict = process_map_and_format_input_data_for_diff(input_data, + kernel, + method, + binarize, + absolute_value, + p_value, + threshold, + ) click.secho(f'Running the diffusion algorithm.') diff --git a/src/diffupy/constants.py b/src/diffupy/constants.py index bf9ad8e..581ce9e 100644 --- a/src/diffupy/constants.py +++ b/src/diffupy/constants.py @@ -60,9 +60,9 @@ def ensure_output_dirs(): #: csv CSV = 'csv' #: xml -XML = 'xml' +XLS = 'xls' #: xmls -XMLS = 'xmls' +XLSX = 'xlsx' #: tsv TSV = 'tsv' #: graphML @@ -78,28 +78,28 @@ def ensure_output_dirs(): #: edge list EDGE_LIST = '.lst' -XLS_FORMATS = [ - XML, - XMLS -] +XLS_FORMATS = ( + XLS, + XLSX +) -#: DiffuPath available graph formats -GRAPH_FORMATS = [ +#: Available graph formats +GRAPH_FORMATS = ( CSV, TSV, GRAPHML, BEL, JSON, PICKLE, -] +) -#: DiffuPath available kernel formats -KERNEL_FORMATS = [ +#: Available kernel formats +KERNEL_FORMATS = ( CSV, TSV, JSON, PICKLE, -] +) #: Separators FORMAT_SEPARATOR_MAPPING = { diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py index 1ef3ee4..4e65078 100644 --- a/src/diffupy/process_input.py +++ b/src/diffupy/process_input.py @@ -2,7 +2,7 @@ """Main matrix class and processing of input data.""" -from typing import Dict, Optional, Union, List, Set +from typing import Dict, Optional, Union, List, Set, Tuple import numpy as np import pandas as pd @@ -15,16 +15,16 @@ """Process input data""" -def process_input_data_for_diff(data_input: Union[str, pd.DataFrame, list, dict, np.ndarray, Matrix], - kernel: Matrix, - method: str = 'raw', - binning: Optional[bool] = False, - absolute_value: Optional[bool] = False, - p_value: Optional[float] = None, - threshold: Optional[float] = None, - background_labels: Optional[Union[list, Dict[str, list]]] = None, - **further_parse_args - ) -> Matrix: +def process_map_and_format_input_data_for_diff(data_input: Union[str, pd.DataFrame, list, dict, np.ndarray, Matrix], + kernel: Matrix, + method: str = 'raw', + binning: Optional[bool] = False, + absolute_value: Optional[bool] = False, + p_value: Optional[float] = None, + threshold: Optional[float] = None, + background_labels: Optional[Union[list, Dict[str, list]]] = None, + **further_parse_args + ) -> Matrix: """Process miscellaneous data input, perform the mapping to the diffusion background network (as a kernel) and format it for the diffusion computation function. @@ -40,6 +40,7 @@ def process_input_data_for_diff(data_input: Union[str, pd.DataFrame, list, dict, for string list parsing: separ_str for excel/csv parsing: min_row, cols_mapping, relevant_cols, irrelevant_cols for excel: relevant_sheets, irrelevant_sheets + for mapping: check_substrings (as a bool if input list or list of labels types if input dict) """ # If specific label background not provided, get a list from kernel labels. if not background_labels: @@ -54,7 +55,8 @@ def process_input_data_for_diff(data_input: Union[str, pd.DataFrame, list, dict, threshold, **further_parse_args ), - background_labels + background_labels, + check_substrings=further_parse_args.get('check_substrings') ), kernel ) @@ -64,8 +66,8 @@ def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFra method: str = 'raw', binning: bool = False, absolute_value: bool = False, - p_value: float = None, - threshold: Optional[float] = None, + p_value: float = 0.05, + threshold: Optional[float] = 0.5, **further_parse_args ) -> Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]]: """Pipeline the provided miscellaneous data input for further processing, in the following standardized data structures: @@ -199,22 +201,22 @@ def _codify_input_data(df: pd.DataFrame, absolute_value: bool, p_value: float, threshold: Optional[float], - cols_titles_mapping: Optional[Dict[str:str]] = None + cols_titles_mapping: Optional[Dict[str, str]] = None ) -> Union[Dict[str, Dict[str, int]], Dict[str, int]]: """Process the input scores dataframe for the codifying process.""" - # Ensure that node labeling is in the provided dataset. - if not any(n in df.columns for n in NODE_LABELING): - raise ValueError( - f'Ensure that your file contains a column {NODE_LABELING} with node IDs.' - ) - # Rename dataframe column titles according (if) provided label_mapping. if cols_titles_mapping is not None: for label_to_rename, new_name in cols_titles_mapping.items(): if label_to_rename in df.columns: df = df.rename(columns={label_to_rename: new_name}) + # Ensure that node labeling is in the provided dataset. + if not any(n in df.columns for n in NODE_LABELING): + raise ValueError( + f'Ensure that your file contains a column {NODE_LABELING} with node IDs.' + ) + # Standardize the title of the node column labeling column to 'Label', for later processing. if LABEL not in df.columns: for l in list(df.columns): @@ -464,16 +466,25 @@ def _type_dict_label_list_data_struct_check(v: Union[dict, list]) -> bool: def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]], - background_labels: Union[Dict[str, list], list]) -> Union[Dict[str, int], list]: + background_labels: Union[Dict[str, list], list], + check_substrings: Union[List, bool] = None) -> Union[Dict[str, int], list]: """Map nodes from input dataset to nodes in network to get a set of labelled nodes.""" if isinstance(background_labels, list): - return _map_labels_to_background(input_labels, background_labels) + return _map_labels_to_background(input_labels, + background_labels, + check_substring=check_substrings) elif isinstance(background_labels, dict): - return {node_type: _map_labels_to_background(input_labels, node_set, node_type) + return {node_type: _map_labels_to_background(input_labels, + node_set, + background_labels_type=node_type, + check_substring=check_substrings) for node_type, node_set in background_labels.items() - if _map_labels_to_background(input_labels, node_set, node_type) not in [[], {}] + if _map_labels_to_background(input_labels, + node_set, + background_labels_type=node_type, + check_substring=check_substrings) not in [[], {}] } else: raise IOError( @@ -481,88 +492,126 @@ def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[st ) +def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]], + background_labels: list, + check_substrings: bool = False) -> Union[ + list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]]: + """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes.""" + if _label_list_data_struct_check(input_labels): + return _map_label_list(input_labels, background_labels, check_substrings) + + elif _label_scores_dict_data_struct_check(input_labels): + return _map_label_dict(input_labels, background_labels, check_substrings) + + elif _type_dict_label_list_data_struct_check(input_labels): + l = [] + for type, label_list in input_labels.items(): + l += _map_labels(label_list, background_labels, check_substrings) + return l + + elif _type_dict_label_scores_dict_data_struct_check(input_labels): + d = {} + for type, scores_dict in input_labels.items(): + d.update(_map_labels(scores_dict, background_labels, check_substrings)) + return d + + else: + raise TypeError( + f'{EMOJI} The input labels data structure can not be processed for label mapping' + ) + + def _map_labels_to_background(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]], background_labels: list, - background_labels_type: str = None + background_labels_type: str = None, + check_substring: Union[List, bool] = None ) -> Union[Dict[str, Dict[str, int]], Dict[str, int]]: """Map nodes from input dataset to nodes in network to get a set of labelled nodes.""" - if _type_dict_label_scores_dict_data_struct_check(input_labels) or _type_dict_label_list_data_struct_check( - input_labels): - if background_labels_type: - if background_labels_type in input_labels.keys(): - return _map_labels(input_labels[background_labels_type], background_labels) - else: - return { - type: _map_labels(label_list, background_labels) - for type, label_list in input_labels.items() - if _map_labels(label_list, background_labels) not in [[], {}] - } + if _type_dict_label_scores_dict_data_struct_check(input_labels) or \ + _type_dict_label_list_data_struct_check(input_labels): + + if background_labels_type and background_labels_type in input_labels.keys(): + return _map_labels(input_labels[background_labels_type], background_labels, + check_substring is not None and background_labels_type in check_substring) + return { + type: _map_labels(label_list, background_labels, + check_substring is not None and type in check_substring) + for type, label_list in input_labels.items() + if _map_labels(label_list, background_labels, + check_substring is not None and type in check_substring) not in [[], {}] + } + + return _map_labels(input_labels, background_labels, check_substring) + + +def _check_label_to_background_labels(label: str, + label_list: List[Union[str, Tuple[str]]], + substring: bool = False) -> Union[str, None]: + if label in label_list: + return label - return _map_labels(input_labels, background_labels) + # If the first fast mapping check do not match, perform further mapping iteration + for entity in label_list: + + if isinstance(entity, set) or isinstance(entity, tuple) or isinstance(entity, list): + for subentity in entity: + if not substring: + if str(subentity) == label: return subentity + elif str(subentity) in label or label in str(subentity): + return subentity + + elif substring and (str(entity) in label or label in str(entity)): + return entity + + return None def _map_label_list(input_labels: Union[str, Set[str], List[str]], - background_labels: List[str]) -> List[str]: + background_labels: List[str], + check_substrings: bool = False) -> List[str]: mapped_list = [] for label in input_labels: if isinstance(label, str): - if label in background_labels: - mapped_list.append(label) - elif isinstance(label, set) or isinstance(label, list): + label_bck = _check_label_to_background_labels(label, background_labels, check_substrings) + if label_bck is not None: + mapped_list.append(label_bck) + elif isinstance(label, set) or isinstance(label, tuple) or isinstance(label, list): for sublabel in set(label): - if sublabel in background_labels: - mapped_list.append(label) + label_bck = _check_label_to_background_labels(sublabel, background_labels, check_substrings) + if label_bck is not None: + mapped_list.append(label_bck) else: raise TypeError( - f'{EMOJI} The input label {label} data structure can not be processed for label mapping' + f'{EMOJI} The input label "{label}" "{type(label)}" data type can not be processed for label mapping' ) return mapped_list def _map_label_dict(input_labels: Dict[Union[str, set], Union[int, float]], - background_labels: list) -> Dict[str, Union[int, float]]: + background_labels: list, + check_substrings: bool = False) -> Dict[str, Union[int, float]]: mapped_dict = {} + for label, v in input_labels.items(): + if isinstance(label, int) or isinstance(label, float): + label = str(label) + if isinstance(label, str): - if label in background_labels: - mapped_dict[label] = v - elif isinstance(label, set) or isinstance(label, list): + label_bck = _check_label_to_background_labels(label, background_labels, check_substrings) + if label_bck is not None: + mapped_dict[label_bck] = v + elif isinstance(label, set) or isinstance(label, tuple) or isinstance(label, list): for sublabel in set(label): - if sublabel in background_labels: - mapped_dict[label] = v + label_bck = _check_label_to_background_labels(sublabel, background_labels, check_substrings) + if label_bck is not None: + mapped_dict[label_bck] = v else: raise TypeError( - f'{EMOJI} The input label {label} data structure can not be processed for label mapping' + f'{EMOJI} The input label "{label}" "{type(label)}" data type can not be processed for label mapping' ) - return mapped_dict - - -def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]], - background_labels: list) -> Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]]: - """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes.""" - if _label_list_data_struct_check(input_labels): - return _map_label_list(input_labels, background_labels) - elif _label_scores_dict_data_struct_check(input_labels): - return _map_label_dict(input_labels, background_labels) - - elif _type_dict_label_list_data_struct_check(input_labels): - l = [] - for type, label_list in input_labels.items(): - l += _map_labels(label_list, background_labels) - return l - - elif _type_dict_label_scores_dict_data_struct_check(input_labels): - d = {} - for type, scores_dict in input_labels.items(): - d.update(_map_labels(scores_dict, background_labels)) - return d - - else: - raise TypeError( - f'{EMOJI} The input labels data structure can not be processed for label mapping' - ) + return mapped_dict """Generate/format data input as a vector/matrix for the diffusion computation matching the kernel rows""" From 47c8c768cec032f5f7af19044cbcb7c50b61f761 Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Tue, 21 Apr 2020 23:04:17 +0200 Subject: [PATCH 12/17] Show mapping statistics feature added --- src/diffupy/process_input.py | 71 +++++++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 13 deletions(-) diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py index 4e65078..1916351 100644 --- a/src/diffupy/process_input.py +++ b/src/diffupy/process_input.py @@ -2,6 +2,7 @@ """Main matrix class and processing of input data.""" +import logging from typing import Dict, Optional, Union, List, Set, Tuple import numpy as np @@ -10,7 +11,9 @@ from .constants import * from .matrix import Matrix from .utils import from_pickle, from_json, from_dataframe_file, from_nparray_to_df, get_random_value_from_dict, \ - get_random_key_from_dict, parse_xls_to_df + get_random_key_from_dict, parse_xls_to_df, log_dict + +log = logging.getLogger(__name__) """Process input data""" @@ -23,6 +26,7 @@ def process_map_and_format_input_data_for_diff(data_input: Union[str, pd.DataFra p_value: Optional[float] = None, threshold: Optional[float] = None, background_labels: Optional[Union[list, Dict[str, list]]] = None, + show_statistics: bool = True, **further_parse_args ) -> Matrix: """Process miscellaneous data input, perform the mapping to the diffusion background network (as a kernel) and @@ -46,17 +50,19 @@ def process_map_and_format_input_data_for_diff(data_input: Union[str, pd.DataFra if not background_labels: background_labels = list(kernel.rows_labels) - # Pipeline the input, first preprocessing it, then mapping it to the background labels and finally formatting it. - return format_input_for_diffusion(map_labels_input(process_input_data(data_input, - method, - binning, - absolute_value, - p_value, - threshold, - **further_parse_args - ), - background_labels, - check_substrings=further_parse_args.get('check_substrings') + # Pipeline the input, first preprocessing it, then mapping it to the background labels + # and finally formatting it with the kernel reference. + return format_input_for_diffusion(map_labels_input(input_labels=process_input_data(data_input, + method, + binning, + absolute_value, + p_value, + threshold, + **further_parse_args + ), + background_labels=background_labels, + check_substrings=further_parse_args.get('check_substrings'), + show_statistics=show_statistics ), kernel ) @@ -84,6 +90,8 @@ def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFra for excel/csv parsing: min_row, cols_mapping, relevant_cols, irrelevant_cols for excel: relevant_sheets, irrelevant_sheets """ + log.info("Processing the data input.") + # Preprocess the raw input according its data structure types. preprocessed_data = _process_data_input_format(data_input, **further_parse_args) @@ -467,7 +475,10 @@ def _type_dict_label_list_data_struct_check(v: Union[dict, list]) -> bool: def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]], background_labels: Union[Dict[str, list], list], - check_substrings: Union[List, bool] = None) -> Union[Dict[str, int], list]: + check_substrings: Union[List, bool] = None, + show_statistics: bool = False) -> Union[Dict[str, int], list]: + log.info("Mapping the input labels to the background labels reference.") + """Map nodes from input dataset to nodes in network to get a set of labelled nodes.""" if isinstance(background_labels, list): return _map_labels_to_background(input_labels, @@ -491,6 +502,38 @@ def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[st f'{EMOJI} The background mapping labels should be provided as a label list or as a type dict of label list.' ) + if show_statistics: log_dict(mapping_statistics(mapped_labels, input_labels)) + + return mapped_labels + + +def mapping_statistics(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]], + mapped_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]]) -> Dict: + percentage_dict = {} + total_mapping = 0 + total_labels = 0 + + if _label_list_data_struct_check(input_labels) or _label_scores_dict_data_struct_check(input_labels): + total_mapping = len(input_labels) + total_labels = len(mapped_labels) + + elif _type_dict_label_list_data_struct_check(input_labels) or _type_dict_label_scores_dict_data_struct_check( + input_labels): + for input_type, mapping in input_labels.items(): + if input_type in mapped_labels: + percentage_dict[input_type] = len(mapping) / len(mapped_labels[input_type]) + total_mapping += len(mapping) + total_labels += len(mapped_labels[input_type]) + + else: + raise TypeError( + f'{EMOJI} The input labels data structure can not be processed for label mapping' + ) + + percentage_dict['General mapping'] = total_mapping / total_labels + + return percentage_dict + def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]], background_labels: list, @@ -621,6 +664,8 @@ def format_input_for_diffusion(processed_input: Union[list, Dict[str, int], Dict kernel: Matrix, missing_value: int = -1) -> Matrix: """Format/generate input vector/matrix according the data structure of the processed_data_input.""" + log.info("Formatting the processed to the reference kernel Matrix.") + if _label_list_data_struct_check(processed_input): return format_categorical_input_vector_from_label_list(rows_labeled=processed_input, col_label='scores', From f1e088487a7c85bb757ae96627e4661530046c81 Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Fri, 24 Apr 2020 09:59:07 +0200 Subject: [PATCH 13/17] flake8 cleaning in diffupy --- src/diffupy/constants.py | 3 +- src/diffupy/process_input.py | 64 +++++++++++++++++------------------- src/diffupy/utils.py | 25 +++++++++----- 3 files changed, 47 insertions(+), 45 deletions(-) diff --git a/src/diffupy/constants.py b/src/diffupy/constants.py index 581ce9e..3984660 100644 --- a/src/diffupy/constants.py +++ b/src/diffupy/constants.py @@ -130,7 +130,7 @@ def ensure_output_dirs(): ENTITY = 'Entity' GENE = 'Gene' -NODE_LABELING= [ +NODE_LABELING = [ NODE, LABEL, ENTITY, @@ -145,4 +145,3 @@ def ensure_output_dirs(): LOG_FC = 'LogFC' #: Statistical significance (p-value) P_VALUE = 'p-value' - diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py index 1916351..79bb997 100644 --- a/src/diffupy/process_input.py +++ b/src/diffupy/process_input.py @@ -445,17 +445,13 @@ def _remove_non_significant_entities(df: pd.DataFrame, p_value: float) -> Dict[s def _label_scores_dict_data_struct_check(v: Union[dict, list]) -> bool: """Check data structure type Dict[str, int].""" - return (isinstance(v, dict) and - isinstance(get_random_value_from_dict(v), (int, float)) - ) + return isinstance(v, dict) and isinstance(get_random_value_from_dict(v), (int, float)) def _type_dict_label_scores_dict_data_struct_check(v: Union[dict, list]) -> bool: """Check data structure type Dict[str, Dict[str, int]].""" - return (isinstance(v, dict) and - isinstance(get_random_value_from_dict(v), dict) and - isinstance(get_random_value_from_dict(get_random_value_from_dict(v)), (int, float)) - ) + return isinstance(v, dict) and isinstance(get_random_value_from_dict(v), dict) and isinstance( + get_random_value_from_dict(get_random_value_from_dict(v)), (int, float)) def _label_list_data_struct_check(v: Union[dict, list]) -> bool: @@ -465,9 +461,7 @@ def _label_list_data_struct_check(v: Union[dict, list]) -> bool: def _type_dict_label_list_data_struct_check(v: Union[dict, list]) -> bool: """Check data structure type Dict[str, list].""" - return (isinstance(v, dict) and - isinstance(get_random_value_from_dict(v), list) - ) + return isinstance(v, dict) and isinstance(get_random_value_from_dict(v), list) """Mappers from input to network background""" @@ -481,28 +475,29 @@ def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[st """Map nodes from input dataset to nodes in network to get a set of labelled nodes.""" if isinstance(background_labels, list): - return _map_labels_to_background(input_labels, - background_labels, - check_substring=check_substrings) + mapped_labels = _map_labels_to_background(input_labels, + background_labels, + check_substring=check_substrings) elif isinstance(background_labels, dict): - return {node_type: _map_labels_to_background(input_labels, - node_set, - background_labels_type=node_type, - check_substring=check_substrings) - for node_type, node_set - in background_labels.items() - if _map_labels_to_background(input_labels, - node_set, - background_labels_type=node_type, - check_substring=check_substrings) not in [[], {}] - } + mapped_labels = {node_type: _map_labels_to_background(input_labels, + node_set, + background_labels_type=node_type, + check_substring=check_substrings) + for node_type, node_set + in background_labels.items() + if _map_labels_to_background(input_labels, + node_set, + background_labels_type=node_type, + check_substring=check_substrings) not in [[], {}] + } else: raise IOError( f'{EMOJI} The background mapping labels should be provided as a label list or as a type dict of label list.' ) - if show_statistics: log_dict(mapping_statistics(mapped_labels, input_labels)) + if show_statistics: + log_dict(mapping_statistics(mapped_labels, input_labels)) return mapped_labels @@ -537,8 +532,8 @@ def mapping_statistics(input_labels: Union[list, Dict[str, Dict[str, int]], Dict def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]], background_labels: list, - check_substrings: bool = False) -> Union[ - list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]]: + check_substrings: bool = False + ) -> Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]]: """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes.""" if _label_list_data_struct_check(input_labels): return _map_label_list(input_labels, background_labels, check_substrings) @@ -547,16 +542,16 @@ def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, i return _map_label_dict(input_labels, background_labels, check_substrings) elif _type_dict_label_list_data_struct_check(input_labels): - l = [] + map_list = [] for type, label_list in input_labels.items(): - l += _map_labels(label_list, background_labels, check_substrings) - return l + map_list += _map_labels(label_list, background_labels, check_substrings) + return map_list elif _type_dict_label_scores_dict_data_struct_check(input_labels): - d = {} + map_dict = {} for type, scores_dict in input_labels.items(): - d.update(_map_labels(scores_dict, background_labels, check_substrings)) - return d + map_dict.update(_map_labels(scores_dict, background_labels, check_substrings)) + return map_dict else: raise TypeError( @@ -600,7 +595,8 @@ def _check_label_to_background_labels(label: str, if isinstance(entity, set) or isinstance(entity, tuple) or isinstance(entity, list): for subentity in entity: if not substring: - if str(subentity) == label: return subentity + if str(subentity) == label: + return subentity elif str(subentity) in label or label in str(subentity): return subentity diff --git a/src/diffupy/utils.py b/src/diffupy/utils.py index f8d0ec8..6e275d3 100644 --- a/src/diffupy/utils.py +++ b/src/diffupy/utils.py @@ -7,7 +7,6 @@ import pickle import random import warnings -from collections import defaultdict from typing import List, Union, Dict, Optional import networkx as nx @@ -124,10 +123,10 @@ def get_idx_scores_mapping(scores): return {i: score for i, score in enumerate(scores)} -def print_dict_dimensions(entities_db, title): - """Print dimension of the dictionary.""" +def print_dict_dimensions(entities_db, message='Total number of '): + """Print dimension of the dictionary""" total = 0 - print(title) + for k1, v1 in entities_db.items(): m = '' if isinstance(v1, dict): @@ -138,11 +137,19 @@ def print_dict_dimensions(entities_db, title): m += f'{len(v1)} ' total += len(v1) - print(f'Total number of {k1}: {m} ') + log_dict({k1: m}, message) print(f'Total: {total} ') +def log_dict(dict_to_print: dict, message: str = ''): + """Print dictionary as list with a message""" + + for k1, v1 in dict_to_print.items(): + log.info(f'{message} {k1}: {v1} ') + print(f'{message} {k1}: {v1} ') + + def get_random_key_from_dict(d): return random.choice(list(d.keys())) @@ -283,8 +290,8 @@ def parse_xls_sheet_to_df(sheet: opxl.workbook, for col in sheet.iter_cols(min_row=min_row): col_label = col[0].value - if ((relevant_cols is not None and col_label in relevant_cols) or - (irrelevant_cols is not None and col_label not in irrelevant_cols)): + if ((relevant_cols is not None and col_label in relevant_cols) or ( + irrelevant_cols is not None and col_label not in irrelevant_cols)): parsed_sheet_dict[col_label] = [munge_cell(cell.value) for cell in col[1:]] return pd.DataFrame.from_dict(parsed_sheet_dict) @@ -305,8 +312,8 @@ def parse_xls_to_df(path: str, if len(sheets) > 1: return {sheets[ix].lower(): parse_xls_sheet_to_df(sheet, min_row, relevant_cols, irrelevant_cols) for ix, sheet in enumerate(wb) - if (relevant_sheets is not None and sheets[ix] in relevant_sheets) or - (irrelevant_sheets is not None and sheets[ix] in irrelevant_sheets) + if (relevant_sheets is not None and sheets[ix] in relevant_sheets) or ( + irrelevant_sheets is not None and sheets[ix] in irrelevant_sheets) } else: From 798416bd707aae735baa11a9a8ceb25dc1be5095 Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Fri, 24 Apr 2020 10:15:10 +0200 Subject: [PATCH 14/17] flake8 cleaning in diffupy --- src/diffupy/matrix.py | 1 - src/diffupy/process_input.py | 14 ++++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/diffupy/matrix.py b/src/diffupy/matrix.py index bbda07e..8c52cf4 100644 --- a/src/diffupy/matrix.py +++ b/src/diffupy/matrix.py @@ -551,7 +551,6 @@ class MatrixFromNumpyArray(Matrix): def __init__(self, nparray, name=''): """Initialize laplacian.""" - df = from_nparray_to_df(nparray) rows = list(df.rows.values) diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py index 79bb997..9bbe50a 100644 --- a/src/diffupy/process_input.py +++ b/src/diffupy/process_input.py @@ -29,8 +29,7 @@ def process_map_and_format_input_data_for_diff(data_input: Union[str, pd.DataFra show_statistics: bool = True, **further_parse_args ) -> Matrix: - """Process miscellaneous data input, perform the mapping to the diffusion background network (as a kernel) and - format it for the diffusion computation function. + """Process miscellaneous data input, perform the mapping to the diffusion background network (as a kernel) and format it for the diffusion computation function. :param data_input: A miscellaneous data input to be processed/formatted for the diffuPy diffusion computation. :param kernel: A pre-computed kernel to perform the label mapping and the matching for the input formatting. @@ -76,8 +75,7 @@ def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFra threshold: Optional[float] = 0.5, **further_parse_args ) -> Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]]: - """Pipeline the provided miscellaneous data input for further processing, in the following standardized data structures: - label list, type_dict label lists, label-scores dict or type_dict label-scores dicts. + """Pipeline the provided miscellaneous data input for further processing, in the following standardized data structures: label list, type_dict label lists, label-scores dict or type_dict label-scores dicts. :param data_input: A miscellaneous data input to be processed. :param method: Elected method ["raw", "ml", "gm", "ber_s", "ber_p", "mc", "z"] @@ -471,6 +469,7 @@ def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[st background_labels: Union[Dict[str, list], list], check_substrings: Union[List, bool] = None, show_statistics: bool = False) -> Union[Dict[str, int], list]: + """Get the mappings from preprocessed input_labels.""" log.info("Mapping the input labels to the background labels reference.") """Map nodes from input dataset to nodes in network to get a set of labelled nodes.""" @@ -504,6 +503,7 @@ def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[st def mapping_statistics(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]], mapped_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]]) -> Dict: + """Get the mapping statistics.""" percentage_dict = {} total_mapping = 0 total_labels = 0 @@ -565,7 +565,7 @@ def _map_labels_to_background(input_labels: Union[list, Dict[str, Dict[str, int] check_substring: Union[List, bool] = None ) -> Union[Dict[str, Dict[str, int]], Dict[str, int]]: - """Map nodes from input dataset to nodes in network to get a set of labelled nodes.""" + """Map labels from preprocessed input to background_labels to get a set of matched labels.""" if _type_dict_label_scores_dict_data_struct_check(input_labels) or \ _type_dict_label_list_data_struct_check(input_labels): @@ -586,6 +586,7 @@ def _map_labels_to_background(input_labels: Union[list, Dict[str, Dict[str, int] def _check_label_to_background_labels(label: str, label_list: List[Union[str, Tuple[str]]], substring: bool = False) -> Union[str, None]: + """Check if label string in a label list, also check further if substring checking.""" if label in label_list: return label @@ -609,6 +610,7 @@ def _check_label_to_background_labels(label: str, def _map_label_list(input_labels: Union[str, Set[str], List[str]], background_labels: List[str], check_substrings: bool = False) -> List[str]: + """Map labels from preprocessed input to background_labels LIST to get a set of matched labels.""" mapped_list = [] for label in input_labels: if isinstance(label, str): @@ -630,6 +632,7 @@ def _map_label_list(input_labels: Union[str, Set[str], List[str]], def _map_label_dict(input_labels: Dict[Union[str, set], Union[int, float]], background_labels: list, check_substrings: bool = False) -> Dict[str, Union[int, float]]: + """Map labels from preprocessed input to background_labels DICT to get a set of matched labels.""" mapped_dict = {} for label, v in input_labels.items(): @@ -783,7 +786,6 @@ def format_input_vector_from_label_score_dict(labels_scores_dict: Dict[str, int] type_k: bool = False ) -> Matrix: """Generate scores input vector from labels scores dict.""" - input_mat = Matrix( mat=np.transpose(np.array([list(labels_scores_dict.values())])), rows_labels=list(labels_scores_dict.keys()), From b04db50e6b713381ddcc7bd05e5afa0c5e0173a7 Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Fri, 24 Apr 2020 10:23:19 +0200 Subject: [PATCH 15/17] flake8 cleaning in diffupy --- src/diffupy/utils.py | 11 ++++++----- tests/test_input.py | 8 +++----- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/diffupy/utils.py b/src/diffupy/utils.py index 6e275d3..aa506d1 100644 --- a/src/diffupy/utils.py +++ b/src/diffupy/utils.py @@ -124,7 +124,7 @@ def get_idx_scores_mapping(scores): def print_dict_dimensions(entities_db, message='Total number of '): - """Print dimension of the dictionary""" + """Print dimension of the dictionary.""" total = 0 for k1, v1 in entities_db.items(): @@ -143,18 +143,19 @@ def print_dict_dimensions(entities_db, message='Total number of '): def log_dict(dict_to_print: dict, message: str = ''): - """Print dictionary as list with a message""" - + """Print dictionary as list with a message.""" for k1, v1 in dict_to_print.items(): log.info(f'{message} {k1}: {v1} ') print(f'{message} {k1}: {v1} ') -def get_random_key_from_dict(d): +def get_random_key_from_dict(d: dict) -> [Union[str, int, tuple]]: + """Return random key from provided dict.""" return random.choice(list(d.keys())) -def get_random_value_from_dict(d): +def get_random_value_from_dict(d: dict): + """Return random value from provided dict.""" return d[get_random_key_from_dict(d)] diff --git a/tests/test_input.py b/tests/test_input.py index c141c85..f3e5273 100644 --- a/tests/test_input.py +++ b/tests/test_input.py @@ -5,7 +5,6 @@ import logging import unittest -import numpy as np from diffupy.constants import * from diffupy.matrix import Matrix from diffupy.process_input import process_input_data, map_labels_input, \ @@ -271,7 +270,6 @@ def test_validate_scores_4(self): def test_format_input_for_diffusion_label_list(self): """Test empty matrix.""" - processed_mapped_nodes_list = format_input_for_diffusion( map_labels_input({'Metabolite': {'C': -1}, 'Gene': {'A': 2, 'B': 1}, 'mirnas': {'A': 1, 'T': 1}}, self.kernel_test_1.rows_labels), @@ -279,7 +277,7 @@ def test_format_input_for_diffusion_label_list(self): ) # TODO: Implement in Matrix equal, now if the col order is mixed it raises error - #assert(np.allclose(processed_mapped_nodes_list.mat, + # assert(np.allclose(processed_mapped_nodes_list.mat, # np.array([[-1, 2, 1], # [-1, 1, -1], # [-1, -1, -1], @@ -287,9 +285,9 @@ def test_format_input_for_diffusion_label_list(self): # ) # ) # ) - #self.assertEqual(processed_mapped_nodes_list.cols_labels, + # self.assertEqual(processed_mapped_nodes_list.cols_labels, # ['Metabolite', 'Gene', 'mirnas'] # ) - #self.assertEqual(processed_mapped_nodes_list.rows_labels, + # self.assertEqual(processed_mapped_nodes_list.rows_labels, # ['A', 'B', 'C', 'D'] # ) From 65f06f9c562ab1308eecc4f918441a1855281795 Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Fri, 24 Apr 2020 10:29:27 +0200 Subject: [PATCH 16/17] diffupy cli refactor and output format feature added --- src/diffupy/cli.py | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/src/diffupy/cli.py b/src/diffupy/cli.py index ec1db95..fcb9f82 100644 --- a/src/diffupy/cli.py +++ b/src/diffupy/cli.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -"""Command line interface for DiffuPy.""" +"""Command line interface for diffuPy.""" import json import logging @@ -12,7 +12,7 @@ import click from diffupy.process_network import get_kernel_from_network_path -from .constants import OUTPUT, METHODS, EMOJI, RAW +from .constants import OUTPUT, METHODS, EMOJI, RAW, CSV, JSON from .diffuse import diffuse as run_diffusion from .kernels import regularised_laplacian_kernel from .process_input import process_map_and_format_input_data_for_diff @@ -78,14 +78,14 @@ def kernel( @main.command() @click.option( - '-n', '--network', - help='Path to the network graph or kernel', + '-i', '--input', + help='Input data', required=True, type=click.Path(exists=True, dir_okay=False) ) @click.option( - '-i', '--data', - help='Input data', + '-n', '--network', + help='Path to the network graph or kernel', required=True, type=click.Path(exists=True, dir_okay=False) ) @@ -131,8 +131,15 @@ def kernel( default=0.05, show_default=True, ) +@click.option( + '-f', '--output_format', + help='Statistical significance (p-value).', + type=float, + default=CSV, + show_default=True, +) def diffuse( - input_data: str, + input: str, network: str, output: str = sys.stdout, method: str = RAW, @@ -140,15 +147,16 @@ def diffuse( threshold: float = None, absolute_value: bool = True, p_value: float = 0.05, + output_format: str = CSV ): """Run a diffusion method over a network or pre-generated kernel.""" click.secho(f'{EMOJI} Loading graph from {network} {EMOJI}') kernel = get_kernel_from_network_path(network) - click.secho(f'Codifying data from {input_data}.') + click.secho(f'Processing data input from {input}.') - input_scores_dict = process_map_and_format_input_data_for_diff(input_data, + input_scores_dict = process_map_and_format_input_data_for_diff(input, kernel, method, binarize, @@ -157,24 +165,21 @@ def diffuse( threshold, ) - - click.secho(f'Running the diffusion algorithm.') + click.secho(f'Computing the diffusion algorithm.') results = run_diffusion( - label_dict, + input_scores_dict, method, k=kernel ) - # results = run_diffusion( - # label_dict, - # method, - # graph, - # ) + if output_format is CSV: + results.to_csv(output) - # json.dump(results, output, indent=2) + elif output_format is JSON: + json.dump(results, output, indent=2) - click.secho(f'Finished!') + click.secho(f'{EMOJI} Diffusion performed with success. Output located at {output} {EMOJI}') if __name__ == '__main__': From b138ad9f101872464a20628689feed7fc84e37a9 Mon Sep 17 00:00:00 2001 From: jmarinllao Date: Fri, 24 Apr 2020 10:36:10 +0200 Subject: [PATCH 17/17] openpyxl dependence --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 72f668b..a978139 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,6 +52,7 @@ install_requires = scipy pybel==0.13.2 pandas + openpyxl # Random options zip_safe = false