diff --git a/setup.cfg b/setup.cfg index 72f668b..a978139 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,6 +52,7 @@ install_requires = scipy pybel==0.13.2 pandas + openpyxl # Random options zip_safe = false diff --git a/src/diffupy/cli.py b/src/diffupy/cli.py index 46e69f5..fcb9f82 100644 --- a/src/diffupy/cli.py +++ b/src/diffupy/cli.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -"""Command line interface for DiffuPy.""" +"""Command line interface for diffuPy.""" import json import logging @@ -10,12 +10,13 @@ import time import click +from diffupy.process_network import get_kernel_from_network_path -from .constants import OUTPUT, METHODS, EMOJI +from .constants import OUTPUT, METHODS, EMOJI, RAW, CSV, JSON from .diffuse import diffuse as run_diffusion from .kernels import regularised_laplacian_kernel -from .process_input import process_input -from .utils import process_network_from_cli +from .process_input import process_map_and_format_input_data_for_diff +from .process_network import process_graph_from_file logger = logging.getLogger(__name__) @@ -42,9 +43,9 @@ def main(): ) @click.option('-l', '--log', is_flag=True, help='Activate debug mode') def kernel( - network: str, - output: str = OUTPUT, - log: bool = None + graph: str, + output: str = OUTPUT, + log: bool = None ): """Generate a kernel for a given network.""" # Configure logging level @@ -55,20 +56,20 @@ def kernel( logging.basicConfig(level=logging.INFO) logger.setLevel(logging.INFO) - click.secho(f'{EMOJI} Loading graph from {network} {EMOJI}') + click.secho(f'{EMOJI} Loading graph from {graph} {EMOJI}') - graph = process_network_from_cli(network) + graph = process_graph_from_file(graph) - click.secho(f'{EMOJI} Calculating regularized Laplacian kernel. This might take a while... {EMOJI}') + click.secho(f'{EMOJI} Generating regularized Laplacian kernel from graph. This might take a while... {EMOJI}') exe_t_0 = time.time() - background_mat = regularised_laplacian_kernel(graph) + kernel = regularised_laplacian_kernel(graph) exe_t_f = time.time() - output_file = os.path.join(output, f'{network.split("/")[-1]}.pickle') + output_file = os.path.join(output, f'{graph.split("/")[-1]}.pickle') # Export numpy array with open(output_file, 'wb') as file: - pickle.dump(background_mat, file, protocol=4) + pickle.dump(kernel, file, protocol=4) running_time = exe_t_f - exe_t_0 @@ -77,14 +78,14 @@ def kernel( @main.command() @click.option( - '-n', '--network', - help='Path to the network graph or kernel', + '-i', '--input', + help='Input data', required=True, type=click.Path(exists=True, dir_okay=False) ) @click.option( - '-i', '--data', - help='Input data', + '-n', '--network', + help='Path to the network graph or kernel', required=True, type=click.Path(exists=True, dir_okay=False) ) @@ -98,7 +99,7 @@ def kernel( '-m', '--method', help='Diffusion method', type=click.Choice(METHODS), - required=True, + default=RAW, ) @click.option( '-b', '--binarize', @@ -112,6 +113,7 @@ def kernel( @click.option( '-t', '--threshold', help='Codify node labels by applying a threshold to logFC in input.', + default=None, type=float, ) @click.option( @@ -129,48 +131,55 @@ def kernel( default=0.05, show_default=True, ) +@click.option( + '-f', '--output_format', + help='Statistical significance (p-value).', + type=float, + default=CSV, + show_default=True, +) def diffuse( - network: str, - data: str, - output: str, - method: str, - binarize: bool, - absolute_value: bool, - threshold: float, - p_value: float, + input: str, + network: str, + output: str = sys.stdout, + method: str = RAW, + binarize: bool = True, + threshold: float = None, + absolute_value: bool = True, + p_value: float = 0.05, + output_format: str = CSV ): """Run a diffusion method over a network or pre-generated kernel.""" click.secho(f'{EMOJI} Loading graph from {network} {EMOJI}') - graph = process_network_from_cli(network) - click.secho( - f'{EMOJI} Graph loaded with: \n' - f'{graph.number_of_nodes()} nodes\n' - f'{graph.number_of_edges()} edges\n' - f'{EMOJI}' - ) + kernel = get_kernel_from_network_path(network) - click.secho(f'Codifying data from {data}.') + click.secho(f'Processing data input from {input}.') - label_dict = process_input(data, method, binarize, absolute_value, p_value, threshold) + input_scores_dict = process_map_and_format_input_data_for_diff(input, + kernel, + method, + binarize, + absolute_value, + p_value, + threshold, + ) - click.secho(f'Running the diffusion algorithm.') + click.secho(f'Computing the diffusion algorithm.') results = run_diffusion( - label_dict, + input_scores_dict, method, - graph, + k=kernel ) - # results = run_diffusion( - # label_dict, - # method, - # graph, - # ) + if output_format is CSV: + results.to_csv(output) - # json.dump(results, output, indent=2) + elif output_format is JSON: + json.dump(results, output, indent=2) - click.secho(f'Finished!') + click.secho(f'{EMOJI} Diffusion performed with success. Output located at {output} {EMOJI}') if __name__ == '__main__': diff --git a/src/diffupy/constants.py b/src/diffupy/constants.py index ac15a41..3984660 100644 --- a/src/diffupy/constants.py +++ b/src/diffupy/constants.py @@ -59,6 +59,10 @@ def ensure_output_dirs(): #: csv CSV = 'csv' +#: xml +XLS = 'xls' +#: xmls +XLSX = 'xlsx' #: tsv TSV = 'tsv' #: graphML @@ -66,23 +70,36 @@ def ensure_output_dirs(): #: bel BEL = 'bel' #: node link json -NODE_LINK_JSON = 'json' +JSON = 'json' #: pickle -BEL_PICKLE = 'pickle' +PICKLE = 'pickle' #: gml GML = 'gml' #: edge list EDGE_LIST = '.lst' -#: DiffuPath available network formats -FORMATS = [ +XLS_FORMATS = ( + XLS, + XLSX +) + +#: Available graph formats +GRAPH_FORMATS = ( CSV, TSV, GRAPHML, BEL, - NODE_LINK_JSON, - BEL_PICKLE, -] + JSON, + PICKLE, +) + +#: Available kernel formats +KERNEL_FORMATS = ( + CSV, + TSV, + JSON, + PICKLE, +) #: Separators FORMAT_SEPARATOR_MAPPING = { @@ -109,9 +126,22 @@ def ensure_output_dirs(): #: Node name NODE = 'Node' +LABEL = 'Label' +ENTITY = 'Entity' +GENE = 'Gene' + +NODE_LABELING = [ + NODE, + LABEL, + ENTITY, + GENE +] + +#: Node type +NODE_TYPE = 'NodeType' +#: Unspecified score type +SCORE = 'Score' #: Log2 fold change (logFC) LOG_FC = 'LogFC' #: Statistical significance (p-value) P_VALUE = 'p-value' -#: Label -LABEL = 'Label' diff --git a/src/diffupy/diffuse.py b/src/diffupy/diffuse.py index 118d39c..53aa306 100644 --- a/src/diffupy/diffuse.py +++ b/src/diffupy/diffuse.py @@ -51,8 +51,8 @@ def diffuse( ) -> Matrix: """Run diffusion on a network given an input and a diffusion method. - :param input_scores: score collection, supplied as n-dimensional array. Could be 1-dimensional (List) or n-dimensional (Matrix). - :param method: Selected method ["raw", "ml", "gm", "ber_s", "ber_p", "mc", "z"] + :param input_scores: score collection, supplied as n-dimensional array. Could be 1-dimensional (Vector) or n-dimensional (Matrix). + :param method: Elected method ["raw", "ml", "gm", "ber_s", "ber_p", "mc", "z"] :param graph: A network as a graph. It could be optional if a Kernel is provided :param kwargs: Optional arguments: - k: a kernel [matrix] stemming from a graph, thus sparing the graph transformation process diff --git a/src/diffupy/matrix.py b/src/diffupy/matrix.py index 0ff499a..34b3b0e 100644 --- a/src/diffupy/matrix.py +++ b/src/diffupy/matrix.py @@ -7,9 +7,11 @@ import numpy as np import pandas as pd +from diffupy.constants import CSV +from networkx import DiGraph from .utils import get_label_ix_mapping, get_label_list_graph, get_laplacian, decode_labels, get_idx_scores_mapping, \ - get_repeated_labels + get_repeated_labels, from_dataframe_file, from_nparray_to_df log = logging.getLogger(__name__) @@ -23,15 +25,14 @@ class Matrix: """Matrix class.""" def __init__( - self, - mat=None, - rows_labels=None, - cols_labels=None, - graph=None, - quadratic=False, - name='', - init_value=None, - **kwargs + self, + mat=None, + rows_labels=None, + cols_labels=None, + graph=None, + quadratic=False, + name='', + init_value=None, ): """Initialize matrix. @@ -79,22 +80,22 @@ def __init__( def __str__(self): """Return a string representation of the Matrix.""" - s = f" {self.cols_labels}" + s = f" {self.cols_labels}" for i, row_label in enumerate(self.rows_labels): s += f"\n {row_label} {self.mat[i]} " return f"\nmatrix {self.name} \n {s} \n " - def __iter__(self, **kargs): + def __iter__(self, **attr): """Help method for the iteration of the Matrix.""" self.i = -1 self.j = 0 - if 'get_indices' in kargs: - self.get_indices = kargs['get_indices'] - if 'get_labels' in kargs: - self.get_labels = kargs['get_labels'] + if 'get_indices' in attr: + self.get_indices = attr['get_indices'] + if 'get_labels' in attr: + self.get_labels = attr['get_labels'] return self @@ -463,23 +464,6 @@ def order_rows(self, reverse=True, col_ref_idx=None): return ordered_mat - """Import""" - - def from_csv(self, csv_path): - """Import matrix from csv file using the headers as a Matrix class.""" - m = np.genfromtxt(csv_path, dtype=None, delimiter=',') - return Matrix( - mat=np.array( - [ - [float(x) - for x in a[1:]] - for a in m[1:] - ]), - rows_labels=list(m[1:, 0]), - cols_labels=list(m[0, 1:]), - name=str(os.path.basename(csv_path).replace('.csv', '')) - ) - """Export""" def to_dict(self, ordered=True): @@ -496,19 +480,137 @@ def to_dict(self, ordered=True): return d + def to_df(self, ordered=True): + """Export matrix as a data frame using the headers (row_labels, cols_labels) of the Matrix class.""" + d = self.to_dict(ordered) + + rows_labels = d.pop('rows_labels') + + df = pd.DataFrame(d) + df.rows.values = rows_labels + + return df + def to_csv(self, path, file_name='_export.csv', index=False, ordered=True): """Export matrix to csv file using the headers (row_labels, cols_labels) of the Matrix class.""" # Generate dataframe - df = pd.DataFrame(data=self.to_dict(ordered)) - df.to_csv(os.path.join(path, self.name, file_name), index=index) + self.to_df(ordered).to_csv(os.path.join(path, self.name, file_name), index=index) + + def to_nx_graph(self): + """Export matrix as a Graph using the headers (row_labels, cols_labels) of the Matrix class.""" + if len(self.cols_labels) != len(self.rows_labels) or not self.quadratic: + raise ValueError('The matrix cannot be converted as a graph since it is not quadratic, which ' + 'it is the used representation of a network (usually a kernel) as a Matrix.') + + graph = DiGraph() + + for score, sub_name, obj_name in self.__iter__(get_labels=True, get_indices=False): + if score != 0: + graph.add_edge( + sub_name, obj_name, + ) + + return graph + + +class MatrixFromDict(Matrix): + """Constructor matrix class for Dictionary data structure to Matrix conversion.""" + + def __init__(self, d, name=''): + """Initialize laplacian.""" + rows = list(d.pop('rows_labels')) + cols = list(d.keys()) + + Matrix.__init__(self, mat=np.array(list(d.values())), + rows_labels=rows, + cols_labels=cols, + quadratic=len(cols) == len(rows), + name=name + ) + + +class MatrixFromDataFrame(Matrix): + """Constructor matrix class for DataFrame to Matrix conversion.""" + + def __init__(self, df, name=''): + """Initialize laplacian.""" + rows = list(df.rows.values) + cols = list(df.cols.values) + + Matrix.__init__(self, mat=df.to_numpy(), + rows_labels=rows, + cols_labels=cols, + quadratic=len(cols) == len(rows), + name=name + ) + + +class MatrixFromNumpyArray(Matrix): + """Constructor matrix class for DataFrame to Matrix conversion.""" + + def __init__(self, nparray, name=''): + """Initialize laplacian.""" + df = from_nparray_to_df(nparray) + + rows = list(df.rows.values) + cols = list(df.cols.values) + + Matrix.__init__(self, mat=df.to_numpy(), + rows_labels=rows, + cols_labels=cols, + quadratic=len(cols) == len(rows), + name=name + ) + + +class MatrixFromCSV(Matrix): + """Constructor matrix class for CSV to Matrix conversion.""" + + def __init__(self, csv_path, fmt=CSV, name=None): + """Initialize laplacian.""" + df = from_dataframe_file(csv_path, fmt) + + if name is None: + name = str(os.path.basename(csv_path).replace('.csv', '')) + + rows = list(df.rows.values) + cols = list(df.cols.values) + + Matrix.__init__(self, mat=df.to_numpy(), + rows_labels=rows, + cols_labels=cols, + quadratic=len(cols) == len(rows), + name=name + ) + + +class MatrixFromGraph(Matrix): + """Constructor matrix class for nx.Graph to Matrix conversion.""" + + # TODO : move instances initialization from global argument graph to here + + def __init__(self, graph, node_argument='name', name=''): + # This initialization would make a matrix representing the graph (taking a graph argument as label) + rows = list(get_label_list_graph(graph, node_argument)) + + Matrix.__init__(self, rows_labels=rows, + init_value=1, + quadratic=True, + name=name, + ) class LaplacianMatrix(Matrix): """Laplacian matrix class.""" - def __init__(self, graph, normalized=False, name=''): + def __init__(self, graph, normalized=False, node_argument='name', name=''): """Initialize laplacian.""" l_mat = get_laplacian(graph, normalized) + rows = list(get_label_list_graph(graph, node_argument)) - Matrix.__init__(self, mat=l_mat, quadratic=True, name=name, graph=graph) + Matrix.__init__(self, mat=l_mat, + rows_labels=rows, + quadratic=True, + name=name + ) diff --git a/src/diffupy/process_input.py b/src/diffupy/process_input.py index bcee825..604971b 100644 --- a/src/diffupy/process_input.py +++ b/src/diffupy/process_input.py @@ -1,30 +1,196 @@ # -*- coding: utf-8 -*- """Main matrix class and processing of input data.""" -from typing import Dict, Optional + +import logging +from typing import Dict, Optional, Union, List, Set, Tuple + +import numpy as np import pandas as pd from .constants import * from .matrix import Matrix +from .utils import from_pickle, from_json, from_dataframe_file, from_nparray_to_df, get_random_value_from_dict, \ + get_random_key_from_dict, parse_xls_to_df, log_dict + +log = logging.getLogger(__name__) + +"""Process input data""" + + +def process_map_and_format_input_data_for_diff(data_input: Union[str, pd.DataFrame, list, dict, np.ndarray, Matrix], + kernel: Matrix, + method: str = 'raw', + binning: Optional[bool] = False, + absolute_value: Optional[bool] = False, + p_value: Optional[float] = None, + threshold: Optional[float] = None, + background_labels: Optional[Union[list, Dict[str, list]]] = None, + show_statistics: bool = True, + **further_parse_args + ) -> Matrix: + """Process miscellaneous data input, perform the mapping to the diffusion background network (as a kernel) and format it for the diffusion computation function. + + :param data_input: A miscellaneous data input to be processed/formatted for the diffuPy diffusion computation. + :param kernel: A pre-computed kernel to perform the label mapping and the matching for the input formatting. + :param method: Elected method ["raw", "ml", "gm", "ber_s", "ber_p", "mc", "z"]. + :param binning: If logFC provided in dataset, convert logFC to binary. + :param absolute_value: Codify node labels by applying threshold to | logFC | in input. + :param p_value: Statistical significance. + :param threshold: Codify node labels by applying a threshold to logFC in input. + :param background_labels: Labels set to map the input labels, which can provide label classification by type dict. + :param further_parse_args: Arguments to refine the data input parsing, among which: + for string list parsing: separ_str + for excel/csv parsing: min_row, cols_mapping, relevant_cols, irrelevant_cols + for excel: relevant_sheets, irrelevant_sheets + for mapping: check_substrings (as a bool if input list or list of labels types if input dict) + """ + # If specific label background not provided, get a list from kernel labels. + if not background_labels: + background_labels = list(kernel.rows_labels) + + # Pipeline the input, first preprocessing it, then mapping it to the background labels + # and finally formatting it with the kernel reference. + return format_input_for_diffusion(map_labels_input(input_labels=process_input_data(data_input, + method, + binning, + absolute_value, + p_value, + threshold, + **further_parse_args + ), + background_labels=background_labels, + check_substrings=further_parse_args.get('check_substrings'), + show_statistics=show_statistics + ), + kernel + ) + + +def process_input_data(data_input: Union[str, list, dict, np.ndarray, pd.DataFrame], + method: str = 'raw', + binning: bool = False, + absolute_value: bool = False, + p_value: float = 0.05, + threshold: Optional[float] = 0.5, + **further_parse_args + ) -> Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]]: + """Pipeline the provided miscellaneous data input for further processing, in the following standardized data structures: label list, type_dict label lists, label-scores dict or type_dict label-scores dicts. + + :param data_input: A miscellaneous data input to be processed. + :param method: Elected method ["raw", "ml", "gm", "ber_s", "ber_p", "mc", "z"] + :param binning: If logFC provided in dataset, convert logFC to binary. + :param absolute_value: Codify node labels by applying threshold to | logFC | in input. + :param p_value: Statistical significance. + :param threshold: Codify node labels by applying a threshold to logFC in input. + :param further_parse_args: Arguments to refine the data input parsing, among which: + for string list parsing: separ_str + for excel/csv parsing: min_row, cols_mapping, relevant_cols, irrelevant_cols + for excel: relevant_sheets, irrelevant_sheets + """ + log.info("Processing the data input.") + + # Preprocess the raw input according its data structure types. + preprocessed_data = _process_data_input_format(data_input, **further_parse_args) + + # If the preprocessed input is a list or a label type dict (Dict[str, list]) return it for latter categorical input generation. + if _label_list_data_struct_check(preprocessed_data) or _type_dict_label_list_data_struct_check(preprocessed_data): + return preprocessed_data + + # If the preprocessed input is a label type label-scores dict (Dict[str, pd.DataFrame]) pipeline it for scores codifying. + if isinstance(preprocessed_data, dict): + return {label_type: _codify_input_data(preprocessed_data_i, + method, + binning, + absolute_value, + p_value, + threshold, + further_parse_args.get('cols_titles_mapping') + ) + for label_type, preprocessed_data_i in preprocessed_data.items() + } + + # If the preprocessed input is a scores-label dataframe (pd.DataFrame) pipeline it for scores codifying. + return _codify_input_data(preprocessed_data, + method, + binning, + absolute_value, + p_value, + threshold, + further_parse_args.get('cols_titles_mapping') + ) + + +"""Process input formats""" + + +def _process_data_input_format(raw_data_input: Union[str, list, dict, np.ndarray, pd.DataFrame], + separ_str: str = ', ', + **further_parse_args) -> Union[pd.DataFrame, list, Dict[str, Union[pd.DataFrame, list]]]: + """Format the input as a label-score dataframe, a list or a labels or a type dict for latter input processing.""" + if isinstance(raw_data_input, str): + # If the data input type is a string, mostly will be a path to the dataset file. + if os.path.isfile(raw_data_input): + return _process_data_input_format(_load_data_input_from_file(raw_data_input, **further_parse_args)) + elif '/' in raw_data_input and separ_str not in ['/', ' /', '/ ']: + raise IOError( + f'{EMOJI} The file could not have been located in the provided data input path,.' + ) + # If the data input is not identified as a path, it will be treated as a label list with an indicated separator. + else: + return _process_data_input_format(raw_data_input.split(separ_str)) -"""Process datasets""" + elif isinstance(raw_data_input, list) or isinstance(raw_data_input, set): + return list(set(raw_data_input)) + if isinstance(raw_data_input, pd.DataFrame): + return raw_data_input -def process_input( - path: str, - method: str, - binning: bool, - absolute_value: bool, - p_value: float, - threshold: Optional[float], -) -> Dict[str, int]: - """Read input file and ensure necessary columns exist.""" + elif isinstance(raw_data_input, dict): + # If the data input type dict is a label-scores dict, codify it as a Panda's dataframe for latter processing. + if _label_scores_dict_data_struct_check(raw_data_input): + return pd.DataFrame.from_dict(raw_data_input, orient='index') + # Else it will be treated as a label_type dict, calling recursively the process input format for each type subset (key). + else: + # It is assumed that the all the dict values match the same data type. + return {label_type: _process_data_input_format(data_i) for label_type, data_i in raw_data_input.items()} + + elif isinstance(raw_data_input, np.ndarray): + return from_nparray_to_df(raw_data_input) + + elif isinstance(raw_data_input, Matrix): + return raw_data_input.to_df() + + else: + raise TypeError( + f'{EMOJI} The imported kernel type is not valid. Please ensure is provided as a diffupy ' + f'Matrix, a Dict, NumpyArray or Pandas DataFrame. ' + ) + + +def _load_data_input_from_file(path: str, **further_parse_args) -> Union[pd.DataFrame, list]: + """Load and process the input data according the input file format.""" if path.endswith(CSV): - fmt = CSV + return from_dataframe_file(path, CSV) + + elif path.endswith(XLS_FORMATS): + return parse_xls_to_df(path, + further_parse_args.get('min_row'), + further_parse_args.get('relevant_sheets'), + further_parse_args.get('irrelevant_sheets'), + further_parse_args.get('relevant_cols'), + further_parse_args.get('irrelevant_cols') + ) elif path.endswith(TSV): - fmt = TSV + return from_dataframe_file(path, TSV) + + elif path.endswith(PICKLE): + return from_pickle(path) + + elif path.endswith(JSON): + return from_json(path) else: raise IOError( @@ -32,33 +198,78 @@ def process_input( f'.csv or .tsv file extension.' ) - df = pd.read_csv( - path, - header=0, - sep=FORMAT_SEPARATOR_MAPPING[CSV] if fmt == CSV else FORMAT_SEPARATOR_MAPPING[TSV] - ) - # Ensure that minimally column Node is in dataset - if NODE not in df.columns: - raise ValueError( - f'Ensure that your file contains a column {NODE} with node IDs.' - ) +"""Pipeline input scores""" - return _codify_input_data(df, method, binning, absolute_value, p_value, threshold) +def _codify_input_data(df: pd.DataFrame, + method: str, + binning: bool, + absolute_value: bool, + p_value: float, + threshold: Optional[float], + cols_titles_mapping: Optional[Dict[str, str]] = None + ) -> Union[Dict[str, Dict[str, int]], + Dict[str, int]]: + """Process the input scores dataframe for the codifying process.""" + # Rename dataframe column titles according (if) provided label_mapping. + if cols_titles_mapping is not None: + for label_to_rename, new_name in cols_titles_mapping.items(): + if label_to_rename in df.columns: + df = df.rename(columns={label_to_rename: new_name}) -"""Codify input according to diffusion scoring method""" + # Ensure that node labeling is in the provided dataset. + if not any(n in df.columns for n in NODE_LABELING): + raise ValueError( + f'Ensure that your file contains a column {NODE_LABELING} with node IDs.' + ) + # Standardize the title of the node column labeling column to 'Label', for later processing. + if LABEL not in df.columns: + for l in list(df.columns): + if l in NODE_LABELING: + df = df.rename(columns={l: LABEL}) + break + + # If node type provided in a column, classify in a dictionary the input codification by its node type. + if NODE_TYPE in df.columns: + + node_types = list(set(df[NODE_TYPE])) # Get the node types list set. + codified_by_type_dict = {} + + for node_type in node_types: + # Filter the nodes by the iterable type. + df_by_type = df.loc[df[NODE_TYPE] == node_type] + + # Codify the nodes for the iterable type. + codified_by_type_dict[node_type] = _codify_method_check(df_by_type, + method, + binning, + absolute_value, + p_value, + threshold + ) + return codified_by_type_dict -def _codify_input_data( - df: pd.DataFrame, - method: str, - binning: bool, - absolute_value: bool, - p_value: float, - threshold: Optional[float], -) -> Dict[str, int]: - """Prepare input data for diffusion.""" + else: + # Codify all the nodes of the dataframe. + return _codify_method_check(df, + method, + binning, + absolute_value, + p_value, + threshold + ) + + +def _codify_method_check(df: pd.DataFrame, + method: str, + binning: bool, + absolute_value: bool, + p_value: float, + threshold: Optional[float], + ) -> Dict[str, int]: + """Classify the input data codification according the diffusion method.""" # Prepare input data for quantitative diffusion scoring methods if method == RAW or method == Z: return _codify_quantitative_input_data(df, binning, absolute_value, p_value, threshold) @@ -69,10 +280,10 @@ def _codify_input_data( else: # TODO: ber_s, ber_p, mc - raise NotImplementedError('This diffusion method has not yet been implemented.') + raise NotImplementedError('This diffusion method has not been yet implemented.') -"""Assign binary labels to input for scoring methods that accept non-quantitative values""" +"""Assign binary scores to input for scoring methods that ONLY accept non-quantitative values""" def _codify_non_quantitative_input_data( @@ -80,28 +291,28 @@ def _codify_non_quantitative_input_data( p_value: float, threshold: Optional[float] ) -> Dict[str, int]: - """Codify input data to get a set of labelled nodes for scoring methods that accept non-quantitative values.""" + """Codify input data to get a set of scored nodes for scoring methods that accept non-quantitative values.""" # LogFC provided in dataset and threshold given if LOG_FC in df.columns and threshold: # Label nodes with 1 if | logFC | passes threshold - df.loc[(df[LOG_FC]).abs() >= threshold, LABEL] = 1 + df.loc[(df[LOG_FC]).abs() >= threshold, SCORE] = 1 # Label nodes with -1 if | logFC | below threshold - df.loc[(df[LOG_FC]).abs() < threshold, LABEL] = -1 + df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = -1 - # If adjusted p-values are provided in dataset, label nodes that are not statistically significant with -1 + # If adjusted p-values are provided in dataset, score nodes that are not statistically significant with -1 if P_VALUE in df.columns: - df.loc[df[P_VALUE] > p_value, LABEL] = -1 + df.loc[df[P_VALUE] > p_value, SCORE] = -1 - return df.set_index(NODE)[LABEL].to_dict() + return df.set_index(LABEL)[SCORE].to_dict() - # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign labels as 1 - df[LABEL] = 1 + # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign scores as 1 + df[SCORE] = 1 - return df.set_index(NODE)[LABEL].to_dict() + return df.set_index(LABEL)[SCORE].to_dict() -"""Assign binary labels to input for scoring methods that accept quantitative values""" +"""Assign binary scores to input for scoring methods that accept quantitative values""" def _codify_quantitative_input_data( @@ -111,35 +322,34 @@ def _codify_quantitative_input_data( p_value: float, threshold: Optional[float], ) -> Dict[str, int]: - """Codify input data to get a set of labelled nodes for scoring methods that accept quantitative values.""" + """Codify input data to get a set of scored nodes for scoring methods that accept quantitative values.""" # LogFC provided in dataset and threshold given if LOG_FC in df.columns and threshold: - # Binarize labels with 1, 0 and/or -1 + # Binarize scores with 1, 0 and/or -1 if binning is True: - # Add binning labels where | logFC | values above threshold are 1 and below are 0 + # Add binning scores where | logFC | values above threshold are 1 and below are 0 if absolute_value is True: return _bin_quantitative_input_by_abs_val(df, threshold, p_value) - # Add signed labels where | logFC | values above threshold are 1 or -1 (signed) and values below are 0 + # Add signed scores where | logFC | values above threshold are 1 or -1 (signed) and values below are 0 return _bin_quantitative_input_by_threshold(df, threshold, p_value) # Labels are 0s or logFC values rather than binary values else: - # Codify inputs with | logFC | if they pass threshold; otherwise assign label as 0 + # Codify inputs with | logFC | if they pass threshold; otherwise assign score as 0 if absolute_value is True: return _codify_quantitative_input_by_abs_val(df, threshold, p_value) - # Codify inputs with logFC if they pass threshold; otherwise assign label as 0 + # Codify inputs with logFC if they pass threshold; otherwise assign score as 0 return _codify_quantitative_input_by_threshold(df, threshold, p_value) - # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign labels as 1 - df[LABEL] = 1 + # If input dataset exclusively contains IDs and no logFC, or if threshold is not given, then assign scores as 1 + df[SCORE] = 1 - # TODO handle NODE_TYPE - return df.set_index(NODE)[LABEL].to_dict() + return df.set_index(LABEL)[SCORE].to_dict() def _bin_quantitative_input_by_abs_val( @@ -147,17 +357,17 @@ def _bin_quantitative_input_by_abs_val( threshold: float, p_value: float, ) -> Dict[str, int]: - """Process quantitative inputs and bin labels by absolute value.""" - # Add label 1 if | logFC | is above threshold - df.loc[(df[LOG_FC]).abs() >= threshold, LABEL] = 1 - # Add label 0 if | logFC | below threshold - df.loc[(df[LOG_FC]).abs() < threshold, LABEL] = 0 + """Process quantitative inputs and bin scores by absolute value.""" + # Add score 1 if | logFC | is above threshold + df.loc[(df[LOG_FC]).abs() >= threshold, SCORE] = 1 + # Add score 0 if | logFC | below threshold + df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0 # logFC and adjusted p-values are provided in dataset if P_VALUE in df.columns: return _remove_non_significant_entities(df, p_value) - return df.set_index(NODE)[LABEL].to_dict() + return df.set_index(LABEL)[SCORE].to_dict() def _bin_quantitative_input_by_threshold( @@ -165,12 +375,12 @@ def _bin_quantitative_input_by_threshold( threshold: float, p_value: float, ) -> Dict[str, int]: - """Process quantitative inputs and bin labels by threshold.""" - # Add label 1 if logFC is above threshold - df.loc[df[LOG_FC] >= threshold, LABEL] = 1 - # Add label 0 if | logFC | below threshold - df.loc[(df[LOG_FC]).abs() < threshold, LABEL] = 0 - # Replace remaining labels with -1 (i.e. | logFC | above threshold but sign is negative) + """Process quantitative inputs and bin scores by threshold.""" + # Add score 1 if logFC is above threshold + df.loc[df[LOG_FC] >= threshold, SCORE] = 1 + # Add score 0 if | logFC | below threshold + df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0 + # Replace remaining score with -1 (i.e. | logFC | above threshold but sign is negative) df = df.fillna(-1) if p_value: @@ -179,10 +389,10 @@ def _bin_quantitative_input_by_threshold( # Disregard entities if logFC adjusted p-value is not significant return _remove_non_significant_entities(df, p_value) - return df.set_index(NODE)[LABEL].to_dict() + return df.set_index(LABEL)[SCORE].to_dict() -"""Assign logFC as labels for input for scoring methods that accept quantitative values""" +"""Assign logFC as score for input for scoring methods that accept quantitative values""" def _codify_quantitative_input_by_abs_val( @@ -190,18 +400,18 @@ def _codify_quantitative_input_by_abs_val( threshold: float, p_value: float, ) -> Dict[str, int]: - """Codify nodes with | logFC | if they pass threshold, otherwise label is 0.""" + """Codify nodes with | logFC | if they pass threshold, otherwise score is 0.""" # Codify nodes with | logFC | if they pass threshold - df.loc[(df[LOG_FC]).abs() >= threshold, LABEL] = (df[LOG_FC]).abs() - # Codify nodes with label 0 if it falls below threshold - df.loc[(df[LOG_FC]).abs() < threshold, LABEL] = 0 + df.loc[(df[LOG_FC]).abs() >= threshold, SCORE] = (df[LOG_FC]).abs() + # Codify nodes with score 0 if it falls below threshold + df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0 # LogFC and adjusted p-values are provided in dataset if P_VALUE in df.columns: # Disregard entities if logFC adjusted p-value is not significant return _remove_non_significant_entities(df, p_value) - return df.set_index(NODE)[LABEL].to_dict() + return df.set_index(LABEL)[SCORE].to_dict() def _codify_quantitative_input_by_threshold( @@ -210,87 +420,424 @@ def _codify_quantitative_input_by_threshold( p_value: float, ) -> Dict[str, int]: """Codify inputs with logFC if they pass threshold value.""" - df.loc[df[LOG_FC] >= threshold, LABEL] = df[LOG_FC] - df.loc[(df[LOG_FC]).abs() < threshold, LABEL] = 0 - df.loc[((df[LOG_FC]).abs() >= threshold) & ((df[LOG_FC]) < 0), LABEL] = df[LOG_FC] + df.loc[df[LOG_FC] >= threshold, SCORE] = df[LOG_FC] + df.loc[(df[LOG_FC]).abs() < threshold, SCORE] = 0 + df.loc[((df[LOG_FC]).abs() >= threshold) & ((df[LOG_FC]) < 0), SCORE] = df[LOG_FC] # LogFC values and adjusted p-values are provided in dataset if P_VALUE in df.columns: # Disregard entities if logFC adjusted p-value is not significant return _remove_non_significant_entities(df, p_value) - return df.set_index(NODE)[LABEL].to_dict() + return df.set_index(LABEL)[SCORE].to_dict() -def _remove_non_significant_entities(df: pd.DataFrame, p_value: float) -> pd.DataFrame: +def _remove_non_significant_entities(df: pd.DataFrame, p_value: float) -> Dict[str, int]: # Label entity 0 if adjusted p-value for logFC is not significant - df.loc[df[P_VALUE] > p_value, LABEL] = 0 + df.loc[df[P_VALUE] > p_value, SCORE] = 0 + + return df.set_index(LABEL)[SCORE].to_dict() + + +"""Data structures format checkers""" + + +def _label_scores_dict_data_struct_check(v: Union[dict, list]) -> bool: + """Check data structure type Dict[str, int].""" + return isinstance(v, dict) and isinstance(get_random_value_from_dict(v), (int, float)) + + +def _type_dict_label_scores_dict_data_struct_check(v: Union[dict, list]) -> bool: + """Check data structure type Dict[str, Dict[str, int]].""" + return isinstance(v, dict) and isinstance(get_random_value_from_dict(v), dict) and isinstance( + get_random_value_from_dict(get_random_value_from_dict(v)), (int, float)) + + +def _label_list_data_struct_check(v: Union[dict, list]) -> bool: + """Check data structure type list.""" + return isinstance(v, list) + + +def _type_dict_label_list_data_struct_check(v: Union[dict, list]) -> bool: + """Check data structure type Dict[str, list].""" + return isinstance(v, dict) and isinstance(get_random_value_from_dict(v), list) + + +"""Mappers from input to network background""" - return df.set_index(NODE)[LABEL].to_dict() +def map_labels_input(input_labels: Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]], + background_labels: Union[Dict[str, list], list], + check_substrings: Union[List, bool] = None, + show_statistics: bool = False) -> Union[Dict[str, int], list]: + """Get the mappings from preprocessed input_labels.""" + log.info("Mapping the input labels to the background labels reference.") + + """Map nodes from input dataset to nodes in network to get a set of labelled nodes.""" + if isinstance(background_labels, list): + mapped_labels = _map_labels_to_background(input_labels, + background_labels, + check_substring=check_substrings) + + elif isinstance(background_labels, dict): + mapped_labels = {node_type: _map_labels_to_background(input_labels, + node_set, + background_labels_type=node_type, + check_substring=check_substrings) + for node_type, node_set + in background_labels.items() + if _map_labels_to_background(input_labels, + node_set, + background_labels_type=node_type, + check_substring=check_substrings) not in [[], {}] + } + else: + raise IOError( + f'{EMOJI} The background mapping labels should be provided as a label list or as a type dict of label list.' + ) + + if show_statistics: + log_dict(mapping_statistics(mapped_labels, input_labels)) + + return mapped_labels + + +def mapping_statistics(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]], + mapped_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]]) -> Dict: + """Get the mapping statistics.""" + percentage_dict = {} + total_mapping = 0 + total_labels = 0 + + if _label_list_data_struct_check(input_labels) or _label_scores_dict_data_struct_check(input_labels): + total_mapping = len(input_labels) + total_labels = len(mapped_labels) + + elif _type_dict_label_list_data_struct_check(input_labels) or _type_dict_label_scores_dict_data_struct_check( + input_labels): + for input_type, mapping in input_labels.items(): + if input_type in mapped_labels: + percentage_dict[input_type] = len(mapping) / len(mapped_labels[input_type]) + total_mapping += len(mapping) + total_labels += len(mapped_labels[input_type]) + + else: + raise TypeError( + f'{EMOJI} The input labels data structure can not be processed for label mapping' + ) + + percentage_dict['General mapping'] = total_mapping / total_labels -"""Generate input vector from dataset labels""" + return percentage_dict -def generate_categoric_input_vector_from_labels( - rows_labeled, - col_label, - background_mat, - missing_value=-1, - rows_unlabeled=None, -): +def _map_labels(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]], + background_labels: list, + check_substrings: bool = False + ) -> Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]]: + """Map nodes from input dataset to nodes in network to get a set of labelled and unlabelled nodes.""" + if _label_list_data_struct_check(input_labels): + return _map_label_list(input_labels, background_labels, check_substrings) + + elif _label_scores_dict_data_struct_check(input_labels): + return _map_label_dict(input_labels, background_labels, check_substrings) + + elif _type_dict_label_list_data_struct_check(input_labels): + map_list = [] + for type, label_list in input_labels.items(): + map_list += _map_labels(label_list, background_labels, check_substrings) + return map_list + + elif _type_dict_label_scores_dict_data_struct_check(input_labels): + map_dict = {} + for type, scores_dict in input_labels.items(): + map_dict.update(_map_labels(scores_dict, background_labels, check_substrings)) + return map_dict + + else: + raise TypeError( + f'{EMOJI} The input labels data structure can not be processed for label mapping' + ) + + +def _map_labels_to_background(input_labels: Union[list, Dict[str, Dict[str, int]], Dict[str, int], Dict[str, list]], + background_labels: list, + background_labels_type: str = None, + check_substring: Union[List, bool] = None + ) -> Union[Dict[str, Dict[str, int]], + Dict[str, int]]: + """Map labels from preprocessed input to background_labels to get a set of matched labels.""" + if _type_dict_label_scores_dict_data_struct_check(input_labels) or \ + _type_dict_label_list_data_struct_check(input_labels): + + if background_labels_type and background_labels_type in input_labels.keys(): + return _map_labels(input_labels[background_labels_type], background_labels, + check_substring is not None and background_labels_type in check_substring) + return { + type: _map_labels(label_list, background_labels, + check_substring is not None and type in check_substring) + for type, label_list in input_labels.items() + if _map_labels(label_list, background_labels, + check_substring is not None and type in check_substring) not in [[], {}] + } + + return _map_labels(input_labels, background_labels, check_substring) + + +def _check_label_to_background_labels(label: str, + label_list: List[Union[str, Tuple[str]]], + substring: bool = False) -> Union[str, None]: + """Check if label string in a label list, also check further if substring checking.""" + if label in label_list: + return label + + # If the first fast mapping check do not match, perform further mapping iteration + for entity in label_list: + + if isinstance(entity, set) or isinstance(entity, tuple) or isinstance(entity, list): + for subentity in entity: + if not substring: + if str(subentity) == label: + return subentity + elif str(subentity) in label or label in str(subentity): + return subentity + + elif substring and (str(entity) in label or label in str(entity)): + return entity + + return None + + +def _map_label_list(input_labels: Union[str, Set[str], List[str]], + background_labels: List[str], + check_substrings: bool = False) -> List[str]: + """Map labels from preprocessed input to background_labels LIST to get a set of matched labels.""" + mapped_list = [] + for label in input_labels: + if isinstance(label, str): + label_bck = _check_label_to_background_labels(label, background_labels, check_substrings) + if label_bck is not None: + mapped_list.append(label_bck) + elif isinstance(label, set) or isinstance(label, tuple) or isinstance(label, list): + for sublabel in set(label): + label_bck = _check_label_to_background_labels(sublabel, background_labels, check_substrings) + if label_bck is not None: + mapped_list.append(label_bck) + else: + raise TypeError( + f'{EMOJI} The input label "{label}" "{type(label)}" data type can not be processed for label mapping' + ) + return mapped_list + + +def _map_label_dict(input_labels: Dict[Union[str, set], Union[int, float]], + background_labels: list, + check_substrings: bool = False) -> Dict[str, Union[int, float]]: + """Map labels from preprocessed input to background_labels DICT to get a set of matched labels.""" + mapped_dict = {} + + for label, v in input_labels.items(): + if isinstance(label, int) or isinstance(label, float): + label = str(label) + + if isinstance(label, str): + label_bck = _check_label_to_background_labels(label, background_labels, check_substrings) + if label_bck is not None: + mapped_dict[label_bck] = v + elif isinstance(label, set) or isinstance(label, tuple) or isinstance(label, list): + for sublabel in set(label): + label_bck = _check_label_to_background_labels(sublabel, background_labels, check_substrings) + if label_bck is not None: + mapped_dict[label_bck] = v + else: + raise TypeError( + f'{EMOJI} The input label "{label}" "{type(label)}" data type can not be processed for label mapping' + ) + + return mapped_dict + + +"""Generate/format data input as a vector/matrix for the diffusion computation matching the kernel rows""" + + +def format_input_for_diffusion(processed_input: Union[list, Dict[str, int], Dict[str, Dict[str, int]], Dict[str, list]], + kernel: Matrix, + missing_value: int = -1) -> Matrix: + """Format/generate input vector/matrix according the data structure of the processed_data_input.""" + log.info("Formatting the processed to the reference kernel Matrix.") + + if _label_list_data_struct_check(processed_input): + return format_categorical_input_vector_from_label_list(rows_labeled=processed_input, + col_label='scores', + kernel=kernel, + missing_value=missing_value + ) + + elif _type_dict_label_list_data_struct_check(processed_input): + return format_categorical_input_matrix_from_label_list(rows_labels=list(processed_input.values()), + cols_labels=list(processed_input.keys()), + kernel=kernel, + missing_value=missing_value + ) + + elif _label_scores_dict_data_struct_check(processed_input): + return format_input_vector_from_label_score_dict(labels_scores_dict=processed_input, + kernel=kernel, + missing_value=missing_value + ) + + elif _type_dict_label_scores_dict_data_struct_check(processed_input): + return format_input_matrix_from_type_label_score_dict(type_dict_labels_scores_dict=processed_input, + kernel=kernel, + missing_value=missing_value + ) + + else: + raise TypeError( + f'{EMOJI} The label/scores mapping data structure can not be processed for the input formatting.' + ) + + +"""Generate categorical (non-quantitative) input vector matrix from raw input dataset labels""" + + +def format_categorical_input_vector_from_label_list(rows_labeled: Union[set, list], + col_label: Union[str, set, list], + kernel: Matrix, + missing_value: int = -1, + rows_unlabeled=None, + i: int = None + ) -> Matrix: """Generate categoric input vector from labels.""" if isinstance(col_label, str): col_label = [col_label] input_mat = Matrix( - rows_labels=list(rows_labeled), + rows_labels=list(set(rows_labeled)), cols_labels=col_label, - init_value=1) + init_value=1 # By default the categorical input value is 1 + ) + if rows_unlabeled: + if i: + rows_unlabeled = rows_unlabeled[i] + input_mat.row_bind( matrix=Matrix( rows_labels=list(rows_unlabeled), cols_labels=col_label, - init_value=0) + init_value=0 # By default the non labeled input value is 0 + ) ) - return input_mat.match_missing_rows(background_mat.rows_labels, missing_value).match_rows(background_mat) + return input_mat.match_missing_rows(kernel.rows_labels, missing_value).match_rows(kernel) -def generate_categoric_input_from_labels( - rows_labels, - cols_labels, - background_mat, - missing_value=-1, - rows_unlabeled=None, -): +def format_categorical_input_matrix_from_label_list(rows_labels: Union[set, list], + cols_labels: Union[set, list], + kernel: Matrix, + missing_value: int = -1, + rows_unlabeled=None + ) -> Matrix: """Generate input vector from labels.""" - if isinstance(cols_labels, list) and len(cols_labels) > 1: - input_mat = generate_categoric_input_vector_from_labels( + if not isinstance(cols_labels, list): + raise NotImplementedError('The column labels should be provided as a list.') + + if len(cols_labels) > 1: + + input_mat = format_categorical_input_vector_from_label_list( rows_labels[0], cols_labels[0], - background_mat, + kernel, missing_value, - rows_unlabeled[0] + rows_unlabeled, + i=0 ) for idx, row_label in enumerate(rows_labels[1:]): - input_vector = generate_categoric_input_vector_from_labels( + input_vector = format_categorical_input_vector_from_label_list( row_label, cols_labels[idx + 1], - background_mat, + kernel, missing_value, - rows_unlabeled[idx + 1], + rows_unlabeled, + idx + 1 ) input_mat.col_bind(matrix=input_vector) return input_mat - else: - return generate_categoric_input_vector_from_labels( - rows_labels, - cols_labels, - background_mat, - missing_value, - rows_unlabeled + + return format_categorical_input_vector_from_label_list( + rows_labels, + cols_labels, + kernel, + missing_value, + rows_unlabeled + ) + + +"""Generate quantitative or binarized/categorical input vector matrix from preprocesed input dataset scores""" + + +def format_input_vector_from_label_score_dict(labels_scores_dict: Dict[str, int], + kernel: Matrix, + col_label: str = 'scores', + missing_value: int = -1, + rows_unlabeled: dict = None, # TODO: To discuss + type_k: bool = False + ) -> Matrix: + """Generate scores input vector from labels scores dict.""" + input_mat = Matrix( + mat=np.transpose(np.array([list(labels_scores_dict.values())])), + rows_labels=list(labels_scores_dict.keys()), + cols_labels=[col_label] + ) + + if rows_unlabeled: + if type_k: + rows_unlabeled = rows_unlabeled[col_label] + + input_mat.row_bind( + matrix=Matrix( + mat=np.transpose(np.array([list(rows_unlabeled.values())])), + rows_labels=list(rows_unlabeled.keys()), + cols_labels=[col_label] + ) ) + + return input_mat.match_missing_rows(kernel.rows_labels, missing_value).match_rows(kernel) + + +def format_input_matrix_from_type_label_score_dict(type_dict_labels_scores_dict: Union[Dict[str, Dict[str, int]], + Dict[str, int]], + kernel, + missing_value: int = -1, + rows_unlabeled=None, # TODO: To discuss + ) -> Matrix: + """Generate input matrix from labels scores dict and/or handle type classification by columns.""" + if _type_dict_label_scores_dict_data_struct_check(type_dict_labels_scores_dict): + + init_k = get_random_key_from_dict(type_dict_labels_scores_dict) + init_v = type_dict_labels_scores_dict.pop(init_k) + + input_mat = format_input_vector_from_label_score_dict(init_v, + kernel, + init_k, + missing_value, + rows_unlabeled, + True + ) + + for node_type, scores_dict in type_dict_labels_scores_dict.items(): + input_vector = format_input_vector_from_label_score_dict(scores_dict, + kernel, + node_type, + missing_value, + rows_unlabeled, + True + ) + input_mat.col_bind(matrix=input_vector) + + return input_mat + else: + return format_input_vector_from_label_score_dict(type_dict_labels_scores_dict, kernel) diff --git a/src/diffupy/process_network.py b/src/diffupy/process_network.py new file mode 100644 index 0000000..4b699c8 --- /dev/null +++ b/src/diffupy/process_network.py @@ -0,0 +1,241 @@ +# -*- coding: utf-8 -*- + +"""Miscellaneous utils of the package.""" + +import logging +from typing import Tuple + +import numpy as np +import pandas as pd +import pybel +from diffupy.matrix import Matrix, MatrixFromDataFrame, MatrixFromDict, MatrixFromNumpyArray +from diffupy.utils import from_dataframe_file, format_checker, from_pickle, get_label_node, from_json +from networkx import DiGraph, Graph, read_graphml, read_gml, node_link_graph, read_edgelist + +from .constants import * +from .constants import CSV, TSV, GRAPHML, GML, BEL, PICKLE, EMOJI, GRAPH_FORMATS +from .kernels import regularised_laplacian_kernel + +log = logging.getLogger(__name__) + + +"""Process network as undefined format (could represented as a graph or as a kernel)""" + + +def get_kernel_and_graph_from_network_path(path: str) -> Tuple[Matrix, Graph]: + """Load network provided in cli as a kernel and as a graph.""" + graph = None + kernel = None + + if path.endswith(KERNEL_FORMATS): + try: + graph = process_graph_from_file(path) + + except ValueError or TypeError: + kernel = process_kernel_from_file(path) + + elif path.endswith(GRAPH_FORMATS): + graph = process_graph_from_file(path) + + else: + raise IOError( + f'{EMOJI} The selected network format is not valid neither as a graph or as a kernel. Please ensure you use one of the following formats: ' + f'{GRAPH_FORMATS}' + ) + + if kernel is None and graph is not None: + kernel = regularised_laplacian_kernel(graph) + + if kernel is not None and graph is None: + graph = kernel.to_nx_graph() + + return kernel, graph + + +def get_kernel_from_network_path(path: str) -> Matrix: + """Load network provided in cli as a kernel.""" + if path.endswith(KERNEL_FORMATS): + try: + graph = process_graph_from_file(path) + + except ValueError or TypeError: + return process_kernel_from_file(path) + + elif path.endswith(GRAPH_FORMATS): + graph = process_graph_from_file(path) + + else: + raise IOError( + f'{EMOJI} The selected network format is not valid neither as a graph or as a kernel. Please ensure you use one of the following formats: ' + f'{GRAPH_FORMATS}' + ) + + return regularised_laplacian_kernel(graph) + + +def get_graph_from_network_path(path: str) -> Graph: + """Load network provided in cli as a graph.""" + if path.endswith(KERNEL_FORMATS): + try: + return process_graph_from_file(path) + + except ValueError or TypeError: + kernel = process_kernel_from_file(path) + + elif path.endswith(GRAPH_FORMATS): + return process_graph_from_file(path) + + else: + raise IOError( + f'{EMOJI} The selected network format is not valid neither as a graph or as a kernel. Please ensure you use one of the following formats: ' + f'{GRAPH_FORMATS}' + ) + + return kernel.to_nx_graph() + + +"""Process input formats""" + + +def process_graph_from_file(path: str) -> Graph: + """Load network from path.""" + if path.endswith(CSV) or path.endswith(TSV): + graph = get_graph_from_df(path, CSV) + + elif path.endswith(TSV): + graph = get_graph_from_df(path, TSV) + + elif path.endswith(PICKLE): + graph = pybel.from_pickle(path) + + elif path.endswith(GRAPHML): + graph = read_graphml(path) + + elif path.endswith(GML): + graph = read_gml(path) + + elif path.endswith(BEL): + graph = pybel.from_path(path) + + elif path.endswith(EDGE_LIST): + graph = read_edgelist(path) + + elif path.endswith(JSON): + data = from_json(path) + graph = node_link_graph(data) + else: + raise IOError( + f'{EMOJI} The selected graph format is not valid. Please ensure you use one of the following formats: ' + f'{GRAPH_FORMATS}' + ) + + log.info( + f'{EMOJI} Graph loaded with: \n' + f'{graph.number_of_nodes()} nodes\n' + f'{graph.number_of_edges()} edges\n' + f'{EMOJI}' + ) + + return graph + + +def process_kernel_from_file(path: str) -> Matrix: + """Load kernel from path.""" + if path.endswith(CSV): + raw_kernel = MatrixFromDataFrame(from_dataframe_file(path, CSV)) + + elif path.endswith(TSV): + raw_kernel = MatrixFromDataFrame(from_dataframe_file(path, TSV)) + + elif path.endswith(PICKLE): + raw_kernel = from_pickle(path) + + elif path.endswith(JSON): + raw_kernel = from_json(path) + + else: + raise IOError( + f'{EMOJI} The selected kernel format is not valid. Please ensure you use one of the following formats: ' + f'{KERNEL_FORMATS}' + ) + + # Check imported type of kernel + if isinstance(raw_kernel, Matrix): + kernel = raw_kernel + + elif isinstance(raw_kernel, dict): + kernel = MatrixFromDict(raw_kernel) + + elif isinstance(raw_kernel, pd.DataFrame): + kernel = MatrixFromDataFrame(raw_kernel) + + elif isinstance(raw_kernel, np.ndarray): + kernel = MatrixFromNumpyArray(raw_kernel) + + else: + raise IOError( + f'{EMOJI} The imported kernel type is not valid. Please ensure it is provided as a diffupy ' + f'Matrix, a Dict, NumpyArray or Pandas DataFrame. ' + ) + + log.info( + f'{EMOJI} Kernel loaded with: \n' + f'{len(kernel.rows_labels)} nodes\n' + f'{EMOJI}' + ) + + return kernel + + +def get_simple_graph_from_multigraph(multigraph): + """Convert undirected graph from multigraph.""" + graph = Graph() + for u, v, data in multigraph.edges(data=True): + u = get_label_node(u) + v = get_label_node(v) + + w = data['weight'] if 'weight' in data else 1.0 + if graph.has_edge(u, v): + graph[u][v]['weight'] += w + else: + graph.add_edge(u, v, weight=w) + + return graph + + +def get_graph_from_df(path: str, sep: str) -> DiGraph: + """Return network from dataFrame.""" + format_checker(sep) + + df = from_dataframe_file(path, sep) + + if SOURCE not in df.columns or TARGET not in df.columns: + raise ValueError( + f'Ensure that your file contains columns for {SOURCE} and {TARGET}. The column for {RELATION} is optional' + f'and can be omitted.' + ) + + graph = DiGraph() + + for index, row in df.iterrows(): + + # Get node names from data frame + sub_name = row[SOURCE] + obj_name = row[TARGET] + + if RELATION in df.columns: + + relation = row[RELATION] + + # Store edge in the graph + graph.add_edge( + sub_name, obj_name, + relation=relation, + ) + + else: + graph.add_edge( + sub_name, obj_name, + ) + + return graph diff --git a/src/diffupy/utils.py b/src/diffupy/utils.py index d747520..aa506d1 100644 --- a/src/diffupy/utils.py +++ b/src/diffupy/utils.py @@ -5,24 +5,26 @@ import json import logging import pickle +import random import warnings -from typing import List, Tuple +from typing import List, Union, Dict, Optional import networkx as nx import numpy as np +import openpyxl as opxl import pandas as pd import pybel - -from networkx import DiGraph, read_graphml, read_gml, node_link_graph, read_edgelist +from networkx import Graph from .constants import * -from .constants import CSV, TSV, GRAPHML, GML, BEL, BEL_PICKLE, NODE_LINK_JSON, EMOJI, FORMATS - +from .constants import CSV, TSV, GRAPH_FORMATS log = logging.getLogger(__name__) +"""Matrix/graph handling utils.""" + -def get_laplacian(graph: nx.Graph, normalized: bool = False) -> np.ndarray: +def get_laplacian(graph: Graph, normalized: bool = False) -> np.ndarray: """Return Laplacian matrix.""" if nx.is_directed(graph): warnings.warn('Since graph is directed, it will be converted to an undirected graph.') @@ -35,7 +37,7 @@ def get_laplacian(graph: nx.Graph, normalized: bool = False) -> np.ndarray: return nx.laplacian_matrix(graph).toarray() -def set_diagonal_matrix(matrix, d): +def set_diagonal_matrix(matrix: np.ndarray, d: list) -> np.ndarray: """Set diagonal matrix.""" for j, row in enumerate(matrix): for i, x in enumerate(row): @@ -121,27 +123,10 @@ def get_idx_scores_mapping(scores): return {i: score for i, score in enumerate(scores)} -def decode_labels(labels): - """Validate labels.""" - labels_decode = [] - - for label in labels: - if not isinstance(label, str): - - if isinstance(label, int): - label = str(label) - else: - label = label.decode('utf-8').replace('"', '') - - labels_decode.append(label) - - return labels_decode - - -def print_dict_dimensions(entities_db, title): +def print_dict_dimensions(entities_db, message='Total number of '): """Print dimension of the dictionary.""" total = 0 - print(title) + for k1, v1 in entities_db.items(): m = '' if isinstance(v1, dict): @@ -152,142 +137,185 @@ def print_dict_dimensions(entities_db, title): m += f'{len(v1)} ' total += len(v1) - print(f'Total number of {k1}: {m} ') + log_dict({k1: m}, message) print(f'Total: {total} ') -def get_simple_graph_from_multigraph(multigraph): - """Convert undirected graph from multigraph.""" - graph = nx.Graph() - for u, v, data in multigraph.edges(data=True): - u = get_label_node(u) - v = get_label_node(v) +def log_dict(dict_to_print: dict, message: str = ''): + """Print dictionary as list with a message.""" + for k1, v1 in dict_to_print.items(): + log.info(f'{message} {k1}: {v1} ') + print(f'{message} {k1}: {v1} ') - w = data['weight'] if 'weight' in data else 1.0 - if graph.has_edge(u, v): - graph[u][v]['weight'] += w - else: - graph.add_edge(u, v, weight=w) - return graph +def get_random_key_from_dict(d: dict) -> [Union[str, int, tuple]]: + """Return random key from provided dict.""" + return random.choice(list(d.keys())) + +def get_random_value_from_dict(d: dict): + """Return random value from provided dict.""" + return d[get_random_key_from_dict(d)] -"""Check formats of networks """ +"""File loading utils.""" -def _format_checker(fmt: str) -> None: - """Check column sep.""" - if fmt not in FORMATS: + +def format_checker(fmt: str, fmt_list: list = GRAPH_FORMATS) -> None: + """Check formats.""" + if fmt not in fmt_list: raise ValueError( f'The selected sep {fmt} is not valid. Please ensure you use one of the following formats: ' - f'{FORMATS}' + f'{fmt_list}' ) -"""Process networks""" - - -def _read_network_file(path: str, fmt: str) -> pd.DataFrame: +def from_dataframe_file(path: str, fmt: str) -> pd.DataFrame: """Read network file.""" - _format_checker(fmt) + format_checker(fmt) - df = pd.read_csv( + return pd.read_csv( path, header=0, sep=FORMAT_SEPARATOR_MAPPING[CSV] if fmt == CSV else FORMAT_SEPARATOR_MAPPING[TSV] ) - if SOURCE not in df.columns or TARGET not in df.columns: - raise ValueError( - f'Ensure that your file contains columns for {SOURCE} and {TARGET}. The column for {RELATION} is optional' - f'and can be omitted.' - ) - return df +def from_json(path: str): + """Read from json file.""" + with open(path) as f: + return json.load(f) -def process_network(path: str, sep: str) -> DiGraph: - """Return network from dataFrame.""" - _format_checker(sep) +def from_pickle(input_path): + """Read from pickle file.""" + with open(input_path, 'rb') as f: + unpickler = pickle.Unpickler(f) + return unpickler.load() - df = _read_network_file(path, sep) - graph = DiGraph() +def from_nparray_to_df(nparray: np.ndarray) -> pd.DataFrame: + """Convert numpy array to data frame.""" + return pd.DataFrame(data=nparray[1:, 1:], + index=nparray[1:, 0], + columns=nparray[0, 1:]) - for index, row in df.iterrows(): - # Get node names from data frame - sub_name = row[SOURCE] - obj_name = row[TARGET] +"""Data parsing utils.""" - if RELATION in df.columns: - relation = row[RELATION] +def decode_labels(labels): + """Validate labels.""" + labels_decode = [] - # Store edge in the graph - graph.add_edge( - sub_name, obj_name, - relation=relation, - ) + for label in labels: + if not isinstance(label, str): - else: - graph.add_edge( - sub_name, obj_name, - ) + if isinstance(label, int): + label = str(label) + else: + label = label.decode('utf-8').replace('"', '') - return graph + labels_decode.append(label) + return labels_decode -def load_json_file(path: str) -> DiGraph: - """Read json file.""" - with open(path) as f: - return json.load(f) +def munge_label(label: Union[str, int, float]) -> str: + """Munge label strings.""" + remove_set = ['*', ' ', '|', '-', '"', "'", "↑", "↓", "\n"] + split_set = ['/'] -def from_pickle(input_path): - """Read from pickle file.""" - with open(input_path, 'rb') as f: - unpickler = pickle.Unpickler(f) - return unpickler.load() + label = str(label).lower() + for symb in remove_set: + if symb in label: + label = label.replace(symb, '') -def process_network_from_cli(path: str) -> nx.Graph: - """Load network from path.""" - if path.endswith(CSV): - graph = process_network(path, CSV) + for symb in split_set: + if symb in label: + label = tuple(set(label.split(symb))) + if len(label) == 1: + label = label[0] - elif path.endswith(TSV): - graph = process_network(path, TSV) + return label - elif path.endswith(GRAPHML): - graph = read_graphml(path) - elif path.endswith(GML): - graph = read_gml(path) +def munge_label_list(labels: list): + """Munge labels list.""" + return list(set([munge_label(label) for label in labels])) - elif path.endswith(BEL): - graph = pybel.from_path(path) - elif path.endswith(BEL_PICKLE): - graph = pybel.from_pickle(path) +def munge_label_scores_dict(labels: dict) -> Dict[str, Union[list, int, str]]: + """Munge labels dict.""" + return {munge_label(label): v for label, v in labels.items()} - elif path.endswith(EDGE_LIST): - graph = read_edgelist(path) - elif path.endswith(NODE_LINK_JSON): - data = load_json_file(path) - graph = node_link_graph(data) +def munge_label_type_dict(label_dict: Dict[str, Union[list, int, str, dict]]) -> Dict[str, Union[list, int, str, dict]]: + """Munge labels type dict.""" + type_label_dict = {} + + for type_label, labels in label_dict.items(): + if isinstance(labels, dict): + type_label_dict[type_label] = munge_label_scores_dict(labels) + + elif isinstance(labels, dict): + type_label_dict[type_label] = munge_label_scores_dict(labels) + + return type_label_dict + + +def munge_cell(cell): + """Munge cell.""" + if isinstance(cell, str): + if cell.replace(',', '').replace('.', '').replace('-', '').isnumeric(): + return float(cell) + else: + return munge_label(cell) + + elif isinstance(cell, float) or isinstance(cell, int): + return cell else: - raise IOError( - f'{EMOJI} The selected format is not valid. Please ensure you use one of the following formats: ' - f'{FORMATS}' - ) - return graph + raise TypeError('The cell type could not be processed.') + +def parse_xls_sheet_to_df(sheet: opxl.workbook, + min_row: Optional[int] = 1, + relevant_cols: Optional[list] = None, + irrelevant_cols: Optional[list] = None) -> pd.DataFrame: + """Process/format excel sheets to DataFrame.""" + parsed_sheet_dict = {} -def process_kernel_from_cli(path: str): - """Process kernel from cli.""" - # TODO process different kinds of input format kernel - return from_pickle(path) + for col in sheet.iter_cols(min_row=min_row): + col_label = col[0].value + + if ((relevant_cols is not None and col_label in relevant_cols) or ( + irrelevant_cols is not None and col_label not in irrelevant_cols)): + parsed_sheet_dict[col_label] = [munge_cell(cell.value) for cell in col[1:]] + + return pd.DataFrame.from_dict(parsed_sheet_dict) + + +def parse_xls_to_df(path: str, + min_row: Optional[int] = 1, + relevant_sheets: Optional[list] = None, + irrelevant_sheets: Optional[list] = None, + relevant_cols: Optional[list] = None, + irrelevant_cols: Optional[list] = None, + ) -> Union[pd.DataFrame, Dict[str, pd.DataFrame]]: + """Process excel file as a set (if several excel sheets) or a single dataframe.""" + wb = opxl.load_workbook(filename=path) + + sheets = wb.sheetnames + + if len(sheets) > 1: + return {sheets[ix].lower(): parse_xls_sheet_to_df(sheet, min_row, relevant_cols, irrelevant_cols) + for ix, sheet in enumerate(wb) + if (relevant_sheets is not None and sheets[ix] in relevant_sheets) or ( + irrelevant_sheets is not None and sheets[ix] in irrelevant_sheets) + } + + else: + return parse_xls_sheet_to_df(wb[sheets[0]]) diff --git a/tests/constants.py b/tests/constants.py index 690ab12..d60e38b 100644 --- a/tests/constants.py +++ b/tests/constants.py @@ -14,7 +14,7 @@ REGULARISED_LAPLACIAN_KERNEL = os.path.join(RESOURCES_FOLDER, 'regularisedLaplacianKernel.csv') DATASETS_FOLDER = os.path.join(RESOURCES_FOLDER, 'datasets') -NODE_TEST_PATH = os.path.join(DATASETS_FOLDER, 'node.csv') +NODE_TYPE_COL_TEST_PATH = os.path.join(DATASETS_FOLDER, 'node_type_col.csv') NODE_LOGFC_TEST_PATH = os.path.join(DATASETS_FOLDER, 'node_logfc.csv') NODE_LOGFC_PVAL_TEST_PATH = os.path.join(DATASETS_FOLDER, 'node_logfc_pval.csv') INPUT_SCORES = os.path.join(RESOURCES_FOLDER, 'input_scores.csv') diff --git a/tests/resources/datasets/node.csv b/tests/resources/datasets/node_type_col.csv similarity index 100% rename from tests/resources/datasets/node.csv rename to tests/resources/datasets/node_type_col.csv diff --git a/tests/test_diffusion.py b/tests/test_diffusion.py index 4fee51c..4e911df 100644 --- a/tests/test_diffusion.py +++ b/tests/test_diffusion.py @@ -10,7 +10,7 @@ from diffupy.diffuse import diffuse from diffupy.matrix import Matrix -from tests.constants import * +from .constants import * log = logging.getLogger(__name__) diff --git a/tests/test_input.py b/tests/test_input.py index 47bc22c..f3e5273 100644 --- a/tests/test_input.py +++ b/tests/test_input.py @@ -7,11 +7,12 @@ from diffupy.constants import * from diffupy.matrix import Matrix -from diffupy.process_input import process_input, map_nodes -from diffupy.utils import process_network +from diffupy.process_input import process_input_data, map_labels_input, \ + format_input_for_diffusion +from diffupy.process_network import get_graph_from_df from diffupy.validate_input import _validate_scores -from tests.constants import * +from .constants import * log = logging.getLogger(__name__) @@ -21,16 +22,16 @@ class ValidateTest(unittest.TestCase): def test_quantitative_bin_id(self): """Test codify label_input for quantitative scoring methods- only entity IDs given (binary labels).""" - input = NODE_TEST_PATH - input_labels_dict = process_input( + input = NODE_TYPE_COL_TEST_PATH + input_labels_dict = process_input_data( input, method=RAW, binning=True, absolute_value=True, p_value=0.05, threshold=None, ) - self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': 1, 'D': 1, 'E': 1}) + self.assertEqual(input_labels_dict, {'Metabolite': {'C': 1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}}) def test_quantitative_bin_fc_sign(self): """Test codify label_input for quantitative scoring methods- logFC given (binary, signed labels).""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_input_data( input, method=RAW, binning=True, absolute_value=False, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': 0, 'D': 0, 'E': -1}) @@ -38,7 +39,7 @@ def test_quantitative_bin_fc_sign(self): def test_quantitative_bin_fc_abs(self): """Test codify label_input for quantitative scoring methods- logFC given (binary, absolute values).""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_input_data( input, method=RAW, binning=True, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': 0, 'D': 0, 'E': 1}) @@ -46,7 +47,7 @@ def test_quantitative_bin_fc_abs(self): def test_quantitative_bin_fcp_sign(self): """Test codify label_input for quantitative scoring methods- logFC and adj. p-value given (binary, signed labels).""" input = NODE_LOGFC_PVAL_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_input_data( input, method=RAW, binning=True, absolute_value=False, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0, 'B': 1, 'C': 0, 'D': 0, 'E': -1}) @@ -54,7 +55,7 @@ def test_quantitative_bin_fcp_sign(self): def test_quantitative_bin_fcp_abs(self): """Test codify label_input for quant. scoring methods- logFC and adj. p-value given (binary, absolute values).""" input = NODE_LOGFC_PVAL_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_input_data( input, method=RAW, binning=True, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0, 'B': 1, 'C': 0, 'D': 0, 'E': 1}) @@ -62,7 +63,7 @@ def test_quantitative_bin_fcp_abs(self): def test_quantitative_fc_sign(self): """Test codify label_input for quantitative scoring methods- logFC given (quantitative, signed labels).""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_input_data( input, method=RAW, binning=False, absolute_value=False, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0.7, 'B': 1.2, 'C': 0, 'D': 0, 'E': -2.2}) @@ -70,7 +71,7 @@ def test_quantitative_fc_sign(self): def test_quantitative_fc_abs(self): """Test codify label_input for quantitative scoring methods- logFC given (quant., absolute values).""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_input_data( input, method=RAW, binning=False, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0.7, 'B': 1.2, 'C': 0, 'D': 0, 'E': 2.2}) @@ -78,7 +79,7 @@ def test_quantitative_fc_abs(self): def test_quantitative_fcp_sign(self): """Test codify label_input for quantitative scoring methods- logFC and adj. p-value given (quant., signed labels).""" input = NODE_LOGFC_PVAL_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_input_data( input, method=RAW, binning=False, absolute_value=False, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0, 'B': 1.2, 'C': 0, 'D': 0, 'E': -2.2}) @@ -86,23 +87,23 @@ def test_quantitative_fcp_sign(self): def test_quantitative_fcp_abs(self): """Test codify label_input for quant. scoring methods- logFC and adj. p-value given (quant., absolute values).""" input = NODE_LOGFC_PVAL_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_input_data( input, method=RAW, binning=False, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 0, 'B': 1.2, 'C': 0, 'D': 0, 'E': 2.2}) def test_non_quantitative_bin_id(self): """Test codify label_input for non-quantitative scoring methods- only entity IDs given (binary labels).""" - input = NODE_TEST_PATH - input_labels_dict = process_input( + input = NODE_TYPE_COL_TEST_PATH + input_labels_dict = process_input_data( input, method=ML, binning=True, absolute_value=True, p_value=0.05, threshold=None, ) - self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': 1, 'D': 1, 'E': 1}) + self.assertEqual(input_labels_dict, {'Metabolite': {'C': 1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}}) def test_non_quantitative_bin_fc_abs(self): """Test codify label_input for non-quantitative scoring methods- logFC given (binary, absolute values (sign)).""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_input_data( input, method=ML, binning=True, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': 1, 'B': 1, 'C': -1, 'D': -1, 'E': 1}) @@ -110,14 +111,72 @@ def test_non_quantitative_bin_fc_abs(self): def test_non_quantitative_bin_fcp_abs(self): """Test codify label_input for non-quant. scoring methods- logFC and adj. p-value given (binary, absolute values).""" input = NODE_LOGFC_PVAL_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_input_data( input, method=ML, binning=True, absolute_value=True, p_value=0.05, threshold=0.5, ) self.assertEqual(input_labels_dict, {'A': -1, 'B': 1, 'C': -1, 'D': -1, 'E': 1}) + def test_map_labels_input_label_list_background_list(self): + """Test map label_input.""" + mapping = map_labels_input(input_labels=['A', 'B', 'C', 'D'], + background_labels=['A', 'B', 'C']) + # As set because the order is not relevant. + self.assertEqual(set(mapping), {'A', 'C', 'B'}) + + def test_map_labels_input_label_list_background_dict(self): + """Test map label_input.""" + mapping = map_labels_input(input_labels=['A', 'B', 'C', 'D'], + background_labels={'Gene': ['A', 'B'], 'Metabolite': ['C']}) + + self.assertEqual(mapping, {'Gene': ['A', 'B'], 'Metabolite': ['C']}) + + def test_map_labels_input_type_dict_label_list_background_list(self): + """Test map label_input.""" + mapping = map_labels_input(input_labels={'Gene': ['A', 'B'], 'Metabolite': ['C', 'D']}, + background_labels=['A', 'B', 'C']) + + self.assertEqual(mapping, {'Gene': ['A', 'B'], 'Metabolite': ['C']}) + + def test_map_labels_input_type_dict_label_dict_background_dict(self): + """Test map label_input.""" + # If the labels are classified in another type ('D' and 'B'), since it do not match with the background it will be not mapped. + mapping = map_labels_input(input_labels={'Gene': ['A'], 'Metabolite': ['C', 'B']}, + background_labels={'Gene': ['A', 'B'], 'Metabolite': ['D']}) + + self.assertEqual(mapping, {'Gene': ['A']}) + + def test_map_labels_input_label_scores_dict_background_list(self): + """Test map label_input.""" + mapping = map_labels_input(input_labels={'A': 1, 'B': 1, 'D': 1, 'E': 1}, + background_labels=['B', 'C', 'D']) + + self.assertEqual(mapping, {'B': 1, 'D': 1}) + + def test_map_labels_input_label_scores_dict_background_dict(self): + """Test map label_input.""" + mapping = map_labels_input(input_labels={'A': 1, 'B': 1, 'D': 1, 'E': 1}, + background_labels={'Gene': ['A', 'B'], 'Metabolite': ['D']}) + + self.assertEqual(mapping, {'Metabolite': {'D': 1}, 'Gene': {'A': 1, 'B': 1}}) + + def test_map_labels_input_type_dict_label_scores_dict_background_list(self): + """Test map label_input.""" + mapping = map_labels_input(input_labels={'Metabolite': {'C': 1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}}, + background_labels=['A', 'B', 'C']) + + self.assertEqual(mapping, {'Metabolite': {'C': 1}, 'Gene': {'A': 1, 'B': 1}}) + + def test_map_labels_input_type_dict_label_scores_dict_background_dict(self): + """Test map label_input.""" + # If the labels are classified in another type ('D' and 'B'), since it do not match with the background it will be not mapped. + mapping = map_labels_input(input_labels={'Metabolite': {'C': -1}, 'Gene': {'A': 1, 'B': 1, 'D': 1, 'E': 1}}, + background_labels={'Gene': ['A', 'B'], 'Metabolite': ['C']}) + + self.assertEqual(mapping, {'Metabolite': {'C': -1}, 'Gene': {'A': 1, 'B': 1}}) + def test_network(self): """Test generate graph from csv.""" - graph = process_network(NETWORK_PATH, CSV) + graph = get_graph_from_df(NETWORK_PATH, CSV) graph_nodes = set(graph.nodes()) graph_edges = set(graph.edges()) @@ -138,16 +197,16 @@ def test_network(self): def test_node_mapping(self): """Test mapping of nodes in label_input to nodes in network.""" input = NODE_LOGFC_TEST_PATH - input_labels_dict = process_input( + input_labels_dict = process_input_data( input, method=RAW, binning=False, absolute_value=True, p_value=0.05, threshold=0.5, ) - graph = process_network(NETWORK_PATH, CSV) + graph = get_graph_from_df(NETWORK_PATH, CSV) graph_nodes = list(graph.nodes()) - mapped_nodes_list = map_nodes(input_labels_dict, graph_nodes) + mapped_nodes_list = map_labels_input(input_labels_dict, graph_nodes) - self.assertEqual(mapped_nodes_list, [0.0, 1.0, 0.0, 0.0, None, 1.0, None, None, None]) + self.assertEqual(mapped_nodes_list, {'A': 0.7, 'B': 1.2, 'C': 0.0, 'D': 0.0, 'E': 2.2}) def test_validate_scores_1(self): """Test validate scores 1.""" @@ -187,3 +246,48 @@ def test_validate_scores_4(self): ) with self.assertRaises(ValueError): _validate_scores(matrix) + + kernel_test_1 = Matrix( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], + cols_labels=['A', 'B', 'C', 'D'], + rows_labels=['A', 'B', 'C', 'D'], + name='Test Kernel 1' + ) + + kernel_test_2 = Matrix( + [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], + cols_labels=['A', 'B', 'C', 'F'], + rows_labels=['A', 'B', 'C', 'F'], + name='Test Kernel 2' + ) + + kernel_test_3 = Matrix( + [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 2, 3, 4, 5]], + cols_labels=['A', 'B', 'C', 'D', 'F'], + rows_labels=['A', 'B', 'C', 'D', 'F'], + name='Test Kernel 3' + ) + + def test_format_input_for_diffusion_label_list(self): + """Test empty matrix.""" + processed_mapped_nodes_list = format_input_for_diffusion( + map_labels_input({'Metabolite': {'C': -1}, 'Gene': {'A': 2, 'B': 1}, 'mirnas': {'A': 1, 'T': 1}}, + self.kernel_test_1.rows_labels), + self.kernel_test_1, + ) + + # TODO: Implement in Matrix equal, now if the col order is mixed it raises error + # assert(np.allclose(processed_mapped_nodes_list.mat, + # np.array([[-1, 2, 1], + # [-1, 1, -1], + # [-1, -1, -1], + # [-1, -1, -1]] + # ) + # ) + # ) + # self.assertEqual(processed_mapped_nodes_list.cols_labels, + # ['Metabolite', 'Gene', 'mirnas'] + # ) + # self.assertEqual(processed_mapped_nodes_list.rows_labels, + # ['A', 'B', 'C', 'D'] + # )