diff --git a/RELEASE.md b/RELEASE.md index 2084d3d..ee15200 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,5 +1,6 @@ # Upcoming release +* Add dynotears (`from_numpy_dynamic`, an algorithm for structure learning on Dynamic Bayesian Networks) * Add a count data type to the data generator using a zero-inflated Poisson * Set bounds/max class imbalance for binary features for the data generators * Add non-linear data generators for multiple data types diff --git a/causalnex/structure/__init__.py b/causalnex/structure/__init__.py index 0315d40..0ab5413 100644 --- a/causalnex/structure/__init__.py +++ b/causalnex/structure/__init__.py @@ -30,6 +30,6 @@ ``causalnex.structure`` provides functionality to define or learn structure. """ -__all__ = ["StructureModel", "notears", "data_generators"] +__all__ = ["StructureModel", "notears", "dynotears", "data_generators"] from .structuremodel import StructureModel diff --git a/causalnex/structure/data_generators/__init__.py b/causalnex/structure/data_generators/__init__.py index b1f71d4..ced528e 100644 --- a/causalnex/structure/data_generators/__init__.py +++ b/causalnex/structure/data_generators/__init__.py @@ -40,14 +40,20 @@ "generate_continuous_data", "generate_continuous_dataframe", "generate_count_dataframe", + "gen_stationary_dyn_net_and_df", + "generate_dataframe_dynamic", + "generate_structure_dynamic", ] from .core import generate_structure, nonlinear_sem_generator, sem_generator from .wrappers import ( + gen_stationary_dyn_net_and_df, generate_binary_data, generate_binary_dataframe, generate_categorical_dataframe, generate_continuous_data, generate_continuous_dataframe, generate_count_dataframe, + generate_dataframe_dynamic, + generate_structure_dynamic, ) diff --git a/causalnex/structure/data_generators/core.py b/causalnex/structure/data_generators/core.py index c88dd51..384dc2e 100644 --- a/causalnex/structure/data_generators/core.py +++ b/causalnex/structure/data_generators/core.py @@ -39,11 +39,11 @@ import pandas as pd from sklearn.gaussian_process.kernels import RBF, Kernel +from causalnex.structure import StructureModel from causalnex.structure.categorical_variable_mapper import ( VariableFeatureMapper, validate_schema, ) -from causalnex.structure.structuremodel import StructureModel # dict mapping distributions names to their functions __distribution_mapper = { @@ -117,7 +117,10 @@ def generate_structure( edge_flags = np.tril(np.ones([num_nodes, num_nodes]), k=-1) else: - raise ValueError("unknown graph type") + raise ValueError( + "Unknown graph type {t}. ".format(t=graph_type) + + "Available types are ['erdos-renyi', 'barabasi-albert', 'full']" + ) # randomly permute edges - required because we limited ourselves to lower diagonal previously perms = np.random.permutation(np.eye(num_nodes, num_nodes)) diff --git a/causalnex/structure/data_generators/wrappers.py b/causalnex/structure/data_generators/wrappers.py index 03e4a85..94ba092 100644 --- a/causalnex/structure/data_generators/wrappers.py +++ b/causalnex/structure/data_generators/wrappers.py @@ -29,14 +29,21 @@ """ Module of methods to sample variables of a single data type. """ -from typing import Optional +import warnings +from typing import List, Optional, Tuple import networkx as nx import numpy as np import pandas as pd +from scipy.sparse import csr_matrix from sklearn.gaussian_process.kernels import Kernel -from causalnex.structure.data_generators import nonlinear_sem_generator, sem_generator +from causalnex.structure.data_generators import ( + generate_structure, + nonlinear_sem_generator, + sem_generator, +) +from causalnex.structure.structuremodel import StructureModel def generate_continuous_data( @@ -362,3 +369,344 @@ def generate_categorical_dataframe( noise_std=noise_scale, seed=seed, ) + + +def generate_structure_dynamic( # pylint: disable=too-many-arguments + num_nodes: int, + p: int, + degree_intra: float, + degree_inter: float, + graph_type_intra: str = "erdos-renyi", + graph_type_inter: str = "erdos-renyi", + w_min_intra: float = 0.5, + w_max_intra: float = 0.5, + w_min_inter: float = 0.5, + w_max_inter: float = 0.5, + w_decay: float = 1.0, +) -> StructureModel: + """ + Generates a dynamic DAG at random. + + Args: + num_nodes: Number of nodes + p: maximum lag to be considered in the structure + degree_intra: expected degree on nodes from the current state + degree_inter: expected degree on nodes from the lagged nodes + graph_type_intra: + - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1) + - barabasi-albert: constructs a scale-free graph from an initial connected graph of (degree / 2) nodes + - full: constructs a fully-connected graph - degree has no effect + graph_type_inter: + - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1) + - full: connect all past nodes to all present nodes + w_min_intra: minimum weight for intra-slice nodes + w_max_intra: maximum weight for intra-slice nodes + w_min_inter: minimum weight for inter-slice nodes + w_max_inter: maximum weight for inter-slice nodes + w_decay: exponent of weights decay for slices that are farther apart. Default is 1.0, which implies no decay + + Raises: + ValueError: if graph type unknown or `num_nodes < 2` + + Returns: + StructureModel containing all simulated nodes and edges (intra- and inter-slice) + """ + sm_intra = generate_structure( + num_nodes=num_nodes, + degree=degree_intra, + graph_type=graph_type_intra, + w_min=w_min_intra, + w_max=w_max_intra, + ) + sm_inter = _generate_inter_structure( + num_nodes=num_nodes, + p=p, + degree=degree_inter, + graph_type=graph_type_inter, + w_min=w_min_inter, + w_max=w_max_inter, + w_decay=w_decay, + ) + res = StructureModel() + res.add_nodes_from(sm_inter.nodes) + res.add_nodes_from(["{var}_lag0".format(var=u) for u in sm_intra.nodes]) + res.add_weighted_edges_from(sm_inter.edges.data("weight")) + res.add_weighted_edges_from( + [ + ("{var}_lag0".format(var=u), "{var}_lag0".format(var=v), w) + for u, v, w in sm_intra.edges.data("weight") + ] + ) + return res + + +def _generate_inter_structure( + num_nodes: int, + p: int, + degree: float, + graph_type: str, + w_min: float, + w_max: float, + w_decay: float = 1.0, + neg: float = 0.5, +) -> StructureModel: + """Simulate random DAG between two time slices. + + Args: + num_nodes: number of nodes per slice + p: number of slices that influence current slice + degree: expected in-degree of current time slice + graph_type: {'erdos-renyi' 'full'} + w_min: minimum weight for inter-slice nodes + w_max: maximum weight for inter-slice nodes + w_decay: exponent of weights decay for slices that are farther apart. Default is 1.0, which implies no decay + neg: the proportion of edge weights expected to be negative. By default, 50% of the edges are expected + to be negative weight (`neg == 0.5`). + + Returns: + G_inter: weighted, bipartite DAG for inter-slice connections + + Raises: + ValueError: if graph type not known + """ + if w_min > w_max: + raise ValueError( + "Absolute minimum weight must be less than or equal to maximum weight: {} > {}".format( + w_min, w_max + ) + ) + + if graph_type == "erdos-renyi": + prob = degree / num_nodes + b = (np.random.rand(p * num_nodes, num_nodes) < prob).astype(float) + elif graph_type == "full": # ignore degree, only for experimental use + b = np.ones([p * num_nodes, num_nodes]) + else: + raise ValueError( + "Unknown inter-slice graph type `{n}`".format(n=graph_type) + + ". Valid types are 'erdos-renyi' and 'full'" + ) + u = [] + for i in range(p): + u_i = np.random.uniform(low=w_min, high=w_max, size=[num_nodes, num_nodes]) / ( + w_decay ** i + ) + u_i[np.random.rand(num_nodes, num_nodes) < neg] *= -1 + u.append(u_i) + + u = np.concatenate(u, axis=0) if u else np.empty(b.shape) + a = (b != 0).astype(float) * u + + df = pd.DataFrame( + a, + index=[ + "{var}_lag{l_val}".format(var=var, l_val=l_val) + for l_val in range(1, p + 1) + for var in range(num_nodes) + ], + columns=[ + "{var}_lag{l_val}".format(var=var, l_val=0) for var in range(num_nodes) + ], + ) + idxs, cols = list(df.index), list(df.columns) + for i in idxs: + df[i] = 0 + for i in cols: + df.loc[i, :] = 0 + + g_inter = StructureModel(df) + return g_inter + + +def generate_dataframe_dynamic( # pylint: disable=R0914 + g: StructureModel, + n_samples: int = 1000, + burn_in: int = 100, + sem_type: str = "linear-gauss", + noise_scale: float = 1.0, + drift: np.ndarray = None, +) -> pd.DataFrame: + """Simulate samples from dynamic SEM with specified type of noise. + Args: + g: Dynamic DAG + n_samples: number of samples + burn_in: number of samples to discard + sem_type: {linear-gauss,linear-exp,linear-gumbel} + noise_scale: scale parameter of noise distribution in linear SEM + drift: array of drift terms for each node, if None then the drift is 0 + Returns: + X: [n,d] sample matrix, row t is X_t + Y: [n,d*p] sample matrix, row t is [X_{t-1}, ..., X_{t-p}] + Raises: + ValueError: if sem_type isn't linear-gauss/linear_exp/linear-gumbel + """ + s_types = ("linear-gauss", "linear-exp", "linear-gumbel") + if sem_type not in s_types: + raise ValueError( + "unknown sem type {st}. Available types are: {sts}".format( + st=sem_type, sts=s_types + ) + ) + intra_nodes = sorted(el for el in g.nodes if "_lag0" in el) + inter_nodes = sorted(el for el in g.nodes if "_lag0" not in el) + w_mat = nx.to_numpy_array(g, nodelist=intra_nodes) + a_mat = nx.to_numpy_array(g, nodelist=intra_nodes + inter_nodes)[ + len(intra_nodes) :, : len(intra_nodes) + ] + g_intra = nx.DiGraph(w_mat) + g_inter = nx.bipartite.from_biadjacency_matrix( + csr_matrix(a_mat), create_using=nx.DiGraph + ) + d = w_mat.shape[0] + p = a_mat.shape[0] // d + total_length = n_samples + burn_in + X = np.zeros([total_length, d]) + Xlags = np.zeros([total_length, p * d]) + ordered_vertices = list(nx.topological_sort(g_intra)) + if drift is None: + drift = np.zeros(d) + for t in range(total_length): + for j in ordered_vertices: + parents = list(g_intra.predecessors(j)) + parents_prev = list(g_inter.predecessors(j + p * d)) + X[t, j] = ( + drift[j] + + X[t, parents].dot(w_mat[parents, j]) + + Xlags[t, parents_prev].dot(a_mat[parents_prev, j]) + ) + if sem_type == "linear-gauss": + X[t, j] = X[t, j] + np.random.normal(scale=noise_scale) + elif sem_type == "linear-exp": + X[t, j] = X[t, j] + np.random.exponential(scale=noise_scale) + elif sem_type == "linear-gumbel": + X[t, j] = X[t, j] + np.random.gumbel(scale=noise_scale) + + if (t + 1) < total_length: + Xlags[t + 1, :] = np.concatenate([X[t, :], Xlags[t, :]])[: d * p] + return pd.concat( + [ + pd.DataFrame(X[-n_samples:], columns=intra_nodes), + pd.DataFrame(Xlags[-n_samples:], columns=inter_nodes), + ], + axis=1, + ) + + +def gen_stationary_dyn_net_and_df( # pylint: disable=R0913, R0914 + num_nodes: int = 10, + n_samples: int = 100, + p: int = 1, + degree_intra: float = 3, + degree_inter: float = 3, + graph_type_intra: str = "erdos-renyi", + graph_type_inter: str = "erdos-renyi", + w_min_intra: float = 0.5, + w_max_intra: float = 0.5, + w_min_inter: float = 0.5, + w_max_inter: float = 0.5, + w_decay: float = 1.0, + sem_type: str = "linear-gauss", + noise_scale: float = 1, + max_data_gen_trials: int = 1000, +) -> Tuple[StructureModel, pd.DataFrame, List[str], List[str]]: + """ + Generates a dynamic structure model as well a dataframe representing a time series realisation of that model. + We do checks to verify the network is stationary, and iterate until the resulting network is stationary. + Args: + num_nodes: number of nodes in the intra-slice structure + n_samples: number of points to sample from the model, as a time series + p: lag value for the dynamic structure + degree_intra: expected degree for intra_slice nodes + degree_inter: expected degree for inter_slice nodes + graph_type_intra: + - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1) + - barabasi-albert: constructs a scale-free graph from an initial connected graph of (degree / 2) nodes + - full: constructs a fully-connected graph - degree has no effect + graph_type_inter: + - erdos-renyi: constructs a graph such that the probability of any given edge is degree / (num_nodes - 1) + - full: connect all past nodes to all present nodesw_min_intra: + w_min_intra: minimum weight on intra-slice adjacency matrix + w_max_intra: maximum weight on intra-slice adjacency matrix + w_min_inter: minimum weight on inter-slice adjacency matrix + w_max_inter: maximum weight on inter-slice adjacency matrix + w_decay: exponent of weights decay for slices that are farther apart. Default is 1.0, which implies no decay + sem_type: {linear-gauss,linear-exp,linear-gumbel} + noise_scale: scale parameter of noise distribution in linear SEM + max_data_gen_trials: maximun number of attempts until obtaining a seemingly stationary model + Returns: + Tuple with: + - the model created,as a Structure model + - DataFrame representing the time series created from the model + - Intra-slice nodes names + - Inter-slice nodes names + """ + + with np.errstate(over="raise", invalid="raise"): + burn_in = max(n_samples // 10, 50) + + simulate_flag = True + g, intra_nodes, inter_nodes = None, None, None + + while simulate_flag: + max_data_gen_trials -= 1 + if max_data_gen_trials <= 0: + simulate_flag = False + + try: + simulate_graphs_flag = True + while simulate_graphs_flag: + + g = generate_structure_dynamic( + num_nodes=num_nodes, + p=p, + degree_intra=degree_intra, + degree_inter=degree_inter, + graph_type_intra=graph_type_intra, + graph_type_inter=graph_type_inter, + w_min_intra=w_min_intra, + w_max_intra=w_max_intra, + w_min_inter=w_min_inter, + w_max_inter=w_max_inter, + w_decay=w_decay, + ) + intra_nodes = sorted([el for el in g.nodes if "_lag0" in el]) + inter_nodes = sorted([el for el in g.nodes if "_lag0" not in el]) + # Exclude empty graphs from consideration unless input degree is 0 + if ( + ( + [(u, v) for u, v in g.edges if u in intra_nodes] + and [(u, v) for u, v in g.edges if u in inter_nodes] + ) + or degree_intra == 0 + or degree_inter == 0 + ): + simulate_graphs_flag = False + + # generate single time series + df = ( + generate_dataframe_dynamic( + g, + n_samples=n_samples + burn_in, + sem_type=sem_type, + noise_scale=noise_scale, + ) + .loc[burn_in:, intra_nodes + inter_nodes] + .reset_index(drop=True) + ) + + if df.isna().any(axis=None): + continue + except (OverflowError, FloatingPointError): + continue + if (df.abs().max().max() < 1e3) or (max_data_gen_trials <= 0): + simulate_flag = False + if max_data_gen_trials <= 0: + warnings.warn( + "Could not simulate data, returning constant dataframe", UserWarning + ) + + df = pd.DataFrame( + np.ones((n_samples, num_nodes * (1 + p))), + columns=intra_nodes + inter_nodes, + ) + return g, df, intra_nodes, inter_nodes diff --git a/causalnex/structure/dynotears.py b/causalnex/structure/dynotears.py new file mode 100644 index 0000000..e7d1202 --- /dev/null +++ b/causalnex/structure/dynotears.py @@ -0,0 +1,494 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Tools to learn a Dynamic Bayesian Network which describe the conditional dependencies between variables in a time-series +dataset. +""" + +import warnings +from typing import Dict, List, Tuple, Union + +import numpy as np +import pandas as pd +import scipy.linalg as slin +import scipy.optimize as sopt + +from causalnex.structure import StructureModel + +from .transformers import DynamicDataTransformer + + +def from_pandas_dynamic( # pylint: disable=too-many-arguments + time_series: Union[pd.DataFrame, List[pd.DataFrame]], + p: int, + lambda_w: float = 0.1, + lambda_a: float = 0.1, + max_iter: int = 100, + h_tol: float = 1e-8, + w_threshold: float = 0.0, + tabu_edges: List[Tuple[int, int, int]] = None, + tabu_parent_nodes: List[int] = None, + tabu_child_nodes: List[int] = None, +) -> StructureModel: + """ + Learn the graph structure of a Dynamic Bayesian Network describing conditional dependencies between variables in + data. The input data is a time series or a list of realisations of a same time series. + The optimisation is to minimise a score function F(W, A) over the graph's contemporaneous (intra-slice) weighted + adjacency matrix, W, and lagged (inter-slice) weighted adjacency matrix, A, subject to the a constraint function + h(W), where h_value(W) == 0 characterises an acyclic graph. h(W) > 0 is a continuous, differentiable function that + encapsulated how acyclic the graph is (less = more acyclic). + + Based on "DYNOTEARS: Structure Learning from Time-Series Data". + https://arxiv.org/abs/2002.00498 + @inproceedings{pamfil2020dynotears, + title={DYNOTEARS: Structure Learning from Time-Series Data}, + author={Pamfil, Roxana and Sriwattanaworachai, Nisara and Desai, Shaan and Pilgerstorfer, + Philip and Georgatzis, Konstantinos and Beaumont, Paul and Aragam, Bryon}, + booktitle={International Conference on Artificial Intelligence and Statistics}, + pages={1595--1605}, + year={2020}year={2020}, + } + Args: + time_series: pd.DataFrame or List of pd.DataFrame instances. + If a list is provided each element of the list being an realisation of a time series (i.e. time series governed + by the same processes) + The columns of the data frame represent the variables in the model, and the *index represents the time index*. + Successive events, therefore, must be indexed with one integer of difference between them too. + p: Number of past interactions we allow the model to create. The state of a variable at time `t` is affected by + past variables up to a `t-p`, as well as by other variables at `t`. + lambda_w: parameter for l1 regularisation of intra-slice edges + lambda_a: parameter for l1 regularisation of inter-slice edges + max_iter: max number of dual ascent steps during optimisation. + h_tol: exit if h(W) < h_tol (as opposed to strict definition of 0). + w_threshold: fixed threshold for absolute edge weights. + tabu_edges: list of edges(lag, from, to) not to be included in the graph. `lag == 0` implies that the edge is + forbidden in the INTRA graph (W), while lag > 0 implies an INTER-slice weight equal zero. + tabu_parent_nodes: list of nodes banned from being a parent of any other nodes. + tabu_child_nodes: list of nodes banned from being a child of any other nodes. + + Returns: + StructureModel representing the model learnt. The node names are noted as `{var}_lag{l}`, where `var` is the + original variable name as in the give in the input data frames and `l`, in 0,1,2..p is the correspondent + time lag. + """ + time_series = [time_series] if not isinstance(time_series, list) else time_series + + X, Xlags = DynamicDataTransformer(p=p).fit_transform(time_series, return_df=False) + + col_idx = {c: i for i, c in enumerate(time_series[0].columns)} + idx_col = {i: c for c, i in col_idx.items()} + + if tabu_edges: + tabu_edges = [(lag, col_idx[u], col_idx[v]) for lag, u, v in tabu_edges] + if tabu_parent_nodes: + tabu_parent_nodes = [col_idx[n] for n in tabu_parent_nodes] + if tabu_child_nodes: + tabu_child_nodes = [col_idx[n] for n in tabu_child_nodes] + + g = from_numpy_dynamic( + X, + Xlags, + lambda_w, + lambda_a, + max_iter, + h_tol, + w_threshold, + tabu_edges, + tabu_parent_nodes, + tabu_child_nodes, + ) + + sm = StructureModel() + sm.add_nodes_from( + [ + "{var}_lag{l_val}".format(var=var, l_val=l_val) + for var in col_idx.keys() + for l_val in range(p + 1) + ] + ) + sm.add_weighted_edges_from( + [ + ( + _format_name_from_pandas(idx_col, u), + _format_name_from_pandas(idx_col, v), + w, + ) + for u, v, w in g.edges.data("weight") + ], + origin="learned", + ) + + return sm + + +def _format_name_from_pandas(idx_col: Dict[int, str], from_numpy_node: str) -> str: + """ + Helper function for `from_pandas_dynamic`. converts a node from the `from_numpy_dynamic` format to the `from_pandas` + format + Args: + idx_col: map from variable to intdex + from_numpy_node: nodes in the structure model output by `from_numpy_dynamic`. + Returns: + nodes in from_pandas_dynamic format + """ + idx, lag_val = from_numpy_node.split("_lag") + return "{var}_lag{l_val}".format(var=idx_col[int(idx)], l_val=lag_val) + + +def from_numpy_dynamic( # pylint: disable=too-many-arguments + X: np.ndarray, + Xlags: np.ndarray, + lambda_w: float = 0.1, + lambda_a: float = 0.1, + max_iter: int = 100, + h_tol: float = 1e-8, + w_threshold: float = 0.0, + tabu_edges: List[Tuple[int, int, int]] = None, + tabu_parent_nodes: List[int] = None, + tabu_child_nodes: List[int] = None, +) -> StructureModel: + """ + Learn the graph structure of a Dynamic Bayesian Network describing conditional dependencies between variables in + data. The input data is time series data present in numpy arrays X and Xlags. + + The optimisation is to minimise a score function F(W, A) over the graph's contemporaneous (intra-slice) weighted + adjacency matrix, W, and lagged (inter-slice) weighted adjacency matrix, A, subject to the a constraint function + h(W), where h_value(W) == 0 characterises an acyclic graph. h(W) > 0 is a continuous, differentiable function that + encapsulated how acyclic the graph is (less = more acyclic). + + Based on "DYNOTEARS: Structure Learning from Time-Series Data". + https://arxiv.org/abs/2002.00498 + @inproceedings{pamfil2020dynotears, + title={DYNOTEARS: Structure Learning from Time-Series Data}, + author={Pamfil, Roxana and Sriwattanaworachai, Nisara and Desai, Shaan and Pilgerstorfer, + Philip and Georgatzis, Konstantinos and Beaumont, Paul and Aragam, Bryon}, + booktitle={International Conference on Artificial Intelligence and Statistics}, + pages={1595--1605}, + year={2020}year={2020}, + } + + Args: + X (np.ndarray): 2d input data, axis=1 is data columns, axis=0 is data rows. Each column represents one variable, + and each row represents x(m,t) i.e. the mth time series at time t. + Xlags (np.ndarray): shifted data of X with lag orders stacking horizontally. Xlags=[shift(X,1)|...|shift(X,p)] + lambda_w (float): l1 regularization parameter of intra-weights W + lambda_a (float): l1 regularization parameter of inter-weights A + max_iter: max number of dual ascent steps during optimisation + h_tol (float): exit if h(W) < h_tol (as opposed to strict definition of 0) + w_threshold: fixed threshold for absolute edge weights. + tabu_edges: list of edges(lag, from, to) not to be included in the graph. `lag == 0` implies that the edge is + forbidden in the INTRA graph (W), while lag > 0 implies an INTER weight equal zero. + tabu_parent_nodes: list of nodes banned from being a parent of any other nodes. + tabu_child_nodes: list of nodes banned from being a child of any other nodes. + Returns: + W (np.ndarray): d x d estimated weighted adjacency matrix of intra slices + A (np.ndarray): d x pd estimated weighted adjacency matrix of inter slices + + Raises: + ValueError: If X or Xlags does not contain data, or dimensions of X and Xlags do not conform + """ + _, d_vars = X.shape + p_orders = Xlags.shape[1] // d_vars + + bnds_w = 2 * [ + (0, 0) + if i == j + else (0, 0) + if tabu_edges is not None and (0, i, j) in tabu_edges + else (0, 0) + if tabu_parent_nodes is not None and i in tabu_parent_nodes + else (0, 0) + if tabu_child_nodes is not None and j in tabu_child_nodes + else (0, None) + for i in range(d_vars) + for j in range(d_vars) + ] + + bnds_a = [] + for k in range(1, p_orders + 1): + bnds_a.extend( + 2 + * [ + (0, 0) + if tabu_edges is not None and (k, i, j) in tabu_edges + else (0, 0) + if tabu_parent_nodes is not None and i in tabu_parent_nodes + else (0, 0) + if tabu_child_nodes is not None and j in tabu_child_nodes + else (0, None) + for i in range(d_vars) + for j in range(d_vars) + ] + ) + + bnds = bnds_w + bnds_a + w_est, a_est = _learn_dynamic_structure( + X, Xlags, bnds, lambda_w, lambda_a, max_iter, h_tol + ) + + w_est[np.abs(w_est) < w_threshold] = 0 + a_est[np.abs(a_est) < w_threshold] = 0 + sm = _matrices_to_structure_model(w_est, a_est) + return sm + + +def _matrices_to_structure_model( + w_est: np.ndarray, a_est: np.ndarray +) -> StructureModel: + """ + Converts the matrices output by dynotears (W and A) into a StructureModel + We use the following convention: + - {var}_lag{l} where l is the lag value (i.e. from how many previous timestamps the edge is coming + - if we deal with a intra_slice_node, `l == 0` + Args: + w_est: Intra-slice weight matrix + a_est: Inter-slice matrix + + Returns: + StructureModel representing the structure learnt + + """ + sm = StructureModel() + lag_cols = [ + "{var}_lag{l_val}".format(var=var, l_val=l_val) + for l_val in range(1 + (a_est.shape[0] // a_est.shape[1])) + for var in range(a_est.shape[1]) + ] + sm.add_nodes_from(lag_cols) + sm.add_edges_from( + [ + (lag_cols[i], lag_cols[j], dict(weight=w_est[i, j])) + for i in range(w_est.shape[0]) + for j in range(w_est.shape[1]) + if w_est[i, j] != 0 + ] + ) + sm.add_edges_from( + [ + (lag_cols[i + w_est.shape[0]], lag_cols[j], dict(weight=a_est[i, j])) + for i in range(a_est.shape[0]) + for j in range(a_est.shape[1]) + if a_est[i, j] != 0 + ] + ) + return sm + + +def _reshape_wa( + wa_vec: np.ndarray, d_vars: int, p_orders: int +) -> Tuple[np.ndarray, np.ndarray]: + """ + Helper function for `_learn_dynamic_structure`. Transform adjacency vector to matrix form + + Args: + wa_vec (np.ndarray): current adjacency vector with intra- and inter-slice weights + d_vars (int): number of variables in the model + p_orders (int): number of past indexes we to use + Returns: + intra- and inter-slice adjacency matrices + """ + + w_tilde = wa_vec.reshape([2 * (p_orders + 1) * d_vars, d_vars]) + w_plus = w_tilde[:d_vars, :] + w_minus = w_tilde[d_vars : 2 * d_vars, :] + w_mat = w_plus - w_minus + a_plus = ( + w_tilde[2 * d_vars :] + .reshape(2 * p_orders, d_vars ** 2)[::2] + .reshape(d_vars * p_orders, d_vars) + ) + a_minus = ( + w_tilde[2 * d_vars :] + .reshape(2 * p_orders, d_vars ** 2)[1::2] + .reshape(d_vars * p_orders, d_vars) + ) + a_mat = a_plus - a_minus + return w_mat, a_mat + + +def _learn_dynamic_structure( + X: np.ndarray, + Xlags: np.ndarray, + bnds: List[Tuple[float, float]], + lambda_w: float = 0.1, + lambda_a: float = 0.1, + max_iter: int = 100, + h_tol: float = 1e-8, +) -> Tuple[np.ndarray, np.ndarray]: + """ + Learn the graph structure of a Dynamic Bayesian Network describing conditional dependencies between data variables. + + The optimisation is to minimise a score function F(W, A) over the graph's contemporaneous (intra-slice) weighted + adjacency matrix, W, and lagged (inter-slice) weighted adjacency matrix, A, subject to the a constraint function + h(W), where h_value(W) == 0 characterises an acyclic graph. h(W) > 0 is a continuous, differentiable function that + encapsulated how acyclic the graph is (less = more acyclic). + + Based on "DYNOTEARS: Structure Learning from Time-Series Data". + https://arxiv.org/abs/2002.00498 + @inproceedings{pamfil2020dynotears, + title={DYNOTEARS: Structure Learning from Time-Series Data}, + author={Pamfil, Roxana and Sriwattanaworachai, Nisara and Desai, Shaan and Pilgerstorfer, + Philip and Georgatzis, Konstantinos and Beaumont, Paul and Aragam, Bryon}, + booktitle={International Conference on Artificial Intelligence and Statistics}, + pages={1595--1605}, + year={2020}year={2020}, + } + + Args: + X (np.ndarray): 2d input data, axis=1 is data columns, axis=0 is data rows. Each column represents one variable, + and each row represents x(m,t) i.e. the mth time series at time t. + Xlags (np.ndarray): shifted data of X with lag orders stacking horizontally. Xlags=[shift(X,1)|...|shift(X,p)] + bnds: Box constraints of L-BFGS-B to ban self-loops in W, enforce non-negativity of w_plus, w_minus, a_plus, + a_minus, and help with stationarity in A + lambda_w (float): l1 regularization parameter of intra-weights W + lambda_a (float): l1 regularization parameter of inter-weights A + max_iter (int): max number of dual ascent steps during optimisation + h_tol (float): exit if h(W) < h_tol (as opposed to strict definition of 0) + + Returns: + W (np.ndarray): d x d estimated weighted adjacency matrix of intra slices + A (np.ndarray): d x pd estimated weighted adjacency matrix of inter slices + + Raises: + ValueError: If X or Xlags does not contain data, or dimensions of X and Xlags do not conform + """ + if X.size == 0: + raise ValueError("Input data X is empty, cannot learn any structure") + if Xlags.size == 0: + raise ValueError("Input data Xlags is empty, cannot learn any structure") + if X.shape[0] != Xlags.shape[0]: + raise ValueError("Input data X and Xlags must have the same number of rows") + if Xlags.shape[1] % X.shape[1] != 0: + raise ValueError( + "Number of columns of Xlags must be a multiple of number of columns of X" + ) + + n, d_vars = X.shape + p_orders = Xlags.shape[1] // d_vars + + def _h(wa_vec: np.ndarray) -> float: + """ + Constraint function of the dynotears + + Args: + wa_vec (np.ndarray): current adjacency vector with intra- and inter-slice weights + + Returns: + float: DAGness of the intra-slice adjacency matrix W (0 == DAG, >0 == cyclic) + """ + + _w_mat, _ = _reshape_wa(wa_vec, d_vars, p_orders) + return np.trace(slin.expm(_w_mat * _w_mat)) - d_vars + + def _func(wa_vec: np.ndarray) -> float: + """ + Objective function that the dynotears tries to minimise + + Args: + wa_vec (np.ndarray): current adjacency vector with intra- and inter-slice weights + + Returns: + float: objective + """ + + _w_mat, _a_mat = _reshape_wa(wa_vec, d_vars, p_orders) + loss = ( + 0.5 + / n + * np.square( + np.linalg.norm( + X.dot(np.eye(d_vars, d_vars) - _w_mat) - Xlags.dot(_a_mat), "fro" + ) + ) + ) + _h_value = _h(wa_vec) + l1_penalty = lambda_w * (wa_vec[: 2 * d_vars ** 2].sum()) + lambda_a * ( + wa_vec[2 * d_vars ** 2 :].sum() + ) + return loss + 0.5 * rho * _h_value * _h_value + alpha * _h_value + l1_penalty + + def _grad(wa_vec: np.ndarray) -> np.ndarray: + """ + Gradient function used to compute next step in dynotears + + Args: + wa_vec (np.ndarray): current adjacency vector with intra- and inter-slice weights + + Returns: + gradient vector + """ + + _w_mat, _a_mat = _reshape_wa(wa_vec, d_vars, p_orders) + e_mat = slin.expm(_w_mat * _w_mat) + loss_grad_w = ( + -1.0 + / n + * (X.T.dot(X.dot(np.eye(d_vars, d_vars) - _w_mat) - Xlags.dot(_a_mat))) + ) + obj_grad_w = ( + loss_grad_w + + (rho * (np.trace(e_mat) - d_vars) + alpha) * e_mat.T * _w_mat * 2 + ) + obj_grad_a = ( + -1.0 + / n + * (Xlags.T.dot(X.dot(np.eye(d_vars, d_vars) - _w_mat) - Xlags.dot(_a_mat))) + ) + + grad_vec_w = np.append( + obj_grad_w, -obj_grad_w, axis=0 + ).flatten() + lambda_w * np.ones(2 * d_vars ** 2) + grad_vec_a = obj_grad_a.reshape(p_orders, d_vars ** 2) + grad_vec_a = np.hstack( + (grad_vec_a, -grad_vec_a) + ).flatten() + lambda_a * np.ones(2 * p_orders * d_vars ** 2) + return np.append(grad_vec_w, grad_vec_a, axis=0) + + # initialise matrix, weights and constraints + wa_est = np.zeros(2 * (p_orders + 1) * d_vars ** 2) + wa_new = np.zeros(2 * (p_orders + 1) * d_vars ** 2) + rho, alpha, h_value, h_new = 1.0, 0.0, np.inf, np.inf + + for n_iter in range(max_iter): + while rho < 1e20: + wa_new = sopt.minimize( + _func, wa_est, method="L-BFGS-B", jac=_grad, bounds=bnds + ).x + h_new = _h(wa_new) + if h_new > 0.25 * h_value: + rho *= 10 + else: + break + wa_est = wa_new + h_value = h_new + alpha += rho * h_value + if h_value <= h_tol: + break + if h_value > h_tol and n_iter == max_iter - 1: + warnings.warn("Failed to converge. Consider increasing max_iter.") + return _reshape_wa(wa_est, d_vars, p_orders) diff --git a/causalnex/structure/transformers.py b/causalnex/structure/transformers.py new file mode 100644 index 0000000..09d5aa8 --- /dev/null +++ b/causalnex/structure/transformers.py @@ -0,0 +1,290 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Collection of sklearn style transformers designed to assist with causal structure learning. +""" + +from copy import deepcopy +from typing import List, Tuple, Union + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.exceptions import NotFittedError + + +class DynamicDataTransformer(BaseEstimator, TransformerMixin): + """ + Format a time series dataframe or list of dataframes into the a format that matches the structure learned by + `from_pandas_dynamic`. This is done to allow for bayesian network probability fitting. + + Example of utilisation: + >>> ddt = DynamicDataTransformer(p=p).fit(time_series, return_df=False) + >>> X, Xlags = ddt.transform(time_series) + + >>> ddt = DynamicDataTransformer(p=p).fit(time_series, return_df=True) + >>> df = ddt.transform(time_series) + """ + + def __init__(self, p: int): + """ + Initialise Transformer + Args: + p: Number of past interactions we allow the model to create. The state of a variable at time `t` is + affected by the variables at the time stamp + the variables at `t-1`, `t-2`,... `t-p`. + """ + self.p = p + self.columns = None + self.return_df = None + + def fit( + self, + time_series: Union[pd.DataFrame, List[pd.DataFrame]], + return_df: bool = True, + ) -> "DynamicDataTransformer": + """ + Fits the time series. This consists memorizing: + - Column names and column positions + - whether a dataframe or a tuple of arrays should be returned by `transform` (details below) + Args: + time_series: pd.DataFrame or List of pd.DataFrame instances. + If a list is provided each element of the list being an realisation of a time series + (i.e. time series governed by the same processes) + The columns of the data frame represent the variables in the model, and the *index represents + the time index*. + Successive events, therefore, must be indexed with one integer of difference between them too. + + return_df: Whether the `transform` method should return a pandas.DataFrame or a tuple with (X,Xlags) + (Details on the documentation of the `transform` method) + + Returns: + self + + """ + time_series = time_series if isinstance(time_series, list) else [time_series] + self._check_input_from_pandas(time_series) + self.columns = list(time_series[0].columns) + self.return_df = return_df + return self + + def transform( + self, time_series: Union[pd.DataFrame, List[pd.DataFrame]] + ) -> Union[pd.DataFrame, Tuple[np.ndarray, np.ndarray]]: + """ + Applies transformation to format the dataframe properly + Args: + time_series: time_series: pd.DataFrame or List of pd.DataFrame instances. Details on `fit` documentation + + Returns: + - If `self.return_df=True`, returns a pandas.DataFrame on the following format: + + A_lag0 B_lag0 C_lag0 ... A_lag1 B_lag1 C_lag1 ... A_lag`p` B_lag`p` C_lag`p` + X X X X X X X X X + X X X X X X X X X + X X X X X X X X X + `lag0` denotes the current variable state and lag`k` denotes the states `k` time stamps in the past. + + - If `self.return_df=False`, returns a tuple of two numpy.ndarrayy: X and Xlags + X (np.ndarray): 2d input data, axis=1 is data columns, axis=0 is data rows. + Each column represents one variable, + and each row represents x(m,t) i.e. the mth time series at time t. + Xlags (np.ndarray): + Shifted data of X with lag orders stacking horizontally. Xlags=[shift(X,1)|...|shift(X,p)] + Raises: + NotFittedError: if `transform` called before `fit` + """ + if self.columns is None: + raise NotFittedError( + "This DynamicDataTransformer is not fitted yet. " + "Call `fit` before using this method" + ) + + time_series = time_series if isinstance(time_series, list) else [time_series] + + self._check_input_from_pandas(time_series) + + time_series = [t[self.columns] for t in time_series] + ts_realisations = self._cut_dataframes_on_discontinuity_points(time_series) + X, Xlags = self._convert_realisations_into_dynotears_format( + ts_realisations, self.p + ) + + if self.return_df: + res = self._concat_lags(X, Xlags) + return res + return X, Xlags + + def _concat_lags(self, X: np.ndarray, Xlags: np.ndarray) -> pd.DataFrame: + df_x = pd.DataFrame( + X, columns=["{col}_lag0".format(col=col) for col in self.columns] + ) + df_xlags = pd.DataFrame( + Xlags, + columns=[ + "{col}_lag{l_}".format(col=col, l_=l_) + for l_ in range(1, self.p + 1) + for col in self.columns + ], + ) + return pd.concat([df_x, df_xlags], axis=1) + + def _check_input_from_pandas(self, time_series: List[pd.DataFrame]): + """ + Check if the input of function `from_pandas_dynamic` is valid + Args: + time_series: List of pd.DataFrame instances. + each element of the list being an realisation of a same time series + + Raises: + ValueError: if empty list of time_series is provided + ValueError: if dataframes contain non numeric data + TypeError: if elements provided are not pandas dataframes + ValueError: if dataframes contain different columns + ValueError: if dataframes index is not in increasing order + TypeError: if dataframes index are not index + """ + if not time_series: + raise ValueError( + "Provided empty list of time_series. At least one DataFrame must be provided" + ) + + df = deepcopy(time_series[0]) + + for t in time_series: + if not isinstance(t, pd.DataFrame): + raise TypeError( + "Time series entries must be instances of `pd.DataFrame`" + ) + + non_numeric_cols = t.select_dtypes(exclude="number").columns + + if not non_numeric_cols.empty: + raise ValueError( + "All columns must have numeric data. Consider mapping the " + "following columns to int: {non_numeric_cols}".format( + non_numeric_cols=list(non_numeric_cols) + ) + ) + + if (not np.all(df.columns == t.columns)) or ( + not np.all(df.dtypes == t.dtypes) + ): + raise ValueError("All inputs must have the same columns and same types") + + if not np.all(t.index == t.index.sort_values()): + raise ValueError( + "Index for dataframe must be provided in increasing order" + ) + + if t.index.dtype != int: + raise TypeError("Index must be integers") + + if self.columns is not None: + missing_cols = [c for c in self.columns if c not in t.columns] + if missing_cols: + raise ValueError( + "We should provide all necessary columns in the time series." + " Columns not provided: {col}".format(col=missing_cols) + ) + + @staticmethod + def _cut_dataframes_on_discontinuity_points( + time_series: List[pd.DataFrame], + ) -> List[np.ndarray]: + """ + Helper function for `from_pandas_dynamic` + Receive a list of dataframes. For each dataframe, cut the points of discontinuity as two different dataframes. + Discontinuities are determined by the indexes. + + For Example: + If the following is a dataframe: + index variable_1 variable_2 + 1 X X + 2 X X + 3 X X + 4 X X + 8 X X <- discontinuity point + 9 X X + 10 X X + + We cut this dataset in two: + + index variable_1 variable_2 + 1 X X + 2 X X + 3 X X + 4 X X + + and: + index variable_1 variable_2 + 8 X X + 9 X X + 10 X X + + + Args: + time_series: list of dataframes representing various realisations of a same time series + + Returns: + List of np.ndarrays representing the pieces of the input datasets with no discontinuity + + """ + time_series_realisations = [] + for t in time_series: + cutting_points = np.where(np.diff(t.index) > 1)[0] + cutting_points = [0] + list(cutting_points + 1) + [len(t)] + for start, end in zip(cutting_points[:-1], cutting_points[1:]): + time_series_realisations.append(t.iloc[start:end, :].values) + return time_series_realisations + + @staticmethod + def _convert_realisations_into_dynotears_format( + realisations: List[np.ndarray], p: int + ) -> Tuple[np.ndarray, np.ndarray]: + """ + Given a list of realisations of a time series, convert it to the format received by the dynotears algorithm. + Each realisation on `realisations` is a realisation of the time series, + where the time dimension is represented by the rows. + - The higher the row, the higher the time index + - The data is complete, meaning that the difference between two time stamps is equal one + Args: + realisations: a list of realisations of a time series + p: the number of lagged columns to create + + Returns: + X and Y as in the SVAR model and DYNOTEARS paper. I.e. X being representing X(m,t) and Y the concatenated + differences [X(m,t-1) | X(m,t-2) | ... | X(m,t-p)] + """ + X = np.concatenate([realisation[p:] for realisation in realisations], axis=0) + y_lag_list = [ + np.concatenate([realisation[p - i - 1 : -i - 1] for i in range(p)], axis=1) + for realisation in realisations + ] + y_lag = np.concatenate(y_lag_list, axis=0) + + return X, y_lag diff --git a/tests/conftest.py b/tests/conftest.py index 3da7a2b..307495c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -38,6 +38,8 @@ from causalnex.structure.notears import from_pandas +# Ignoring limit of 1000 lines per module, since this module contains test sets. +# pylint: disable=C0302 @pytest.fixture def train_model() -> StructureModel: """ @@ -264,7 +266,6 @@ def train_data_discrete_cpds_k2(train_data_discrete) -> Dict[str, np.ndarray]: def create_cpds(data, pc=0): - df = data.copy(deep=True) # type: pd.DataFrame df_vals = {col: list(df[col].unique()) for col in df.columns} @@ -344,7 +345,6 @@ def create_cpds(data, pc=0): @pytest.fixture def train_data_idx_marginals(train_data_idx_cpds): - return create_marginals( train_data_idx_cpds, { @@ -359,7 +359,6 @@ def train_data_idx_marginals(train_data_idx_cpds): @pytest.fixture def train_data_discrete_marginals(train_data_discrete_cpds): - return create_marginals( train_data_discrete_cpds, { @@ -489,6 +488,534 @@ def bn(train_data_idx, train_data_discrete) -> BayesianNetwork: ).fit_node_states_and_cpds(train_data_discrete) +@pytest.fixture() +def data_dynotears_p1() -> Dict[str, np.ndarray]: + """ + Training data for testing Dynamic Bayesian Networks. Return a time series with 50 time points, with 5 columns + This data was simulated with te following configurations + Configurations: + - data points 50, + - num. variables: 5, + - p (lag amount): 1, + - graph type (intra-slice graph): 'erdos-renyi', + - graph type (inter-slice graph): 'erdos-renyi', + - SEM type: 'linear-gauss', + - weight range, intra-slice graph: (0.5, 2.0), + - weight range, inter-slice graph: (0.3, 0.5), + - expected degree, inter-slice graph: 3, + - noise scale (gaussian noise): 1.0, + - w decay: 1.1 + Returns: + dictionary with keys W (intra-weights), A (inter-weights), X and Y (inputs of from_numpy_dynamic) + """ + data = { + "W": np.array( + [ + [0.0, -0.55, 0.0, 1.48, 0.0], + [0.0, 0.0, -0.99, 0.0, 0.0], + [0.0, 0.0, 0.0, -1.13, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + [0.0, -1.91, -0.64, -1.31, 0.0], + ] + ), + "A": np.array( + [ + [-0.35, -0.32, -0.33, 0.37, 0.0], + [-0.41, -0.42, -0.36, 0.33, -0.35], + [0.46, 0.0, 0.44, -0.36, -0.38], + [0.0, 0.0, -0.45, 0.0, 0.43], + [0.31, 0.4, 0.0, 0.0, -0.44], + ] + ), + "X": np.array( + [ + [-8.7, -3.1, -5.1, 2.8, -2.0], + [0.3, -5.8, 2.9, -12.6, 5.1], + [4.8, 17.4, -4.2, 17.6, -6.8], + [-12.5, -20.6, -1.9, -18.5, 8.3], + [14.4, 14.1, 7.7, 6.8, -3.0], + [-8.1, 0.1, -8.6, 9.0, -3.3], + [-3.3, -11.4, 1.3, -18.4, 8.0], + [9.4, 14.4, 3.7, 11.4, -6.5], + [-9.7, -7.0, -3.5, -3.2, -0.0], + [1.7, -2.7, 5.8, -14.8, 4.5], + [3.9, 17.2, -2.4, 19.2, -9.8], + [-12.5, -22.2, 1.2, -23.2, 7.8], + [17.2, 18.2, 9.2, 11.6, -6.4], + [-12.2, -7.9, -2.8, -2.6, -1.2], + [5.8, -4.9, 9.0, -12.6, 3.9], + [6.0, 13.5, 2.3, 15.1, -8.4], + [-10.1, -12.8, -1.4, -13.4, 4.5], + [9.7, 10.3, 5.5, 7.1, -3.5], + [-7.2, -1.4, -4.5, 0.2, -1.3], + [1.8, -3.7, 1.6, -4.8, 3.2], + [2.6, 6.7, -2.1, 9.7, -2.7], + [-4.6, -7.6, -1.7, -4.4, 2.6], + [4.6, -0.1, 4.5, -4.1, 1.9], + [3.3, 5.1, -0.7, 11.0, -4.1], + [-4.3, -12.7, -0.7, -9.5, 5.9], + [8.0, 6.0, 5.6, 0.8, -1.1], + [-2.4, 4.9, -3.9, 11.3, -4.7], + [-4.6, -12.0, -0.3, -13.4, 6.5], + [8.1, 15.9, -0.8, 16.0, -6.5], + [-11.5, -14.8, -5.0, -7.5, 4.4], + [10.0, 3.3, 7.1, -2.8, 1.7], + [-0.7, 3.8, -1.8, 9.2, -4.0], + [-1.6, -13.7, 5.3, -13.0, 4.7], + [11.0, 8.2, 7.7, 6.7, -4.0], + [-4.0, -2.4, -3.0, 1.8, -1.7], + [1.1, -4.8, 2.9, -5.9, 3.0], + [3.4, 7.5, -1.5, 9.0, -2.4], + [-6.2, -3.4, -4.8, -2.2, 1.0], + [0.8, -0.3, 1.8, -2.5, 1.0], + [0.4, 1.7, -1.6, 1.9, -0.5], + [-2.3, -4.5, 2.9, -7.4, 2.3], + [3.9, 9.3, 2.4, 4.1, -3.8], + [-4.8, -0.6, -3.6, 3.0, -2.1], + [-1.8, -7.5, 2.9, -13.2, 4.7], + [5.6, 19.0, -3.7, 18.1, -7.8], + [-13.5, -19.3, -2.7, -17.7, 7.5], + [15.5, 10.6, 9.9, 5.7, -3.1], + [-6.8, 3.3, -6.8, 9.6, -4.8], + [-5.2, -15.3, 3.9, -21.6, 7.9], + [11.1, 22.0, -0.3, 20.7, -8.4], + ] + ), + "Y": np.array( + [ + [10.8, 14.6, 5.8, 8.1, -4.5], + [-8.7, -3.1, -5.1, 2.8, -2.0], + [0.3, -5.8, 2.9, -12.6, 5.1], + [4.8, 17.4, -4.2, 17.6, -6.8], + [-12.5, -20.6, -1.9, -18.5, 8.3], + [14.4, 14.1, 7.7, 6.8, -3.0], + [-8.1, 0.1, -8.6, 9.0, -3.3], + [-3.3, -11.4, 1.3, -18.4, 8.0], + [9.4, 14.4, 3.7, 11.4, -6.5], + [-9.7, -7.0, -3.5, -3.2, -0.0], + [1.7, -2.7, 5.8, -14.8, 4.5], + [3.9, 17.2, -2.4, 19.2, -9.8], + [-12.5, -22.2, 1.2, -23.2, 7.8], + [17.2, 18.2, 9.2, 11.6, -6.4], + [-12.2, -7.9, -2.8, -2.6, -1.2], + [5.8, -4.9, 9.0, -12.6, 3.9], + [6.0, 13.5, 2.3, 15.1, -8.4], + [-10.1, -12.8, -1.4, -13.4, 4.5], + [9.7, 10.3, 5.5, 7.1, -3.5], + [-7.2, -1.4, -4.5, 0.2, -1.3], + [1.8, -3.7, 1.6, -4.8, 3.2], + [2.6, 6.7, -2.1, 9.7, -2.7], + [-4.6, -7.6, -1.7, -4.4, 2.6], + [4.6, -0.1, 4.5, -4.1, 1.9], + [3.3, 5.1, -0.7, 11.0, -4.1], + [-4.3, -12.7, -0.7, -9.5, 5.9], + [8.0, 6.0, 5.6, 0.8, -1.1], + [-2.4, 4.9, -3.9, 11.3, -4.7], + [-4.6, -12.0, -0.3, -13.4, 6.5], + [8.1, 15.9, -0.8, 16.0, -6.5], + [-11.5, -14.8, -5.0, -7.5, 4.4], + [10.0, 3.3, 7.1, -2.8, 1.7], + [-0.7, 3.8, -1.8, 9.2, -4.0], + [-1.6, -13.7, 5.3, -13.0, 4.7], + [11.0, 8.2, 7.7, 6.7, -4.0], + [-4.0, -2.4, -3.0, 1.8, -1.7], + [1.1, -4.8, 2.9, -5.9, 3.0], + [3.4, 7.5, -1.5, 9.0, -2.4], + [-6.2, -3.4, -4.8, -2.2, 1.0], + [0.8, -0.3, 1.8, -2.5, 1.0], + [0.4, 1.7, -1.6, 1.9, -0.5], + [-2.3, -4.5, 2.9, -7.4, 2.3], + [3.9, 9.3, 2.4, 4.1, -3.8], + [-4.8, -0.6, -3.6, 3.0, -2.1], + [-1.8, -7.5, 2.9, -13.2, 4.7], + [5.6, 19.0, -3.7, 18.1, -7.8], + [-13.5, -19.3, -2.7, -17.7, 7.5], + [15.5, 10.6, 9.9, 5.7, -3.1], + [-6.8, 3.3, -6.8, 9.6, -4.8], + [-5.2, -15.3, 3.9, -21.6, 7.9], + ] + ), + } + return data + + +@pytest.fixture() +def data_dynotears_p2() -> Dict[str, np.ndarray]: + """ + Training data for testing Dynamic Bayesian Networks. Return a time series with 50 time points, with 5 columns + This data was simulated with te following configurations + Configurations: + - data points 50, + - num. variables: 5, + - p (lag amount): 2, + - graph type (intra-slice graph): 'erdos-renyi', + - graph type (inter-slice graph): 'erdos-renyi', + - SEM type: 'linear-gauss', + - weight range, intra-slice graph: (0.5, 2.0), + - weight range, inter-slice graph: (0.3, 0.5), + - expected degree, inter-slice graph: 3, + - noise scale (gaussian noise): 1.0, + - w decay: 1.1 + Returns: + dictionary with keys W (intra-weights) ,A (inter-weights), X and Y (inputs of from_numpy_dynamic) + """ + data = { + "W": np.array( + [ + [0.0, 0.0, 0.0, 0.0, -1.08], + [1.16, 0.0, 0.0, -0.81, 0.89], + [-1.83, 0.58, 0.0, 0.61, -1.31], + [1.03, 0.0, 0.0, 0.0, -0.97], + [0.0, 0.0, 0.0, 0.0, 0.0], + ] + ), + "A": np.array( + [ + [0.31, 0.0, 0.0, 0.32, 0.0], + [0.0, 0.36, 0.0, 0.0, 0.5], + [-0.43, 0.0, 0.0, -0.49, -0.39], + [0.39, 0.0, 0.39, 0.0, 0.38], + [0.0, 0.37, 0.0, 0.43, 0.0], + [0.0, -0.37, 0.34, 0.0, 0.0], + [-0.3, 0.0, -0.42, 0.0, 0.45], + [0.0, -0.34, 0.0, 0.0, 0.0], + [0.0, 0.31, 0.0, 0.29, 0.36], + [-0.43, 0.37, 0.0, 0.0, -0.34], + ] + ), + "X": np.array( + [ + [3.1, 0.9, 1.6, 2.9, -6.5], + [-5.6, -4.1, 3.9, 4.7, -3.4], + [-6.3, -3.5, 2.2, 1.4, 0.4], + [-0.3, 0.2, -0.5, -1.5, 2.0], + [2.4, 1.6, -0.9, -0.7, -0.9], + [0.6, -0.1, 0.2, 1.5, -2.7], + [-2.4, -1.6, 2.2, 1.3, -1.6], + [-4.6, 0.5, 1.4, -2.6, 6.9], + [-1.3, -0.1, -1.3, -1.7, 3.7], + [0.5, 4.7, -2.0, -4.1, 6.8], + [8.8, 4.9, -2.7, -1.1, -0.3], + [3.8, 3.1, -0.4, 0.1, 1.5], + [2.0, -0.9, 0.1, 3.9, -1.6], + [-3.8, -0.9, 2.7, 1.3, 2.4], + [-4.4, 3.2, 2.1, -3.3, 9.9], + [0.2, 4.8, -1.6, -3.7, 9.7], + [5.1, 2.8, -6.4, -1.5, 3.7], + [8.1, 4.3, -2.4, -0.1, -1.6], + [2.1, 0.9, 1.5, 2.9, -4.1], + [-4.1, -3.9, 0.2, 1.1, 4.1], + [0.3, -3.0, 0.2, 3.0, -4.7], + [-1.2, 2.8, 2.7, -0.9, -0.9], + [-0.8, 1.8, 1.7, -0.7, 2.4], + [-0.2, -2.2, -1.8, 2.6, -0.3], + [-0.9, 2.1, 0.8, -2.3, 5.9], + [1.5, 3.6, 0.4, -0.1, -0.1], + [-3.0, 1.3, -1.6, -3.2, 9.5], + [6.8, -0.1, -3.5, 2.5, -3.2], + [3.5, 1.8, -1.7, -2.1, 0.5], + [5.1, 1.0, 1.6, 3.6, -6.5], + [-2.8, -3.0, 1.4, 1.8, -3.5], + [-5.9, -4.8, 3.2, 4.0, -3.6], + [-6.7, -2.2, 1.3, -2.7, 4.8], + [0.4, 0.0, -1.8, -0.9, 0.4], + [3.2, 1.7, -2.9, -3.6, 3.1], + [5.7, 0.9, -2.9, -0.3, -1.4], + [3.4, -2.0, -0.5, 3.5, -9.1], + [-4.3, -4.4, 2.9, 2.6, -5.2], + [-9.7, -6.4, 3.4, 1.9, -1.4], + [-5.2, -1.4, 1.6, -2.3, 1.0], + [-1.1, 1.1, -1.6, -3.5, 4.1], + [0.9, 2.0, -1.6, -2.6, 3.3], + [4.3, 4.8, -0.7, -0.7, 0.3], + [4.9, -0.4, -1.3, 1.8, -3.9], + [1.3, -2.9, -0.8, 2.0, -1.3], + [-0.1, -0.8, 2.9, 3.1, -6.7], + [-5.8, 0.5, 4.0, -1.2, 2.6], + [-1.8, -2.4, -2.0, -2.4, 3.5], + [3.3, 0.7, -3.0, -0.6, 0.0], + [4.5, -0.1, 0.4, 2.9, -10.8], + ] + ), + "Y": np.array( + [ + [6.1, 0.5, -2.0, 0.9, -3.0, 2.9, 1.5, -0.9, -0.2, 0.8], + [3.1, 0.9, 1.6, 2.9, -6.5, 6.1, 0.5, -2.0, 0.9, -3.0], + [-5.6, -4.1, 3.9, 4.7, -3.4, 3.1, 0.9, 1.6, 2.9, -6.5], + [-6.3, -3.5, 2.2, 1.4, 0.4, -5.6, -4.1, 3.9, 4.7, -3.4], + [-0.3, 0.2, -0.5, -1.5, 2.0, -6.3, -3.5, 2.2, 1.4, 0.4], + [2.4, 1.6, -0.9, -0.7, -0.9, -0.3, 0.2, -0.5, -1.5, 2.0], + [0.6, -0.1, 0.2, 1.5, -2.7, 2.4, 1.6, -0.9, -0.7, -0.9], + [-2.4, -1.6, 2.2, 1.3, -1.6, 0.6, -0.1, 0.2, 1.5, -2.7], + [-4.6, 0.5, 1.4, -2.6, 6.9, -2.4, -1.6, 2.2, 1.3, -1.6], + [-1.3, -0.1, -1.3, -1.7, 3.7, -4.6, 0.5, 1.4, -2.6, 6.9], + [0.5, 4.7, -2.0, -4.1, 6.8, -1.3, -0.1, -1.3, -1.7, 3.7], + [8.8, 4.9, -2.7, -1.1, -0.3, 0.5, 4.7, -2.0, -4.1, 6.8], + [3.8, 3.1, -0.4, 0.1, 1.5, 8.8, 4.9, -2.7, -1.1, -0.3], + [2.0, -0.9, 0.1, 3.9, -1.6, 3.8, 3.1, -0.4, 0.1, 1.5], + [-3.8, -0.9, 2.7, 1.3, 2.4, 2.0, -0.9, 0.1, 3.9, -1.6], + [-4.4, 3.2, 2.1, -3.3, 9.9, -3.8, -0.9, 2.7, 1.3, 2.4], + [0.2, 4.8, -1.6, -3.7, 9.7, -4.4, 3.2, 2.1, -3.3, 9.9], + [5.1, 2.8, -6.4, -1.5, 3.7, 0.2, 4.8, -1.6, -3.7, 9.7], + [8.1, 4.3, -2.4, -0.1, -1.6, 5.1, 2.8, -6.4, -1.5, 3.7], + [2.1, 0.9, 1.5, 2.9, -4.1, 8.1, 4.3, -2.4, -0.1, -1.6], + [-4.1, -3.9, 0.2, 1.1, 4.1, 2.1, 0.9, 1.5, 2.9, -4.1], + [0.3, -3.0, 0.2, 3.0, -4.7, -4.1, -3.9, 0.2, 1.1, 4.1], + [-1.2, 2.8, 2.7, -0.9, -0.9, 0.3, -3.0, 0.2, 3.0, -4.7], + [-0.8, 1.8, 1.7, -0.7, 2.4, -1.2, 2.8, 2.7, -0.9, -0.9], + [-0.2, -2.2, -1.8, 2.6, -0.3, -0.8, 1.8, 1.7, -0.7, 2.4], + [-0.9, 2.1, 0.8, -2.3, 5.9, -0.2, -2.2, -1.8, 2.6, -0.3], + [1.5, 3.6, 0.4, -0.1, -0.1, -0.9, 2.1, 0.8, -2.3, 5.9], + [-3.0, 1.3, -1.6, -3.2, 9.5, 1.5, 3.6, 0.4, -0.1, -0.1], + [6.8, -0.1, -3.5, 2.5, -3.2, -3.0, 1.3, -1.6, -3.2, 9.5], + [3.5, 1.8, -1.7, -2.1, 0.5, 6.8, -0.1, -3.5, 2.5, -3.2], + [5.1, 1.0, 1.6, 3.6, -6.5, 3.5, 1.8, -1.7, -2.1, 0.5], + [-2.8, -3.0, 1.4, 1.8, -3.5, 5.1, 1.0, 1.6, 3.6, -6.5], + [-5.9, -4.8, 3.2, 4.0, -3.6, -2.8, -3.0, 1.4, 1.8, -3.5], + [-6.7, -2.2, 1.3, -2.7, 4.8, -5.9, -4.8, 3.2, 4.0, -3.6], + [0.4, 0.0, -1.8, -0.9, 0.4, -6.7, -2.2, 1.3, -2.7, 4.8], + [3.2, 1.7, -2.9, -3.6, 3.1, 0.4, 0.0, -1.8, -0.9, 0.4], + [5.7, 0.9, -2.9, -0.3, -1.4, 3.2, 1.7, -2.9, -3.6, 3.1], + [3.4, -2.0, -0.5, 3.5, -9.1, 5.7, 0.9, -2.9, -0.3, -1.4], + [-4.3, -4.4, 2.9, 2.6, -5.2, 3.4, -2.0, -0.5, 3.5, -9.1], + [-9.7, -6.4, 3.4, 1.9, -1.4, -4.3, -4.4, 2.9, 2.6, -5.2], + [-5.2, -1.4, 1.6, -2.3, 1.0, -9.7, -6.4, 3.4, 1.9, -1.4], + [-1.1, 1.1, -1.6, -3.5, 4.1, -5.2, -1.4, 1.6, -2.3, 1.0], + [0.9, 2.0, -1.6, -2.6, 3.3, -1.1, 1.1, -1.6, -3.5, 4.1], + [4.3, 4.8, -0.7, -0.7, 0.3, 0.9, 2.0, -1.6, -2.6, 3.3], + [4.9, -0.4, -1.3, 1.8, -3.9, 4.3, 4.8, -0.7, -0.7, 0.3], + [1.3, -2.9, -0.8, 2.0, -1.3, 4.9, -0.4, -1.3, 1.8, -3.9], + [-0.1, -0.8, 2.9, 3.1, -6.7, 1.3, -2.9, -0.8, 2.0, -1.3], + [-5.8, 0.5, 4.0, -1.2, 2.6, -0.1, -0.8, 2.9, 3.1, -6.7], + [-1.8, -2.4, -2.0, -2.4, 3.5, -5.8, 0.5, 4.0, -1.2, 2.6], + [3.3, 0.7, -3.0, -0.6, 0.0, -1.8, -2.4, -2.0, -2.4, 3.5], + ] + ), + } + return data + + +@pytest.fixture() +def data_dynotears_p3() -> Dict[str, np.ndarray]: + """ + Training data for testing Dynamic Bayesian Networks. Return a time series with 50 time points, with 5 columns + This data was simulated with te following configurations. + Configurations: + - data points 50, + - num. variables: 5, + - p (lag amount): 3, + - graph type (intra-slice graph): 'erdos-renyi', + - graph type (inter-slice graph): 'erdos-renyi', + - SEM type: 'linear-gauss', + - weight range, intra-slice graph: (0.5, 2.0), + - weight range, inter-slice graph: (0.3, 0.5), + - expected degree, inter-slice graph: 3, + - noise scale (gaussian noise): 1.0, + - w decay: 1.1 + Returns: + dictionary with keys W (intra-weights), A (inter-weights), X and Y (inputs of from_numpy_dynamic) + """ + data = { + "W": np.array( + [ + [0.0, 0.0, -1.18, 0.0, -0.92], + [0.0, 0.0, -1.71, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, -1.15], + [0.0, 0.8, 0.0, 0.0, -1.65], + [0.0, 0.0, 0.0, 0.0, 0.0], + ] + ), + "A": np.array( + [ + [-0.42, -0.4, 0.35, 0.0, 0.0], + [0.0, 0.0, 0.0, 0.0, 0.0], + [-0.38, 0.0, -0.48, -0.48, 0.44], + [0.0, -0.36, -0.31, 0.35, 0.0], + [0.0, 0.0, -0.35, 0.0, -0.45], + [0.27, 0.0, 0.0, -0.28, -0.43], + [-0.44, 0.35, 0.0, 0.0, -0.44], + [0.0, 0.0, 0.0, 0.0, 0.0], + [-0.42, -0.39, -0.27, 0.3, -0.29], + [0.0, -0.43, 0.0, -0.41, 0.4], + [0.0, 0.32, 0.38, -0.37, 0.37], + [0.0, 0.37, 0.0, 0.0, 0.41], + [0.0, 0.34, 0.0, 0.27, -0.28], + [-0.31, 0.0, 0.0, -0.34, 0.0], + [0.27, -0.38, 0.0, 0.37, 0.36], + ] + ), + "X": np.array( + [ + [-21.5, 26.1, -56.8, -15.4, 82.4], + [14.4, 20.3, -51.3, -18.8, -4.6], + [16.5, -17.0, 48.2, 18.7, -100.2], + [5.1, -38.8, 70.5, 5.1, 8.6], + [-19.4, 20.5, -47.8, -4.3, 86.9], + [9.8, 28.7, -69.5, -18.3, 8.2], + [9.5, -7.1, 42.4, 13.3, -107.0], + [2.8, -42.5, 83.7, 1.5, 3.1], + [-25.7, 18.6, -42.4, -8.5, 112.6], + [11.5, 34.3, -94.2, -20.5, 33.3], + [19.0, -9.7, 39.5, 18.2, -137.5], + [7.9, -60.4, 122.3, 8.1, -33.3], + [-32.5, 6.0, -19.4, -11.0, 130.2], + [3.5, 48.0, -127.1, -33.7, 81.8], + [28.4, 7.6, 1.0, 16.3, -146.2], + [20.0, -71.8, 153.1, 17.9, -93.3], + [-34.3, -13.0, 20.5, -4.4, 130.2], + [-9.4, 58.4, -150.8, -41.9, 141.6], + [26.7, 32.9, -49.8, 7.9, -134.1], + [34.3, -74.1, 161.6, 26.1, -152.1], + [-34.2, -35.1, 75.5, 8.0, 105.1], + [-21.9, 54.5, -151.3, -49.8, 199.9], + [19.2, 54.9, -98.1, -8.1, -95.3], + [47.4, -72.0, 158.9, 22.4, -192.6], + [-25.7, -60.3, 129.5, 21.9, 54.6], + [-26.9, 42.2, -136.6, -48.4, 243.1], + [12.6, 79.6, -153.1, -11.6, -57.4], + [54.1, -66.7, 153.3, 17.7, -241.5], + [-26.1, -79.5, 185.0, 25.9, 19.5], + [-34.8, 36.3, -133.8, -54.2, 311.0], + [10.6, 105.3, -219.7, -19.1, -15.3], + [73.9, -58.5, 138.7, 25.1, -307.4], + [-17.7, -107.1, 257.4, 47.9, -58.4], + [-51.3, 17.0, -93.4, -59.1, 354.7], + [-13.4, 140.4, -287.4, -43.0, 83.7], + [85.8, -28.4, 71.1, 11.6, -310.8], + [4.5, -130.3, 310.9, 68.7, -171.8], + [-51.1, -25.3, -12.3, -46.8, 361.3], + [-34.2, 158.2, -341.9, -48.7, 191.1], + [92.6, 7.7, -9.7, -2.9, -314.2], + [20.2, -143.1, 363.2, 83.7, -294.6], + [-54.6, -69.6, 82.3, -36.4, 356.7], + [-60.4, 176.2, -390.3, -61.8, 319.4], + [97.2, 53.8, -116.7, -18.9, -289.0], + [46.0, -152.8, 400.1, 103.7, -440.8], + [-52.1, -124.2, 206.7, -16.7, 307.3], + [-93.8, 183.7, -415.9, -74.5, 465.6], + [92.4, 118.4, -265.0, -47.7, -202.0], + [76.5, -140.2, 396.2, 116.8, -592.8], + [-36.6, -195.8, 366.6, 8.3, 202.5], + ] + ), + "Y_1": np.array( + [ + [-4.9, -31.2, 50.6, -5.4, 42.7, 11.0, -26.3, 66.7, 21.5, -82.3], + [-21.5, 26.1, -56.8, -15.4, 82.4, -4.9, -31.2, 50.6, -5.4, 42.7], + [14.4, 20.3, -51.3, -18.8, -4.6, -21.5, 26.1, -56.8, -15.4, 82.4], + [16.5, -17.0, 48.2, 18.7, -100.2, 14.4, 20.3, -51.3, -18.8, -4.6], + [5.1, -38.8, 70.5, 5.1, 8.6, 16.5, -17.0, 48.2, 18.7, -100.2], + [-19.4, 20.5, -47.8, -4.3, 86.9, 5.1, -38.8, 70.5, 5.1, 8.6], + [9.8, 28.7, -69.5, -18.3, 8.2, -19.4, 20.5, -47.8, -4.3, 86.9], + [9.5, -7.1, 42.4, 13.3, -107.0, 9.8, 28.7, -69.5, -18.3, 8.2], + [2.8, -42.5, 83.7, 1.5, 3.1, 9.5, -7.1, 42.4, 13.3, -107.0], + [-25.7, 18.6, -42.4, -8.5, 112.6, 2.8, -42.5, 83.7, 1.5, 3.1], + [11.5, 34.3, -94.2, -20.5, 33.3, -25.7, 18.6, -42.4, -8.5, 112.6], + [19.0, -9.7, 39.5, 18.2, -137.5, 11.5, 34.3, -94.2, -20.5, 33.3], + [7.9, -60.4, 122.3, 8.1, -33.3, 19.0, -9.7, 39.5, 18.2, -137.5], + [-32.5, 6.0, -19.4, -11.0, 130.2, 7.9, -60.4, 122.3, 8.1, -33.3], + [3.5, 48.0, -127.1, -33.7, 81.8, -32.5, 6.0, -19.4, -11.0, 130.2], + [28.4, 7.6, 1.0, 16.3, -146.2, 3.5, 48.0, -127.1, -33.7, 81.8], + [20.0, -71.8, 153.1, 17.9, -93.3, 28.4, 7.6, 1.0, 16.3, -146.2], + [-34.3, -13.0, 20.5, -4.4, 130.2, 20.0, -71.8, 153.1, 17.9, -93.3], + [-9.4, 58.4, -150.8, -41.9, 141.6, -34.3, -13.0, 20.5, -4.4, 130.2], + [26.7, 32.9, -49.8, 7.9, -134.1, -9.4, 58.4, -150.8, -41.9, 141.6], + [34.3, -74.1, 161.6, 26.1, -152.1, 26.7, 32.9, -49.8, 7.9, -134.1], + [-34.2, -35.1, 75.5, 8.0, 105.1, 34.3, -74.1, 161.6, 26.1, -152.1], + [-21.9, 54.5, -151.3, -49.8, 199.9, -34.2, -35.1, 75.5, 8.0, 105.1], + [19.2, 54.9, -98.1, -8.1, -95.3, -21.9, 54.5, -151.3, -49.8, 199.9], + [47.4, -72.0, 158.9, 22.4, -192.6, 19.2, 54.9, -98.1, -8.1, -95.3], + [-25.7, -60.3, 129.5, 21.9, 54.6, 47.4, -72.0, 158.9, 22.4, -192.6], + [-26.9, 42.2, -136.6, -48.4, 243.1, -25.7, -60.3, 129.5, 21.9, 54.6], + [12.6, 79.6, -153.1, -11.6, -57.4, -26.9, 42.2, -136.6, -48.4, 243.1], + [54.1, -66.7, 153.3, 17.7, -241.5, 12.6, 79.6, -153.1, -11.6, -57.4], + [-26.1, -79.5, 185.0, 25.9, 19.5, 54.1, -66.7, 153.3, 17.7, -241.5], + [-34.8, 36.3, -133.8, -54.2, 311.0, -26.1, -79.5, 185.0, 25.9, 19.5], + [10.6, 105.3, -219.7, -19.1, -15.3, -34.8, 36.3, -133.8, -54.2, 311.0], + [73.9, -58.5, 138.7, 25.1, -307.4, 10.6, 105.3, -219.7, -19.1, -15.3], + [-17.7, -107.1, 257.4, 47.9, -58.4, 73.9, -58.5, 138.7, 25.1, -307.4], + [-51.3, 17.0, -93.4, -59.1, 354.7, -17.7, -107.1, 257.4, 47.9, -58.4], + [-13.4, 140.4, -287.4, -43.0, 83.7, -51.3, 17.0, -93.4, -59.1, 354.7], + [85.8, -28.4, 71.1, 11.6, -310.8, -13.4, 140.4, -287.4, -43.0, 83.7], + [4.5, -130.3, 310.9, 68.7, -171.8, 85.8, -28.4, 71.1, 11.6, -310.8], + [-51.1, -25.3, -12.3, -46.8, 361.3, 4.5, -130.3, 310.9, 68.7, -171.8], + [-34.2, 158.2, -341.9, -48.7, 191.1, -51.1, -25.3, -12.3, -46.8, 361.3], + [92.6, 7.7, -9.7, -2.9, -314.2, -34.2, 158.2, -341.9, -48.7, 191.1], + [20.2, -143.1, 363.2, 83.7, -294.6, 92.6, 7.7, -9.7, -2.9, -314.2], + [-54.6, -69.6, 82.3, -36.4, 356.7, 20.2, -143.1, 363.2, 83.7, -294.6], + [-60.4, 176.2, -390.3, -61.8, 319.4, -54.6, -69.6, 82.3, -36.4, 356.7], + [97.2, 53.8, -116.7, -18.9, -289.0, -60.4, 176.2, -390.3, -61.8, 319.4], + [46.0, -152.8, 400.1, 103.7, -440.8, 97.2, 53.8, -116.7, -18.9, -289.0], + [-52.1, -124.2, 206.7, -16.7, 307.3, 46, -152.8, 400.1, 103.7, -440.8], + [-93.8, 183.7, -415.9, -74.5, 465.6, -52, -124.2, 206.7, -16.7, 307.3], + [92.4, 118.4, -265.0, -47.7, -202, -93.8, 183.7, -415.9, -74.5, 465.6], + [76.5, -140.2, 396.2, 116.8, -592.8, 92.4, 118.4, -265, -47.7, -202], + ] + ), + "Y_2": np.array( + [ + [22.4, 6.9, -22.7, -6.3, -40.2], + [11.0, -26.3, 66.7, 21.5, -82.3], + [-4.9, -31.2, 50.6, -5.4, 42.7], + [-21.5, 26.1, -56.8, -15.4, 82.4], + [14.4, 20.3, -51.3, -18.8, -4.6], + [16.5, -17.0, 48.2, 18.7, -100.2], + [5.1, -38.8, 70.5, 5.1, 8.6], + [-19.4, 20.5, -47.8, -4.3, 86.9], + [9.8, 28.7, -69.5, -18.3, 8.2], + [9.5, -7.1, 42.4, 13.3, -107.0], + [2.8, -42.5, 83.7, 1.5, 3.1], + [-25.7, 18.6, -42.4, -8.5, 112.6], + [11.5, 34.3, -94.2, -20.5, 33.3], + [19.0, -9.7, 39.5, 18.2, -137.5], + [7.9, -60.4, 122.3, 8.1, -33.3], + [-32.5, 6.0, -19.4, -11.0, 130.2], + [3.5, 48.0, -127.1, -33.7, 81.8], + [28.4, 7.6, 1.0, 16.3, -146.2], + [20.0, -71.8, 153.1, 17.9, -93.3], + [-34.3, -13.0, 20.5, -4.4, 130.2], + [-9.4, 58.4, -150.8, -41.9, 141.6], + [26.7, 32.9, -49.8, 7.9, -134.1], + [34.3, -74.1, 161.6, 26.1, -152.1], + [-34.2, -35.1, 75.5, 8.0, 105.1], + [-21.9, 54.5, -151.3, -49.8, 199.9], + [19.2, 54.9, -98.1, -8.1, -95.3], + [47.4, -72.0, 158.9, 22.4, -192.6], + [-25.7, -60.3, 129.5, 21.9, 54.6], + [-26.9, 42.2, -136.6, -48.4, 243.1], + [12.6, 79.6, -153.1, -11.6, -57.4], + [54.1, -66.7, 153.3, 17.7, -241.5], + [-26.1, -79.5, 185.0, 25.9, 19.5], + [-34.8, 36.3, -133.8, -54.2, 311.0], + [10.6, 105.3, -219.7, -19.1, -15.3], + [73.9, -58.5, 138.7, 25.1, -307.4], + [-17.7, -107.1, 257.4, 47.9, -58.4], + [-51.3, 17.0, -93.4, -59.1, 354.7], + [-13.4, 140.4, -287.4, -43.0, 83.7], + [85.8, -28.4, 71.1, 11.6, -310.8], + [4.5, -130.3, 310.9, 68.7, -171.8], + [-51.1, -25.3, -12.3, -46.8, 361.3], + [-34.2, 158.2, -341.9, -48.7, 191.1], + [92.6, 7.7, -9.7, -2.9, -314.2], + [20.2, -143.1, 363.2, 83.7, -294.6], + [-54.6, -69.6, 82.3, -36.4, 356.7], + [-60.4, 176.2, -390.3, -61.8, 319.4], + [97.2, 53.8, -116.7, -18.9, -289.0], + [46.0, -152.8, 400.1, 103.7, -440.8], + [-52.1, -124.2, 206.7, -16.7, 307.3], + [-93.8, 183.7, -415.9, -74.5, 465.6], + ] + ), + } + + data["Y"] = np.array( + [list(y1) + list(y2) for y1, y2 in zip(data["Y_1"], data["Y_2"])] + ) + del data["Y_1"] + del data["Y_2"] + return data + + @pytest.fixture def adjacency_mat_num_stability() -> np.ndarray: """ diff --git a/tests/structure/data_generators/test_core.py b/tests/structure/data_generators/test_core.py index 96371cf..e1dad0d 100644 --- a/tests/structure/data_generators/test_core.py +++ b/tests/structure/data_generators/test_core.py @@ -85,7 +85,11 @@ def test_bad_graph_type(self): """ Test that a value other than "erdos-renyi", "barabasi-albert", "full" throws ValueError """ graph_type = "invalid" degree, d_nodes = 4, 10 - with pytest.raises(ValueError, match="unknown graph type"): + with pytest.raises( + ValueError, + match="Unknown graph type invalid. Available types" + r" are \['erdos-renyi', 'barabasi-albert', 'full'\]", + ): generate_structure(d_nodes, degree, graph_type) @pytest.mark.parametrize("num_nodes,degree", [(5, 2), (10, 3), (15, 5)]) diff --git a/tests/structure/data_generators/test_wrappers.py b/tests/structure/data_generators/test_wrappers.py index 36013f0..486db8c 100644 --- a/tests/structure/data_generators/test_wrappers.py +++ b/tests/structure/data_generators/test_wrappers.py @@ -26,6 +26,7 @@ # # See the License for the specific language governing permissions and # limitations under the License. +import re import string from itertools import product @@ -37,13 +38,16 @@ from causalnex.structure import StructureModel from causalnex.structure.data_generators import ( + gen_stationary_dyn_net_and_df, generate_binary_data, generate_binary_dataframe, generate_categorical_dataframe, generate_continuous_data, generate_continuous_dataframe, generate_count_dataframe, + generate_dataframe_dynamic, generate_structure, + generate_structure_dynamic, ) from tests.structure.data_generators.test_core import calculate_proba @@ -698,3 +702,173 @@ def test_dataframe(self, graph, intercept, seed, kernel, zero_inflation_factor): ) assert np.array_equal(data, df[list(graph.nodes())].values) + + +class TestGenerateStructureDynamic: + @pytest.mark.parametrize("num_nodes", (10, 20)) + @pytest.mark.parametrize("p", [1, 10]) + @pytest.mark.parametrize("degree_intra, degree_inter", [(3, 0), (0, 3), (1, 1)]) + def test_all_nodes_in_structure(self, num_nodes, p, degree_intra, degree_inter): + """both intra- and iter-slice nodes should be in the structure""" + g = generate_structure_dynamic(num_nodes, p, degree_intra, degree_inter) + assert np.all( + [ + "{var}_lag{l_val}".format(var=var, l_val=l_val) in g.nodes + for l_val in range(p + 1) + for var in range(num_nodes) + ] + ) + + def test_naming_nodes(self): + """Nodes should have the format {var}_lag{l}""" + g = generate_structure_dynamic(5, 3, 3, 4) + pattern = re.compile(r"[0-5]_lag[0-3]") + for node in g.nodes: + match = pattern.match(node) + assert match and (match.group() == node) + + def test_degree_zero_implies_no_edges(self): + """If the degree is zero, zero edges are generated. + We test this is true for intra edges (ending in 'lag0') and inter edges + """ + g = generate_structure_dynamic(15, 3, 0, 4) # No intra edges + lags = [(u.split("_lag")[1], v.split("_lag")[1]) for u, v in g.edges] + assert np.all([el[0] != "0" for el in lags]) + g = generate_structure_dynamic(15, 3, 4, 0) + lags = [(u.split("_lag")[1], v.split("_lag")[1]) for u, v in g.edges] + assert np.all([el == ("0", "0") for el in lags]) # only Intra edges + g = generate_structure_dynamic(15, 3, 0, 0) # no edges + assert len(g.edges) == 0 + + def test_edges_have_weights(self): + """all edges must have weight values as floats or int""" + g = generate_structure_dynamic(10, 3, 4, 4) # No intra edges + ws = [w for _, _, w in g.edges(data="weight")] + assert np.all([isinstance(w, (float, int)) for w in ws]) + + def test_raise_error_if_wrong_graph_type(self): + """if the graph_type chosen is not among the options available, raise error""" + with pytest.raises( + ValueError, + match=r"Unknown graph type some_type\. " + r"Available types are \['erdos-renyi', 'barabasi-albert', 'full'\]", + ): + generate_structure_dynamic(10, 10, 10, 10, graph_type_intra="some_type") + with pytest.raises( + ValueError, + match=r"Unknown inter-slice graph type `some_type`\. " + "Valid types are 'erdos-renyi' and 'full'", + ): + generate_structure_dynamic(10, 10, 10, 10, graph_type_inter="some_type") + + def test_raise_error_if_min_greater_than_max(self): + """if min > max,raise error""" + with pytest.raises( + ValueError, + match="Absolute minimum weight must be " + r"less than or equal to maximum weight\: 3 \> 2", + ): + generate_structure_dynamic(10, 10, 10, 10, w_min_inter=3, w_max_inter=2) + + @pytest.mark.parametrize("num_nodes", (10, 20)) + @pytest.mark.parametrize("p", [1, 10]) + def test_full_graph_type(self, num_nodes, p): + """all the connections from past variables to current variables should be there if using `full` graph_type""" + g = generate_structure_dynamic(num_nodes, p, 4, 4, graph_type_inter="full") + lagged_edges = sorted((u, v) for u, v in g.edges if int(u.split("_lag")[1]) > 0) + assert lagged_edges == sorted( + ("{v}_lag{l_}".format(v=v_f, l_=l_), "{v}_lag0".format(v=v_t)) + for l_ in range(1, p + 1) + for v_f in range(num_nodes) # var from + for v_t in range(num_nodes) # var to + ) + + +class TestGenerateDataframeDynamic: + @pytest.mark.parametrize( + "sem_type", ["linear-gauss", "linear-exp", "linear-gumbel"] + ) + def test_returns_dateframe(self, sem_type): + """ Return value is an ndarray - test over all sem_types """ + graph_type, degree, d_nodes = "erdos-renyi", 4, 10 + sm = generate_structure_dynamic(d_nodes, 2, degree, degree, graph_type) + data = generate_dataframe_dynamic(sm, sem_type=sem_type, n_samples=10) + assert isinstance(data, pd.DataFrame) + + def test_bad_sem_type(self): + """ Test that invalid sem-type other than "linear-gauss", "linear-exp", "linear-gumbel" is not accepted """ + graph_type, degree, d_nodes = "erdos-renyi", 4, 10 + sm = generate_structure_dynamic(d_nodes, 2, degree, degree, graph_type) + with pytest.raises( + ValueError, + match="unknown sem type invalid. Available types are:" + r" \('linear-gauss', 'linear-exp', 'linear-gumbel'\)", + ): + generate_dataframe_dynamic(sm, sem_type="invalid", n_samples=10) + + @pytest.mark.parametrize("p", [0, 1, 2]) + def test_labels_correct(self, p): + graph_type, degree, d_nodes = "erdos-renyi", 4, 10 + sm = generate_structure_dynamic(d_nodes, p, degree, degree, graph_type) + data = generate_dataframe_dynamic(sm, sem_type="linear-gauss", n_samples=10) + intra_nodes = sorted([el for el in sm.nodes if "_lag0" in el]) + inter_nodes = sorted([el for el in sm.nodes if "_lag0" not in el]) + assert sorted(data.columns) == sorted(list(inter_nodes) + list(intra_nodes)) + + +class TestGenerateStationaryDynamicStructureAndSamples: + def test_wta(self): + with pytest.warns( + UserWarning, match="Could not simulate data, returning constant dataframe" + ): + gen_stationary_dyn_net_and_df( + w_min_inter=1, w_max_inter=2, max_data_gen_trials=2 + ) + + @pytest.mark.parametrize("seed", [2, 3, 5]) + def test_seems_stationary(self, seed): + np.random.seed(seed) + _, df, _, _ = gen_stationary_dyn_net_and_df( + w_min_inter=0.1, w_max_inter=0.2, max_data_gen_trials=2 + ) + assert np.all(df.max() - df.min() < 10) + + def test_error_if_wmin_less_wmax(self): + with pytest.raises( + ValueError, + match="Absolute minimum weight must be less than or equal to maximum weight: 2 > 1", + ): + gen_stationary_dyn_net_and_df( + w_min_inter=2, w_max_inter=1, max_data_gen_trials=2 + ) + + def test_dense_networks(self): + """dense network are more likely to be non stationary. we check that the simulator is still able to provide a + stationary time-deries in that case. + + If df contain only ones it means that the generator failed to obtain a stationary structure""" + np.random.seed(4) + _, df, _, _ = gen_stationary_dyn_net_and_df( + n_samples=1000, + p=1, + w_min_inter=0.2, + w_max_inter=0.5, + max_data_gen_trials=10, + degree_intra=4, + degree_inter=7, + ) + assert np.any(np.ones(df.shape) != df) + + def test_fail_to_find_stationary_network(self): + """if fails to find suitable network, returns dataset of ones""" + np.random.seed(5) + _, df, _, _ = gen_stationary_dyn_net_and_df( + n_samples=1000, + p=1, + w_min_inter=0.6, + w_max_inter=0.6, + max_data_gen_trials=20, + degree_intra=4, + degree_inter=7, + ) + assert np.any(np.ones(df.shape) == df) diff --git a/tests/structure/test_dynotears.py b/tests/structure/test_dynotears.py new file mode 100644 index 0000000..bfb8ceb --- /dev/null +++ b/tests/structure/test_dynotears.py @@ -0,0 +1,738 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. +import re + +import networkx as nx +import numpy as np +import pandas as pd +import pytest + +from causalnex.structure.dynotears import from_numpy_dynamic, from_pandas_dynamic + + +class TestFromNumpyDynotears: + """Test behaviour of the learn_dynamic_structure of `from_numpy_dynamic`""" + + def test_empty_data_raises_error(self): + """ + Providing an empty data set should result in a Value Error explaining that data must not be empty. + This error is useful to catch and handle gracefully, because otherwise the user would experience + misleading division by zero, or unpacking errors. + """ + + with pytest.raises( + ValueError, match="Input data X is empty, cannot learn any structure" + ): + from_numpy_dynamic(np.empty([0, 5]), np.zeros([5, 5])) + with pytest.raises( + ValueError, match="Input data Xlags is empty, cannot learn any structure" + ): + from_numpy_dynamic(np.zeros([5, 5]), np.empty([0, 5])) + + def test_nrows_data_mismatch_raises_error(self): + """ + Providing input data and lagged data with different number of rows should result in a Value Error. + """ + + with pytest.raises( + ValueError, match="Input data X and Xlags must have the same number of rows" + ): + from_numpy_dynamic(np.zeros([5, 5]), np.zeros([6, 5])) + + def test_ncols_lagged_data_not_multiple_raises_error(self): + """ + Number of columns of lagged data is not a multiple of those of input data should result in a Value Error. + """ + + with pytest.raises( + ValueError, + match="Number of columns of Xlags must be a multiple of number of columns of X", + ): + from_numpy_dynamic(np.zeros([5, 5]), np.zeros([5, 6])) + + def test_single_iter_gets_converged_fail_warnings(self, data_dynotears_p1): + """ + With a single iteration on this dataset, learn_structure fails to converge and should give warnings. + """ + + with pytest.warns( + UserWarning, match=r"Failed to converge\. Consider increasing max_iter." + ): + from_numpy_dynamic( + data_dynotears_p1["X"], data_dynotears_p1["Y"], max_iter=1 + ) + + def test_naming_nodes(self, data_dynotears_p3): + """ + Nodes should have the format {var}_lag{l} + """ + sm = from_numpy_dynamic(data_dynotears_p3["X"], data_dynotears_p3["Y"]) + pattern = re.compile(r"[0-5]_lag[0-3]") + + for node in sm.nodes: + match = pattern.match(node) + assert match + assert match.group() == node + + def test_inter_edges(self, data_dynotears_p3): + """ + inter-slice edges must be {var}_lag{l} -> {var'}_lag0 , l > 0 + """ + + sm = from_numpy_dynamic(data_dynotears_p3["X"], data_dynotears_p3["Y"]) + + for start, end in sm.edges: + if int(start[-1]) > 0: + assert int(end[-1]) == 0 + + def test_expected_structure_learned_p1(self, data_dynotears_p1): + """ + Given a small data set with p=1, find all the intra-slice edges and the majority of the inter-slice ones + """ + + sm = from_numpy_dynamic( + data_dynotears_p1["X"], data_dynotears_p1["Y"], w_threshold=0.2 + ) + w_edges = [ + ("{i}_lag0".format(i=i), "{j}_lag0".format(j=j)) + for i in range(5) + for j in range(5) + if data_dynotears_p1["W"][i, j] != 0 + ] + a_edges = [ + ("{i_1}_lag{i_2}".format(i_1=i % 5, i_2=1 + i // 5), "{j}_lag0".format(j=j)) + for i in range(5) + for j in range(5) + if data_dynotears_p1["A"][i, j] != 0 + ] + + edges_in_sm_and_a = [el for el in sm.edges if el in a_edges] + sm_inter_edges = [el for el in sm.edges if "lag0" not in el[0]] + + assert sorted([el for el in sm.edges if "lag0" in el[0]]) == sorted(w_edges) + assert len(edges_in_sm_and_a) / len(a_edges) > 0.6 + assert len(edges_in_sm_and_a) / len(sm_inter_edges) > 0.9 + + def test_expected_structure_learned_p2(self, data_dynotears_p2): + """ + Given a small data set with p=2, all the intra-slice must be correct, and 90%+ found. + the majority of the inter edges must be found too + """ + + sm = from_numpy_dynamic( + data_dynotears_p2["X"], data_dynotears_p2["Y"], w_threshold=0.25 + ) + w_edges = [ + ("{i}_lag0".format(i=i), "{j}_lag0".format(j=j)) + for i in range(5) + for j in range(5) + if data_dynotears_p2["W"][i, j] != 0 + ] + a_edges = [ + ("{i_1}_lag{i_2}".format(i_1=i % 5, i_2=1 + i // 5), "{j}_lag0".format(j=j)) + for i in range(5) + for j in range(5) + if data_dynotears_p2["A"][i, j] != 0 + ] + + edges_in_sm_and_a = [el for el in sm.edges if el in a_edges] + sm_inter_edges = [el for el in sm.edges if "lag0" not in el[0]] + sm_intra_edges = [el for el in sm.edges if "lag0" in el[0]] + + assert len([el for el in sm_intra_edges if el not in w_edges]) == 0 + assert ( + len([el for el in w_edges if el not in sm_intra_edges]) / len(w_edges) + <= 1.0 + ) + assert len(edges_in_sm_and_a) / len(a_edges) > 0.5 + assert len(edges_in_sm_and_a) / len(sm_inter_edges) > 0.5 + + def test_tabu_parents(self, data_dynotears_p2): + """ + If tabu relationships are set, the corresponding edges must not exist + """ + + sm = from_numpy_dynamic( + data_dynotears_p2["X"], + data_dynotears_p2["Y"], + tabu_parent_nodes=[1], + ) + assert not [el for el in sm.edges if "1_lag" in el[0]] + + def test_tabu_children(self, data_dynotears_p2): + """ + If tabu relationships are set, the corresponding edges must not exist + """ + sm = from_numpy_dynamic( + data_dynotears_p2["X"], + data_dynotears_p2["Y"], + tabu_child_nodes=[4], + ) + assert not ([el for el in sm.edges if "4_lag" in el[1]]) + + sm = from_numpy_dynamic( + data_dynotears_p2["X"], + data_dynotears_p2["Y"], + tabu_child_nodes=[1], + ) + assert not ([el for el in sm.edges if "1_lag" in el[1]]) + + def test_tabu_edges(self, data_dynotears_p2): + """ + Tabu edges must not be in the edges learnt + """ + sm = from_numpy_dynamic( + data_dynotears_p2["X"], + data_dynotears_p2["Y"], + tabu_edges=[(0, 2, 4), (0, 0, 3), (1, 1, 4), (1, 3, 4)], + ) + + assert ("2_lag0", "4_lag0") not in sm.edges + assert ("0_lag0", "3_lag0") not in sm.edges + assert ("1_lag1", "4_lag0") not in sm.edges + assert ("3_lag1", "4_lag0") not in sm.edges + + def test_multiple_tabu(self, data_dynotears_p2): + """ + If tabu relationships are set, the corresponding edges must not exist + """ + sm = from_numpy_dynamic( + data_dynotears_p2["X"], + data_dynotears_p2["Y"], + tabu_edges=[(0, 1, 4), (0, 0, 3), (1, 1, 4), (1, 3, 4)], + tabu_child_nodes=[0, 1], + tabu_parent_nodes=[3], + ) + + assert ("1_lag0", "4_lag0") not in sm.edges + assert ("0_lag0", "3_lag0") not in sm.edges + assert ("1_lag1", "4_lag0") not in sm.edges + assert ("3_lag1", "4_lag0") not in sm.edges + assert not ([el for el in sm.edges if "0_lag" in el[1]]) + assert not ([el for el in sm.edges if "1_lag" in el[1]]) + assert not ([el for el in sm.edges if "3_lag" in el[0]]) + + def test_all_columns_in_structure(self, data_dynotears_p2): + """Every columns that is in the data should become a node in the learned structure""" + sm = from_numpy_dynamic( + data_dynotears_p2["X"], + data_dynotears_p2["Y"], + ) + assert sorted(sm.nodes) == [ + "{var}_lag{l_val}".format(var=var, l_val=l_val) + for var in range(5) + for l_val in range(3) + ] + + def test_isolated_nodes_exist(self, data_dynotears_p2): + """Isolated nodes should still be in the learned structure""" + sm = from_numpy_dynamic( + data_dynotears_p2["X"], data_dynotears_p2["Y"], w_threshold=1 + ) + assert len(sm.edges) == 2 + assert len(sm.nodes) == 15 + + def test_edges_contain_weight(self, data_dynotears_p2): + """Edges must contain the 'weight' from the adjacent table """ + sm = from_numpy_dynamic(data_dynotears_p2["X"], data_dynotears_p2["Y"]) + assert np.all( + [ + isinstance(w, (float, int, np.number)) + for u, v, w in sm.edges(data="weight") + ] + ) + + def test_certain_relationships_get_near_certain_weight(self): + """If a == b always, ther should be an edge a->b or b->a with coefficient close to one """ + + np.random.seed(17) + data = pd.DataFrame( + [[np.sqrt(el), np.sqrt(el)] for el in np.random.choice(100, size=500)], + columns=["a", "b"], + ) + sm = from_numpy_dynamic(data.values[1:], data.values[:-1], w_threshold=0.1) + edge = ( + sm.get_edge_data("1_lag0", "0_lag0") or sm.get_edge_data("0_lag0", "1_lag0") + )["weight"] + + assert 0.99 < edge <= 1.01 + + def test_inverse_relationships_get_negative_weight(self): + """If a == -b always, ther should be an edge a->b or b->a with coefficient close to minus one """ + + np.random.seed(17) + data = pd.DataFrame( + [[el, -el] for el in np.random.choice(100, size=500)], columns=["a", "b"] + ) + sm = from_numpy_dynamic(data.values[1:], data.values[:-1], w_threshold=0.1) + edge = ( + sm.get_edge_data("1_lag0", "0_lag0") or sm.get_edge_data("0_lag0", "1_lag0") + )["weight"] + assert -1.01 < edge <= -0.99 + + def test_no_cycles(self, data_dynotears_p2): + """ + The learned structure should be acyclic + """ + + sm = from_numpy_dynamic( + data_dynotears_p2["X"], data_dynotears_p2["Y"], w_threshold=0.05 + ) + assert nx.algorithms.is_directed_acyclic_graph(sm) + + def test_tabu_edges_on_non_existing_edges_do_nothing(self, data_dynotears_p2): + """If tabu edges do not exist in the original unconstrained network then nothing changes""" + sm = from_numpy_dynamic( + data_dynotears_p2["X"], data_dynotears_p2["Y"], w_threshold=0.2 + ) + + sm_2 = from_numpy_dynamic( + data_dynotears_p2["X"], + data_dynotears_p2["Y"], + w_threshold=0.2, + tabu_edges=[(0, 0, 0), (0, 0, 1), (0, 0, 2), (0, 0, 3)], + ) + assert set(sm_2.edges) == set(sm.edges) + + +class TestFromPandasDynotears: + """Test behaviour of the learn_dynamic_structure of `from_pandas_dynamic`""" + + def test_empty_data_raises_error(self): + """ + Providing an empty data set should result in a Value Error explaining that data must not be empty. + This error is useful to catch and handle gracefully, because otherwise the user would experience + misleading division by zero, or unpacking errors. + """ + with pytest.raises( + ValueError, match="Input data X is empty, cannot learn any structure" + ): + from_pandas_dynamic(pd.DataFrame(np.empty([2, 5])), p=2) + + with pytest.raises( + ValueError, match="Input data X is empty, cannot learn any structure" + ): + from_pandas_dynamic(pd.DataFrame(np.empty([1, 5])), p=1) + + with pytest.raises( + ValueError, match="Input data X is empty, cannot learn any structure" + ): + from_pandas_dynamic(pd.DataFrame(np.empty([0, 5])), p=1) + + def test_single_iter_gets_converged_fail_warnings(self, data_dynotears_p1): + """ + With a single iteration on this dataset, learn_structure fails to converge and should give warnings. + """ + + with pytest.warns( + UserWarning, match=r"Failed to converge\. Consider increasing max_iter." + ): + from_pandas_dynamic(pd.DataFrame(data_dynotears_p1["X"]), p=1, max_iter=1) + + def test_naming_nodes(self, data_dynotears_p3): + """ + Nodes should have the format {var}_lag{l} + """ + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]), + p=3, + ) + pattern = re.compile(r"[abcde]_lag[0-3]") + + for node in sm.nodes: + match = pattern.match(node) + assert match + assert match.group() == node + + def test_inter_edges(self, data_dynotears_p3): + """ + inter-slice edges must be {var}_lag{l} -> {var'}_lag0 , l > 0 + """ + + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]), + p=3, + ) + for start, end in sm.edges: + if int(start[-1]) > 0: + assert int(end[-1]) == 0 + + def test_expected_structure_learned_p1(self, data_dynotears_p1): + """ + Given a small data set with p=1, find all the intra-slice edges and the majority of the inter-slice ones + """ + df = pd.DataFrame(data_dynotears_p1["X"], columns=["a", "b", "c", "d", "e"]) + df.loc[-1, :] = data_dynotears_p1["Y"][0, :] + df = df.sort_index() + + sm = from_pandas_dynamic( + df, + p=1, + w_threshold=0.2, + ) + map_ = dict(zip(range(5), ["a", "b", "c", "d", "e"])) + w_edges = [ + ("{i}_lag0".format(i=map_[i]), "{j}_lag0".format(j=map_[j])) + for i in range(5) + for j in range(5) + if data_dynotears_p1["W"][i, j] != 0 + ] + a_edges = [ + ( + "{i_1}_lag{i_2}".format(i_1=map_[i % 5], i_2=1 + i // 5), + "{j}_lag0".format(j=map_[j]), + ) + for i in range(5) + for j in range(5) + if data_dynotears_p1["A"][i, j] != 0 + ] + + edges_in_sm_and_a = [el for el in sm.edges if el in a_edges] + sm_inter_edges = [el for el in sm.edges if "lag0" not in el[0]] + assert sorted(el for el in sm.edges if "lag0" in el[0]) == sorted(w_edges) + assert len(edges_in_sm_and_a) / len(a_edges) > 0.6 + assert len(edges_in_sm_and_a) / len(sm_inter_edges) > 0.9 + + def test_expected_structure_learned_p2(self, data_dynotears_p2): + """ + Given a small data set with p=2, all the intra-slice must be correct, and 90%+ found. + the majority of the inter edges must be found too + """ + df = pd.DataFrame(data_dynotears_p2["X"], columns=["a", "b", "c", "d", "e"]) + df.loc[-1, :] = data_dynotears_p2["Y"][0, :5] + df.loc[-2, :] = data_dynotears_p2["Y"][0, 5:10] + + df = df.sort_index() + + sm = from_pandas_dynamic( + df, + p=2, + w_threshold=0.25, + ) + map_ = dict(zip(range(5), ["a", "b", "c", "d", "e"])) + w_edges = [ + ("{i}_lag0".format(i=map_[i]), "{j}_lag0".format(j=map_[j])) + for i in range(5) + for j in range(5) + if data_dynotears_p2["W"][i, j] != 0 + ] + a_edges = [ + ( + "{i_1}_lag{i_2}".format(i_1=map_[i % 5], i_2=1 + i // 5), + "{j}_lag0".format(j=map_[j]), + ) + for i in range(5) + for j in range(5) + if data_dynotears_p2["A"][i, j] != 0 + ] + + edges_in_sm_and_a = [el for el in sm.edges if el in a_edges] + sm_inter_edges = [el for el in sm.edges if "lag0" not in el[0]] + sm_intra_edges = [el for el in sm.edges if "lag0" in el[0]] + + assert len([el for el in sm_intra_edges if el not in w_edges]) == 0 + assert ( + len([el for el in w_edges if el not in sm_intra_edges]) / len(w_edges) + <= 1.0 + ) + assert len(edges_in_sm_and_a) / len(a_edges) > 0.5 + assert len(edges_in_sm_and_a) / len(sm_inter_edges) > 0.5 + + def test_tabu_parents(self, data_dynotears_p3): + """ + If tabu relationships are set, the corresponding edges must not exist + """ + + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]), + p=3, + tabu_parent_nodes=["a"], + ) + assert not [el for el in sm.edges if "a_lag" in el[0]] + + def test_tabu_children(self, data_dynotears_p3): + """ + If tabu relationships are set, the corresponding edges must not exist + """ + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]), + p=3, + tabu_child_nodes=["c", "d"], + ) + assert not ([el for el in sm.edges if "c_lag" in el[1]]) + assert not ([el for el in sm.edges if "d_lag" in el[1]]) + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]), + p=3, + tabu_child_nodes=["a", "b"], + ) + assert not ([el for el in sm.edges if "a_lag" in el[1]]) + assert not ([el for el in sm.edges if "b_lag" in el[1]]) + + def test_tabu_edges(self, data_dynotears_p3): + """ + Tabu edges must not be in the edges learnt + """ + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]), + p=3, + tabu_edges=[(0, "c", "e"), (0, "a", "d"), (1, "b", "e"), (1, "d", "e")], + ) + + assert ("c_lag0", "e_lag0") not in sm.edges + assert ("a_lag0", "d_lag0") not in sm.edges + assert ("b_lag1", "e_lag0") not in sm.edges + assert ("d_lag1", "e_lag0") not in sm.edges + + def test_multiple_tabu(self, data_dynotears_p3): + """ + If tabu relationships are set, the corresponding edges must not exist + """ + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]), + p=3, + tabu_edges=[(0, "a", "e"), (0, "a", "d"), (1, "b", "e"), (1, "d", "e")], + tabu_child_nodes=["a", "b"], + tabu_parent_nodes=["d"], + ) + + assert ("a_lag0", "e_lag0") not in sm.edges + assert ("a_lag0", "d_lag0") not in sm.edges + assert ("b_lag1", "e_lag0") not in sm.edges + assert ("d_lag1", "e_lag0") not in sm.edges + assert not ([el for el in sm.edges if "a_lag" in el[1]]) + assert not ([el for el in sm.edges if "b_lag" in el[1]]) + assert not ([el for el in sm.edges if "d_lag" in el[0]]) + + def test_all_columns_in_structure(self, data_dynotears_p2): + """Every columns that is in the data should become a node in the learned structure""" + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p2["X"], columns=["a", "b", "c", "d", "e"]), + p=2, + w_threshold=0.4, + ) + assert sorted(sm.nodes) == [ + "{var}_lag{l_val}".format(var=var, l_val=l_val) + for var in ["a", "b", "c", "d", "e"] + for l_val in range(3) + ] + + def test_isolated_nodes_exist(self, data_dynotears_p2): + """Isolated nodes should still be in the learned structure""" + df = pd.DataFrame(data_dynotears_p2["X"], columns=["a", "b", "c", "d", "e"]) + df.loc[-1, :] = data_dynotears_p2["Y"][0, :5] + df.loc[-2, :] = data_dynotears_p2["Y"][0, 5:10] + df = df.sort_index() + + sm = from_pandas_dynamic(df, p=2, w_threshold=1) + assert len(sm.edges) == 2 + assert len(sm.nodes) == 15 + + def test_edges_contain_weight(self, data_dynotears_p3): + """Edges must contain the 'weight' from the adjacent table """ + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]), + p=3, + ) + assert np.all( + [ + isinstance(w, (float, int, np.number)) + for u, v, w in sm.edges(data="weight") + ] + ) + + def test_certain_relationships_get_near_certain_weight(self): + """If a == b always, ther should be an edge a->b or b->a with coefficient close to one """ + + np.random.seed(17) + data = pd.DataFrame( + [[np.sqrt(el), np.sqrt(el)] for el in np.random.choice(100, size=500)], + columns=["a", "b"], + ) + sm = from_pandas_dynamic(data, p=1, w_threshold=0.1) + edge = ( + sm.get_edge_data("b_lag0", "a_lag0") or sm.get_edge_data("a_lag0", "b_lag0") + )["weight"] + + assert 0.99 < edge <= 1.01 + + def test_inverse_relationships_get_negative_weight(self): + """If a == -b always, there should be an edge a->b or b->a with coefficient close to minus one """ + + np.random.seed(17) + data = pd.DataFrame( + [[el, -el] for el in np.random.choice(100, size=500)], columns=["a", "b"] + ) + sm = from_pandas_dynamic(data, p=1, w_threshold=0.1) + edge = ( + sm.get_edge_data("b_lag0", "a_lag0") or sm.get_edge_data("a_lag0", "b_lag0") + )["weight"] + assert -1.01 < edge <= -0.99 + + def test_no_cycles(self, data_dynotears_p2): + """ + The learned structure should be acyclic + """ + sm = from_pandas_dynamic( + pd.DataFrame(data_dynotears_p2["X"], columns=["a", "b", "c", "d", "e"]), + p=2, + w_threshold=0.05, + ) + assert nx.algorithms.is_directed_acyclic_graph(sm) + + def test_tabu_edges_on_non_existing_edges_do_nothing(self, data_dynotears_p2): + """If tabu edges do not exist in the original unconstrained network then nothing changes""" + df = pd.DataFrame(data_dynotears_p2["X"], columns=["a", "b", "c", "d", "e"]) + df.loc[-1, :] = data_dynotears_p2["Y"][0, :5] + df.loc[-2, :] = data_dynotears_p2["Y"][0, 5:10] + df = df.sort_index() + + sm = from_pandas_dynamic( + df, + p=2, + w_threshold=0.2, + ) + sm_2 = from_pandas_dynamic( + df, + p=2, + w_threshold=0.2, + tabu_edges=[(0, "a", "a"), (0, "a", "b"), (0, "a", "c"), (0, "a", "d")], + ) + assert set(sm_2.edges) == set(sm.edges) + + def test_list_of_dfs_as_input(self, data_dynotears_p2): + """ + the result when given a list of dataframes should be the same as a single dataframe. + Also, stacking two dataframes should give the same result as well + """ + df = pd.DataFrame(data_dynotears_p2["X"], columns=["a", "b", "c", "d", "e"]) + df.loc[-1, :] = data_dynotears_p2["Y"][0, :5] + df.loc[-2, :] = data_dynotears_p2["Y"][0, 5:10] + + df = df.sort_index() + df_ = df.copy() + df_.index = range(100, 152) + df = pd.concat([df, df_]) + sm = from_pandas_dynamic(df, p=2, w_threshold=0.05) + sm_1 = from_pandas_dynamic([df], p=2, w_threshold=0.05) + sm_2 = from_pandas_dynamic([df, df], p=2, w_threshold=0.05) + + assert list(sm_2.edges) == list(sm_1.edges) + assert list(sm.edges) == list(sm_1.edges) + + weights = np.array([w for _, _, w in sm.edges(data="weight")]) + weights_1 = np.array([w for _, _, w in sm_1.edges(data="weight")]) + weights_2 = np.array([w for _, _, w in sm_2.edges(data="weight")]) + assert np.max(np.abs(weights - weights_1)) < 0.001 + assert np.max(np.abs(weights - weights_2)) < 0.001 + + def test_discondinuity(self): + """ + The result when having a point of discontinuity must be the same as if we cut the df in two (on the discont. + point) and provide the two datasets as input. + + This is because, inside, the algorithm cuts the dfs into continuous datasets + """ + np.random.seed(12) + df = pd.DataFrame(np.random.random([100, 5]), columns=["a", "b", "c", "d", "e"]) + df_2 = pd.DataFrame( + np.random.random([100, 5]), + columns=["a", "b", "c", "d", "e"], + index=np.arange(200, 300), + ) + + sm = from_pandas_dynamic(pd.concat([df, df_2], axis=0), p=2, w_threshold=0.05) + sm_1 = from_pandas_dynamic([df, df_2], p=2, w_threshold=0.05) + + assert [(u, v, round(w, 3)) for u, v, w in sm_1.edges(data="weight")] == [ + (u, v, round(w, 3)) for u, v, w in sm.edges(data="weight") + ] + + def test_incorrect_input_format(self): + with pytest.raises( + ValueError, + match="Provided empty list of time_series." + " At least one DataFrame must be provided", + ): + from_pandas_dynamic([], 1) + + with pytest.raises( + ValueError, + match=r"All columns must have numeric data\. " + r"Consider mapping the following columns to int: \['a'\]", + ): + from_pandas_dynamic(pd.DataFrame([["1"]], columns=["a"]), 1) + + with pytest.raises( + TypeError, + match="Time series entries must be instances of `pd.DataFrame`", + ): + from_pandas_dynamic([np.array([1, 2])], 1) + + with pytest.raises( + ValueError, + match="Index for dataframe must be provided in increasing order", + ): + df = pd.DataFrame(np.random.random([5, 5]), index=[3, 1, 2, 5, 0]) + from_pandas_dynamic(df, 1) + + with pytest.raises( + ValueError, + match="All inputs must have the same columns and same types", + ): + df = pd.DataFrame( + np.random.random([5, 5]), + columns=["a", "b", "c", "d", "e"], + ) + df_2 = pd.DataFrame( + np.random.random([5, 5]), + columns=["a", "b", "c", "d", "f"], + ) + from_pandas_dynamic([df, df_2], 1) + + with pytest.raises( + ValueError, + match="All inputs must have the same columns and same types", + ): + df = pd.DataFrame( + np.random.random([5, 5]), + columns=["a", "b", "c", "d", "e"], + ) + df_2 = pd.DataFrame( + np.random.random([5, 5]), + columns=["a", "b", "c", "d", "e"], + ) + df_2["a"] = df_2["a"].astype(int) + from_pandas_dynamic([df, df_2], 1) + + with pytest.raises( + TypeError, + match="Index must be integers", + ): + df = pd.DataFrame(np.random.random([5, 5]), index=[0, 1, 2, 3.0, 4]) + from_pandas_dynamic(df, 1) diff --git a/tests/structure/test_transformers.py b/tests/structure/test_transformers.py new file mode 100644 index 0000000..4831a05 --- /dev/null +++ b/tests/structure/test_transformers.py @@ -0,0 +1,153 @@ +# Copyright 2019-2020 QuantumBlack Visual Analytics Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND +# NONINFRINGEMENT. IN NO EVENT WILL THE LICENSOR OR OTHER CONTRIBUTORS +# BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF, OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# +# The QuantumBlack Visual Analytics Limited ("QuantumBlack") name and logo +# (either separately or in combination, "QuantumBlack Trademarks") are +# trademarks of QuantumBlack. The License does not grant you any right or +# license to the QuantumBlack Trademarks. You may not use the QuantumBlack +# Trademarks or any confusingly similar mark as a trademark for your product, +# or use the QuantumBlack Trademarks in any other manner that might cause +# confusion in the marketplace, including but not limited to in advertising, +# on websites, or on software. +# +# See the License for the specific language governing permissions and +# limitations under the License. + +import re + +import numpy as np +import pandas as pd +import pytest +from sklearn.exceptions import NotFittedError + +from causalnex.structure.transformers import DynamicDataTransformer + + +class TestDynamicDataTransformer: + def test_naming_nodes(self, data_dynotears_p3): + """ + Nodes should have the format {var}_lag{l} + """ + df = pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]) + df_dyno = DynamicDataTransformer(p=3).fit_transform(df) + + pattern = re.compile(r"[abcde]_lag[0-3]") + for node in df_dyno.columns: + match = pattern.match(node) + assert match + assert match.group() == node + + def test_all_nodes_in_df(self, data_dynotears_p3): + """ + Nodes should have the format {var}_lag{l} + """ + df = pd.DataFrame(data_dynotears_p3["X"], columns=["a", "b", "c", "d", "e"]) + df_dyno = DynamicDataTransformer(p=3).fit_transform(df) + + assert list(df_dyno.columns) == [ + el + "_lag" + str(i) for i in range(4) for el in ["a", "b", "c", "d", "e"] + ] + + def test_incorrect_input_format(self): + with pytest.raises( + ValueError, + match="Provided empty list of time_series." + " At least one DataFrame must be provided", + ): + DynamicDataTransformer(p=3).fit_transform([]) + + with pytest.raises( + ValueError, + match=r"All columns must have numeric data\. " + r"Consider mapping the following columns to int: \['a'\]", + ): + DynamicDataTransformer(p=1).fit_transform( + pd.DataFrame([["1"]], columns=["a"]) + ) + + with pytest.raises( + TypeError, + match="Time series entries must be instances of `pd.DataFrame`", + ): + DynamicDataTransformer(p=1).fit_transform([np.array([1, 2])]) + + with pytest.raises( + ValueError, + match="Index for dataframe must be provided in increasing order", + ): + df = pd.DataFrame(np.random.random([5, 5]), index=[3, 1, 2, 5, 0]) + DynamicDataTransformer(p=1).fit_transform(df) + + with pytest.raises( + ValueError, + match="All inputs must have the same columns and same types", + ): + df = pd.DataFrame( + np.random.random([5, 5]), + columns=["a", "b", "c", "d", "e"], + ) + df_2 = pd.DataFrame( + np.random.random([5, 5]), + columns=["a", "b", "c", "d", "f"], + ) + DynamicDataTransformer(p=1).fit_transform([df, df_2]) + + with pytest.raises( + ValueError, + match="All inputs must have the same columns and same types", + ): + cols = ["a", "b", "c", "d", "e"] + df = pd.DataFrame(np.random.random([5, 5]), columns=cols) + df_2 = pd.DataFrame(np.random.random([5, 5]), columns=cols) + df_2["a"] = df_2["a"].astype(int) + DynamicDataTransformer(p=1).fit_transform([df, df_2]) + + with pytest.raises( + TypeError, + match="Index must be integers", + ): + df = pd.DataFrame(np.random.random([5, 5]), index=[0, 1, 2, 3.0, 4]) + DynamicDataTransformer(p=1).fit_transform(df) + + def test_not_fitted_transform(self): + """if transform called before fit: raise error""" + with pytest.raises( + NotFittedError, + match=r"This DynamicDataTransformer is not fitted yet\." + " Call `fit` before using this method", + ): + df = pd.DataFrame(np.random.random([5, 5])) + DynamicDataTransformer(p=1).transform(df) + + def test_transform_wrong_input(self): + """If transform df does not have all necessaty columns, raise error""" + with pytest.raises( + ValueError, + match="We should provide all necessary columns in " + r"the time series\. Columns not provided: \[2, 3\]", + ): + df = pd.DataFrame(np.random.random([5, 5])) + ddt = DynamicDataTransformer(p=1).fit(df) + ddt.transform(df.drop([2, 3], axis=1)) + + def test_return_df_true_equivalent_to_false(self): + """Check that the df from `return_df=true` is + equivalent the result if `return_df=false`""" + df = pd.DataFrame(np.random.random([50, 10])) + df_dyno = DynamicDataTransformer(p=3).fit_transform(df, return_df=True) + X, Xlags = DynamicDataTransformer(p=3).fit_transform(df, return_df=False) + assert np.all(df_dyno.values[:, :10] == X) + assert np.all(df_dyno.values[:, 10:] == Xlags)