From 7862b3c56dbc36f72b3c7f87d9b39e1ae78b4ddc Mon Sep 17 00:00:00 2001 From: FrizzoDavide <105226148+FrizzoDavide@users.noreply.github.com> Date: Sun, 7 Jan 2024 17:41:17 +0100 Subject: [PATCH] Update the MkDocs for the Library Documentation (#33) * Update mkdocs.yml * Update requirements.txt * Update extraction.md file so that it appears in the doc * Update md files to show the source code * Update docstrings in the py files * Update selection.py * Update mkdocs.yml * Update selection.py and .md * Update cast.py * Update denoising.py * Update entropy.py * Update outliers.py * Added extraction_frequency.py * Added scalers.md * Update mkdocs.yml * Update scalers.py * Transfer scaler.md to the docs/transformation/features folder * Added slicing.md and split.md * Adding operations.md and rolling_windows.md * Update operations.py and rolling_windows.py * Update operations.py and rolling_windows.py * Added transformation.md * Update transformation.py * Update mkdocs.yml * Update transformation.py --------- Co-authored-by: Lemeda98 Co-authored-by: Luciano Lorenti --- ceruleo/dataset/analysis/correlation.py | 23 +- ceruleo/dataset/analysis/distribution.py | 32 +- .../dataset/analysis/numerical_features.py | 121 +++--- ceruleo/dataset/analysis/sample_rate.py | 18 +- ceruleo/dataset/catalog/CMAPSS.py | 12 +- ceruleo/dataset/catalog/CMAPSS2.py | 2 - ceruleo/dataset/catalog/PHMDataset2018.py | 49 +-- ceruleo/dataset/ts_dataset.py | 139 +++++-- ceruleo/dataset/utils.py | 18 +- ceruleo/graphics/analysis.py | 5 +- ceruleo/graphics/duration.py | 88 +--- ceruleo/graphics/results.py | 106 ++--- ceruleo/iterators/batcher.py | 75 ++-- ceruleo/iterators/iterators.py | 148 +++---- ceruleo/iterators/sample_weight.py | 38 +- ceruleo/iterators/shufflers.py | 64 ++- ceruleo/iterators/utils.py | 18 +- ceruleo/models/baseline.py | 35 +- ceruleo/models/keras/callbacks.py | 10 +- ceruleo/models/keras/dataset.py | 40 +- ceruleo/models/keras/layers.py | 13 +- ceruleo/models/keras/losses.py | 35 +- ceruleo/models/sklearn.py | 76 ++-- ceruleo/results/results.py | 258 ++++++------ ceruleo/transformation/features/cast.py | 19 +- ceruleo/transformation/features/denoising.py | 185 +++++---- ceruleo/transformation/features/entropy.py | 29 +- ceruleo/transformation/features/extraction.py | 287 +++++++------ .../features/extraction_frequency.py | 90 +++-- ceruleo/transformation/features/hurst.py | 172 ++++---- ceruleo/transformation/features/imputers.py | 259 ++++++------ ceruleo/transformation/features/operations.py | 28 +- ceruleo/transformation/features/outliers.py | 199 +++++++-- ceruleo/transformation/features/resamplers.py | 82 ++-- .../features/rolling_windows.py | 34 +- ceruleo/transformation/features/scalers.py | 246 +++++++----- ceruleo/transformation/features/selection.py | 173 +++++++- ceruleo/transformation/features/slicing.py | 16 +- ceruleo/transformation/features/split.py | 33 +- .../transformation/features/transformation.py | 376 ++++++++---------- docs/models/keras/index.md | 2 +- docs/transformation/features/cast.md | 2 +- docs/transformation/features/denoising.md | 2 +- docs/transformation/features/entropy.md | 2 +- docs/transformation/features/extraction.md | 5 + .../features/extraction_frequency.md | 6 + docs/transformation/features/hurst.md | 6 + docs/transformation/features/imputers.md | 2 +- docs/transformation/features/operations.md | 6 + docs/transformation/features/outliers.md | 2 +- docs/transformation/features/resamplers.md | 2 +- .../features/rolling_windows.md | 6 + docs/transformation/features/scalers.md | 6 + docs/transformation/features/selection.md | 4 +- docs/transformation/features/slicing.md | 6 + docs/transformation/features/split.md | 6 + .../transformation/features/transformation.md | 6 + mkdocs.yml | 15 +- requirements.txt | 3 +- 59 files changed, 2085 insertions(+), 1655 deletions(-) create mode 100644 docs/transformation/features/extraction_frequency.md create mode 100644 docs/transformation/features/hurst.md create mode 100644 docs/transformation/features/operations.md create mode 100644 docs/transformation/features/rolling_windows.md create mode 100644 docs/transformation/features/scalers.md create mode 100644 docs/transformation/features/slicing.md create mode 100644 docs/transformation/features/split.md create mode 100644 docs/transformation/features/transformation.md diff --git a/ceruleo/dataset/analysis/correlation.py b/ceruleo/dataset/analysis/correlation.py index baa8b437..e99a0632 100644 --- a/ceruleo/dataset/analysis/correlation.py +++ b/ceruleo/dataset/analysis/correlation.py @@ -1,5 +1,3 @@ - - from itertools import combinations from typing import List, Optional, Tuple @@ -18,22 +16,21 @@ def correlation_analysis( Compute the correlation between all the features given an Iterable of executions. Parameters: - dataset: Dataset of time series corr_threshold: Threshold to consider two features of a single execution highly correlated features: List of features to consider when computing the correlations Returns: + A DataFrame indexed with the column names with the following columns: - pd.DataFrame: A DataFrame indexed with the column names with the following columns: + - Mean Correlation + - Std Correlation + - Percentage of lives with a high correlation + - Abs mean correlation + - Std mean correlation + - Max correlation + - Min correlation - - Mean Correlation - - Std Correlation - - Percentage of lives with a high correlation - - Abs mean correlation - - Std mean correlation - - Max correlation - - Min correlation """ if features is None: features = sorted(list(dataset.common_features())) @@ -41,7 +38,7 @@ def correlation_analysis( features = sorted(list(set(features).intersection(dataset.common_features()))) features = dataset.get_features_of_life(0)[features].corr().columns correlated_features = [] - + for ex in iterate_over_features(dataset): ex = ex[features] corr_m = ex.corr().fillna(0) @@ -73,4 +70,4 @@ def percentage_above_treshold(x): ) output["Max correlation"] = df.groupby(by=["Feature 1", "Feature 2"]).max() output["Min correlation"] = df.groupby(by=["Feature 1", "Feature 2"]).min() - return output \ No newline at end of file + return output diff --git a/ceruleo/dataset/analysis/distribution.py b/ceruleo/dataset/analysis/distribution.py index e0f59d6d..c3c793ac 100644 --- a/ceruleo/dataset/analysis/distribution.py +++ b/ceruleo/dataset/analysis/distribution.py @@ -19,7 +19,6 @@ def histogram_per_life( bins_to_use: np.ndarray, normalize: bool = True, ) -> List[np.ndarray]: - try: d = life[feature] h, _ = np.histogram(d, bins=bins_to_use) @@ -49,27 +48,22 @@ def features_divergeces( columns: Optional[List[str]] = None, show_progress: bool = False, ) -> pd.DataFrame: - """Compute the divergence between features + """ + Compute the divergence between features Parameters: - ds: The dataset number_of_bins: Number of bins columns: Which columns to use Returns: - df: A DataFrame in which cach row contains the - distances between a feature of two run-to-failure cycle - with the following columns: - - - Life 1: Run-to-failure cycle 1 - - Life 2: Run-to-failure cycle 2 - - W: Wasserstein - - KL: KL Divergence - - feature: The feature name - - - + A DataFrame in which each row contains the distances between a feature of two run-to-failure cycle with the following columns: + + - Life 1: Run-to-failure cycle 1 + - Life 2: Run-to-failure cycle 2 + - W: Wasserstein + - KL: KL Divergence + - feature: The feature name """ if columns is None: columns = ds.numeric_features() @@ -88,14 +82,16 @@ def features_divergeces( histograms[feature].append( histogram_per_life(life, feature, features_bins[feature]) ) - + df_data = [] for feature in columns: data = {} - for ((i, h1), (j, h2)) in itertools.combinations(enumerate(histograms[feature]), 2): + for (i, h1), (j, h2) in itertools.combinations( + enumerate(histograms[feature]), 2 + ): kl = (np.mean(kl_div(h1, h2)) + np.mean(kl_div(h2, h1))) / 2 wd = wasserstein_distance(h1, h2) df_data.append((i, j, wd, kl, feature)) df = pd.DataFrame(df_data, columns=["Life 1", "Life 2", "W", "KL", "feature"]) - return df \ No newline at end of file + return df diff --git a/ceruleo/dataset/analysis/numerical_features.py b/ceruleo/dataset/analysis/numerical_features.py index e87141a5..bf2cf20b 100644 --- a/ceruleo/dataset/analysis/numerical_features.py +++ b/ceruleo/dataset/analysis/numerical_features.py @@ -14,33 +14,31 @@ from ceruleo.dataset.utils import iterate_over_features_and_target -def entropy(s: np.ndarray)-> float: - """Approximate entropy +def entropy(s: np.ndarray) -> float: + """ + Approximate entropy The approximate entropy quantifies the amount of regularity and the unpredictability of fluctuations over time-series data. Parameters: - s: A single feature Returns: - - ae: Approximate entropy of feature s + Approximate entropy of feature s """ return ant.app_entropy(s) -def correlation(s: np.ndarray, y:Optional[np.ndarray]=None): - """Correlation of the feature with the target +def correlation(s: np.ndarray, y: Optional[np.ndarray] = None) -> float: + """ + Correlation of the feature with the target Parameters: - s: A single feature y: The RUL target Returns: - - c: Correlation between the feature ant the RUL target + Correlation between the feature and the RUL target """ N = s.shape[0] if not (s[0] == s).all(): @@ -51,31 +49,29 @@ def correlation(s: np.ndarray, y:Optional[np.ndarray]=None): return corr -def autocorrelation(s: np.ndarray)-> float: - """Autocorrelation of a feature +def autocorrelation(s: np.ndarray) -> float: + """ + Autocorrelation of a feature Parameters: - s: A single feature Returns: - - ac: Autocorrelation of the feature + Autocorrelation of the feature """ diff = np.diff(s) - return np.sum(diff ** 2) / s.shape[0] + return np.sum(diff**2) / s.shape[0] def monotonicity(s: np.ndarray) -> float: - """Monotonicity of a feature + """ + Monotonicity of a feature, the two extreme values are 0 if the feature is constant and 1 if it is strictly monotonic. Parameters: - s: A single feature Returns: - - f: Monotonicity of the feature + Monotonicity of the feature """ N = s.shape[0] diff = np.diff(s) @@ -83,62 +79,61 @@ def monotonicity(s: np.ndarray) -> float: def n_unique(s: np.ndarray) -> int: - """Number of unique values in the array + """ + Number of unique values in the array Parameters: - s: A single feature Returns: - - n: Number of unique values + Number of unique values """ return len(np.unique(s)) + def null(s: np.ndarray) -> float: - """Null proportion for a given feature + """ + Null proportion for a given feature Parameters: s: A feature Returns: - - n: Null proportion + Null proportion """ return np.mean(~np.isfinite(s)) -def mutual_information(x:np.ndarray, y:np.ndarray): + +def mutual_information(x: np.ndarray, y: np.ndarray) -> float: """Mutual information between a feature and the target [Reference](Remaining Useful Life Prediction Using Ranking Mutual Information Based Monotonic Health Indicator) Parameters: - x: A single feature y: RUL Target - Returns: - float: Mutual information between x and y + Returns: + Mutual information between x and y """ x = x.reshape(-1, 1) x = np.nan_to_num(x) return mutual_info_regression(x, y) + metrics = { "std": lambda x, y: np.std(x), - "correlation": lambda x,y: correlation(x, y), - "autocorrelation": lambda x,y:autocorrelation(x), - "monotonicity": lambda x,y:monotonicity(x), - "number_of_unique_elements": lambda x,y:n_unique(x), - 'mutual_information': mutual_information, - 'null': lambda x, y: null(x), - 'entropy': lambda x, y: entropy(x) + "correlation": lambda x, y: correlation(x, y), + "autocorrelation": lambda x, y: autocorrelation(x), + "monotonicity": lambda x, y: monotonicity(x), + "number_of_unique_elements": lambda x, y: n_unique(x), + "mutual_information": mutual_information, + "null": lambda x, y: null(x), + "entropy": lambda x, y: entropy(x), } - - def analysis_single_time_series( X: np.ndarray, y: np.ndarray, @@ -146,18 +141,18 @@ def analysis_single_time_series( data: Optional[Dict] = None, what_to_compute: List[str] = [], ) -> dict: - """Compute the analysis for a single run-to-failure cycle + """ + Compute the analysis for a single run-to-failure cycle Parameters: - - X: Features + X: Input Features y: RUL Target column_names: Column names of the features data: Initial data what_to_compute: Features to compute Returns: - dict: Computed info + Dictionary containing the computed info """ if data is None: @@ -171,12 +166,11 @@ def analysis_single_time_series( m = metrics[what](x_ts, y) - data[column_name][what].append(m) return data -def merge_analysis(data: dict): +def merge_analysis(data: dict) -> pd.DataFrame: data_df = defaultdict(lambda: defaultdict(list)) for column_name in data.keys(): for what in data[column_name]: @@ -195,25 +189,26 @@ def analysis( show_progress: bool = False, what_to_compute: List[str] = [], ) -> pd.DataFrame: - """Compute analysis of numerical features + """ + Compute analysis of numerical features Parameters: - - transformed_dataset: A transformed dataset with a features and target + dataset: A transformed dataset with features and target show_progress: Wether to show the progress when computing the features - what_to_compute: Elements available to compute - - std - - correlation - - autocorrelation, - - monotonicity - - number_of_unique_elements - - mutual_information - - null - - entropy + what_to_compute: Elements available to compute: + + - std + - Correlation + - Autocorrelation + - Monotonicity + - Number of unique elements + - Mutual information + - Null + - Entropy - Returns: - df: Dataframe with the columns specified by what_to_compute + Returns: + Dataframe with the columns specified by what_to_compute """ if len(what_to_compute) == 0: @@ -229,7 +224,5 @@ def analysis( column_names = dataset.numeric_features() for X, y in iterate_over_features_and_target(dataset): y = np.squeeze(y) - data = analysis_single_time_series( - X, y, column_names, data, what_to_compute - ) - return merge_analysis(data) \ No newline at end of file + data = analysis_single_time_series(X, y, column_names, data, what_to_compute) + return merge_analysis(data) diff --git a/ceruleo/dataset/analysis/sample_rate.py b/ceruleo/dataset/analysis/sample_rate.py index c65101e8..ca79226a 100644 --- a/ceruleo/dataset/analysis/sample_rate.py +++ b/ceruleo/dataset/analysis/sample_rate.py @@ -9,18 +9,17 @@ def sample_rate(ds: AbstractTimeSeriesDataset, unit: str = "s") -> np.ndarray: - """Obtain an array of time difference between two consecutive samples + """ + Obtain an array of time difference between two consecutive samples. - If the index it's a timestamp, the time difference will be converted to the provided - unit + If the index it's a timestamp, the time difference will be converted to the provided unit Parameters: ds: The dataset unit: Unit to convert the timestamps differences Returns: - - sample_rates: np.ndarray + Array of time differences """ time_diff = [] @@ -35,16 +34,15 @@ def sample_rate(ds: AbstractTimeSeriesDataset, unit: str = "s") -> np.ndarray: def sample_rate_summary( ds: AbstractTimeSeriesDataset, unit: Optional[str] = "s" ) -> pd.DataFrame: - """Obtain the main and standard deviation of the sample rate of the dataset + """ + Obtain the mean, mode and standard deviation of the sample rate of the dataset Parameters: - ds: The dataset unit: Unit to convert the time differences Returns: - df: Dataframe with the following columns - Mean sample rate, Std sample rate, Mode sample rate + A Dataframe with the following columns: Mean sample rate, Std sample rate, Mode sample rate """ sr = sample_rate(ds, unit) return pd.DataFrame( @@ -54,4 +52,4 @@ def sample_rate_summary( "Mode sample rate": pd.Series(sr).mode().values[0], }, index=["Dataset"], - ) \ No newline at end of file + ) diff --git a/ceruleo/dataset/catalog/CMAPSS.py b/ceruleo/dataset/catalog/CMAPSS.py index b35fb93b..e665c418 100644 --- a/ceruleo/dataset/catalog/CMAPSS.py +++ b/ceruleo/dataset/catalog/CMAPSS.py @@ -19,7 +19,7 @@ # Features used by -# Multiobjective Deep Belief Networks Ensemble forRemaining Useful Life Estimation in +# Multiobjective Deep Belief Networks Ensemble for Remaining Useful Life Estimation in # Prognostics Chong Zhang, Pin Lim, A. K. Qin,Senior Member, IEEE, and Kay Chen Tan,Fellow, IEEE sensor_indices = np.array([2, 3, 4, 7, 8, 9, 11, 12, 13, 14, 15, 17, 20, 21]) + (4 - 1) @@ -44,7 +44,6 @@ def obtain_raw_files(raw_data_path: Path = DATASET_PATH, ): """Download and unzip the raw files Parameters: - raw_data_path: Path where to store the dataset """ raw_data_path = raw_data_path / "files" @@ -136,17 +135,14 @@ class CMAPSSDataset(AbstractLivesDataset): Example: - - ``` py + ``` train_dataset = CMAPSSDataset(train=True, models='FD001') - validation_dataset = CMAPSSDataset(train=False, models='FD001') ``` Parameters: - - train: Wether to obtain the train data provided - models: Names of the models + train: Weather to obtain the train data provided, by default True + models: Names of the models, by default None (all models) """ def __init__( self, train: bool = True, models: Optional[Union[str, List[str]]] = None diff --git a/ceruleo/dataset/catalog/CMAPSS2.py b/ceruleo/dataset/catalog/CMAPSS2.py index 2a4f40e8..0ab4d1ce 100644 --- a/ceruleo/dataset/catalog/CMAPSS2.py +++ b/ceruleo/dataset/catalog/CMAPSS2.py @@ -142,9 +142,7 @@ class CMAPSS2Dataset(AbstractLivesDataset): [Dataset reference](https://data.phmsociety.org/2021-phm-conference-data-challenge/) Parameters: - train: Wether to obtain the train data provided - models: Names of the models """ def __init__( self, diff --git a/ceruleo/dataset/catalog/PHMDataset2018.py b/ceruleo/dataset/catalog/PHMDataset2018.py index 4da3af11..9d996489 100644 --- a/ceruleo/dataset/catalog/PHMDataset2018.py +++ b/ceruleo/dataset/catalog/PHMDataset2018.py @@ -19,7 +19,6 @@ logger = logging.getLogger(__name__) - COMPRESSED_FILE = "phm_data_challenge_2018.tar.gz" FOLDER = "phm_data_challenge_2018" @@ -44,25 +43,23 @@ def track_progress(members): download(path) logger.info("Decompressing dataset...") with tarfile.open(path / OUTPUT, "r") as tarball: + def is_within_directory(directory, target): - abs_directory = os.path.abspath(directory) abs_target = os.path.abspath(target) - + prefix = os.path.commonprefix([abs_directory, abs_target]) - + return prefix == abs_directory - + def safe_extract(tar, path=".", members=None, *, numeric_owner=False): - for member in tar.getmembers(): member_path = os.path.join(path, member.name) if not is_within_directory(path, member_path): raise Exception("Attempted Path Traversal in Tar File") - - tar.extractall(path, members, numeric_owner=numeric_owner) - - + + tar.extractall(path, members, numeric_owner=numeric_owner) + safe_extract(tarball, path=path, members=track_progress(tarball)) shutil.move(str(path / "phm_data_challenge_2018" / "train"), str(path / "train")) shutil.move(str(path / "phm_data_challenge_2018" / "test"), str(path / "test")) @@ -73,15 +70,14 @@ def safe_extract(tar, path=".", members=None, *, numeric_owner=False): class FailureType(Enum): """Failure types availables for the dataset. - Possible values are - - ```py + Possible values are: + ``` FailureType.FlowCoolPressureDroppedBelowLimit FailureType.FlowcoolPressureTooHighCheckFlowcoolPump FailureType.FlowcoolLeak ``` - """ + FlowCoolPressureDroppedBelowLimit = "FlowCool Pressure Dropped Below Limit" FlowcoolPressureTooHighCheckFlowcoolPump = ( "Flowcool Pressure Too High Check Flowcool Pump" @@ -105,14 +101,11 @@ def merge_data_with_faults( """Merge the raw sensor data with the fault information Parameters: - data_file: Path where the raw sensor data is located fault_data_file: Path where the fault information is located Returns: - - df: Dataframe indexed by time with the raw sensors and faults - The dataframe contains also a fault_number column + A Dataframe indexed by time with the raw sensors and faults. The dataframe contains also a fault_number column """ data = pd.read_csv(data_file).set_index("time") @@ -167,9 +160,9 @@ def prepare_dataset(dataset_path: Path): class PHMDataset2018(AbstractLivesDataset): """PHM 2018 Dataset - The 2018 PHM dataset is a public dataset released by Seagate which contains the execution of 20 different - ion milling machines. They distinguish three different failure causes and provide 22 features, - including user-defined variables and sensors. + The 2018 PHM dataset is a public dataset released by Seagate which contains the execution of 20 different + ion milling machines. They distinguish three different failure causes and provide 22 features, + including user-defined variables and sensors. Three faults are present in the dataset @@ -180,22 +173,18 @@ class PHMDataset2018(AbstractLivesDataset): [Dataset reference](https://phmsociety.org/conference/annual-conference-of-the-phm-society/annual-conference-of-the-prognostics-and-health-management-society-2018-b/phm-data-challenge-6/) Example: - - ```py + ``` dataset = PHMDataset2018( - failure_types=FailureType.FlowCoolPressureDroppedBelowLimit, - tools=['01_M02'] + failure_types=FailureType.FlowCoolPressureDroppedBelowLimit,tools=['01_M02'] ) ``` - - Parameters: - failure_types: List of failure types tools: List of tools path: Path where the dataset is located """ + def __init__( self, failure_types: Union[FailureType, List[FailureType]] = [l for l in FailureType], @@ -245,8 +234,8 @@ def _load_life(self, filename: str) -> pd.DataFrame: def get_time_series(self, i: int) -> pd.DataFrame: df = self._load_life(self.lives.iloc[i]["Filename"]) - df.index = pd.to_timedelta(df.index, unit='s') - df = df[df['FIXTURESHUTTERPOSITION'] == 1] + df.index = pd.to_timedelta(df.index, unit="s") + df = df[df["FIXTURESHUTTERPOSITION"] == 1] df["RUL"] = np.arange(df.shape[0] - 1, -1, -1) return df diff --git a/ceruleo/dataset/ts_dataset.py b/ceruleo/dataset/ts_dataset.py index 6b5df65e..bd6c45f5 100644 --- a/ceruleo/dataset/ts_dataset.py +++ b/ceruleo/dataset/ts_dataset.py @@ -1,12 +1,13 @@ - from collections.abc import Iterable from re import S from typing import Any, List, Tuple, Union import numpy as np import pandas as pd + try: import tensorflow as tf + TENSORFLOW_ENABLED = True except: TENSORFLOW_ENABLED = False @@ -14,6 +15,7 @@ from tqdm.auto import tqdm from abc import abstractmethod, abstractproperty + class DatasetIterator: def __init__(self, dataset): self.dataset = dataset @@ -59,20 +61,20 @@ def duration(self, life: pd.DataFrame) -> float: """Obtain the duration of the time-series Parameters: - i: Index of the life + life: The input life Returns: - duration: Duration of the life + Duration of the life """ v = life.index return v.max() - v.min() def durations(self, show_progress: bool = False) -> List[float]: - """Obtain the length of each life + """ + Obtain the length of each life Return: - - durations: List of durations + List of durations """ if self._durations is None: if show_progress: @@ -86,15 +88,16 @@ def durations(self, show_progress: bool = False) -> List[float]: def __call__(self, i): return self[i] - def get_features_of_life(self, i:int ) -> pd.DataFrame: + def get_features_of_life(self, i: int) -> pd.DataFrame: return self[i] - def __getitem__(self, i: Union[int, Iterable]): + def __getitem__( + self, i: Union[int, Iterable] + ) -> Union[pd.DataFrame, "FoldedDataset"]: """Obtain a time-series or an splice of the dataset using a FoldedDataset Parameters: - - i: If the parameter is an in it will return a pd.DataFrame with the i-th time-series. + i: If the parameter is an int it will return a pd.DataFrame with the i-th time-series. If the parameter is a list of int it will return a FoldedDataset with the time-series whose id are present in the list @@ -102,8 +105,8 @@ def __getitem__(self, i: Union[int, Iterable]): ValueError: When the list does not contain integer parameters Returns: - pd.DataFrame: the i-th time-series - FoldedDataset: The dataset with the lives specified by the list + The i-th time-series + An instance of class FoldedDataset containing the dataset with the lives specified by the list """ if isinstance(i, slice): i = range( @@ -126,12 +129,12 @@ def __getitem__(self, i: Union[int, Iterable]): def shape(self) -> Tuple[int, int]: return (self.n_time_series, 1) - def __len__(self): + def __len__(self) -> int: """ - Return: - - n: The number of time-series in the dataset + Compute the number of lifes in the dataset + Return: + Number of time-series in the dataset """ return self.n_time_series @@ -145,12 +148,12 @@ def to_pandas( Create a dataset with the time-series concatenated Parameters: - - proportion_of_lives: Proportion of lives to use. + proportion_of_lives: Proportion of lives to use, by default 1 + subsample_proportion: Proportion of samples to use, by default 1 + show_progress: Whether to show progress when concatenating the lives, by default False Returns: - - df: a DataFrame with all the lives concatenated + A DataFrame with all the lives concatenated """ if show_progress: bar = tqdm @@ -165,7 +168,6 @@ def to_pandas( ) for i in bar(range(self.n_time_series)): - if proportion_of_lives < 1.0 and np.random.rand() > proportion_of_lives: continue @@ -198,13 +200,35 @@ def _compute_common_features( def common_features( self, show_progress: bool = False, proportion_of_lives: float = 1.0 ) -> List[str]: + """ + Compute the common features of the dataset among the different lives + + Parameters: + proportion_of_lives: Proportion of lives to use, by default 1 + show_progress: Whether to show progress when computing the common features, by default False + + Returns: + A list with the common features + """ if self._common_features is None: self._common_features = self._compute_common_features( proportion_of_lives, show_progress=show_progress ) return self._common_features - def map(self, transformer, cache_size: int = None): + def map( + self, transformer: "TransformedDataset", cache_size: int = None + ) -> "TransformedDataset": + """ + Apply a transformation to the dataset + + Parameters: + transformer: The transformation to apply + cache_size: The size of the cache to use, by default None + + Returns: + The transformed dataset as an instance of class TransformedDataset + """ from ceruleo.dataset.transformed import TransformedDataset return TransformedDataset(self, transformer, cache_size=cache_size) @@ -213,12 +237,10 @@ def numeric_features(self, show_progress: bool = False) -> List[str]: """Obtain the list of the common numeric features in the dataset Parameters: - - show_progress : Whether to show progress when computing the common features, by default False + show_progress: Whether to show progress when computing the common features, by default False Returns: - - l: List of columns + List of columns containing the common numeric features """ features = self.common_features(show_progress=show_progress) @@ -230,6 +252,14 @@ def numeric_features(self, show_progress: bool = False) -> List[str]: ) def categorical_features(self, show_progress: bool = False) -> List[str]: + """Obtain the list of the common categorical features in the dataset + + Parameters: + show_progress: Whether to show progress when computing the common features + + Returns: + List of columns containing the common numeric features + """ features = self.common_features(show_progress=show_progress) df = self.get_time_series(0) return list( @@ -240,6 +270,10 @@ def categorical_features(self, show_progress: bool = False) -> List[str]: class FoldedDataset(AbstractTimeSeriesDataset): + """ + Dataset containing a subset of the time-series. An instanc of this class can be obtained by slicing an AbstractTimeSeriesDataset with a list of indexes + """ + def __init__(self, dataset: AbstractTimeSeriesDataset, indices: list): super().__init__() self.dataset = dataset @@ -252,22 +286,62 @@ def __getattribute__(self, __name: str) -> Any: return self.dataset.__getattribute__(__name) @property - def n_time_series(self): + def n_time_series(self) -> int: + """ + Compute the number of lifes in the folded dataset + + Return: + Number of time-series in the folded dataset + + """ return len(self.indices) - def get_time_series(self, i: int): + def get_time_series(self, i: int) -> pd.DataFrame: + """ + Obtain the i-th time-series in the folded dataset + + Parameters: + i: Index of the life + + Returns: + The i-th time-series + """ return self.dataset[self.indices[i]] - def _original_index(self, i: int): + def _original_index(self, i: int) -> int: + """ + Obtain the index of the i-th time-series in the original dataset + + Parameters: + i: Index of the life + + Returns: + The index of the i-th time-series in the original dataset + """ if isinstance(self.dataset, FoldedDataset): return self.dataset._original_index(self.indices[i]) else: return self.indices[i] - def original_indices(self): + def original_indices(self) -> List[int]: + """ + Obtain the original indices for all the time-series in the FoldedDataset + + Returns: + The original indices for all the time-series in the FoldedDataset + """ return [self._original_index(i) for i in range(len(self.indices))] def number_of_samples_of_time_series(self, i: int) -> int: + """ + Compute the number of samples of the i-th time-series in the FoldedDataset + + Parameters: + i: Index of the life + + Returns: + Number of samples of the i-th time-series in the FoldedDataset + """ return self[i][0].shape[0] def __reduce_ex__(self, __protocol) -> Union[str, Tuple[Any, ...]]: @@ -275,7 +349,8 @@ def __reduce_ex__(self, __protocol) -> Union[str, Tuple[Any, ...]]: class AbstractLivesDataset(AbstractTimeSeriesDataset): - """Base class for RUL estimation dataset + """ + Base class for RUL estimation dataset Three abstract methods must be implemented to start using CERULEo with your data: @@ -284,10 +359,10 @@ class AbstractLivesDataset(AbstractTimeSeriesDataset): * `n_time_series(self) -> int`: The property return the total number of lives present in the dataset * `rul_column(self) -> str`: The property should return the name of the RUL column """ + @abstractproperty def rul_column(self) -> str: raise NotImplementedError def duration(self, life: pd.DataFrame) -> float: return life[self.rul_column].max() - diff --git a/ceruleo/dataset/utils.py b/ceruleo/dataset/utils.py index 9c63b809..ce3dbc06 100644 --- a/ceruleo/dataset/utils.py +++ b/ceruleo/dataset/utils.py @@ -35,20 +35,22 @@ def iterate_over_target(ds: Union[TransformedDataset, AbstractLivesDataset]): Parameters: ds: The dataset - + Returns: it: The iterator """ if isinstance(ds, TransformedDataset): return map(lambda x: x[1], ds) - elif hasattr(ds, 'rul_column'): + elif hasattr(ds, "rul_column"): return map(lambda x: x[ds.rul_column], ds) else: - raise ValueError('Invalid dataset type used') - + raise ValueError("Invalid dataset type used") + -def iterate_over_features_and_target(ds: Union[TransformedDataset, AbstractLivesDataset]): +def iterate_over_features_and_target( + ds: Union[TransformedDataset, AbstractLivesDataset] +): """Helper function to iterate over the features and RUL target in a dataset Example: @@ -60,14 +62,14 @@ def iterate_over_features_and_target(ds: Union[TransformedDataset, AbstractLives Parameters: ds: The dataset - + Returns: it: The iterator """ if isinstance(ds, TransformedDataset): return map(lambda x: (x[0], x[1]), ds) - elif hasattr(ds, 'rul_column'): + elif hasattr(ds, "rul_column"): return map(lambda x: (x, x[ds.rul_column]), ds) else: - raise ValueError('Invalid dataset type used') \ No newline at end of file + raise ValueError("Invalid dataset type used") diff --git a/ceruleo/graphics/analysis.py b/ceruleo/graphics/analysis.py index 39ebd6c9..e7254e0c 100644 --- a/ceruleo/graphics/analysis.py +++ b/ceruleo/graphics/analysis.py @@ -12,18 +12,17 @@ def plot_correlation_analysis( features: Optional[List[str]] = None, ax: Optional[matplotlib.axes.Axes] = None, **kwargs, -): +) -> matplotlib.axes.Axes: """Plot the correlated features in a dataset Parameters: - dataset: The dataset corr_threshold: Minimum threshold to consider that the correlation is high features: List of features ax: The axis where to draw Returns: - ax: the axis + The plot axis """ if features is not None: diff --git a/ceruleo/graphics/duration.py b/ceruleo/graphics/duration.py index 73ca9b44..facb764a 100644 --- a/ceruleo/graphics/duration.py +++ b/ceruleo/graphics/duration.py @@ -40,37 +40,31 @@ def durations_histogram( ) -> matplotlib.axes.Axes: """Generate an histogram from the lives durations of the dataset - Example: - + ''' durations_histogram( [train_dataset,validation_dataset], label=['Train','Validation'], xlabel='Unit Cycles', units='cycles', figsize=(17, 5)); + ''' Parameters: - datasets: Dataset from which take the lives durations - xlabel: Label of the x axis - label: Label of each dataset to use as label in the boxplot - bins: Number of bins to compute in the histogram - units: Units of time of the lives. Useful to generate labels - vlines: Vertical lines to be added to the plot - - Each element of the list should be the x position in the first element of the tuple, - and the second elmenet of the tuple should be the label of the line - ax: Axis where to draw the plot. - If missing a new figure will be created - add_mean: Whether to add a vertical line with the mean value - add_median: whether to add a vertical line with the median value - transform: A function to transform each duration - threshold: Includes duration less than the threshold + xlabel: Label of the x axis, by default Cycle Duration + label: Label of each dataset to use as label in the boxplot, by default 1 + bins: Number of bins to compute in the histogram, by default 15 + units: Units of time of the lives. Useful to generate labels, by default m + vlines: Vertical lines to add to the figure in the form [(x_coordinate, label)] + ax: Axis where to draw the plot. If missing a new figure will be created + add_mean: Whether to add a vertical line with the mean value, by default True + add_median: whether to add a vertical line with the median value, by default True + transform: A function to transform each duration, by default identity transform + threshold: Includes duration less than the threshold, by default np.inf Returns: - - ax: The axis in which the histogram was created + The axis in which the histogram was created """ if isinstance(datasets, list): @@ -114,30 +108,7 @@ def histogram_from_durations( alpha=1.0, **kwargs, ) -> matplotlib.axes.Axes: - """Generate an histogram from the lives durations - - Parameters: - - durations: Duration of each run-to-failure cycle - xlabel: Label of the x axis - label: Label of each dataset to use as label in the boxplot - bins: Number of bins to compute in the histogram - units: Units of time of the lives. Useful to generate labels - vlines: Vertical lines to be added to the plot - - Each element of the list should be the x position in the first element of the tuple, - and the second elmenet of the tuple should be the label of the line - ax: Axis where to draw the plot. - If missing a new figure will be created - add_mean: Whether to add a vertical line with the mean value - add_median: whether to add a vertical line with the median value - transform: A function to transform each duration - threshold: Includes duration less than the threshold - - Returns: - - ax: The axis in which the histogram was created - """ + if ax is None: _, ax = plt.subplots(1, 1, **kwargs) @@ -193,24 +164,17 @@ def durations_boxplot( figsize=(17, 5)) Parameters: - datasets: Dataset from which take the lives durations xlabel: Label of each dataset to use as label in the boxplot ylabel: Label of the y axis - ax: Axis where to draw the plot. - If missing a new figure will be created - hlines: Horizontal lines to be added to the plot - - Each element of the list should be the y position in the first element of the tuple, - and the second element of the tuple should be the label of the line + ax: Axis where to draw the plot.If missing a new figure will be created + hlines: Horizontal lines to add to the figure in the form [(y_coordinate, label)] units: Units of time of the lives. Useful to generate labels transform: A function to transform each duration maxy: Maximum y value of the plot - Returns: - - ax: Axis where plot has been drawn + Axis where plot has been drawn """ if isinstance(datasets, list): assert isinstance(xlabel, list) @@ -245,23 +209,7 @@ def boxplot_from_durations( maxy: Optional[float] = None, **kwargs, )-> matplotlib.axes.Axes: - """Generate an histogram from a list of durations - - Parameters: - - durations: Durations of run-to-cycle-failure - xlabel: Label of each dataset to use as label in the boxplot - ylabel: Label of the y axis - ax: Axes where the figure will be plotted - hlines: Horizontal lines to add to the figure in the form - [(y_coordinate, label)] - units: Units of the y axis - maxy: Maximum y-axis value - - Returns: - ax: Axes where the figure has been drawn - - """ + if isinstance(durations[0], list): assert isinstance(xlabel, list) assert len(durations) == len(xlabel) diff --git a/ceruleo/graphics/results.py b/ceruleo/graphics/results.py index 3d1bd388..73caca99 100644 --- a/ceruleo/graphics/results.py +++ b/ceruleo/graphics/results.py @@ -30,7 +30,6 @@ def plot_lives(ds: TransformedDataset): Plot each life Parameters: - ds: A transformed dataset """ fig, ax = plt.subplots() @@ -152,12 +151,12 @@ def boxplot_errors_wrt_RUL( x_axis_label: Optional[str] = None, ax=None, **kwargs, -): - """Boxplots of difference between true and predicted RUL over Cross-validated results +) -> matplotlib.axes.Axes: + """ + Boxplots of difference between true and predicted RUL over Cross-validated results Parameters: - results_dict: Dictionary with the results of the fitted models nbins: Number of bins to divide the y_axis_label: Optional string to be added to the y axis @@ -166,11 +165,10 @@ def boxplot_errors_wrt_RUL( If an axis is not provided, it will create one. Keyword arguments: - **kwargs Return: - ax + The axis in which the plot has been made """ if ax is None: fig, ax = plt.subplots(**kwargs) @@ -197,21 +195,19 @@ def _cv_barplot_errors_wrt_RUL_multiple_models( color_palette: str = "hls", bar_width: float=1/1.5, **kwargs, -): - """Plot the barplots given the errors +) -> Tuple[matplotlib.figure.Figure ,matplotlib.axes.Axes]: + """ + Plot the barplots given the errors Parameters: - bin_edges: np.ndarray: - model_results: Dictionary with the results ax: Axis y_axis_label: Y Label x_axis_label: X Label Returns: - - Tuple[fig, axis] + The plot axis """ if ax is None: fig, ax = plt.subplots(**kwargs) @@ -285,10 +281,10 @@ def barplot_errors_wrt_RUL( color_palette: str = "hls", **kwargs, ): - """Barlots of difference between true and predicted RUL + """ + Barlots of difference between true and predicted RUL Parameters: - results_dict: Dictionary with the results for each model nbins: Number of bins in wich divide the RUL target y_axis_label: Y label @@ -311,15 +307,20 @@ def barplot_errors_wrt_RUL( def _cv_shadedline_plot_errors_wrt_RUL_multiple_models( - bin_edges, - model_results, - ax=None, - y_axis_label=None, - x_axis_label=None, + bin_edges: np.array, + model_results: dict, + ax: Optional[matplotlib.axes.Axes]=None, + y_axis_label: Optional[str] =None, + x_axis_label: Optional[str] =None, **kwargs, ): """Plot a shaded regions for each model + Parameters: + bin_edges: Bin Edges + model_results: Dictionary with the results for the model + y_axis_label: Y label + x_axis_label: X label """ if ax is None: fig, ax = plt.subplots(**kwargs) @@ -398,22 +399,23 @@ def _cv_shadedline_plot_errors_wrt_RUL_multiple_models( def shadedline_plot_errors_wrt_RUL( results_dict: dict, nbins: int, - y_axis_label=None, - x_axis_label=None, - ax=None, + y_axis_label: Optional[str] =None, + x_axis_label: Optional[str] =None, + ax: Optional[matplotlib.axes.Axes] =None, **kwargs, -): - """Shaded line +) -> matplotlib.axes.Axes: + """ + Shaded line Parameters: - results_dict: _description_ - nbins:_description_ - y_axis_label: _description_, by default None - x_axis_label:_description_, by default None - ax: _description_, by default None + results_dict: Dictionary with the results for the model + nbins: Number of bins + y_axis_label: Y label + x_axis_label: X label + ax: Plot axis Returns: - ax: The axis + The plot axis """ if ax is None: @@ -464,7 +466,8 @@ def plot_unexpected_breaks( add_shade: bool = True, **kwargs, ) -> matplotlib.axes.Axes: - """Plot the risk of unexpected breaks with respect to the maintenance window + """ + Plot the risk of unexpected breaks with respect to the maintenance window Parameters: results_dict: Dictionary with the results @@ -474,8 +477,7 @@ def plot_unexpected_breaks( units: Units to use in the xlabel, by default "" Returns: - - ax: The axis in which the plot was made + The axis in which the plot was made """ if ax is None: fig, ax = plt.subplots(**kwargs) @@ -555,17 +557,18 @@ def label_formatter(x): def plot_life( life: FittedLife, - ax=None, + ax: Optional[matplotlib.axes.Axes]=None, units: Optional[str] = "", markersize: float = 0.7, add_fitted: bool = False, - plot_target:bool = True, - add_regressed:bool = True, + plot_target: bool = True, + add_regressed: bool = True, start_x:int= 0, label:str = '', **kwargs, -): - """Plot a single life +) -> matplotlib.axes.Axes: + """ + Plot a single life Parameters: life: A fitted life @@ -580,8 +583,7 @@ def plot_life( label: Returns: - - ax: Axis + The plot axis """ if ax is None: _, ax = plt.subplots(1, 1, **kwargs) @@ -630,24 +632,23 @@ def plot_life( def plot_predictions_grid( results: Union[PredictionResult, List[PredictionResult]], ncols: int = 3, - alpha=1.0, + alpha: float =1.0, xlabel: Optional[str] = None, ylabel: Optional[str] = None, **kwargs, ): - """Plot a matrix of predictions + """ + Plot a matrix of predictions Parameters: - results: Dictionary with the results - ncols: Number of colmns in the plot, by default 3 - alpha: Opacity of the predicted curves, by default 1.0 - xlabel: Xlabel, by default None - ylabel: YLabel, by default None + ncols: Number of colmns in the plot + alpha: Opacity of the predicted curves + xlabel: Xlabel + ylabel: YLabel Return: - - ax: The axis on which the plot has been made + The axis on which the plot has been made """ def linear_to_subindices(i, ncols): @@ -700,10 +701,10 @@ def plot_predictions( model_name:str = '', **kwargs, ) -> matplotlib.axes.Axes: - """Plots the predicted and the true remaining useful lives + """ + Plots the predicted and the true remaining useful lives Parameters: - result: A PredictionResult object or a tuple with (y_true, y_predicted) ax: Axis to plot. If it is missing a new figure will be created units: Units of time to be used in the axis labels @@ -714,8 +715,7 @@ def plot_predictions( Returns: - - ax: The axis on which the plot has been made + The axis on which the plot has been made """ if ax is None: _, ax = plt.subplots(1, 1, **kwargs) diff --git a/ceruleo/iterators/batcher.py b/ceruleo/iterators/batcher.py index 6689fa35..944a8857 100644 --- a/ceruleo/iterators/batcher.py +++ b/ceruleo/iterators/batcher.py @@ -22,24 +22,10 @@ class Batcher: - """WindowedIterator Batcher - - Example: - - ``` py - batcher = Batcher.new(transformed_dataset, - window=150, - batch_size=64, - step=1, - horizon=1) - X, y, data = next(batcher) - X.shape - - (64, 150, n_features) - ``` + """ + WindowedIterator Batcher Parameters: - iterator: Dataset iterator batch_size: int @@ -66,17 +52,17 @@ def new( sample_weight: SampleWeight = NotWeighted(), right_closed: bool = True, padding: bool = False, - ): - """Batcher constructor from a dataset + ) -> "Batcher": + """ + Batcher constructor from a dataset - The method constructs WindowedDatasetIterator from the dataset and + The method constructs a WindowedDatasetIterator from the dataset and then a Batcher from the iterator. Most of the parameters come from the WindowedDatasetIterator, Example: - - ``` py + ``` batcher = Batcher.new(transformed_dataset, window=150, batch_size=64, @@ -86,23 +72,20 @@ def new( X.shape (64, 150, n_features) - ``` + ``` Parameters: - dataset: Dataset from which the batcher will be created batch_size: Batch size step: strides - horizon: Size of the horizon to predict. By default 1 + horizon: Size of the horizon to predict. shuffle: AbstractShuffler sample_weight: SampleWeight right_closed: bool - padding: wheter to pad data if there are not enough points - to fill the window + padding: wheter to pad data if there are not enough points to fill the window Returns: - - batcher: A new constructed batcher + A new constructed batcher """ iterator = WindowedDatasetIterator( dataset, @@ -118,11 +101,11 @@ def new( return b def __len__(self) -> int: - """Number of batches + """ + Number of batches Returns: - - batches: Number of batches in the iterator + Number of batches in the iterator """ if len(self.iterator) is None: return None @@ -136,47 +119,45 @@ def __iter__(self): @property def n_features(self) -> int: - """Number of features of the transformed dataset + """ + Number of features of the transformed dataset - This is a helper method to obtain the transformed - dataset information from the WindowedDatasetIterator + This is a helper method to obtain the transformed dataset information from the WindowedDatasetIterator Returns: - - features: Number of features of the transformed dataset + Number of features of the transformed dataset """ return self.iterator.n_features @property def window_size(self) -> int: - """Lookback window size + """ + Lookback window size - This is a helper method to obtain the WindowedDatasetIterator - information + This is a helper method to obtain the WindowedDatasetIterator information Returns: - - window: Lookback window size + Lookback window size """ return self.iterator.window_size @property def output_shape(self) -> int: - """Number of values returned as target by each sample + """ + Number of values returned as target by each sample Returns: - - output_size: Number of values returned as target by each sample + Number of values returned as target by each sample """ return self.iterator.output_size @property def input_shape(self) -> Tuple[int, int]: - """Tuple containing (window_size, n_features) + """ + Tuple containing (window_size, n_features) Returns: - - window_size, n_features + (window_size, n_features) """ return self.iterator.input_shape diff --git a/ceruleo/iterators/iterators.py b/ceruleo/iterators/iterators.py index 85674466..8ce8ba2f 100644 --- a/ceruleo/iterators/iterators.py +++ b/ceruleo/iterators/iterators.py @@ -19,7 +19,11 @@ import numpy as np import pandas as pd from ceruleo.dataset.transformed import TransformedDataset -from ceruleo.iterators.sample_weight import AbstractSampleWeights, NotWeighted, SampleWeight +from ceruleo.iterators.sample_weight import ( + AbstractSampleWeights, + NotWeighted, + SampleWeight, +) from ceruleo.iterators.shufflers import AbstractShuffler, NotShuffled from tqdm.auto import tqdm import functools @@ -28,27 +32,27 @@ logger = logging.getLogger(__name__) - def seq_to_seq_signal_generator( - signal_X:np.ndarray, - signal_Y:np.ndarray, + signal_X: np.ndarray, + signal_Y: np.ndarray, i: int, window_size: int, output_size: int = 1, right_closed: bool = True, ) -> Tuple[np.ndarray, np.ndarray]: - """Generator for sequence to sequence models + """ + Generator for sequence to sequence models Parameters: signal_X: The input signal signal_Y: The output signal i: Current index - window_size: indow size - output_size: Output sequence length, by default 1 - right_closed: Wether the lsat input of the windwo is included or not, by default True + window_size: Window size + output_size: Output sequence length + right_closed: Weather the last input of the window is included or not Returns: - Input and ouput sequences + A Tuple with the Input and output sequences """ initial = max(i - window_size + 1, 0) @@ -73,23 +77,22 @@ def windowed_signal_generator( window_size: int, output_size: int = 1, right_closed: bool = True, -): +) -> Tuple[np.ndarray, float]: """ Return a lookback window and the value to predict. Parameters: - data: Matrix of size (life_length, n_features) with the information of the life target: Target feature of size (life_length) i: Position of the value to predict window_size: Size of the lookback window output_size: Number of points of the target - right_closed: Wether the las sample of the window should be included or not + right_closed: Wether the last sample of the window should be included or not Returns: + A tuple containing the lookback window and the value to predict. - tuple (np.array, float) """ initial = max(i - window_size + 1, 0) is_df = isinstance(data, pd.DataFrame) @@ -110,19 +113,19 @@ def windowed_signal_generator( signal_y_1 = np.expand_dims(signal_y_1, axis=1) else: if is_df: - signal_y_1 = target.iloc[i : min(i + output_size, target.shape[0]), :].values + signal_y_1 = target.iloc[ + i : min(i + output_size, target.shape[0]), : + ].values else: signal_y_1 = target[i : min(i + output_size, target.shape[0]), :] if signal_y_1.shape[0] < output_size: - padding = np.zeros( ((output_size - signal_y_1.shape[0]), signal_y_1.shape[1]) ) signal_y_1 = np.concatenate((signal_y_1, padding), axis=0) if signal_X_1.shape[0] < window_size: - signal_X_1 = np.vstack( ( np.zeros((window_size - signal_X_1.shape[0], signal_X_1.shape[1])), @@ -134,23 +137,16 @@ def windowed_signal_generator( class IterationType(Enum): - """Iteration type - - Possible values are - - - SEQ_TO_SEQ = 1: + """ + Iteration type - The seq to seq iterator will return as a target a window of a same size - as the input aligned with it - + Possible values are: - - FORECAST = 2 - - The forecast iterator produces as target the values of the Y transformers + - SEQ_TO_SEQ = 1: The seq to seq iterator will return as a target a window of a same size as the input aligned with it + - FORECAST = 2: The forecast iterator produces as target the values of the Y transformers that start where the X data ends. - - """ + SEQ_TO_SEQ = 1 FORECAST = 2 @@ -168,12 +164,12 @@ def valid_sample( class RelativePosition: - """Relative position selector base class + """ + Relative position selector base class - The relative position selectors allow specifying - the iteration starts and end relative to the beginning - or the end of the run-to-cycle failure + The relative position selectors allow specifying the iteration starts and end relative to the beginning or the end of the run-to-cycle failure """ + def __init__(self, i: int): self.i = i @@ -183,14 +179,12 @@ def get(self, time_series_length: int): class RelativeToEnd(RelativePosition): - """Specify positions relative to the end of the run-to-failure cycle + """ + Specify positions relative to the end of the run-to-failure cycle Example: - - An iterator that iterate each run-to-failure cycle starting - in the last 500 samples of each cycle. - - ``` py + An iterator that iterates each run-to-failure cycle starting in the last 500 samples of each cycle. + ``` iterator = WindowedDatasetIterator( transformed_ds, window_size=3, @@ -199,20 +193,18 @@ class RelativeToEnd(RelativePosition): horizon=1) ``` """ + def get(self, time_series_length: int): return max(time_series_length - self.i, 0) - class RelativeToStart(RelativePosition): - """Specify positions relative to the start of the run-to-failure cycle + """ + Specify positions relative to the start of the run-to-failure cycle Example: - - An iterator that iterate each run-to-failure cycle skipping the first - 200 samples of each cycle. - - ``` py + An iterator that iterate each run-to-failure cycle skipping the first 200 samples of each cycle. + ``` iterator = WindowedDatasetIterator( transformed_ds, window_size=3, @@ -221,36 +213,31 @@ class RelativeToStart(RelativePosition): horizon=1) ``` """ + def get(self, time_series_length: int): return self.i class WindowedDatasetIterator: - """Iterate a dataset using windows + """ + Iterate a dataset using windows Parameters: - dataset: The transformed dataset window_size: Size of the lookback window - step: Separation between two consecutive size - If step == window_size there are not overlapping - between two consecutive windows - horizon: Horizon to be predicted. - If this value is 3, for each window, 3 elements - of the target are expected to be predicted + step: Separation between two consecutive size. If step == window_size there is no overlapping between two consecutive windows + horizon: Horizon to be predicted. If this value is 3, for each window, 3 elements of the target are expected to be predicted shuffler: How the data should be shuffled sample_weight: Which are the sample weight for each sample - right_closed: Wether the last point of the window should be included or not - padding: Wether to pad elements if the samples are not enough to fill the window - Usually this happens at the beginning of the window - iteration_type: Specify its the underlying model its a forecasting in which - an scalar is predicted, or a sequence to sequence model similar - to an autoencoder + right_closed: Weather the last point of the window should be included or not + padding: Wether to pad elements if the samples are not enough to fill the window. Usually this happens at the beginning of the window + iteration_type: Specify if the underlying model it's a forecasting model in which a scalar is predicted, or a sequence to sequence model similar to an autoencoder start_index: Initial index of each run-tu-failure cycle end_index: Final index of each run-to-failure cycle - valid_sample: A callable that returns wether a sample is valid or not - last_point: Wether to add the last point + valid_sample: A callable that returns weather a sample is valid or not + last_point: Weather to add the last point """ + def __init__( self, dataset: TransformedDataset, @@ -265,8 +252,7 @@ def __init__( start_index: Union[int, RelativePosition] = 0, end_index: Optional[Union[int, RelativePosition]] = None, valid_sample: Callable[[int, int, int, int, int], bool] = valid_sample, - last_point: bool = True - + last_point: bool = True, ): self.last_point = last_point if isinstance(start_index, int): @@ -284,9 +270,6 @@ def __init__( self.shuffler.initialize(self) self.iteration_type = iteration_type - - - if self.iteration_type == IterationType.FORECAST: self.slicing_function = windowed_signal_generator else: @@ -310,14 +293,11 @@ def __init__( valid_sample, self.padding, self.window_size ) - - def __len__(self): """ Return the length of the iterator - If it not was iterated once, it will compute the length by iterating - from the entire dataset + If it was not iterated once, it will compute the length by iterating from the entire dataset """ if self.length is None: self.length = sum(1 for _ in self) @@ -350,16 +330,18 @@ def __next__(self): ) return curr_X, curr_y, [self.sample_weight(y, timestamp, metadata)] - def get_data(self, flatten: bool = True, show_progress: bool = False): - """Obtain a + def get_data( + self, flatten: bool = True, show_progress: bool = False + ) -> Tuple[np.ndarray, np.array, np.array]: + """ + Obtain data, target and sample weights as numpy arrays. Parameters: - flatten: Wether to flatten data show_progress: Wether to show progress Returns: - X, y, sw: Data, target and sample weights + Data, target and sample weights """ N_points = len(self) @@ -394,23 +376,19 @@ def get_data(self, flatten: bool = True, show_progress: bool = False): @property def n_features(self) -> int: - """Number of features of the transformed dataset - This is a helper method to obtain the transformed - dataset information from the WindowedDatasetIterator - Returns - ------- - int + """Number of features of the transformed dataset. This is a helper method to obtain the transformed dataset information from the WindowedDatasetIterator + + Returns: Number of features of the transformed dataset """ return self.dataset.transformer.n_features @property def shape(self) -> Tuple[int, int]: - """Tuple containing (window_size, n_features) + """ + Tuple containing (window_size, n_features) - Returns - ------- - Tuple[int, int] + Returns: Tuple containing (window_size, n_features) """ return (self.window_size, self.n_features) diff --git a/ceruleo/iterators/sample_weight.py b/ceruleo/iterators/sample_weight.py index 70917ea5..ac36d997 100644 --- a/ceruleo/iterators/sample_weight.py +++ b/ceruleo/iterators/sample_weight.py @@ -6,23 +6,28 @@ import numpy as np - class AbstractSampleWeights: - """The base class for the sample weight provider """ + The base class for the sample weight provider + """ + def __call__(self, y, i: int, metadata): raise NotImplementedError class NotWeighted(AbstractSampleWeights): - """Simplest sample weight provvider + """ + Simplest sample weight provider Provide 1 as a sample weight for every sample """ + def __call__(self, y, i: int, metadata): return 1 -"""The Sample Weight type is a callable with the following signature + +""" +The Sample Weight type is a callable with the following signature fun(y, i:int, metadata) @@ -32,30 +37,35 @@ def __call__(self, y, i: int, metadata): class RULInverseWeighted(AbstractSampleWeights): - r"""Weight each sample inverse to the RUL - - ```math - w_i = \frac{1}{RUL_{i} + 1} - ``` """ + Weight each sample by the inverse of the RUL + """ + def __call__(self, y, i: int, metadata): return 1 / (y[i, 0] + 1) class InverseToLengthWeighted(AbstractSampleWeights): - """Weights samples according to the duration of the run-to-failure cycle they bolng + """ + Weights samples according to the duration of the run-to-failure cycle they belong to. All points in the run-to-cycle are weighted equally inverse to the cycle duration + """ + def __call__(self, y, i: int, metadata): return 1 / y[0] class ExponentialDecay(AbstractSampleWeights): - def __init__(self, *, near_0_at:float): + """ + Weight samples with an exponential decay function based on the RUL. + + """ + + def __init__(self, *, near_0_at: float): super().__init__() - self.alpha = -(near_0_at)**2/np.log(0.000001) + self.alpha = -((near_0_at) ** 2) / np.log(0.000001) def __call__(self, y, i: int, metadata): - return (1 + np.exp(-(y[i, 0]**2) / self.alpha))**2 - + return (1 + np.exp(-(y[i, 0] ** 2) / self.alpha)) ** 2 diff --git a/ceruleo/iterators/shufflers.py b/ceruleo/iterators/shufflers.py index be75f9e4..63538aa9 100644 --- a/ceruleo/iterators/shufflers.py +++ b/ceruleo/iterators/shufflers.py @@ -1,15 +1,16 @@ -"""Shufflers are helpers classes used by the iterator to change +""" +Shufflers are helpers classes used by the iterator to change the order of the run-to-failure cycles and the timestamps inside each cycle There are six types of shuffles - NotShuffled: All the cycles are processed in order -- AllShuffled: -- IntraTimeSeriesShuffler -- InverseOrder: -- TimeSeriesOrderIntraSignalShuffling -- TimeSeriesOrderShuffling +- AllShuffled: Everything is shuffled +- IntraTimeSeriesShuffler: Each point of the time series is shuffled, but the TS are kept in order +- InverseOrder: The data points will be fed in RUL decreasing order +- TimeSeriesOrderIntraSignalShuffling: Each point in the ts is shuffled, and the ts order are shuffled also. +- TimeSeriesOrderShuffling: Time series are shuffled, but each point inside the time series kept its order """ from typing import Callable, Optional, Tuple @@ -18,7 +19,9 @@ class AbstractShuffler: - """A Shuffler is used by the iterator to interleave samples of different run-to-fail cycles""" + """ + A Shuffler is used by the iterator to interleave samples of different run-to-fail cycles + """ class Iterator: def __init__(self, shuffler, iterator): @@ -36,28 +39,26 @@ def iterator(self, iterator: "WindowedDatasetIterator"): return AbstractShuffler.Iterator(self, iterator) def at_end(self) -> bool: - """Determines wether the iterator reached to its end + """ + Determines weather the iterator reached to its end Returns: - bool: Wether the iterator is at its end """ return self.current_time_series == self.wditerator.dataset.n_time_series def next_element(self) -> Tuple[int, int]: - """Iterating function + """ + Iterating function The method takes the current time serie, and the current time stamp and calls the advance method Returns: - ts_index, timestamp: Index of the current run-to-failure cycle and - current timestamp of that cycle + Index of the current run-to-failure cycle andcurrent timestamp of that cycle Raises: - - StopIteration: When the iteration reached to an end - + When the iteration reaches the end """ if self.at_end(): @@ -69,31 +70,30 @@ def next_element(self) -> Tuple[int, int]: return ts_index, timestamp def start(self, iterator: "WindowedDatasetIterator"): - """start the shuffler given an iterator + """ + Start the shuffler given an iterator Parameters: - - iterator: An Windowed iterator + iterator: An Windowed Iterator """ self.initialize(iterator) def time_series(self) -> int: - """Current time series + """ + Current time series Returns: - - current_time_series: Current Time Series + Current Time Series """ return self.current_time_series def initialize(self, iterator: "WindowedDatasetIterator"): - """Initialize the current shuffler + """ + Initialize the current shuffler - This method is in charge of initializing everything to - allow the correct iteration + This method is in charge of initializing everything to allow the correct iteration Parameters: - iterator: WindowedDatasetIterator """ @@ -133,11 +133,11 @@ def number_samples_of_time_series(self, time_series_index: int) -> int: return self._samples_per_time_series[time_series_index] def number_of_samples_of_current_time_series(self) -> int: - """Obtain the number of samples for the current time series + """ + Obtain the number of samples for the current time series Returns: - - int: Number of samples + Number of samples """ @@ -196,7 +196,6 @@ def initialize(self, iterator: "WindowedDatasetIterator"): self.n_time_series = iterator.dataset.n_time_series def timestamp(self) -> int: - ret = self.timestamps[self.current_timestamp_index] return ret @@ -239,7 +238,6 @@ def timestamp(self) -> int: return ret def advance(self): - if self.current_timestamp == self.current_time_series_size() - 1: self.time_series_changed() else: @@ -264,7 +262,7 @@ class TimeSeriesOrderIntraSignalShuffling(AbstractShuffler): Each point in the ts is shuffled, and the ts order are shuffled also. !!! note - + Iteration 1: | TS 1 | TS 1 | TS 1 | TS 2 | TS 2 | TS 2 | 3 | 2 | 1 | 1 | 3 | 2 Iteration 2: | TS 2 | TS 2 | TS 2 | TS 1 | TS 1 | TS 1 @@ -378,7 +376,6 @@ def time_series(self) -> int: while ( self.number_samples_of_time_series(l) == self.timestamps_per_ts_indices[l] ): - l = np.random.randint(self.wditerator.dataset.n_time_series) self.current_time_series = l @@ -423,8 +420,7 @@ def load_time_series(self, time_series_index: int): class NotShuffled(AbstractShuffler): """ - Nothing is shuffled. Each sample of each run-to-failure cycle - are iterated in order + Nothing is shuffled. Each sample of each run-to-failure cycle are iterated in order Iteration 1: | Life 1 | Life 1 | Life 1 | Life 2 | Life 3 | Life 3 | 1 | 2 | 3 | 1 | 2 | 1 diff --git a/ceruleo/iterators/utils.py b/ceruleo/iterators/utils.py index 08fe782d..2efd0253 100644 --- a/ceruleo/iterators/utils.py +++ b/ceruleo/iterators/utils.py @@ -1,5 +1,3 @@ - - from numpy.lib.arraysetops import isin from ceruleo.dataset.transformed import TransformedDataset from ceruleo.dataset.ts_dataset import AbstractTimeSeriesDataset @@ -11,7 +9,8 @@ from ceruleo.iterators.iterators import WindowedDatasetIterator try: - import tensorflow as tf + import tensorflow as tf + TENSORFLOW = True except: TENSORFLOW = False @@ -19,7 +18,7 @@ def true_values( dataset: Union[WindowedDatasetIterator, Batcher, AbstractTimeSeriesDataset], - target_column: Optional[str] = None + target_column: Optional[str] = None, ) -> np.array: """Obtain the true RUL of the dataset after the transformation @@ -28,17 +27,20 @@ def true_values( dataset: Iterator of the dataset Returns: - + true_RUL: target values after the transformation """ from ceruleo.transformation.functional.transformers import TransformerIdentity + if isinstance(dataset, Batcher): dataset = dataset.iterator - elif isinstance(dataset, AbstractTimeSeriesDataset) and not isinstance(dataset, TransformedDataset): + elif isinstance(dataset, AbstractTimeSeriesDataset) and not isinstance( + dataset, TransformedDataset + ): if target_column is None: - if not hasattr(dataset, 'rul_column'): - raise ValueError('Please provide a target column to access') + if not hasattr(dataset, "rul_column"): + raise ValueError("Please provide a target column to access") else: target_column = dataset.rul_column return np.squeeze(np.concatenate([y for y in iterate_over_target(dataset)])) diff --git a/ceruleo/models/baseline.py b/ceruleo/models/baseline.py index 75d465d0..ef919073 100644 --- a/ceruleo/models/baseline.py +++ b/ceruleo/models/baseline.py @@ -9,13 +9,11 @@ class BaselineModel: - """Predict the RUL using the mean of the median value of the duration - of the dataset + """ + Predict the RUL using the mean or the median value of the duration of the dataset Parameters: - - mode: Method for computing the duration of the dataset - Possible values are: 'mean' and 'median' + mode: Method for computing the duration of the dataset. Possible values are: 'mean' and 'median' """ def __init__(self, mode: str = "mean", RUL_threshold: Optional[float] = None): @@ -23,7 +21,8 @@ def __init__(self, mode: str = "mean", RUL_threshold: Optional[float] = None): self.RUL_threshold = RUL_threshold def fit(self, ds: Union[TransformedDataset, AbstractLivesDataset]): - """Compute the mean or median RUL using the given dataset + """ + Compute the mean or median RUL using the given dataset Parameters: ds: Dataset from which obtain the true RUL @@ -42,16 +41,15 @@ def fit(self, ds: Union[TransformedDataset, AbstractLivesDataset]): elif self.mode == "median": self.fitted_RUL = np.median(true) - def predict(self, ds: TransformedDataset): - """Predict the whole life using the fitted values + def predict(self, ds: TransformedDataset) -> np.ndarray: + """ + Predict the whole life using the fitted values Parameters: - ds: Dataset iterator from which obtain the true RUL Returns: - - d: Predicted target + Predicted RUL """ output = [] for y in iterate_over_target(ds): @@ -62,10 +60,10 @@ def predict(self, ds: TransformedDataset): class FixedValueBaselineModel: - """A model that predicts always the same duration for each run-to-failure cycle + """ + A model that predicts always the same duration for each run-to-failure cycle Parameters: - value: Fixed RUL """ @@ -75,16 +73,17 @@ def __init__(self, *, value: float): def fit(self, *args): return self - def predict(self, ds: TransformedDataset, RUL_threshold: Optional[float] = None): - """Predict the whole life using the fixed values + def predict( + self, ds: TransformedDataset, RUL_threshold: Optional[float] = None + ) -> np.ndarray: + """ + Predict the whole life using the fixed values Parameters: - ds: Dataset iterator from which obtain the true RUL Returns: - - true_RUL: Predicted target + Predicted RUL """ output = [] for y in iterate_over_target(ds): diff --git a/ceruleo/models/keras/callbacks.py b/ceruleo/models/keras/callbacks.py index 584d0db3..b3c9627f 100644 --- a/ceruleo/models/keras/callbacks.py +++ b/ceruleo/models/keras/callbacks.py @@ -12,10 +12,10 @@ class PredictionCallback(Callback): - """Generate a plot after each epoch with the predictions + """ + Generate a plot after each epoch with the predictions Parameters: - model: The model used predict output_path: Path of the output image dataset: The dataset that want to be plotted @@ -25,10 +25,9 @@ def __init__( self, output_path: Path, dataset: tf.data.Dataset, - units: str='', + units: str = "", filename_suffix: str = "", ): - super().__init__() self.output_path = output_path self.dataset = dataset @@ -43,7 +42,7 @@ def on_epoch_end(self, epoch, logs={}): y_pred = self.model.predict(self.dataset) y_true = true_values(self.dataset) ax = plot_predictions( - PredictionResult('Model', y_true, y_pred), + PredictionResult("Model", y_true, y_pred), figsize=(17, 5), units=self.units, ) @@ -52,4 +51,3 @@ def on_epoch_end(self, epoch, logs={}): ax.figure.savefig(self.output_path, dpi=ax.figure.dpi) plt.close(ax.figure) - diff --git a/ceruleo/models/keras/dataset.py b/ceruleo/models/keras/dataset.py index f2aea18f..06d08318 100644 --- a/ceruleo/models/keras/dataset.py +++ b/ceruleo/models/keras/dataset.py @@ -3,18 +3,18 @@ from ceruleo.iterators.iterators import WindowedDatasetIterator import numpy as np + def tf_regression_dataset(iterator: WindowedDatasetIterator) -> tf.data.Dataset: - """Create a forecast tf.data.Dataset from the iterator + """ + Create a forecast tf.data.Dataset from the iterator - The dataset is is constructed from a generator + The dataset is constructed from a generator Parameters: - iterator: The data iterator Returns: - - d: A tensorlfow dataset + A tensorflow dataset """ n_features = iterator.n_features @@ -25,9 +25,7 @@ def generator_function(): a = tf.data.Dataset.from_generator( generator_function, output_signature=( - tf.TensorSpec( - shape=(iterator.window_size, n_features), dtype=tf.float32 - ), + tf.TensorSpec(shape=(iterator.window_size, n_features), dtype=tf.float32), tf.TensorSpec(shape=(iterator.horizon, 1), dtype=tf.float32), tf.TensorSpec(shape=(1), dtype=tf.float32), ), @@ -37,18 +35,17 @@ def generator_function(): def tf_seq_to_seq_dataset(iterator: WindowedDatasetIterator) -> tf.data.Dataset: - """Create a sequence to sequence tf.data.Dataset from the iterator + """ + Create a sequence to sequence tf.data.Dataset from the iterator - The dataset is is constructed from a generator + The dataset is constructed from a generator Parameters: - iterator: The data iterator Returns: - - d: A tensorlfow dataset - """ + A tensorflow dataset + """ n_features = iterator.n_features def generator_function(): @@ -58,10 +55,10 @@ def generator_function(): a = tf.data.Dataset.from_generator( generator_function, output_signature=( + tf.TensorSpec(shape=(iterator.window_size, n_features), dtype=tf.float32), tf.TensorSpec( - shape=(iterator.window_size, n_features), dtype=tf.float32 + shape=(iterator.window_size, iterator.horizon), dtype=tf.float32 ), - tf.TensorSpec(shape=(iterator.window_size, iterator.horizon), dtype=tf.float32), tf.TensorSpec(shape=(1), dtype=tf.float32), ), ) @@ -70,18 +67,17 @@ def generator_function(): def tf_autoencoder_dataset(iterator: WindowedDatasetIterator) -> tf.data.Dataset: - """Create an autoencoder tf.data.Dataset from the iterator + """ + Create an autoencoder tf.data.Dataset from the iterator - The dataset is is constructed from a generator + The dataset is constructed from a generator Parameters: - iterator: The data iterator Returns: - - d: A tensorlfow dataset - """ + A tensorflow dataset + """ n_features = iterator.n_features def gen_train(): diff --git a/ceruleo/models/keras/layers.py b/ceruleo/models/keras/layers.py index 6ccf54b2..ec621e3d 100644 --- a/ceruleo/models/keras/layers.py +++ b/ceruleo/models/keras/layers.py @@ -23,14 +23,14 @@ def RemoveDimension(axis=0): class ConcreteDropout(tf.keras.layers.Layer): - """Concrete Dropout layer class from https://arxiv.org/abs/1705.07832. + """ + Concrete Dropout layer class from https://arxiv.org/abs/1705.07832. Dropout Feature Ranking for Deep Learning Models Chun-Hao Chang Ladislav Rampasek Anna Goldenberg Parameters: - dropout_regularizer: Positive float, satisfying $dropout_regularizer = 2 / (\tau * N)$ with model precision $\tau$ (inverse observation noise) and N the number of instances in the dataset. @@ -111,7 +111,8 @@ def call(self, inputs, training=True): class ResidualShrinkageBlock(tf.keras.layers.Layer): - """ResidualShrinkageBlock + """ + ResidualShrinkageBlock """ def build(self, input_shape): @@ -194,6 +195,12 @@ def get_config(self): class LASSOLayer(Layer): + """ + LASSO Layer + + Parameters: + l1: L1 regularization parameter + """ def __init__(self, l1:float): super(LASSOLayer, self).__init__() self.l1 = l1 diff --git a/ceruleo/models/keras/losses.py b/ceruleo/models/keras/losses.py index 35522e72..bf35b19d 100644 --- a/ceruleo/models/keras/losses.py +++ b/ceruleo/models/keras/losses.py @@ -6,8 +6,13 @@ from tensorflow.python.keras.losses import LossFunctionWrapper -def root_mean_squared_error(y_true, y_pred): - """Root mean squared error""" +def root_mean_squared_error(y_true: np.array, y_pred: np.array) -> float: + """Root mean squared error + + Parameters: + y_true: True RUL values + y_pred: Predicted RUL values + """ return K.sqrt(K.mean(K.square(y_pred - y_true), axis=0)) @@ -22,8 +27,9 @@ def asymmetric_loss_pm( alpha_r, gamma_r, relative_weight: bool = True, -): - """Customizable Asymmetric Loss Functions for Machine Learning-based Predictive Maintenance +) -> float: + """ + Customizable Asymmetric Loss Functions for Machine Learning-based Predictive Maintenance Ehrig, L., Atzberger, D., Hagedorn, B., Klimke, J., & Döllner, J. (2020, October). @@ -34,7 +40,6 @@ def asymmetric_loss_pm( [Reference](https://ieeexplore.ieee.org/document/9287246) Parameters: - y_true: True RUL values y_pred: Predicted RUL values theta_l: Linear to exponential change point for overpredictions (Positive) @@ -46,7 +51,6 @@ def asymmetric_loss_pm( relative_weight: Wether to use weigthing relative to the RUL Returns: - l: the loss computed """ @@ -82,9 +86,9 @@ def asymmetric_loss_pm( return a - class AsymmetricLossPM(LossFunctionWrapper): - """Customizable Asymmetric Loss Functions for Machine Learning-based Predictive Maintenance + """ + Customizable Asymmetric Loss Functions for Machine Learning-based Predictive Maintenance Ehrig, L., Atzberger, D., Hagedorn, B., Klimke, J., & Döllner, J. (2020, October). @@ -130,15 +134,14 @@ def __init__( def relative_mae(C: float = 0.9): - """MAE weighted by the relative error + """ + MAE weighted by the relative error Parameters: - C: Minimal value for the RUL Returns: - - callable: The loss function + The loss function """ mae = tf.keras.losses.MeanAbsoluteError() @@ -153,15 +156,14 @@ def concrete_relative_mae(y_true, y_pred): def relative_mse(C: float = 0.9): - """MSE weighted by the relative error + """ + MSE weighted by the relative error Parameters: - C: Minimal value for the RUL Returns: - - callable: The loss function + The loss function """ mse = tf.keras.losses.MeanSquaredError() @@ -173,4 +175,3 @@ def concrete_relative_mse(y_true, y_pred): return mse(y_true, y_pred, sample_weight=sw) return concrete_relative_mse - diff --git a/ceruleo/models/sklearn.py b/ceruleo/models/sklearn.py index cfa1c4d3..bfdc6d5e 100644 --- a/ceruleo/models/sklearn.py +++ b/ceruleo/models/sklearn.py @@ -1,6 +1,6 @@ """This module provides interoperability -Scikit learn models can be used wit the ceruleo Transformers +Scikit learned models can be used wit the ceruleo Transformers The `TimeSeriesWindowTransformer` is a scikit-learn transformers that takes @@ -34,18 +34,19 @@ from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin from sklearn.metrics import mean_squared_error from sklearn.metrics._scorer import get_scorer +from typing import List logger = logging.getLogger(__name__) class EstimatorWrapper(TransformerMixin, BaseEstimator): - """Wrapper around sklearn estimators to allow calling the fit and predict + """ + Wrapper around sklearn estimators to allow calling the fit and predict The transformer keeps the X and y together. This wrapper divide the X,y and call the fit(X,y) and predict(X,y) of the estimator Parameters: - estimator: A scikit-learn estimator """ @@ -63,11 +64,11 @@ def predict(self, Xy, **transform_params): class TimeSeriesWindowTransformer(TransformerMixin, BaseEstimator): - """A scikit-learn transformer for obtaining a windowed time-series from the run-to-cycle failures + """ + A scikit-learn transformer for obtaining a windowed time-series from the run-to-cycle failures Parameters: - - transformer: + transformer: A scikit-learn transformer window_size: Window size of the iterator step: Stride of the iterators horizon: Horizon of the predictions @@ -98,6 +99,12 @@ def __init__( self.padding = padding def fit(self, dataset: AbstractTimeSeriesDataset): + """ + Fit the transformer with the given dataset + + Parameters: + dataset: Dataset to fit the transformer + """ self.transformer.fit(dataset) return self @@ -120,20 +127,22 @@ def transform(self, dataset: AbstractTimeSeriesDataset): def true_values(self, dataset: AbstractTimeSeriesDataset): X, y, sw = self._iterator(dataset).get_data() return y.ravel() - + def get_params(self, deep=None): params = super().get_params(deep) if deep: - params['ts_window_transformer__transformer'] = self.transformer.get_params(deep) + params["ts_window_transformer__transformer"] = self.transformer.get_params( + deep + ) return params class CeruleoRegressor(RegressorMixin, BaseEstimator): - """A regressor wrapper similar to sklearn.compose.TransformedTargetRegressor + """ + A regressor wrapper similar to sklearn.compose.TransformedTargetRegressor Parameters: - features_transformer: The transformer regressor: A scikit-learn regressor """ @@ -173,17 +182,19 @@ def set_params(self, **kwargs): class CeruleoMetricWrapper: - """A wrapper around sklearn metrics + """ + A wrapper around sklearn metrics Example: - + ''' grid_search = GridSearchCV( estimator=regressor_gs, param_grid={ 'regressor': [RandomForestRegressor(max_depth=5)] }, scoring=CeruleoMetricWrapper('neg_mean_absolute_error') - ) + ) + ''' """ def __init__(self, scoring): @@ -201,22 +212,18 @@ def train_model( model, train_iterator: WindowedDatasetIterator, val_windowed_iterator: Optional[WindowedDatasetIterator] = None, - **fit_kwargs + **fit_kwargs, ): - """Fit the model with the given dataset iterator - + """ + Fit the model with the given dataset iterator Parameters: - - train_iterator: - + train_iterator: Training Iterator Keyword arguments: - fit_kwargs: Arguments for the fit method Returns: - - model: SKLearn model + A SKLearn model """ X, y, sample_weight = train_iterator.get_data() @@ -234,32 +241,32 @@ def train_model( def predict(model, dataset_iterator: WindowedDatasetIterator): - """Get the predictions for the given iterator + """ + Get the predictions for the given iterator Parameters: - dataset_iterator: Dataset iterator from which obtain data to predict Returns: - - array: Array with the predictiosn + Array with the predictiosn """ X, _, _ = dataset_iterator.get_data() return model.predict(X) -def fit_batch(model, train_batcher: Batcher, val_batcher: Batcher, n_epochs=15): - """Fit the model using the given batcher +def fit_batch( + model, train_batcher: Batcher, val_batcher: Batcher, n_epochs=15 +) -> Tuple["Model", List]: + """ + Fit the model using the given batcher Parameters: - model: SKLearn Model train_batcher: Train dataset batcher val_batcher: Validation dataset batcher n_epochs: Number of epochs, by default 15 Returns: - model: the model history: history of errors """ @@ -273,18 +280,17 @@ def fit_batch(model, train_batcher: Batcher, val_batcher: Batcher, n_epochs=15): return model, history -def predict_batch(model, dataset_batcher: Batcher): - """Predict the values using the given batcher +def predict_batch(model, dataset_batcher: Batcher) -> np.ndarray: + """ + Predict the values using the given batcher Parameters: - model: SKLearn model dataset_batcher: The batcher Returns: - - RUL_predicted: Predictions array + RUL Prediction array """ y_pred = [] for X, y in dataset_batcher: diff --git a/ceruleo/results/results.py b/ceruleo/results/results.py index 5397f6ec..d3c8d476 100644 --- a/ceruleo/results/results.py +++ b/ceruleo/results/results.py @@ -26,11 +26,13 @@ """ import logging from dataclasses import dataclass, field -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Tuple, Union import numpy as np -from ceruleo.results.picewise_regression import (PiecewesieLinearFunction, - PiecewiseLinearRegression) +from ceruleo.results.picewise_regression import ( + PiecewesieLinearFunction, + PiecewiseLinearRegression, +) from sklearn.metrics import mean_absolute_error as mae from sklearn.metrics import mean_absolute_percentage_error as mape from sklearn.metrics import mean_squared_error as mse @@ -41,8 +43,10 @@ @dataclass class MetricsResult: - """An object that store regression metrics and times """ + An object that store regression metrics and times + """ + mae: float mse: float fitting_time: float = 0 @@ -51,19 +55,20 @@ class MetricsResult: @dataclass class PredictionResult: - """A prediction result is composed by a name """ + A prediction result is composed by a name + """ + name: str true_RUL: np.ndarray predicted_RUL: np.ndarray - metrics: MetricsResult + metrics: MetricsResult def compute_metrics(self): self.metrics.mae = mae(self.true_RUL, self.predicted_RUL) self.metrics.mse = mse(self.true_RUL, self.predicted_RUL) - - def __init__(self, name:str, true_RUL: np.ndarray, predicted_RUL: np.ndarray): + def __init__(self, name: str, true_RUL: np.ndarray, predicted_RUL: np.ndarray): self.metrics = MetricsResult(0, 0) self.name = name self.true_RUL = np.squeeze(true_RUL) @@ -93,17 +98,18 @@ def compute_rul_line(rul: float, n: int, tt: Optional[np.array] = None): class CVResults: """ - Compute the error histogram + Compute the error histogram - Compute the error with respect to the RUL considering the results of different - folds + Compute the error with respect to the RUL considering the results of different + folds - Parameters: - y_true: List with the true values of each hold-out set of a cross validation - y_pred: List with the predictions of each hold-out set of a cross validation - nbins: Number of bins to compute the histogram + Parameters: + y_true: List with the true values of each hold-out set of a cross validation + y_pred: List with the predictions of each hold-out set of a cross validation + nbins: Number of bins to compute the histogram """ + def __init__( self, y_true: List[List], @@ -111,7 +117,6 @@ def __init__( nbins: int = 5, bin_edges: Optional[np.array] = None, ): - if bin_edges is None: max_value = np.max([np.max(y) for y in y_true]) bin_edges = np.linspace(0, max_value, nbins + 1) @@ -130,7 +135,6 @@ def _add_fold_result(self, fold: int, y_pred: np.array, y_true: np.array): y_true = np.squeeze(y_true) for j in range(len(self.bin_edges) - 1): - mask = (y_true >= self.bin_edges[j]) & (y_true <= self.bin_edges[j + 1]) indices = np.where(mask)[0] @@ -181,7 +185,6 @@ def models_cv_results( bin_edges = np.linspace(0, max_y_value, nbins + 1) model_results = {} for model_name in results_dict.keys(): - model_results[model_name] = model_cv_results( results_dict[model_name], bin_edges=bin_edges ) @@ -193,12 +196,11 @@ class FittedLife: """Represent a Fitted run-to-cycle failure Parameters: - y_true: The true RUL target y_pred: The predicted target time: Time feature fit_line_not_increasing: Wether the fitted line can increase or not. - RUL_threshold: Indicates the thresholding value used during de fit + RUL_threshold: Indicates the thresholding value used during the fit """ @@ -219,7 +221,6 @@ def __init__( if isinstance(time, np.ndarray): self.time = time else: - self.time = np.array(np.linspace(0, y_true[0], n=len(y_true))) else: @@ -243,18 +244,17 @@ def __init__( self.y_true_fitted = p(self.time) @staticmethod - def compute_time_feature(y_true: np.array, RUL_threshold: Optional[float] = None) -> Tuple[float, np.ndarray]: + def compute_time_feature( + y_true: np.array, RUL_threshold: Optional[float] = None + ) -> Tuple[float, np.ndarray]: """Compute the time feature based on the target Parameters: - y_true: RUL target RUL_threshold: - Returns - ------- - - Degradind start time and time + Returns: + Degrading start time and time """ degrading_start = FittedLife._degrading_start(y_true, RUL_threshold) time = FittedLife._compute_time(y_true, degrading_start) @@ -264,17 +264,16 @@ def compute_time_feature(y_true: np.array, RUL_threshold: Optional[float] = None def _degrading_start( y_true: np.array, RUL_threshold: Optional[float] = None ) -> float: - """Obtain the index when the life value is lower than the RUL_threshold + """ + Obtain the index when the life value is lower than the RUL_threshold Parameters: - y_true: Array of true values of the RUL of the life RUL_threshold: float Return: - - degrading_start: if RUL_threshold is None, the degradint start if the first index. + If RUL_threshold is None, the degrading start if the first index. Otherwise it is the first index in which y_true < RUL_threshold """ degrading_start = 0 @@ -284,31 +283,30 @@ def _degrading_start( degrading_start = degrading_start_i[0][0] else: d = np.diff(y_true) == 0 - while (degrading_start< len(d)) and (d[degrading_start]): + while (degrading_start < len(d)) and (d[degrading_start]): degrading_start += 1 return degrading_start @staticmethod def _compute_time(y_true: np.array, degrading_start: int) -> np.array: - """Compute the passage of time from the true RUL + """ + Compute the passage of time from the true RUL The passage of time is computed as the cumulative sum of the first difference of the true labels. In case there are tresholded values, the time steps of the thresholded zone is assumed to be as the median values - of the time steps computed of the zones of the life in which we have information. + of the time steps computed in the zones of the life in which we have information. Parameters: - y_true: The true RUL labels - degrading_start : The index in which the true RUL values starts to be lower than the treshold + degrading_start: The index in which the true RUL values starts to be lower than the treshold Returns: - - t: Time component + Time component """ if len(y_true) == 1: return np.array([0]) - + time_diff = np.diff(np.squeeze(y_true)[degrading_start:][::-1]) time = np.zeros(len(y_true)) if degrading_start > 0: @@ -317,20 +315,16 @@ def _compute_time(y_true: np.array, degrading_start: int) -> np.array: else: time[0 : degrading_start + 1] = 1 time[degrading_start + 1 :] = time_diff - + return np.cumsum(time) def _fit_picewise_linear_regression(self, y: np.array) -> PiecewesieLinearFunction: - """Fit the array trough a picewise linear regression - - Parameters - ---------- - y : np.array - Points to be fitted + """ + Fit the array trough a picewise linear regression - Returns - ------- - PiecewesieLinearFunction + Parameters: + y: Points to be fitted + Returns: The Picewise linear function fitted """ pwlr = PiecewiseLinearRegression(not_increasing=self.fit_line_not_increasing) @@ -351,11 +345,15 @@ def mae(self, sample_weight=None) -> float: return np.mean(sw * np.abs(self.y_true[:N] - self.y_pred)) def noisiness(self) -> float: - """How much the predictions resemble a line + """ + How much the predictions resembles a line This metric is computed as the mse of the fitted values with respect to the least squares fitted line of this values + + Returns: + The Mean Absolute Error of the fitted values with respect to the least squares fitted line """ return mae(self.y_pred_fitted, self.y_pred) @@ -380,36 +378,32 @@ def end_of_life(self): else: return self.time[z[0]] - def maintenance_point(self, m: float = 0): - """Compute the maintenance point + def maintenance_point(self, m: float = 0) -> float: + """ + Compute the maintenance point The maintenance point is computed as the predicted end of life - m - Parameters - ----------- - m: float, optional - Fault horizon Defaults to 0. + Parameters: + m: Fault horizon Defaults to 0. - Returns - -------- - float - Time of maintenance + Returns: + Time of maintenance """ return self.predicted_end_of_life() - m - def unexploited_lifetime(self, m: float = 0): - """Compute the unexploited lifetime given a fault horizon window + def unexploited_lifetime(self, m: float = 0) -> float: + """ + Compute the unexploited lifetime given a fault horizon window Machine Learning for Predictive Maintenance: A Multiple Classifiers Approach Susto, G. A., Schirru, A., Pampuri, S., McLoone, S., & Beghi, A. (2015). - Parameters - ---------- - m: float, optional - Fault horizon windpw. Defaults to 0. + Parameters: + m: Fault horizon window. Defaults to 0. Returns: - float: unexploited lifetime + Unexploited lifetime """ if self.maintenance_point(m) < self.end_of_life(): @@ -417,19 +411,18 @@ def unexploited_lifetime(self, m: float = 0): else: return 0 - def unexpected_break(self, m: float = 0, tolerance: float = 0): - """Compute wether an unexpected break will produce using a fault horizon window of size m + def unexpected_break(self, m: float = 0, tolerance: float = 0) -> bool: + """ + Compute weather an unexpected break will produce using a fault horizon window of size m Machine Learning for Predictive Maintenance: A Multiple Classifiers Approach Susto, G. A., Schirru, A., Pampuri, S., McLoone, S., & Beghi, A. (2015). Parameters: - - m: Fault horizon windpw. + m: Fault horizon window. Returns: - - Unexploited lifetime + A boolean indicating if an unexpected break will occur """ if self.maintenance_point(m) - tolerance < self.end_of_life(): return False @@ -438,13 +431,14 @@ def unexpected_break(self, m: float = 0, tolerance: float = 0): def split_lives_indices(y_true: np.array) -> List[List[int]]: - """Obtain a list of indices for each life + """ + Obtain a list of indices for each life Parameters: y_true: True vector with the RUL Returns: - l: A list with the indices belonging to each life + A list with the indices belonging to each life """ assert len(y_true) >= 2 lives_indices = ( @@ -467,17 +461,17 @@ def split_lives( fit_line_not_increasing: Optional[bool] = False, time: Optional[int] = None, ) -> List[FittedLife]: - """Divide an array of predictions into a list of FittedLife Object + """ + Divide an array of predictions into a list of FittedLife Object Parameters: y_true: The true RUL target y_pred: The predicted RUL - fit_line_not_increasing : Optional[bool], optional - Wether the fit line can increase, by default False - time: A vector with timestamps. If omitted wil be computed from y_true + fit_line_not_increasing: Weather the fit line can increase, by default False + time: A vector with timestamps. If omitted will be computed from y_true Returns: - lives: FittedLife list + FittedLife list """ lives = [] for r in split_lives_indices(results.true_RUL): @@ -495,9 +489,6 @@ def split_lives( return lives - - - def unexploited_lifetime(d: PredictionResult, window_size: int, step: int): bb = [split_lives(cv) for cv in d] return unexploited_lifetime_from_cv(bb, window_size, step) @@ -512,7 +503,6 @@ def unexploited_lifetime_from_cv( for m in windows: jj = [] for r in lives: - ul_cv_list = [life.unexploited_lifetime(m) for life in r] jj.extend(ul_cv_list) @@ -525,19 +515,19 @@ def unexploited_lifetime_from_cv( def unexpected_breaks( d: List[PredictionResult], window_size: int, step: int ) -> Tuple[np.ndarray, np.ndarray]: - """Compute the risk of unexpected breaks with respect to the maintenance window size + """ + Compute the risk of unexpected breaks with respect to the maintenance window size Parameters: - ---------- - d: Dictionary with the results - window_size: Maximum size of the maintenance windows - step: Number of points in which compute the risks. - step different maintenance windows will be used. + d: Dictionary with the results + window_size: Maximum size of the maintenance windows + step: Number of points in which compute the risks. + step different maintenance windows will be used. Returns: - Tuple[np.ndarray, np.ndarray] - * Maintenance window size evaluated - * Risk computed for every window size used + A tuple of np.arrays with: + - Maintenance window size evaluated + - Risk computed for every window size used """ bb = [split_lives(fold) for fold in d] @@ -547,22 +537,19 @@ def unexpected_breaks( def unexpected_breaks_from_cv( lives: List[List[FittedLife]], window_size: int, n: int ) -> Tuple[np.ndarray, np.ndarray]: - """Compute the risk of unexpected breaks given a Cross-Validation results - - Parameters - ---------- - lives : List[List[FittedLife]] - Cross validation results. - window_size : int - Maximum size of the maintenance window - n : int - Number of points to evaluate the risk of unexpected breaks - - Returns - ------- - Tuple[np.ndarray, np.ndarray] - * Maintenance window size evaluated - * Risk computed for every window size used + """ + Compute the risk of unexpected breaks given a Cross-Validation results + + Parameters: + lives: Cross validation results. + window_size: Maximum size of the maintenance window + n: Number of points to evaluate the risk of unexpected breaks + + + Returns: + A tuple of np.arrays with: + - Maintenance window size evaluated + - Risk computed for every window size used """ std_per_window = [] mean_per_window = [] @@ -603,13 +590,7 @@ def metric_J(d, window_size: int, step: int): def cv_regression_metrics_single_model( results: List[PredictionResult], threshold: float = np.inf ): - errors = { - "MAE": [], - "MAE SW": [], - "MSE": [], - "MSE SW": [], - "MAPE": [] - } + errors = {"MAE": [], "MAE SW": [], "MSE": [], "MSE SW": [], "MAPE": []} for result in results: y_mask = np.where(result.true_RUL <= threshold)[0] y_true = np.squeeze(result.true_RUL[y_mask]) @@ -621,7 +602,6 @@ def cv_regression_metrics_single_model( if len(np.unique(y_pred)) == 1: continue - sw = compute_sample_weight( "relative", y_true, @@ -668,41 +648,41 @@ def cv_regression_metrics_single_model( errors1 = {} for k in errors.keys(): - errors1[k] = ufloat(np.round(np.mean(errors[k]),2), np.round(np.std(errors[k]), 2)) + errors1[k] = ufloat( + np.round(np.mean(errors[k]), 2), np.round(np.std(errors[k]), 2) + ) return errors1 def cv_regression_metrics( results_dict: Dict[str, List[PredictionResult]], threshold: float = np.inf ) -> dict: - """Compute regression metrics for each model + """ + Compute regression metrics for each model Parameters: - data: Dictionary with the model predictions. - threshold: Compute metrics errors only in RUL values less than the threshold Returns: + A dictionary with the following structure: + d: { ['Model]: + { + 'MAE': { + 'mean': + 'std': + }, + 'MAE SW': { + 'mean': + 'std': + }, + 'MSE': { + 'mean': + 'std': + }, + } + ] - - d: { ['Model]: - { - 'MAE': { - 'mean': - 'std': - }, - 'MAE SW': { - 'mean': - 'std': - }, - 'MSE': { - 'mean': - 'std': - }, - } - ] - """ out = {} for model_name in results_dict.keys(): diff --git a/ceruleo/transformation/features/cast.py b/ceruleo/transformation/features/cast.py index 2af09dae..a0b27984 100644 --- a/ceruleo/transformation/features/cast.py +++ b/ceruleo/transformation/features/cast.py @@ -15,19 +15,26 @@ class CastTo(TransformerStep): """Cast to a given datatype Example: - step = CastTo(type='float32') Parameters: - - type: Type name to convert to - name: Name of the step + type: Data Type to cast to + name: Name of the step, by default None """ - def __init__(self, *, type:str, name:Optional[str]=None): + + def __init__(self, *, type: str, name: Optional[str] = None): super().__init__(name=name) self.type = type + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Cast to a given datatype + + Parameters: + X: DataFrame to transform - def transform(self, X: pd.DataFrame): + Returns: + Transformed DataFrame + """ return X.astype(self.type) diff --git a/ceruleo/transformation/features/denoising.py b/ceruleo/transformation/features/denoising.py index 6161f80d..385ef75d 100644 --- a/ceruleo/transformation/features/denoising.py +++ b/ceruleo/transformation/features/denoising.py @@ -22,17 +22,13 @@ def __init__(self, window: int, order: int = 2, name: Optional[str] = None): self.order = order def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: - """Return a new dataframe with the features filtered - - Parameters - ---------- - X : pd.DataFrame - Input life + """ + Return a new dataframe with the features filtered + Parameters: + X: Input life - Returns - ------- - pd.DataFrame + Returns: A new DatafFrame with the same index as the input with the features filtered """ if X.shape[0] > self.window: @@ -46,16 +42,14 @@ def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: class MeanFilter(TransformerStep): - """Filter each feature using a rolling mean filter - - Parameters - ---------- - window : int - Size of the rolling window - min_periods : int, optional - Minimum number of points of the rolling window, by default 15 - name : Optional[str], optional - Name of the step, by default None + """ + Filter each feature using a rolling mean filter + + Parameters: + window: Size of the rolling window + min_periods: Minimum number of non-null points of the rolling window, by default 15 + name: Name of the step, by default None + center: Wether the guassian window should be centered, by default False """ def __init__( @@ -71,6 +65,15 @@ def __init__( self.center = center def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: + """ + Return a new dataframe with the features filtered + + Parameters: + X: Input life + + Returns: + A new DatafFrame with the same index as the input with the features filtered + """ return X.rolling( self.window, min_periods=self.min_periods, @@ -81,14 +84,10 @@ def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: class MedianFilter(TransformerStep): """Filter each feature using a rolling median filter - Parameters - ---------- - window : int - Size of the rolling window - min_periods : int, optional - Minimum number of points of the rolling window, by default 15 - name : Optional[str], optional - Name of the step, by default None + Parameters: + window: Size of the rolling window + min_periods: Minimum number of points of the rolling window, by default 15 + name: Name of the step, by default None """ def __init__(self, window: int, min_periods: int = 15, name: Optional[str] = None): @@ -97,18 +96,27 @@ def __init__(self, window: int, min_periods: int = 15, name: Optional[str] = Non self.min_periods = min_periods def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: + """ + Return a new dataframe with the features filtered + + Parameters: + X: Input life + + Returns: + A new DatafFrame with the same index as the input with the features filtered + """ return X.rolling(self.window, min_periods=self.min_periods).median( numeric_only=True ) class OneDimensionalKMeans(TransformerStep): - """Clusterize each feature into a number of clusters + """ + Clusterize each feature into a number of clusters + + Parameters: + n_clusters: Number of clusters, by default 5 - Parameters - ---------- - n_clusters : int - Number of clusters to obtain per cluster """ def __init__(self, n_clusters: int = 5, name: Optional[str] = None): @@ -116,7 +124,13 @@ def __init__(self, n_clusters: int = 5, name: Optional[str] = None): self.clusters = {} self.n_clusters = n_clusters - def partial_fit(self, X): + def partial_fit(self, X: pd.DataFrame): + """ + Fit the model to the input data to obtain the clusters + + Parameters: + X: Input life + """ if len(self.clusters) == 0: for c in X.columns: self.clusters[c] = MiniBatchKMeans( @@ -128,19 +142,14 @@ def partial_fit(self, X): return self def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: - """Transform the input dataframe - - Parameters - ---------- - X : pd.DataFrame - Input life + """ + Transform the input dataframe + Parameters: + X: Input life - Returns - ------- - pd.DataFrame - A new DataFrame with the same index as the input. - Each feature is replaced with the clusters of each point + Returns: + A new DataFrame with the same index as the input. Each feature is replaced with the clusters of each point """ X = X.copy() for c in X.columns: @@ -151,14 +160,12 @@ def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: class MultiDimensionalKMeans(TransformerStep): - """Clusterize data points and replace each feature with the centroid feature its belong - - Parameters - ---------- - n_clusters : int, optional - Number of clusters to obtain by default 5 - name : Optional[str], optional - Name of the step, by default None + """ + Clusterize data points and replace each feature with the centroid feature it belongs to + + Parameters: + n_clusters: Number of clusters to obtain by default 5 + name: Name of the step, by default None """ def __init__(self, n_clusters: int = 5, name: Optional[str] = None): @@ -166,24 +173,24 @@ def __init__(self, n_clusters: int = 5, name: Optional[str] = None): self.n_clusters = n_clusters self.clusters = MiniBatchKMeans(n_clusters=self.n_clusters, n_init="auto") - def partial_fit(self, X): + def partial_fit(self, X: pd.DataFrame): + """ + Fit the model to the input data to obtain the clusters + + Parameters: + X: Input life + """ self.clusters.partial_fit(X) return self def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: """Transform the input life with the centroid information - Parameters - ---------- - X : pd.DataFrame - Input life + Parameters: + X: Input life - - Returns - ------- - pd.DataFrame - A new DataFrame in which each point was replaced by the - centroid its belong + Returns: + A new DataFrame in which each point was replaced by the centroid it belongs to """ X = X.copy() @@ -192,14 +199,12 @@ def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: class EWMAFilter(TransformerStep): - """Filter each feature using EWM - - Parameters - ---------- - window : int - Size of window - name : Optional[str], optional - Name of the step, by default None + """ + Filter each feature using EWM (Exponential Moving Window) + + Parameters: + span: Time constant of the EMA (Exponential Moving Average) + name: Name of the step, by default None """ def __init__(self, span: float, name: Optional[str] = None): @@ -207,22 +212,27 @@ def __init__(self, span: float, name: Optional[str] = None): self.span = span def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: + """ + Return a new dataframe with the features filtered + + Parameters: + X: Input life + + Returns: + A new DatafFrame with the same index as the input with the features filtered + """ return X.ewm(span=self.span, ignore_na=True).mean() class GaussianFilter(TransformerStep): - """Apply a gaussian filter - - Parameters - ---------- - window_size : int - Size of the gaussian filter - std : float - Standard deviation of the filter - min_points : int, optional - Minimun nomber of points of the rolling window, by default 1 - center : bool, optional - Wether the guassian window should be centered, by default False + """ + Apply a gaussian filter + + Parameters: + window_size: Size of the gaussian filter + std: Standard deviation of the filter + min_points: Minimun nomber of points of the rolling window, by default 1 + center: Wether the guassian window should be centered, by default False """ def __init__( @@ -241,6 +251,15 @@ def __init__( self.min_points = min_points def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Return a new dataframe with the features filtered + + Parameters: + X: Input life + + Returns: + A new DatafFrame with the same index as the input with the features filtered + """ return X.rolling( window=self.window_size, win_type="gaussian", diff --git a/ceruleo/transformation/features/entropy.py b/ceruleo/transformation/features/entropy.py index c9bb2026..40be261c 100644 --- a/ceruleo/transformation/features/entropy.py +++ b/ceruleo/transformation/features/entropy.py @@ -25,19 +25,10 @@ class LocalEntropyMeasures(TransformerStep): - Local Entropy Rate - Parameters - ---------- - min_points : int, optional - The minimun number of points of the expanding window, by default 2 - to_compute : List[str], optional - List of the features to compute, by default None - Valid values are: - 'local_active_information' - 'local_block_entropy' - 'local_entropy_rate' - name : Optional[str], optional - Name of the step, by default None - + Parameters: + min_points: The minimun number of points of the expanding window, by default 2 + to_compute: List of the features to compute, by default None. Valid values are: 'local_active_information', 'local_block_entropy', 'local_entropy_rate' + name: Name of the step, by default None """ def __init__( @@ -72,7 +63,17 @@ def _local_block_entropy(self, s: pd.Series): def _local_entropy_rate(self, s: pd.Series): return entropy_rate(s.values, self.window, local=True) - def transform(self, X): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + + """ + Return a new dataframe with Entropy features computed for each feature in the input + + Parameters: + X: Input life + + Returns: + A new DataFrame with the at maximum three times the number of columns as the input since three Entropy features are computed for each feature in the input + """ X_new_n_columns = len(X.columns) * len(self.to_compute) i = 0 diff --git a/ceruleo/transformation/features/extraction.py b/ceruleo/transformation/features/extraction.py index 5cd195a8..820d6377 100644 --- a/ceruleo/transformation/features/extraction.py +++ b/ceruleo/transformation/features/extraction.py @@ -53,13 +53,25 @@ def transform(self, X: pd.DataFrame): class ColumnWiseSum(TransformerStep): """ Compute the column-wise sum each column + + Parameters: + column_name: Name of the unique column which is returned """ def __init__(self, column_name: str, name: Optional[str] = None): super().__init__(name=name) self.column_name = column_name - def transform(self, X: pd.DataFrame): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Apply the transformation to the input life + + Parameters: + X: The input life + + Returns: + A single-column DataFrame containing the column-wise sum for each input sample + """ return pd.DataFrame(X.sum(axis=1), columns=[self.column_name]) @@ -67,6 +79,15 @@ class SampleNumber(TransformerStep): """Return a column with increasing number""" def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Apply the transformation to the input life + + Parameters: + X: The input life + + Returns: + A DataFrame with increasing sample indexes. + """ df = pd.DataFrame(index=X.index) df["sample_number"] = list(range(X.shape[0])) return df @@ -113,6 +134,15 @@ def fit(self, X: pd.DataFrame, y=None): return self def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: + """ + Apply the transformation to the input life + + Parameters: + X: The input life + + Returns: + A DataFrame with shape equal to (X.shape[0],n_unique(feature)) containin the One Hot Encoding for the input feature + """ categories = sorted(list([c for c in self.categories if c is not None])) d = pd.Categorical(X[self.feature], categories=categories) @@ -139,21 +169,19 @@ def __init__( self.categories = set() self.encoder = None - def transform(self, X, y=None): - """Return a new DataFrame with the feature encoded with integer numbers - - Parameters - ---------- - X : pd.DataFrame - The input life - y : [type], optional + def transform(self, X: pd.DataFrame, y: Optional[type]=None) -> pd.DataFrame: + """ + Return a new DataFrame with the feature encoded with integer numbers + Parameters; + X: The input life + y: [type], optional - Returns - ------- - pd.DataFrame - A new dataframe with the same index as the input - with 1 column + Parameters: + X: The input life + + Returns: + A new dataframe with the same index as the input with 1 column containing the encoding of the input feature. """ def hash(x): @@ -171,12 +199,9 @@ def hash(x): class SimpleEncodingCategorical(TransformerStep): """Compute a simple numerical encoding for a given feature - Parameters - ---------- - feature : str - Feature name from which compute the simple encoding - name : Optional[str], optional - Step name, by default None + Parameters: + feature: Feature name from which compute the simple encoding + name: Step name, by default None """ def __init__(self, *, feature: Optional[str] = None, name: Optional[str] = None): @@ -185,18 +210,14 @@ def __init__(self, *, feature: Optional[str] = None, name: Optional[str] = None) self.categories = set() self.encoder = None - def partial_fit(self, X: pd.DataFrame, y=None): + def partial_fit(self, X: pd.DataFrame, y=None) -> "SimpleEncodingCategorical": """Compute incrementally the set of possible categories - Parameters - ---------- - X : pd.DataFrame - The input life + Parameters: + X: The input life - Returns - ------- - SimpleEncodingCategorical - self + Returns: + Instance of class SimpleEncodingCategorical """ if self.feature is None: self.feature = X.columns[0] @@ -205,19 +226,15 @@ def partial_fit(self, X: pd.DataFrame, y=None): return self - def fit(self, X: pd.DataFrame, y=None): - """Compute the set of possible categories - - Parameters - ---------- - X : pd.DataFrame - The input life - + def fit(self, X: pd.DataFrame, y=None) -> "SimpleEncodingCategorical": + """ + Compute the set of possible categories - Returns - ------- - OneHotCategorical - self + Parameters: + X: The input life + + Returns: + Instance of class SimpleEncodingCategorical """ if self.feature is None: self.feature = X.columns[0] @@ -225,21 +242,14 @@ def fit(self, X: pd.DataFrame, y=None): self.categories.update(set(X[self.feature].unique())) return self - def transform(self, X, y=None): + def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: """Return a new DataFrame with the feature encoded with integer numbers - Parameters - ---------- - X : pd.DataFrame - The input life - y : [type], optional + Parameters: + X: The input life - - Returns - ------- - pd.DataFrame - A new dataframe with the same index as the input - with 1 column + Returns: + A new dataframe with the same index as the input with 1 column with the Simple Encoding of the input feature. """ categories = sorted(list([c for c in self.categories if c is not None])) d = pd.Categorical(X[self.feature], categories=categories) @@ -272,16 +282,9 @@ class LifeStatistics(TransformerStep): - Hurst - Parameters - ---------- - to_compute : List[str], optional - List of the features to compute, by default None - Valid values are: - 'kurtosis', 'skewness', 'max', 'min', 'std', 'peak', 'impulse', - 'clearance', 'rms', 'shape', 'crest', 'hurst' - name : Optional[str], optional - Name of the step, by default None - + Parameters: + to_compute: List of the features to compute, by default None. Valid values are:'kurtosis', 'skewness', 'max', 'min', 'std', 'peak', 'impulse','clearance', 'rms', 'shape', 'crest', 'hurst' + name: Name of the step, by default None """ def __init__( @@ -367,19 +370,14 @@ def _crest(self, s: pd.Series): return 0 def transform(self, X: pd.DataFrame) -> pd.DataFrame: - """Compute features from the given life - - Parameters - ---------- - X : pd.DataFrame - Input life - - Returns - ------- - pd.DataFrame - A new DataFrame with one row with n columns. - Let m be the number of features of the life and - f the len(to_compute) ten where n = m x f, + """ + Compute features from the given life + + Parameters: + X: The input life + + Returns: + A new DataFrame with one row and with n columns. Let m be the number of features of the life and f the len(to_compute), then n = m x f, """ X_new = pd.DataFrame(index=[0]) for c in X.columns: @@ -409,19 +407,11 @@ class RollingStatistics(TransformerStep): - Shape - Crest - Parameters - ---------- - window:int - Size of the rolling window - min_points : int, optional - The minimun number of points of the expanding window, by default 15 - to_compute : Optional[List[Str]], optional - Name of features to compute - Possible values are: - 'kurtosis', 'skewness', 'max', 'min', 'std', 'peak', 'impulse', - 'clearance', 'rms', 'shape', 'crest' - name: Optiona[str] - Name of the step, by default None + Parameters: + window: Size of the rolling window, by default 15 + min_points: The minimun number of points of the expanding window + to_compute: Name of features to compute. Possible values are: 'kurtosis', 'skewness', 'max', 'min', 'std', 'peak', 'impulse', 'clearance', 'rms', 'shape', 'crest' + name: Name of the step, by default None """ @@ -575,7 +565,16 @@ def _transform_specific( out = getattr(self, f"_{stats}")(X[c], rolling[c], abs_rolling[c]) X_new.loc[:, feature] = out.values - def transform(self, X: pd.DataFrame): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Compute features from the given life + + Parameters: + X: The input life + + Returns: + A new DataFrame with one row and with n columns. Let m be the number of features of the life and f the len(to_compute), then n = m x f + """ columns = self._compute_column_names(X) X_new = pd.DataFrame(index=X.index, columns=columns) @@ -609,24 +608,17 @@ class ExpandingStatistics(TransformerStep): - Hurst - Parameters - ---------- - min_points : int, optional - The minimun number of points of the expanding window, by default 2 - to_compute : List[str], optional - List of the features to compute, by default None - Valid values are: - 'kurtosis', 'skewness', 'max', 'min', 'std', 'peak', 'impulse', - 'clearance', 'rms', 'shape', 'crest', 'hurst' - name : Optional[str], optional - Name of the step, by default None + Parameters: + min_points: The minimun number of points of the expanding window, by default 2 + to_compute: List of the features to compute, by default None. Valid values are: 'kurtosis', 'skewness', 'max', 'min', 'std', 'peak', 'impulse','clearance', 'rms', 'shape', 'crest', 'hurst' + name: Name of the step, by default None """ def __init__( self, *, - min_points=2, + min_points: int=2, to_compute: List[str] = None, specific: Optional[Dict[str, List[str]]] = None, name: Optional[str] = None, @@ -889,7 +881,17 @@ def _transform_specific( ) X_new.loc[:, feature] = out.values - def transform(self, X: pd.DataFrame): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Compute features from the given life + + Parameters: + X: The input life + + Returns: + A new DataFrame with one row and with n columns. Let m be the number of features of the life and f the len(to_compute), then n = m x f, + """ + columns = self._compute_column_names(X) X_new = pd.DataFrame(index=X.index, columns=columns) @@ -911,16 +913,14 @@ class Difference(TransformerStep): X[features1] - X[features2] - Parameters - + Parameters: feature_set1: Feature list of the first group to substract feature_set2:Feature list of the second group to substract name: Name of the step, by default None - """ def __init__( - self, *, feature_set1: list, feature_set2: list, name: Optional[str] = None + self, *, feature_set1: List[str], feature_set2: List[str], name: Optional[str] = None ): super().__init__(name=name) if len(feature_set1) != len(feature_set2): @@ -932,6 +932,15 @@ def __init__( self.feature_names_computed = False def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Apply the transformation to the input life + + Parameters: + X: The input life + + Returns: + A DataFrame with two columns containing the result of the differences between the two sets of input features + """ if not self.feature_names_computed: self.feature_set1 = [self.find_feature(X, c) for c in self.feature_set1] self.feature_set2 = [self.find_feature(X, c) for c in self.feature_set2] @@ -944,19 +953,25 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: class EMD(TransformerStep): """Compute the empirical mode decomposition of each feature - Parameters - ---------- - n : int - Number of modes to compute - name : Optional[str], optional - [description], by default 'EMD' + Parameters: + n: Number of modes to compute + name: Name of the step, by default None """ def __init__(self, *, n: int, name: Optional[str] = "EMD"): super().__init__(name=name) self.n = n - def transform(self, X): + def transform(self, X:pd.DataFrame) -> pd.DataFrame: + """ Apply transformation to the input life + + Parameters: + X: The input life + + Returns: + A DataFrame where the number of columns is n times the one of the input life, since each features is substituted by the n modes of its EMD + + """ new_X = pd.DataFrame(index=X.index) for c in X.columns: try: @@ -974,6 +989,7 @@ def transform(self, X): class SlidingNonOverlappingEMD(TransformerStep): + def __init__( self, *, window_size: int, max_imfs: int, keep: Optional[int] = None, **kwargs ): @@ -1012,12 +1028,13 @@ def _emd(values: np.ndarray): class EMDFilter(TransformerStep): - """Filter the signals using Empirical Mode decomposition + """ + Filter the signals using Empirical Mode decomposition - Parameters - ---------- - n: int - Number of + Parameters: + n: Number of modes + min_imf: Min Intrinsic Mode Function + max_imf: Max Intrinsic Mode Function """ def __init__( @@ -1028,7 +1045,16 @@ def __init__( self.min_imf = min_imf self.max_imf = max_imf - def transform(self, X): + def transform(self, X:pd.DataFrame) -> pd.DataFrame: + """ + Apply the transformation to the input life + + Parameters: + X: The input life + + Returns: + A DataFrame with the same shape of the input life and with the result of the EMD Filter application + """ new_X = pd.DataFrame(index=X.index) for c in X.columns: @@ -1043,10 +1069,21 @@ def transform(self, X): class ChangesDetector(TransformerStep): """Compute how many changes there are in a categorical variable + + ['a', 'a', 'b', 'c] -> [0, 0, 1, 1] """ def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Appply the transformation to the input life + + Parameters: + X: The input life + + Returns: + A DataFrame with boolean values representing weather changes were applied to the input variable or not + """ return X != X.shift(axis=0) @@ -1054,6 +1091,16 @@ class Interactions(TransformerStep): """Compute pairwise interactions between the features""" def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Apply the transformation to one life + + Parameters: + X: The input life + + Returns: + DataFrame containing the pairwise interaction values + + """ X_new = pd.DataFrame(index=X.index) for c1, c2 in itertools.combinations(X.columns, 2): X_new[f"{c1}_{c2}"] = X[c1] * X[c2] diff --git a/ceruleo/transformation/features/extraction_frequency.py b/ceruleo/transformation/features/extraction_frequency.py index fc12b748..e8fa0a69 100644 --- a/ceruleo/transformation/features/extraction_frequency.py +++ b/ceruleo/transformation/features/extraction_frequency.py @@ -9,51 +9,59 @@ @jit(nopython=True, error_model='numpy') -def get_moment(y, moment) -> np.float32: +def get_moment(y: np.array, moment: int) -> np.float32: """ - Returns the (non centered) moment of the distribution y: - E[y**moment] = \\sum_i[index(y_i)^moment * y_i] / \\sum_i[y_i] - :param y: the discrete distribution from which one wants to calculate the moment - :type y: pandas.Series or np.array - :param moment: the moment one wants to calcalate (choose 1,2,3, ... ) - :type moment: int - :return: the moment requested - :return type: float + Compute the (non centered) moment of the input distribution + + Parameters: + y: the discrete distribution from which one wants to calculate the moment + moment: the moment one wants to calculate (choose 1,2,3, ... ) + + Returns: + The moment requested """ return y.dot(np.arange(len(y), dtype=np.float64)**moment) / y.sum() @jit(nopython=True, error_model='numpy') -def get_centroid(y) -> np.float32: +def get_centroid(y: np.array) -> np.float32: """ - :param y: the discrete distribution from which one wants to calculate the centroid - :type y: pandas.Series or np.array - :return: the centroid of distribution y (aka distribution mean, first moment) - :return type: float + Compute the centroid of the input distribution (aka distribution mean, first moment) + + Parameters: + y: the discrete distribution from which one wants to calculate the centroid + + Returns: + The centroid of distribution y """ return get_moment(y, 1) @jit(nopython=True, error_model='numpy') -def get_variance(y) -> np.float32: +def get_variance(y: np.array) -> np.float32: """ - :param y: the discrete distribution from which one wants to calculate the variance - :type y: pandas.Series or np.array - :return: the variance of distribution y - :return type: float + Compute the variance of the input distribution (aka distribution second central moment) + + Parameters: + y: the discrete distribution from which one wants to calculate the variance + + Returns: + The variance of distribution y """ + # Here we are implementing the formula var(X) = E[X**2] (second moment) - E[X]**2 (first moment squared) return get_moment(y, 2) - get_centroid(y) ** 2 @jit(nopython=True, error_model='numpy') -def get_skew(y) -> np.float32: +def get_skew(y: np.array) -> np.float32: """ - Calculates the skew as the third standardized moment. - Ref: https://en.wikipedia.org/wiki/Skewness#Definition - :param y: the discrete distribution from which one wants to calculate the skew - :type y: pandas.Series or np.array - :return: the skew of distribution y - :return type: float + Calculates the skew as the third standardized moment. (Ref: https://en.wikipedia.org/wiki/Skewness#Definition) + + Parameters: + y: the discrete distribution from which one wants to calculate the skew + + Returns: + The skew of distribution y """ variance = get_variance(y) @@ -69,14 +77,15 @@ def get_skew(y) -> np.float32: @jit(nopython=True, error_model='numpy') -def get_kurtosis(y) -> np.float32: +def get_kurtosis(y: np.array) -> np.float32: """ - Calculates the kurtosis as the fourth standardized moment. - Ref: https://en.wikipedia.org/wiki/Kurtosis#Pearson_moments - :param y: the discrete distribution from which one wants to calculate the kurtosis - :type y: pandas.Series or np.array - :return: the kurtosis of distribution y - :return type: float + Calculates the kurtosis as the fourth standardized moment. (Ref: https://en.wikipedia.org/wiki/Kurtosis#Pearson_moments) + + Parameters: + y: the discrete distribution from which one wants to calculate the kurtosis + + Returns: + The kurtosis of distribution y """ variance = get_variance(y) @@ -93,16 +102,15 @@ def get_kurtosis(y) -> np.float32: @jit(debug=True, nopython=True, error_model='numpy') -def compute_frequency_features(x): +def compute_frequency_features(x: np.array) -> np.array: """ Returns the spectral centroid (mean), variance, skew, and kurtosis of the absolute fourier transform spectrum. - :param x: the time series to calculate the feature of - :type x: numpy.ndarray - :param param: contains dictionaries {"aggtype": s} where s str and in ["centroid", "variance", - "skew", "kurtosis"] - :type param: list - :return: the different feature values - :return type: pandas.Series + + Parameters: + x: the time series on which the spectral features are calculated + + Returns: + An array containig the spectral centroid, variance, skew, and kurtosis of the absolute fourier transform spectrum. """ with objmode(fft_abs='float64[:]'): diff --git a/ceruleo/transformation/features/hurst.py b/ceruleo/transformation/features/hurst.py index 9534e78d..99d85dfe 100644 --- a/ceruleo/transformation/features/hurst.py +++ b/ceruleo/transformation/features/hurst.py @@ -1,45 +1,38 @@ from numba import guvectorize import numpy as np +import pandas as pd @guvectorize("float64[:], int64, int64, int64, float64[:]", "(m),(),(),()->()", cache=True, nopython=True) -def hurst_rs(x, min_chunksize, max_chunksize, num_chunksize, out): - """Estimate the Hurst exponent using R/S method. +def hurst_rs(x: np.array, min_chunksize: int, max_chunksize: int, num_chunksize: int, out: np.array) -> float: + """ + Estimate the Hurst exponent using R/S method. Estimates the Hurst (H) exponent using the R/S method from the time series. The R/S method consists of dividing the series into pieces of equal size - `series_len` and calculating the rescaled range. This repeats the process + series_len and calculating the rescaled range. This repeats the process for several `series_len` values and adjusts data regression to obtain the H. `series_len` will take values between `min_chunksize` and `max_chunksize`, the step size from `min_chunksize` to `max_chunksize` can be controlled through the parameter `step_chunksize`. - Parameters - ---------- - x : 1D-array - A time series to calculate hurst exponent, must have more elements - than `min_chunksize` and `max_chunksize`. - min_chunksize : int - This parameter allow you control the minimum window size. - max_chunksize : int - This parameter allow you control the maximum window size. - num_chunksize : int - This parameter allow you control the size of the step from minimum to - maximum window size. Bigger step means fewer calculations. - out : 1-element-array, optional - one element array to store the output. - - Returns - ------- - H : float - A estimation of Hurst exponent. - + Parameters: + x: A time series to calculate hurst exponent, must have more elements than `min_chunksize` and `max_chunksize`. + min_chunksize: This parameter allow you control the minimum window size. + max_chunksize: This parameter allow you control the maximum window size. + num_chunksize: This parameter allow you control the size of the step from minimum to maximum window size. Bigger step means fewer calculations. + out: One element array to store the output. + + + Returns: + Estimation of Hurst exponent. + References ---------- - Hurst, H. E. (1951). Long term storage capacity of reservoirs. ASCE - Transactions, 116(776), 770-808. - Alessio, E., Carbone, A., Castelli, G. et al. Eur. Phys. J. B (2002) 27: - 197. http://dx.doi.org/10.1140/epjb/e20020150 + Hurst, H. E. (1951). Long term storage capacity of reservoirs. ASCE + Transactions, 116(776), 770-808. + Alessio, E., Carbone, A., Castelli, G. et al. Eur. Phys. J. B (2002) 27: + 197. http://dx.doi.org/10.1140/epjb/e20020150 """ N = len(x) max_chunksize += 1 @@ -81,8 +74,9 @@ def hurst_rs(x, min_chunksize, max_chunksize, num_chunksize, out): out[0] = H -def hurst_dma(prices, min_chunksize=8, max_chunksize=200, num_chunksize=5): - """Estimate the Hurst exponent using R/S method. +def hurst_dma(prices: np.array, min_chunksize: int=8, max_chunksize: int=200, num_chunksize: int=5) -> float: + """ + Estimate the Hurst exponent using the DMA method. Estimates the Hurst (H) exponent using the DMA method from the time series. The DMA method consists on calculate the moving average of size `series_len` @@ -93,22 +87,20 @@ def hurst_dma(prices, min_chunksize=8, max_chunksize=200, num_chunksize=5): `min_chunksize` to `max_chunksize` can be controlled through the parameter `step_chunksize`. - Parameters - ---------- - prices - min_chunksize - max_chunksize - num_chunksize - - Returns - ------- - hurst_exponent : float + Parameters: + prices: A time series to calculate hurst exponent, must have more elements than `min_chunksize` and `max_chunksize`. + min_chunksize: This parameter allow you control the minimum window size. + max_chunksize: This parameter allow you control the maximum window size. + num_chunksize: This parameter allow you control the size of the step from minimum to maximum window size. Bigger step means fewer calculations. + + Returns: Estimation of hurst exponent. + References ---------- - Alessio, E., Carbone, A., Castelli, G. et al. Eur. Phys. J. B (2002) 27: - 197. http://dx.doi.org/10.1140/epjb/e20020150 + Alessio, E., Carbone, A., Castelli, G. et al. Eur. Phys. J. B (2002) 27: + 197. http://dx.doi.org/10.1140/epjb/e20020150 """ max_chunksize += 1 @@ -129,34 +121,31 @@ def hurst_dma(prices, min_chunksize=8, max_chunksize=200, num_chunksize=5): return H -def hurst_dsod(x): - """Estimate Hurst exponent on data timeseries. +def hurst_dsod(x: np.array) -> float: + """ + Estimate Hurst exponent on data timeseries using the DSOD (Discrete Second Order Derivative). The estimation is based on the discrete second order derivative. Consists on get two different noise of the original series and calculate the standard deviation and calculate the slope of two point with that values. source: https://gist.github.com/wmvanvliet/d883c3fe1402c7ced6fc - Parameters - ---------- - x : numpy array - time series to estimate the Hurst exponent for. - - Returns - ------- - h : float - The estimation of the Hurst exponent for the given time series. + Parameters: + x: Time series to estimate the Hurst exponent for. + + Returns: + Estimation of the Hurst exponent for the given time series. + References ---------- - Istas, J.; G. Lang (1994), “Quadratic variations and estimation of the local - Hölder index of data Gaussian process,” Ann. Inst. Poincaré, 33, pp. 407–436. + Istas, J.; G. Lang (1994), “Quadratic variations and estimation of the local + Hölder index of data Gaussian process,” Ann. Inst. Poincaré, 33, pp. 407–436. Notes - ----- - This hurst_ets is data literal traduction of wfbmesti.m of waveleet toolbox - from matlab. + ---------- + This hurst_ets is data literal traduction of wfbmesti.m of waveleet toolbox from matlab. """ y = np.cumsum(np.diff(x, axis=0), axis=0) @@ -176,56 +165,47 @@ def hurst_dsod(x): return 0.5 * np.log2(s2 / s1) -def hurst_exponent(prices, min_chunksize=8, max_chunksize=200, num_chunksize=5, - method='RS'): - """Estimates Hurst Exponent. +def hurst_exponent(prices: Option[np.ndarray, pd.Series, pd.DataFrame], min_chunksize: int =8, max_chunksize: int =200, num_chunksize: int =5, + method: str ='RS') -> float: + """ + Estimates Hurst Exponent. Estimate the hurst exponent following one of 3 methods. Each method - Parameters - ---------- - prices : numpy.ndarray, pandas.Series or pandas.DataFrame - A time series to estimate hurst exponent. - min_chunksize : int, optional - Minimum chunk size of the original series. This parameter doesn't have - any effect with DSOD method. - max_chunksize : int, optional - Maximum chunk size of the original series. This parameter doesn't have - any effect with DSOD method. - step_chunksize : int, optional - Step used to select next the chunk size which divide the original - series. This parameter doesn't have any effect with DSOD method. - method : {'RS', 'DMA', 'DSOD', 'all'} - The methods can take one of that values, - RS : rescaled range. - DMA : deviation moving average. - DSOD : discrete second order derivative. - - - Returns - ------- - hurst_exponent : float + Parameters: + prices: A time series to estimate hurst exponent. + min_chunksize : int, optional + Minimum chunk size of the original series. This parameter doesn't have any effect with DSOD method, by default 8 + max_chunksize : int, optional + Maximum chunk size of the original series. This parameter doesn't have any effect with DSOD method. by default 200. + num_chunksize : int, optional + Step used to select next the chunk size which divide the original series. This parameter doesn't have any effect with DSOD method, by default 5. + method : {'RS', 'DMA', 'DSOD', 'all'}. Valid values are: RS : rescaled range, DMA : deviation moving average, DSOD : discrete second order derivative. + + + Returns: Estimation of hurst_exponent according to the method selected. + References ---------- - RS : Hurst, H. E. (1951). Long term storage capacity of reservoirs. ASCE - Transactions, 116(776), 770-808. - DMA : Alessio, E., Carbone, A., Castelli, G. et al. Eur. Phys. J. B (2002) - 27: 197. http://dx.doi.org/10.1140/epjb/e20020150 - DSOD : Istas, J.; G. Lang (1994), “Quadratic variations and estimation of - the local Hölder index of data Gaussian process,” Ann. Inst. Poincaré, - 33, pp. 407–436. + RS: Hurst, H. E. (1951). Long term storage capacity of reservoirs. ASCE + Transactions, 116(776), 770-808. + DMA: Alessio, E., Carbone, A., Castelli, G. et al. Eur. Phys. J. B (2002) + 27: 197. http://dx.doi.org/10.1140/epjb/e20020150 + DSOD: Istas, J.; G. Lang (1994), “Quadratic variations and estimation of + the local Hölder index of data Gaussian process,” Ann. Inst. Poincaré, + 33, pp. 407–436. Notes - ----- - The hurst exponent is an estimation which is important because there is no - data closed equation for it instead we have some methods to estimate it with - high variations among them. + ---------- + The hurst exponent is an estimation which is important because there is no + data closed equation for it instead we have some methods to estimate it with + high variations among them. See Also - -------- - hurst_rs, hurst_dma, hurst_dsod + ---------- + hurst_rs, hurst_dma, hurst_dsod """ if len(prices) == 0: return np.nan diff --git a/ceruleo/transformation/features/imputers.py b/ceruleo/transformation/features/imputers.py index 08585f20..370f4c48 100644 --- a/ceruleo/transformation/features/imputers.py +++ b/ceruleo/transformation/features/imputers.py @@ -13,14 +13,13 @@ class PerColumnImputer(TransformerStep): """Impute the values of each column following a simple rule The imputing is made following this rule: - -np.inf -> min - np.inf -> max - nan -> median - - Parameters - ---------- - name : Optional[str], optional - Step name, by default None + * -np.inf -> min + * np.inf -> max + * nan -> median + + Parameters: + name: Step name, by default None + """ def __init__(self, *, name: Optional[str] = None): @@ -29,7 +28,14 @@ def __init__(self, *, name: Optional[str] = None): self.data_max = None self.data_median = None - def partial_fit(self, X, y=None): + def partial_fit(self, X:pd.DataFrame, y=None) -> pd.DataFrame: + """ + Fit the transformation incrementally + + Parameters: + X: The input life + + """ X = X.replace([np.inf, -np.inf], np.nan) col_to_max = X.max() col_to_min = X.max() @@ -51,7 +57,14 @@ def _remove_na(self): self.data_min.fillna(0, inplace=True) self.data_median.fillna(0, inplace=True) - def fit(self, X, y=None): + def fit(self, X:pd.DataFrame, y=None) -> pd.DataFrame: + """ + Fit the transformation + + Parameters: + X: The input life + + """ X = X.replace([np.inf, -np.inf], np.nan) col_to_max = X.max() col_to_min = X.max() @@ -64,7 +77,14 @@ def fit(self, X, y=None): self._remove_na() return self - def transform(self, X, y=None): + def transform(self, X:pd.DataFrame, y=None) -> pd.DataFrame: + """ + Apply the transformation to the input life + + Parameters: + X: The input life + + """ X_new = X.copy() for c in X_new.columns: X_new[c].replace([np.inf], self.data_max[c], inplace=True) @@ -72,7 +92,14 @@ def transform(self, X, y=None): X_new[c].replace([np.nan], self.data_median[c], inplace=True) return X_new - def description(self): + def description(self) -> tuple: + """ + Transformation's Description + + Returns: + A tuple with the transformation name and the Max, Min and Median values for each feature. + + """ name = super().description() data = [] for k in self.data_max.index: @@ -93,52 +120,47 @@ class NaNtoInf(TransformerStep): """Replace NaN for inf""" def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: - """Transform the input life replacing Nan for inf - - Parameters - ---------- - X : pd.DataFrame - Input Dataframe to be transformed + """ + Transform the input life replacing Nan for inf - Returns - ------- - pd.DataFrame - A dataframe with she same index as the input without NaN values + Parameters: + X: Input Dataframe to be transformed + + Returns: + A dataframe with she same index as the input with the NaN values replaced with inf """ return X.replace([np.inf, -np.inf], np.nan) + #It should be -> X.replace(np.nan,[np.inf, -np.inf]) + #return X.replace(np.nan,np.inf) class MedianImputer(TransformerStep): - """Impute missing values with the median value of the training set + """ + Impute missing values with the median value of the training set - Parameters - ---------- - name : Optional[str] - The name of the step + Parameters: + name: The name of the step + """ def __init__(self, *, name: Optional[str] = None): super().__init__(name=name) self.tdigest_dict = None - def fit(self, X, y=None): + def fit(self, X:pd.DataFrame, y=None): """Compute the median value - Parameters - ---------- - X : pd.DataFrame - The input life + Parameters: + X: The input life """ self.median = X.median(axis=0).to_dict() return self - def partial_fit(self, X): + def partial_fit(self, X:pd.DataFrame): """Compute the median value incrementally - Parameters - ---------- - X : pd.DataFrame - The input life + Parameters: + X: The input life """ if self.tdigest_dict is None: self.tdigest_dict = {c: TDigest() for c in X.columns} @@ -149,19 +171,16 @@ def partial_fit(self, X): c: self.tdigest_dict[c].percentile(50) for c in self.tdigest_dict.keys() } - def transform(self, X, y=None): - """Return a new dataframe with the missing values replaced by the fitted median + def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: + """ + Return a new dataframe with the missing values replaced by the fitted median - Parameters - ---------- - X : pd.DataFrame - Life + Parameters: + X: The input life - Returns - ------- - pd.DataFrame - A new DataFrame with the same index as the input with the Na values replaced + Returns: + A new DataFrame with the same index as the input with the Na values replaced by the fitted median """ return X.fillna(value=self.median) @@ -169,23 +188,20 @@ def transform(self, X, y=None): class MeanImputer(TransformerStep): """Impute missing values with the mean value of the training set - Parameters - ---------- - name : Optional[str] - The name of the step + Parameters: + name: The name of the step + """ def __init__(self, *, name: Optional[str] = None): super().__init__(name=name) self.sum = None - def partial_fit(self, X, y=None): + def partial_fit(self, X:pd.DataFrame, y=None): """Compute the mean value incrementally - Parameters - ---------- - X : pd.DataFrame - The input life + Parameters: + X: The input life """ if self.sum is None: self.sum = X.sum(axis=0) @@ -196,13 +212,11 @@ def partial_fit(self, X, y=None): self.mean = (self.sum / self.counts).to_dict() return self - def fit(self, X, y=None): - """Compute the mean value + def fit(self, X:pd.DataFrame, y=None): + """Compute the mean value - Parameters - ---------- - X : pd.DataFrame - The input life + Parameters: + X: The input life """ self.mean = X.mean(axis=0).to_dict() return self @@ -210,34 +224,26 @@ def fit(self, X, y=None): def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: """Return a new dataframe with the missing values replaced by the fitted mean - Parameters - ---------- - X : pd.DataFrame - Life - + Parameters: + X: The input life - Returns - ------- - pd.DataFrame - A new DataFrame with the same index as the input with the Na values replaced + Returns: + A new DataFrame with the same index as the input with the Na values replaced by the fitted mean """ return X.fillna(value=self.mean) class ApplyRollingImputer(TransformerStep): - """Impute missing values using a function over a rolling window - - Parameters - ---------- - - window_size : int - Window size of the rolling window + """ + Impute missing values using a function over a rolling window - func: Callable - The function to call in each window + Parameters: + window_size: Window size of the rolling window + func: The function to call in each window + """ - def __init__(self, *, window_size: int, func, **kwargs): + def __init__(self, *, window_size: int, func: callable, **kwargs): super().__init__(**kwargs) self.window_size = window_size self.function = func @@ -245,12 +251,11 @@ def __init__(self, *, window_size: int, func, **kwargs): self.sum = None def partial_fit(self, X: pd.DataFrame): - """Compute incrementally the mean value to use as default value to impute + """ + Compute incrementally the mean value to use as default value to impute - Parameters - ---------- - X : pd.DataFrame - The input lfie + Parameters: + X: The input life """ if self.sum is None: self.sum = X.sum(axis=0) @@ -262,30 +267,25 @@ def partial_fit(self, X: pd.DataFrame): return self def fit(self, X: pd.DataFrame): - """Compute a default value in case there are not valid values in the rolling window + """ + Compute a default value in case there are not valid values in the rolling window - Parameters - ---------- - X : pd.DataFrame - The input life + Parameters: + X: The input life """ self.default_value = np.mean(X, axis=0) self.default_value[~np.isfinite(self.default_value)] = 0 return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: - """Transform the input life - - Parameters - ---------- - X : pd.DataFrame - The input life to be transformed - - Returns - ------- - pd.DataFrame - A new life with the same index as the input with the missing values - replaced by the output of the function supplied + """ + Transform the input life + + Parameters: + X: The input life to be transformed + + Returns: + A new life with the same index as the input with the missing values replaced by the output of the function supplied """ X = X.copy() @@ -302,25 +302,20 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: class RollingMedianImputer(ApplyRollingImputer): """Impute missing values with the median value on a rolling window - Parameters - ---------- - - window_size : int - Window size of the rolling window + Parameters: + window_size: Window size of the rolling window """ def __init__(self, *, window_size: int, name:str=None): - super().__init__(window_size, np.median, name=name) + super().__init__(window_size, func=np.median, name=name) class RollingMeanImputer(ApplyRollingImputer): """Impute missing values with the mean value on a rolling window - Parameters - ---------- - - window_size : int - Window size of the rolling window + Parameters: + window_size: Window size of the rolling window + """ def __init__(self, *, window_size: int, name:str=None): @@ -330,7 +325,16 @@ def __init__(self, *, window_size: int, name:str=None): class ForwardFillImputer(TransformerStep): """Impute forward filling the values""" - def transform(self, X): + def transform(self, X:pd.DataFrame) -> pd.DataFrame: + """ + Transform the input life + + Parameters: + X: The input life to be transformed + + Returns: + A new life with the same index as the input with the missing values replaced by the value in the succesive timestamp + """ if not isinstance(X, pd.DataFrame): raise ValueError("Input array must be a data frame") return X.ffill() @@ -339,19 +343,36 @@ def transform(self, X): class BackwardFillImputer(TransformerStep): """Impute forward filling the values""" - def transform(self, X): + def transform(self, X:pd.DataFrame) -> pd.DataFrame: + """ + Transform the input life + + Parameters: + X: The input life to be transformed + + Returns: + A new life with the same index as the input with the missing values replaced by the value in the previous timestamp + """ if not isinstance(X, pd.DataFrame): raise ValueError("Input array must be a data frame") return X.bfill() class FillImputer(TransformerStep): - def __init__(self, *, value, name: Optional[str] = None): + def __init__(self, *, value: float, name: Optional[str] = None): super().__init__(name=name) self.value = value - """Impute forward filling the values + """Impute substituting the missing values with a value specified in the input """ + def transform(self, X:pd.DataFrame) -> pd.DataFrame: + """ + Transform the input life - def transform(self, X): + Parameters: + X: The input life to be transformed + + Returns: + A new life with the same index as the input with the missing values replaced by the value specified in the input + """ return X.fillna(value=self.value) diff --git a/ceruleo/transformation/features/operations.py b/ceruleo/transformation/features/operations.py index 66cd3f6b..d9449fcf 100644 --- a/ceruleo/transformation/features/operations.py +++ b/ceruleo/transformation/features/operations.py @@ -6,10 +6,34 @@ class Sum(TransformerStep): - def transform(self, X: List[pd.DataFrame]): + """ + Concatenate multiple run-to-failure cycles vertically + """ + def transform(self, X: List[pd.DataFrame]) -> pd.DataFrame: + """ + Apply the concatenation + + Parameters: + X: List of run-to-failure cycles to concatenate + + Returns: + A dataframe with the concatenated run-to-failure cycles + """ return reduce(lambda x, y: x.add(y, fill_value=0), X) class Divide(TransformerStep): - def transform(self, X: List[pd.DataFrame]): + """ + Divide multiple run-to-failure cycles vertically + """ + def transform(self, X: List[pd.DataFrame]) -> pd.DataFrame: + """ + Apply the division + + Parameters: + X: List of run-to-failure cycles to divide + + Returns: + A dataframe with the divided run-to-failure cycles + """ return reduce(lambda x, y: x.divide(y, fill_value=0), X) \ No newline at end of file diff --git a/ceruleo/transformation/features/outliers.py b/ceruleo/transformation/features/outliers.py index 756fb383..801aa623 100644 --- a/ceruleo/transformation/features/outliers.py +++ b/ceruleo/transformation/features/outliers.py @@ -12,20 +12,20 @@ class IQROutlierRemover(TransformerStep): - """Remove values outside (Q1 - margin*IQR, Q2 + margin*IQR) + """ + Remove values outside (Q1 - margin*IQR, Q2 + margin*IQR) If clip is True the values will be clipped between the range, otherwise the values are going to be replaced by inf and -inf - - Parameters: - lower_quantile: Lower quantile threshold for the non-anomalous values - upper_quantile: Upper quantile threshold for the non-anomalous values - margin: How many times the IQR gets multiplied + lower_quantile: Lower quantile threshold for the non-anomalous values, by feault 0.25 + upper_quantile: Upper quantile threshold for the non-anomalous values, by feault 0.75 + margin: How many times the IQR gets multiplied, by default 0.75 proportion_to_sample: If you want to compute the quantiles in an smaller proportion of data - you can specify it - clip: Wether to clip the values outside the range. + you can specify it,by default 1.0 + clip: Wether to clip the values outside the range, by default False + name: Name of the step, by default None """ @@ -33,8 +33,8 @@ def __init__( self, lower_quantile: float = 0.25, upper_quantile: float = 0.75, - margin=1.5, - proportion_to_sample=1.0, + margin: float=1.5, + proportion_to_sample: float=1.0, clip: bool = False, name: Optional[str] = None, prefer_partial_fit: bool = False, @@ -48,7 +48,13 @@ def __init__( self.upper_quantile = upper_quantile self.clip = clip - def partial_fit(self, X): + def partial_fit(self, X: pd.DataFrame): + """ + Compute the quantiles of the data and the interquartile range incrementally + + Parameters: + X: Input life + """ if X.shape[0] == 1: return self if self.proportion_to_sample < 1: @@ -74,7 +80,13 @@ def partial_fit(self, X): self.IQR = {c: self.Q3[c] - self.Q1[c] for c in self.Q1.keys()} return self - def fit(self, X): + def fit(self, X: pd.DataFrame): + """ + Compute the quantiles of the data and the interquartile range incrementally + + Parameters: + X: Input life + """ if self.proportion_to_sample < 1: sampled_points = np.random.choice( X.shape[0], int(X.shape[0] * self.proportion_to_sample), replace=False @@ -87,7 +99,16 @@ def fit(self, X): self.Q3 = self.Q3.to_dict() return self - def transform(self, X): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Remove the outliers from the input life. + + Parameters: + X: Input life + + Returns: + A new DataFrame with the outliers removed + """ X = X.copy() check_is_fitted(self, "Q1") check_is_fitted(self, "Q3") @@ -116,18 +137,17 @@ def description(self): class BeyondQuartileOutlierRemover(TransformerStep): - """Remove values outside (Q1, Q3) + """ + Remove values outside (Q1, Q3) If clip is True the values will be clipped between the range, otherwise the values are going to be replaced by inf and -inf - - Parameters: - lower_quantile: Lower quantile threshold for the non-anomalous values - upper_quantile: Upper quantile threshold for the non-anomalous values - clip: Wether to clip the values outside the range. - + lower_quantile: Lower quantile threshold for the non-anomalous values, by default 0.25 + upper_quantile: Upper quantile threshold for the non-anomalous values, by default 0.75 + clip: Wether to clip the values outside the range, by default False + name: Name of the step, by default None """ def __init__( @@ -150,7 +170,13 @@ def __init__( self.Q3 = None self.quantile_estimator = None - def partial_fit(self, X): + def partial_fit(self, X: pd.DataFrame): + """ + Compute the quantiles of the data incrementally + + Parameters: + X: Input life + """ if X.shape[0] == 1: return self if self.quantile_estimator is None: @@ -161,7 +187,13 @@ def partial_fit(self, X): self.quantile_estimator.update(X.select_dtypes(include="number")) return self - def fit(self, X): + def fit(self, X: pd.DataFrame): + """ + Compute the quantiles of the data + + Parameters: + X: Input life + """ if self.subsample < 1: sampled_points = np.random.choice( @@ -173,7 +205,16 @@ def fit(self, X): return self - def transform(self, X): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Remove the outliers from the input life. + + Parameters: + X: Input life + + Returns: + A new DataFrame with the outliers removed + """ if self.Q1 is None: self.Q1 = self.quantile_estimator.estimate_quantile(self.lower_quantile) @@ -199,11 +240,16 @@ def description(self): class ZScoreOutlierRemover(TransformerStep): """ - X = np.random.rand(500, 5) * np.random.randn(500, 5) * 15 - imput = ZScoreImputer(1.5) - imput.fit(X) - X_t = imput.transform(X) + Remove values outside (mean - number_of_std_allowed*std, mean + number_of_std_allowed*std). The outliers are set to NaN + + Parameters: + number_of_std_allowed: Number of standard deviations to consider a point an outlier + name: Name of the step, by default None """ + #X = np.random.rand(500, 5) * np.random.randn(500, 5) * 15 + #imput = ZScoreImputer(1.5) + #imput.fit(X) + #X_t = imput.transform(X) def __init__( self, @@ -216,11 +262,26 @@ def __init__( self.number_of_std_allowed = number_of_std_allowed self.scaler = StandardScaler() - def fit(self, X): + def fit(self, X: pd.DataFrame): + """ + Fit a StandardScaler to the data + + Parameters: + X: Input life + """ self.scaler.fit(X) return self - def transform(self, X): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Remove the outliers from the input life. + + Parameters: + X: Input life + + Returns: + A new DataFrame with the outliers removed + """ X_new = self.scaler.transform(X) X_new[np.abs(X_new) > self.number_of_std_allowed] = np.nan return pd.DataFrame(X_new, columns=X.columns, index=X.index) @@ -229,12 +290,17 @@ def transform(self, X): class EWMAOutOfRange(TransformerStep): """ Compute the EWMA limits and mark as NaN points outside UCL and LCL + + Parameters: + lambda_: Parameter for the EWMA, by default 0.5 + return_mask: Wether to return a mask with the outliers or the original data with the outliers marked as NaN, by default False + name: Name of the step, by default None """ def __init__( self, *, - lambda_=0.5, + lambda_ : float=0.5, return_mask: bool = False, name: Optional[str] = None, prefer_partial_fit: bool = False, @@ -246,7 +312,13 @@ def __init__( self.columns = None self.return_mask = return_mask - def partial_fit(self, X, y=None): + def partial_fit(self, X: pd.DataFrame, y=None): + """ + Compute the EWMA limits incrementally + + Parameters: + X: Input life + """ if self.columns is None: self.columns = X.columns.values else: @@ -267,14 +339,29 @@ def _compute_limits(self, X): LCL = mean - 3 * s return (pd.Series(LCL, index=self.columns), pd.Series(UCL, index=self.columns)) - def fit(self, X, y=None): + def fit(self, X: pd.DataFrame, y=None): + """ + Compute the EWMA limits + + Parameters: + X: Input life + """ self.columns = X.columns LCL, UCL = self._compute_limits(X) self.LCL = LCL self.UCL = UCL return self - def transform(self, X): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Remove the outliers from the input life. + + Parameters: + X: Input life + + Returns: + A new DataFrame with the outliers removed + """ mask = (X[self.columns] < (self.LCL)) | (X[self.columns] > (self.UCL)) if self.return_mask: return mask.astype("int") @@ -283,8 +370,19 @@ def transform(self, X): X[mask] = np.nan return X +# We can do the same with Rolling Median class RollingMeanOutlierRemover(TransformerStep): + """ + Compute the rolling mean and use it to compute the upper and lower bound to define outliers + + Parameters: + window: Window for the rolling mean, by default 15 + lambda_: Multiplier of the std used to define the bounds, by default 3 + return_mask: Wether to return a mask with the outliers or the original data with the outliers marked as NaN, by default False + name: Name of the step, by default None + """ + def __init__( self, *, @@ -298,7 +396,16 @@ def __init__( self.lambda_ = lambda_ self.return_mask = return_mask - def transform(self, X): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Remove the outliers from the input life. + + Parameters: + X: Input life + + Returns: + A new DataFrame with the outliers removed + """ r = X.rolling(self.window, min_periods=1) std = r.quantile(0.75) - r.quantile(0.25) upper = r.median() + (self.lambda_ * std) @@ -316,17 +423,39 @@ def transform(self, X): class IsolationForestOutlierRemover(TransformerStep): + """ + Remove outliers using Isolation Forests to detect them. + + Parameters: + n_estimators: Number of trees in the forest, by default 100 + name: Name of the step, by default None + """ def __init__(self, *, n_estimators=100, **kwargs): super().__init__(prefer_partial_fit=False, **kwargs) self.n_estimators = n_estimators self.forests = {} def fit(self, X: pd.DataFrame): + """ + Fit the Isolation Forest model to the data + + Parameters: + X: Input life + """ for c in X.columns: self.forests[c] = IsolationForest(n_estimators=self.n_estimators).fit(X[c].values.reshape(-1, 1) ) return self - def transform(self, X: pd.DataFrame): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Remove the outliers from the input life. + + Parameters: + X: Input life + + Returns: + A new DataFrame with the outliers removed + """ X_new = X.copy() for c in X.columns: r = self.forests[c].predict(X[c].values.reshape(-1, 1) ) diff --git a/ceruleo/transformation/features/resamplers.py b/ceruleo/transformation/features/resamplers.py index 47ecba96..8649660d 100644 --- a/ceruleo/transformation/features/resamplers.py +++ b/ceruleo/transformation/features/resamplers.py @@ -15,19 +15,25 @@ def transform(self, X: pd.DataFrame): class IndexMeanResampler(TransformerStep): """Resample - - When the index of the run-to-failure cycle is a time feature Parameters: - - rule: + rule: Time frequency or rule according to which the data should be resampled """ - def __init__(self, *, rule, **kwargs): + def __init__(self, *, rule: str, **kwargs): super().__init__(**kwargs) self.rule = rule - def transform(self, X: pd.DataFrame): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Apply the transformation to the input life + + Parameters: + X: The input life + + Returns: + The resampled DataFrame + """ return X.resample(self.rule).mean().dropna() @@ -38,7 +44,6 @@ class SubsamplerTransformer(TransformerStep): Resample the time series with an integer index and interpolate linearly the values Parameters: - time_feature: Time feature steps: Number of steps drop_time_feature: Drop the time feature @@ -54,14 +59,11 @@ def __init__(self, *args, time_feature: str, steps: int, drop_time_feature: bool def partial_fit(self, X: pd.DataFrame): """Obtain the name of the feature used as time - Parameters - ---------- - X : pd.DataFrame - The current time-series to be fitted + Parameters: + X: The current time-series to be fitted - Returns - ------- - self + Returns: + Instance of class IntegerIndexResamplerTransformer """ if self._time_feature is None: @@ -71,7 +73,16 @@ def partial_fit(self, X: pd.DataFrame): return self - def transform(self, X: pd.DataFrame): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Apply the transformation to the input life + + Parameters: + X: The input life + + Returns: + The resampled DataFrame + """ X = X.groupby(X[self._time_feature] // self.steps, sort=False).mean() if self.drop_time_feature: X = X.drop(columns=[self._time_feature]) @@ -80,18 +91,13 @@ def transform(self, X: pd.DataFrame): class IntegerIndexResamplerTransformer(TransformerStep): - """IntegerIndexResamplerTransformer - + """ Resample the time series with an integer index and interpolate linearly the values - Parameters - ---------- - time_feature : str - Time feature - steps : int - Number of steps - drop_time_feature: bool - Drop the time feature + Parameters: + time_feature: Time feature + steps: Number of steps + drop_time_feature: Drop the time feature """ def __init__(self, *args, time_feature: str, steps: int, drop_time_feature: bool): @@ -102,23 +108,29 @@ def __init__(self, *args, time_feature: str, steps: int, drop_time_feature: bool self.drop_time_feature = drop_time_feature def partial_fit(self, X: pd.DataFrame): - """Obtain the name of the feature used as time - - Parameters - ---------- - X : pd.DataFrame - The current time-series to be fitted + """ + Obtain the name of the feature used as time - Returns - ------- - self + Parameters: + X: The current time-series to be fitted + Returns: + Instance of class IntegerIndexResamplerTransformer """ if self._time_feature is None: self._time_feature = self.find_feature(X, self._time_feature_name) return self - def transform(self, X: pd.DataFrame): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Apply the transformation to the input life + + Parameters: + X: The input life + + Returns: + The resampled DataFrame + """ Told = X[self._time_feature].values Tnew = np.arange(Told.min(), Told.max(), self.steps) new_columns = X.columns.values diff --git a/ceruleo/transformation/features/rolling_windows.py b/ceruleo/transformation/features/rolling_windows.py index 7dbd27d7..55b50006 100644 --- a/ceruleo/transformation/features/rolling_windows.py +++ b/ceruleo/transformation/features/rolling_windows.py @@ -1,8 +1,15 @@ import numpy as np +from typing import Callable +# Window len = L, Stride len/stepsize = S +def _strided_app(a: np.array, L: int, S: int) -> np.array: + """ + Returns an array that is strided -def _strided_app(a, L, S): # Window len = L, Stride len/stepsize = S - """returns an array that is strided + Parameters: + a: Array to be strided + L: Length of the window + S: Stride (S=L/stepsize) """ nrows = ((a.size-L)//S)+1 n = a.strides[0] @@ -18,27 +25,20 @@ def _strided_app(a, L, S): # Window len = L, Stride len/stepsize = S return l -def apply_rolling_data(values : np.ndarray, function, window, step=1): - """Perform a rolling window analysis at the column `col` from `data` +def apply_rolling_data(values : np.ndarray, function: Callable[[np.ndarray], np.ndarray], window: int, step: int =1) -> np.array: + """ + Perform a rolling window analysis at the column `col` from `data` - Given a dataframe `data` with time series, call `function` at - sections of length `window` at the data of column `col`. Append - the results to `data` at a new columns with name `label`. + Given a dataframe `data` with time series, call `function` at sections of length `window` at the data of column `col`. Append the results to `data` at a new columns with name `label`. Parameters: - - data: 1-D Time series of data - function: Function to be called to calculate the rolling window - analysis, the function must receive as input an array or - pandas series. Its output must be either a number or a pandas - series - window: length of the window to perform the analysis - step: step to take between two consecutive windows + values: 1-D Time series of data + function: Function to be called to calculate the rolling window analysis, the function must receive as input an array or pandas series. Its output must be either a number or a pandas series + window: Length of the window to perform the analysis + step: Step to take between two consecutive windows, by default 1 Returns: - ------- data: Columns generated by the function applied - """ x = _strided_app(values, window, step) diff --git a/ceruleo/transformation/features/scalers.py b/ceruleo/transformation/features/scalers.py index f98a6629..3129af39 100644 --- a/ceruleo/transformation/features/scalers.py +++ b/ceruleo/transformation/features/scalers.py @@ -8,28 +8,22 @@ class RobustMinMaxScaler(TransformerStep): - """Scale features using statistics that are robust to outliers. + """ + Scale features using statistics that are robust to outliers. - This Scaler scales the data according to the quantile range + This Scaler scales the data according to the quantile range. The IQR is the range between the limits provided, by default, 1st quartile (25th quantile) and the 3rd quartile (75th quantile). The quantiles are approximated using tdigest - Parameters - ---------- - range : tuple - Desired range of transformed data. - clip : bool, optional - Set to True to clip transformed values of held-out data to provided, by default True - lower_quantile : float, optional - Lower limit of the quantile range to compute the scale, by default 0.25 - upper_quantile : float, optional - Upper limit of the quantile range to compute the scale, by default 0.75 - tdigest_size : Optional[int], optional - Size of the t-digest structure, by default 100 - name : Optional[str], optional - Name of the step, by default None + Parameters: + range: Desired range of transformed data. + clip: Set to True to clip transformed values of held-out data to provided, by default True + lower_quantile: Lower limit of the quantile range to compute the scale, by default 0.25 + upper_quantile: Upper limit of the quantile range to compute the scale, by default 0.75 + tdigest_size: Size of the t-digest structure, by default 100 + name: Name of the step, by default None """ def __init__( @@ -66,15 +60,37 @@ def _compute_quantiles(self): def partial_fit(self, df: pd.DataFrame, y=None): + """ + Compute the quantiles of the dataset + + Parameters: + df: The input dataset + """ self.quantile_estimator.update(df) return self def fit(self, df: pd.DataFrame, y=None): + """ + Compute the quantiles of the dataset + + Parameters: + df: The input dataset + """ self.quantile_estimator.update(df) self._compute_quantiles() return self - def transform(self, X: pd.DataFrame): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Scale the input dataset + + Parameters: + X: The input dataset + + Returns: + A new DataFrame with the same index as the input with the + data scaled with respect to the quantiles of the fiited dataset + """ if self.Q1 is None: self._compute_quantiles() @@ -94,19 +110,16 @@ def transform(self, X: pd.DataFrame): class MinMaxScaler(TransformerStep): - """Transform features by scaling each feature to a given range. + """ + Transform features by scaling each feature to a given range. This transformer scales and translates each feature individually such that it is in the given range on the training set. - Parameters - ---------- - range : tuple - Desired range of transformed data. - clip : bool, optional - Set to True to clip transformed values of held-out data to provided, by default True - name : Optional[str], optional - Name of the step, by default None + Parameters: + range: Desired range of transformed data. + clip: Set to True to clip transformed values of held-out data to provided, by default True + name: Name of the step, by default None """ def __init__( @@ -124,7 +137,13 @@ def __init__( self.data_max = None self.clip = clip - def partial_fit(self, df, y=None): + def partial_fit(self, df: pd.DataFrame, y=None): + """ + Compute the dataset's bounds + + Parameters: + df: The input dataset + """ partial_data_min = df.min(skipna=True) partial_data_max = df.max(skipna=True) if self.data_min is None: @@ -139,13 +158,28 @@ def partial_fit(self, df, y=None): ) return self - def fit(self, df, y=None): + def fit(self, df: pd.DataFrame, y=None): + """ + Compute the dataset's bounds + + Parameters: + df: The input dataset + """ self.data_min = df.min(skipna=True) self.data_max = df.max(skipna=True) return self - def transform(self, X): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Scale the input dataset + + Parameters: + X: The input dataset + + Returns: + A new DataFrame with the same index as the input with the data scaled in the range inserted in input + """ try: divisor = self.data_max - self.data_min @@ -169,13 +203,11 @@ def description(self): class StandardScaler(TransformerStep): - """Standardize features by removing the mean and scaling to unit variance. - - Parameters - ---------- - name : Optional[str], optional - Name of the step, by default None + """ + Standardize features by removing the mean and scaling to unit variance. + Parameters: + name: Name of the step, by default None """ def __init__(self, *, name: Optional[str] = None): @@ -183,7 +215,13 @@ def __init__(self, *, name: Optional[str] = None): self.std = None self.mean = None - def partial_fit(self, df, y=None): + def partial_fit(self, df: pd.DataFrame, y=None): + """ + Compute mean and std of the dataset + + Parameters: + df: The input dataset + """ if df.shape[0] < 15: return self partial_data_mean = df.mean() @@ -196,20 +234,39 @@ def partial_fit(self, df, y=None): self.std = pd.concat([self.std, partial_data_std], axis=1).mean(axis=1) return self - def fit(self, df, y=None): + def fit(self, df: pd.DataFrame, y=None): + """ + Compute mean and std of the dataset + + Parameters: + df: The input dataset + """ self.mean = df.mean() self.std = df.std() return self - def transform(self, X): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Scale the input dataset + Parameters: + X: The input dataset + + Returns: + A new DataFrame with the same index as the input with the data scaled to have null mean and unit variance + """ return (X - self.mean) / (self.std) class RobustStandardScaler(TransformerStep): - """Scale features using statistics that are robust to outliers.""" + """ + Scale features using statistics that are robust to outliers. + + Parameters: + quantile_range: Desired quantile range of transformed data, by defualt (0.25,0.75) + """ - def __init__(self, *, quantile_range=(0.25, 0.75), prefer_partial_fit:bool = False, **kwargs): + def __init__(self, *, quantile_range: tuple=(0.25, 0.75), prefer_partial_fit:bool = False, **kwargs): super().__init__( **kwargs,prefer_partial_fit=prefer_partial_fit) self.quantile_range = quantile_range self.quantile_estimator = QuantileEstimator() @@ -217,18 +274,11 @@ def __init__(self, *, quantile_range=(0.25, 0.75), prefer_partial_fit:bool = Fal self.median = None def fit(self, X: pd.DataFrame, y=None): - """Compute the mean of the dataset - - Parameters - ---------- - X : pd.DataFrame - the input dataset - + """ + Compute the mean of the dataset - Returns - ------- - MeanCentering - self + Parameters: + X: the input dataset """ Q1 = X.quantile(self.quantile_range[0]) Q3 = X.quantile(self.quantile_range[1]) @@ -236,17 +286,11 @@ def fit(self, X: pd.DataFrame, y=None): self.median = X.median() def partial_fit(self, X: pd.DataFrame, y=None): - """Compute incrementally the mean of the dataset - - Parameters - ---------- - X : pd.DataFrame - the input life + """ + Compute incrementally the mean of the dataset - Returns - ------- - MeanCentering - self + Parameters: + X: the input dataset """ if X.shape[0] < 2: return self @@ -264,18 +308,15 @@ def _compute_quantiles(self): self.median = self.quantile_estimator.quantile(0.5) def transform(self, X: pd.DataFrame) -> pd.DataFrame: - """Center the input life + """ + Center the input life - Parameters - ---------- - X : pd.DataFrame + Parameters: + X: pd.DataFrame The input life - Returns - ------- - pd.DataFrame - A new DataFrame with the same index as the input with the - data centered with respect to the mean of the fiited dataset + Returns: + A new DataFrame with the same index as the input with the data centered with respect to the mean of the fiited dataset """ if self.IQR is None: self._compute_quantiles() @@ -284,13 +325,10 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: class ScaleInvRUL(TransformerStep): """ - Scale binary columns according the inverse of the RUL. - Usually this will be used before a CumSum transformation + Scale binary columns according to the inverse of the RUL.Usually this will be used before a CumSum transformation - Parameters - ---------- - rul_column: str - Column with the RUL + Parameters: + rul_column: Column with the RUL """ def __init__(self, *,rul_column: str, name: Optional[str] = None): @@ -301,6 +339,12 @@ def __init__(self, *,rul_column: str, name: Optional[str] = None): self.rul_column = None def partial_fit(self, X: pd.DataFrame): + """ + Fit the scaler + + Parameters: + X: The input dataset + """ if self.rul_column is None: self.rul_column = self.column_name(X, self.rul_column_in) columns = [c for c in X.columns if c != self.rul_column] @@ -322,7 +366,16 @@ def partial_fit(self, X: pd.DataFrame): self.penalty[k] = 1 / np.median(self.RUL_list_per_column[k]) - def transform(self, X: pd.DataFrame): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Scale the input dataset + + Parameters: + X: The input dataset + + Returns: + A new DataFrame with the same index as the input with the data scaled with respect to the RUL + """ columns = [c for c in X.columns if c != self.rul_column] X_new = pd.DataFrame(index=X.index) for c in columns: @@ -332,24 +385,19 @@ def transform(self, X: pd.DataFrame): class PerCategoricalMinMaxScaler(TransformerStep): - """Performs a minmax scaler partition the data trough some categorical feature + """ + Performs a minmax scaler partition of the data trough some categorical feature Usually, different execution configurations lead to different scales in the features. Therefore, sometimes it is useful to scale the data based on a categorical feature, to reflect the difference in the execution parameters. - Parameters - ---------- - categorical_feature: str - The name of the categorical feature whose values are going to be used - to split each time-series - scaler: Optional[Union[MinMaxScaler,RobustMinMaxScaler]], default MinMaxScaler - The scale to use when scaling the data - scaler_params: dict - Parameters used when constructing the scaler - name: Optional[str] - Name of the transformer - + Parameters: + categorical_feature: str + The name of the categorical feature whose values are going to be used to split each time-series + scaler: The scaler to use when scaling the data, by default MinMaxScaler + scaler_params: Parameters used when constructing the scaler, by default {} + name: Name of the step, by default None """ def __init__( @@ -368,7 +416,13 @@ def __init__( self.scalers = {"default": self.scaler(**self.scaler_params)} - def partial_fit(self, X, y=None): + def partial_fit(self, X: pd.DataFrame, y=None): + """ + Fit the scaler + + Parameters: + X: The input dataset + """ if self.categorical_feature_name is None: self.categorical_feature_name = self.find_feature( X, self.categorical_feature @@ -380,8 +434,16 @@ def partial_fit(self, X, y=None): self.scalers[category].partial_fit(data) self.scalers["default"].partial_fit(data) - def transform(self, X: pd.DataFrame): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Scale the input dataset using the appropriate scaler for each category + Parameters: + X: The input dataset + + Returns: + A new DataFrame with the same index as the input with the data scaled with respect to the categorical feature + """ X_new = X.drop(columns=[self.categorical_feature_name]) for category, data in X.groupby(self.categorical_feature_name): diff --git a/ceruleo/transformation/features/selection.py b/ceruleo/transformation/features/selection.py index 7779e26d..2a3ea411 100644 --- a/ceruleo/transformation/features/selection.py +++ b/ceruleo/transformation/features/selection.py @@ -15,8 +15,7 @@ class ByNameFeatureSelector(TransformerStep): """Select a subset of feature by name Parameters: - - features: Feature name or List of features name to select + features: Feature name or List of features name to select """ def __init__(self, *, features:Union[str, List[str]]= [], name: Optional[str] = None): super().__init__(name=name) @@ -40,7 +39,13 @@ def partial_fit(self, df, y=None): ] return self - def fit(self, df, y=None): + def fit(self, df:pd.DataFrame, y=None): + """ + Find the indices of the features to select + + Parameters: + df: DataFrame containing the input life + """ if len(self.features) > 0: features = [f for f in self.features if f in set(df.columns)] else: @@ -48,7 +53,16 @@ def fit(self, df, y=None): self.features_computed_ = sorted(features) return self - def transform(self, X): + def transform(self, X:pd.DataFrame) -> pd.DataFrame: + """ + Transform the input life + + Parameters: + X: The input life to be transformed + + Returns: + A new DataFrame containing only the selected features + """ return X.loc[:, self.features_computed_].copy() @property @@ -67,11 +81,27 @@ def __str__(self): class PositionFeatures(TransformerStep): - def __init__(self, *, features, name: Optional[str] = None): + """ + Reorder the features of the input life + + Parameters: + features: Dictionary containing the features to reorder and their new position + name: Name of the step, by default None + """ + def __init__(self, *, features: dict, name: Optional[str] = None): super().__init__(name=name) self.features = features - def transform(self, X): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Transform the input life by reordering the features + + Parameters: + X: The input life to be transformed + + Returns: + A new DataFrame containing the features in the order specified in the constructor + """ cols = list(X.columns) for name, pos in self.features.items(): a, b = cols.index(name), pos @@ -81,16 +111,38 @@ def transform(self, X): class DiscardByNameFeatureSelector(TransformerStep): - def __init__(self, *, features=[], name: Optional[str] = None): + """ + Remove a list of features from the input life + + Parameters: + features: List of features to discard + name: Name of the step, by default None + """ + def __init__(self, *, features: List=[], name: Optional[str] = None): super().__init__(name=name) self.features = features self.features_indices = None - def fit(self, df, y=None): + def fit(self, df:pd.DataFrame, y=None): + """ + Find the indices of the features to discard + + Parameters: + df: DataFrame containing the set of features to discard + """ self.feature_columns = [f for f in df.columns if f not in self.features] return self - def transform(self, X): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Transform the input life + + Parameters: + X: The input life to be transformed + + Returns: + A new DataFrame containing only the features not in the list of features to discard + """ return X.loc[:, self.feature_columns] @property @@ -99,12 +151,25 @@ def n_features(self): class PandasVarianceThreshold(TransformerStep): + """ + Remove features with variance lower than a variance threshold inserted in input + + Parameters: + min_variance: Minimum variance threshold + name: Name of the step, by default None + """ def __init__(self, *, min_variance: float, name: Optional[str] = None): super().__init__(name=name) self.min_variance = min_variance self.selected_columns_ = None - def partial_fit(self, X, y=None): + def partial_fit(self, X:pd.DataFrame, y=None): + """ + Find the indexes of the features with variance higher than the threshold + + Parameters: + X: DataFrame containing the input life + """ variances_ = X.var(skipna=True) partial_selected_columns_ = X.columns[variances_ > self.min_variance] if ( @@ -129,8 +194,13 @@ def partial_fit(self, X, y=None): logger.warning("All features were removed") return self - def fit(self, X, y=None): + def fit(self, X:pd.DataFrame, y=None): + """ + Find the indexes of the features with variance higher than the threshold + Parameters: + X: DataFrame containing the input life + """ if not isinstance(X, pd.DataFrame): raise ValueError("Input array must be a data frame") self.variances_ = X.var(skipna=True) @@ -140,19 +210,41 @@ def fit(self, X, y=None): ) return self - def transform(self, X, y=None): + def transform(self, X:pd.DataFrame, y=None) -> pd.DataFrame: + """ + Transform the input life + + Parameters: + X: The input life to be transformed + + Returns: + A new life containing only the features with variance higher than the threshold + """ if not isinstance(X, pd.DataFrame): raise ValueError("Input array must be a data frame") return X[self.selected_columns_].copy() class NullProportionSelector(TransformerStep): + """ + Remove features with null proportion higher than a threshold inserted in input + + Parameters: + max_null_proportion: Maximum null proportion threshold + name: Name of the step, by default None + """ def __init__(self, *, max_null_proportion: float, name: Optional[str] = None): super().__init__(name=name) self.max_null_proportion = max_null_proportion self.selected_columns_ = None - def partial_fit(self, X, y=None): + def partial_fit(self, X:pd.DataFrame, y=None): + """ + Find the indexes of the features with null proportion lower than the threshold + + Parameters: + X: DataFrame containing the input life + """ null_proportion = X.isnull().mean() partial_selected_columns_ = X.columns[ @@ -175,7 +267,13 @@ def partial_fit(self, X, y=None): logger.warning("All features were removed") return self - def fit(self, X, y=None): + def fit(self, X:pd.DataFrame, y=None): + """ + Find the indexes of the features with null proportion lower than the threshold + + Parameters: + X: DataFrame containing the input life + """ if not isinstance(X, pd.DataFrame): raise ValueError("Input array must be a data frame") self.null_proportion = X.isnull().mean() @@ -184,7 +282,16 @@ def fit(self, X, y=None): ] return self - def transform(self, X, y=None): + def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: + """ + Transform the input life + + Parameters: + X: The input life to be transformed + + Returns: + A new life containing only the features with null proportion lower than the threshold + """ if not isinstance(X, pd.DataFrame): raise ValueError("Input array must be a data frame") return X[self.selected_columns_].copy() @@ -202,27 +309,40 @@ def __init__(self, *, pattern:str, name: Optional[str] = None): self.selected_columns_ = None - def partial_fit(self, df, y=None): + def partial_fit(self, df: pd.DataFrame, y=None): + + """ + Find the features matching the pattern + + Parameters: + df: DataFrame containing the entire set of features + """ if self.selected_columns_ is None: self.selected_columns_ = [f for f in df.columns if self.pattern in f ] return self - def transform(self, X, y=None): + def transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame: + """ + Transform the input life + + Parameters: + X: The input life to be transformed + + Returns: + A new life with the same index as the input with the missing values replaced by the value in the succesive timestamp + """ if not isinstance(X, pd.DataFrame): raise ValueError("Input array must be a data frame") return X[self.selected_columns_].copy() - - class ByTypeFeatureSelector(TransformerStep): """Select a subset of feature by type Parameters: - - features: Feature name or List of features name to select, by default [] + type_: Data type to be selected, by default [] """ def __init__(self, *, type_:Union[str, List]= [], name: Optional[str] = None): super().__init__(name=name) @@ -241,7 +361,16 @@ def fit(self, df, y=None): return self - def transform(self, X): + def transform(self, X:pd.DataFrame) -> pd.DataFrame: + """ + Transform the input life + + Parameters: + X: The input life to be transformed + + Returns: + A new DataFrame containing only the features of the selected type + """ return X.loc[:, self.features].copy() @property diff --git a/ceruleo/transformation/features/slicing.py b/ceruleo/transformation/features/slicing.py index 9e5db2eb..a29a04a6 100644 --- a/ceruleo/transformation/features/slicing.py +++ b/ceruleo/transformation/features/slicing.py @@ -14,11 +14,12 @@ class SliceRows(TransformerStep): - """Slice portion of the run-to-failure cycle + """ + Slice portion of the run-to-failure cycle Parameters: - initial: Initial position of the slice - final: Final position of the slice + initial: Initial position of the slice, default RelativeToStart(0) + final: Final position of the slice, default RelativeToEnd(0) """ def __init__( @@ -35,6 +36,15 @@ def __init__( self.final = final def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Slice the run-to-failure cycle + + Parameters: + X: Input dataframe + + Returns: + A dataframe with the sliced run-to-failure cycle + """ if isinstance(self.initial, RelativePosition): initial = self.initial.get(X.shape[0]) else: diff --git a/ceruleo/transformation/features/split.py b/ceruleo/transformation/features/split.py index b1638aed..09beeff3 100644 --- a/ceruleo/transformation/features/split.py +++ b/ceruleo/transformation/features/split.py @@ -11,7 +11,19 @@ class Joiner(TransformerStep): - def transform(self, X: List[pd.DataFrame]): + """ + Join multiple run-to-failure cycles into a single DataFrame + """ + def transform(self, X: List[pd.DataFrame]) -> pd.DataFrame: + """ + Join the input lifes + + Parameters: + X: List of run-to-failure cycles to join + + Returns: + A dataframe with the joined run-to-failure cycles + """ if isinstance(X, list): X_default = X[0] X_q = pd.concat(X[1:]) @@ -21,8 +33,14 @@ def transform(self, X: List[pd.DataFrame]): else: return X - class Filter(TransformerStep): + """ + Filter rows of a dataframe based on a query + + Parameters: + values: Values to filter by + columns: Columns to filter by + """ def __init__( self, *, @@ -43,7 +61,16 @@ def prepare_value(v): [f"({c} == {prepare_value(v)})" for c, v in zip(self.columns, self.values)] ) - def transform(self, X): + def transform(self, X:pd.DataFrame) -> pd.DataFrame: + """ + Filter the dataframe + + Parameters: + X: Input dataframe + + Returns: + A dataframe with the filtered rows + """ if self.values == ["__category_all__"]: return X.drop(columns=self.columns) else: diff --git a/ceruleo/transformation/features/transformation.py b/ceruleo/transformation/features/transformation.py index 80278838..feecedb6 100644 --- a/ceruleo/transformation/features/transformation.py +++ b/ceruleo/transformation/features/transformation.py @@ -17,34 +17,21 @@ def __init__(self, *args, **kwargs): self.sum = None def fit(self, X: pd.DataFrame, y=None): - """Compute the mean of the dataset - - Parameters - ---------- - X : pd.DataFrame - the input dataset - + """ + Compute the mean of the dataset - Returns - ------- - MeanCentering - self + Parameters: + X: The input dataset """ self.mean = X.mean() return self def partial_fit(self, X: pd.DataFrame, y=None): - """Compute incrementally the mean of the dataset - - Parameters - ---------- - X : pd.DataFrame - the input life + """ + Compute incrementally the mean of the dataset - Returns - ------- - MeanCentering - self + Parameters: + X: The input life """ if self.sum is None: self.sum = X.sum() @@ -58,22 +45,17 @@ def partial_fit(self, X: pd.DataFrame, y=None): def transform(self, X: pd.DataFrame) -> pd.DataFrame: """Center the input life - Parameters - ---------- - X : pd.DataFrame - The input life - - Returns - ------- - pd.DataFrame - A new DataFrame with the same index as the input with the - data centered with respect to the mean of the fiited dataset + Parameters: + X: The input life + + Returns: + A new DataFrame with the same index as the input with the data centered with respect to the mean of the fiited dataset """ return X - self.mean class MedianCentering(TransformerStep): - """Center the data with respect to the mean""" + """Center the data with respect to the median""" def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -81,34 +63,21 @@ def __init__(self, *args, **kwargs): self.median = None def fit(self, X: pd.DataFrame, y=None): - """Compute the mean of the dataset - - Parameters - ---------- - X : pd.DataFrame - the input dataset - + """ + Compute the median of the dataset - Returns - ------- - MeanCentering - self + Parameters: + X: The input dataset """ self.median = X.median() return self def partial_fit(self, X: pd.DataFrame, y=None): - """Compute incrementally the mean of the dataset - - Parameters - ---------- - X : pd.DataFrame - the input life + """ + Compute incrementally the median of the dataset - Returns - ------- - MeanCentering - self + Parameters: + X: The input life """ if X.shape[0] < 2: return self @@ -127,18 +96,14 @@ def partial_fit(self, X: pd.DataFrame, y=None): return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: - """Center the input life - - Parameters - ---------- - X : pd.DataFrame - The input life + """ + Center the input life - Returns - ------- - pd.DataFrame - A new DataFrame with the same index as the input with the - data centered with respect to the mean of the fiited dataset + Parameters: + X: The input life + + Returns: + A new DataFrame with the same index as the input with the data centered with respect to the mean of the fiited dataset """ return X - self.median @@ -147,18 +112,14 @@ class Square(TransformerStep): """Compute the square of the values of each feature""" def transform(self, X: pd.DataFrame) -> pd.DataFrame: - """Transform the input life with the square of the values + """ + Transform the input life with the square of the values - Parameters - ---------- - X : pd.DataFrame - The input life + Parameters: + X: The input life - Returns - ------- - pd.DataFrame - A new dataframe with the same index as the input with - the square of the values + Returns: + A new dataframe with the same index as the input with the square of the values """ return X.pow(2) @@ -167,31 +128,25 @@ class Sqrt(TransformerStep): """Compute the sqrt of the values of each feature""" def transform(self, X: pd.DataFrame) -> pd.DataFrame: - """Transform the input life with the sqrt of the values - - Parameters - ---------- - X : pd.DataFrame - The input life + """ + Transform the input life with the sqrt of the values - Returns - ------- - pd.DataFrame - A new dataframe with the same index as the input with - the sqrt of the values + Parameters: + X: The input life + + Returns: + A new dataframe with the same index as the input with the sqrt of the values """ return X.pow(1.0 / 2) class Scale(TransformerStep): - """Scale each feature by a given vaulue - - Parameters - ---------- - scale_factor : float - Scale factor - name : Optional[str], optional - Name of the step, by default None + """ + Scale each feature by a given vaulue + + Parameters: + scale_factor: Scale factor to apply to each feature + name: Name of the step, by default None """ def __init__(self, *, scale_factor: float, name: Optional[str] = None): @@ -199,135 +154,111 @@ def __init__(self, *, scale_factor: float, name: Optional[str] = None): self.scale_factor = scale_factor def transform(self, X: pd.DataFrame) -> pd.DataFrame: - """Return the scaled life - - Parameters - ---------- - X : pd.DataFrame - The input life + """ + Return the scaled life - Returns - ------- - pd.DataFrame + Parameters: + X: The input life + + Returns: Return a new DataFrame with the same index as the input with the scaled features """ return X * self.scale_factor class ExpandingCentering(TransformerStep): - """Center the life using an expanding window - - .. highlight:: python - .. code-block:: python - - X - X.expanding().mean() - + """ + Center the life using an expanding window """ - def transform(self, X: pd.DataFrame) -> pd.DataFrame: - """Transform the live centering it using an expanding window + #.. raw:: html + #

Formula: \(X - X.expanding().mean())

- Parameters - ---------- - X : pd.DataFrame - The input life + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Transform the live centering it using an expanding window - Returns - ------- - pd.DataFrame - Return a new DataFrame with the same index as the input with the - data centered + Parameters: + X: The input life + + Returns: + Return a new DataFrame with the same index as the input with the data centered """ return X - X.expanding().mean() class RollingCentering(TransformerStep): - """Center the life using an expanding window + """ + Center the life using an rolling window - .. highlight:: python - .. code-block:: python + """ - X - X.rolling().mean() - """ - def __init__(self, *, window: int, min_points: int, name: Optional[str] = None): + def __init__(self, window: int, min_points: int, name: Optional[str] = None): super().__init__(name=name) self.window = window self.min_points = min_points def transform(self, X: pd.DataFrame) -> pd.DataFrame: - """Transform the live centering it using an expanding window - - Parameters - ---------- - X : pd.DataFrame - The input life + """ + Transform the live centering it using a rolling window - Returns - ------- - pd.DataFrame - Return a new DataFrame with the same index as the input with the - data centered + Parameters: + X: The input life + + Returns: + Return a new DataFrame with the same index as the input with the data centered """ return X - X.rolling(window=self.window, min_periods=self.min_points).mean() class ExpandingNormalization(TransformerStep): """Normalize the life features using an expanding window + """ - .. highlight:: python - .. code-block:: python + #.. highlight:: python + #.. code-block:: python - (X - X.expanding().mean()) / (X.expanding().std()) + #(X - X.expanding().mean()) / (X.expanding().std()) - """ - def transform(self, X): - """Transform the live normalized it using an expanding window - - Parameters - ---------- - X : pd.DataFrame - The input life + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Transform the live normalized it using an expanding window - Returns - ------- - pd.DataFrame - Return a new DataFrame with the same index as the input with the - data normalized + Parameters: + X: The input life + + Returns: + Return a new DataFrame with the same index as the input with the data normalized """ return (X - X.expanding().mean()) / (X.expanding().std()) class Accumulate(TransformerStep): - """Compute the accumulated sum of each feature. + """ + Compute the accumulated sum of each feature. - This is useful for binary features to compute count + This is useful to compute the count of binary features. - Parameters - ---------- - normalize: - https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6621413 - """ + Parameters: + normalize: Weather to apply the normalization or not, by default False + """ def __init__(self, *, normalize: bool = False, name: Optional[str] = None): super().__init__(name=name) self.normalize = normalize def transform(self, X: pd.DataFrame) -> pd.DataFrame: - """Transform the input life computing the cumulated sum - - Parameters - ---------- - X : pd.DataFrame - Input life + """ + Transform the input life computing the cumulated sum - Returns - ------- - pd.DataFrame - Return a new DataFrame with the same index as the input - with the cumulated sum of the features + Parameters: + X: The Input life + + Returns: + Return a new DataFrame with the same index as the input with the cumulated sum of the features """ X1 = X.cumsum() if self.normalize: @@ -339,39 +270,32 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: class Diff(TransformerStep): """Compute the 1 step difference of each feature.""" - def transform(self, X): - """Transform the input life computing the 1 step difference + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Transform the input life computing the 1 step difference - Parameters - ---------- - X : pd.DataFrame - Input life + Parameters: + X: The Input life - Returns - ------- - pd.DataFrame - Return a new DataFrame with the same index as the input - with the difference of the features + Returns: + Return a new DataFrame with the same index as the input with the 1 step difference of the features """ return X.diff() class StringConcatenate(TransformerStep): - """Compute the 1 step difference of each feature.""" - - def transform(self, X): - """Transform the input life computing the 1 step difference + """ + Return a new DataFrame with a single column with the concatenation of the values of each row. The method works only on strings and the values are separated by -""" - Parameters - ---------- - X : pd.DataFrame - Input life + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Transform the input life computing the 1 step difference - Returns - ------- - pd.DataFrame - Return a new DataFrame with the same index as the input - with the difference of the features + Parameters: + X: The Input life + + Returns: + Return a new DataFrame with the same index as the input with the difference of the features """ new_X = pd.DataFrame(index=X.index) new_X["concatenation"] = X.agg("-".join, axis=1) @@ -379,37 +303,32 @@ def transform(self, X): class Apply(TransformerStep): - """Apply the function element-wise""" + """Apply the input function element-wise""" def __init__(self, *, fun, name: Optional[str] = None): super().__init__(name=name) self.fun = fun - def transform(self, X): - """Transform the input life computing the 1 step difference + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Transform the input life applying the input function to each element Parameters - ---------- - X : pd.DataFrame - Input life - - Returns - ------- - pd.DataFrame - Return a new DataFrame with the same index as the input - with the difference of the features + X: The Input life + + Returns: + Return a new DataFrame with results of the function application to each element. """ return X.apply(self.fun) class Clip(TransformerStep): - """Clip values onto a predefined range + """ + Clip values onto a predefined range Parameters: lower: The lower value upper: The Upper value - - """ def __init__(self, *, lower: float, upper: float, **kwargs): @@ -417,22 +336,40 @@ def __init__(self, *, lower: float, upper: float, **kwargs): self.lower = lower self.upper = upper - def transform(self, X: pd.DataFrame): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Transform the input life clipping the values onto a predefined range + + Parameters: + X: The Input life + + Returns: + Return a new DataFrame with the same index as the input with the clipped values + """ return X.clip(self.lower, self.upper) class SubstractLinebase(TransformerStep): - """SubstractLinebase""" + """Subtract the values in the first row from all the rows in the input life""" def __init__(self, *args): super().__init__(*args) - def transform(self, X): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Apply the transformation to the input life + + Parameters: + X: The Input life + + Returns: + Return a new DataFrame with the same index as the input with the subtraction of the first row + """ return X - X.iloc[0, :] class Peaks(TransformerStep): - """Peaks""" + """Find Peaks in the input life. Return a new DataFrame with the same index as the input with 1 in the position of the peaks and 0 otherwise""" distance: float @@ -440,7 +377,16 @@ def __init__(self, *, distance:float, name : Optional[str] = None): super().__init__(name=name) self.distance = distance - def transform(self, X): + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Detect the peaks in the input life + + Parameters: + X: The Input life + + Returns: + Return a new DataFrame with the same index as the input with the peaks marked as 1 and 0 otherwise. + """ new_X = pd.DataFrame(np.zeros(X.shape), index=X.index, columns=X.columns) for i, c in enumerate(X.columns): peaks_positions, _ = find_peaks(X[c].values, distance=self.distance) diff --git a/docs/models/keras/index.md b/docs/models/keras/index.md index dce5a52c..d5c9037a 100644 --- a/docs/models/keras/index.md +++ b/docs/models/keras/index.md @@ -1,4 +1,4 @@ -# Rererence +# Reference ## Dataset diff --git a/docs/transformation/features/cast.md b/docs/transformation/features/cast.md index 119012eb..f867deb3 100644 --- a/docs/transformation/features/cast.md +++ b/docs/transformation/features/cast.md @@ -2,5 +2,5 @@ ::: ceruleo.transformation.features.cast options: - show_source: false + show_source: True heading_level: 3 \ No newline at end of file diff --git a/docs/transformation/features/denoising.md b/docs/transformation/features/denoising.md index cdfeaca9..dab1e002 100644 --- a/docs/transformation/features/denoising.md +++ b/docs/transformation/features/denoising.md @@ -2,5 +2,5 @@ ::: ceruleo.transformation.features.denoising options: - show_source: false + show_source: True heading_level: 3 \ No newline at end of file diff --git a/docs/transformation/features/entropy.md b/docs/transformation/features/entropy.md index 5ce8baf9..86826355 100644 --- a/docs/transformation/features/entropy.md +++ b/docs/transformation/features/entropy.md @@ -2,5 +2,5 @@ ::: ceruleo.transformation.features.entropy options: - show_source: false + show_source: True heading_level: 3 diff --git a/docs/transformation/features/extraction.md b/docs/transformation/features/extraction.md index e69de29b..1b222085 100644 --- a/docs/transformation/features/extraction.md +++ b/docs/transformation/features/extraction.md @@ -0,0 +1,5 @@ + +::: ceruleo.transformation.features.extraction + options: + show_source: True + heading_level: 3 \ No newline at end of file diff --git a/docs/transformation/features/extraction_frequency.md b/docs/transformation/features/extraction_frequency.md new file mode 100644 index 00000000..aaa8996d --- /dev/null +++ b/docs/transformation/features/extraction_frequency.md @@ -0,0 +1,6 @@ +## Extraction Frequency + +::: ceruleo.transformation.features.extraction_frequency + options: + show_source: True + heading_level: 3 \ No newline at end of file diff --git a/docs/transformation/features/hurst.md b/docs/transformation/features/hurst.md new file mode 100644 index 00000000..98e66231 --- /dev/null +++ b/docs/transformation/features/hurst.md @@ -0,0 +1,6 @@ +## Hurst Exponent Estimator + +::: ceruleo.transformation.features.hurst + options: + show_source: True + heading_level: 3 \ No newline at end of file diff --git a/docs/transformation/features/imputers.md b/docs/transformation/features/imputers.md index 79109099..a748d32c 100644 --- a/docs/transformation/features/imputers.md +++ b/docs/transformation/features/imputers.md @@ -3,5 +3,5 @@ ::: ceruleo.transformation.features.imputers options: - show_source: false + show_source: True heading_level: 3 diff --git a/docs/transformation/features/operations.md b/docs/transformation/features/operations.md new file mode 100644 index 00000000..be52d1ae --- /dev/null +++ b/docs/transformation/features/operations.md @@ -0,0 +1,6 @@ +## Operations + +::: ceruleo.transformation.features.operations + options: + show_source: True + heading_level: 3 \ No newline at end of file diff --git a/docs/transformation/features/outliers.md b/docs/transformation/features/outliers.md index 17afc7d2..07e42132 100644 --- a/docs/transformation/features/outliers.md +++ b/docs/transformation/features/outliers.md @@ -2,5 +2,5 @@ ::: ceruleo.transformation.features.outliers options: - show_source: false + show_source: True heading_level: 3 \ No newline at end of file diff --git a/docs/transformation/features/resamplers.md b/docs/transformation/features/resamplers.md index 1095835c..e31171c6 100644 --- a/docs/transformation/features/resamplers.md +++ b/docs/transformation/features/resamplers.md @@ -2,5 +2,5 @@ ::: ceruleo.transformation.features.resamplers options: - show_source: false + show_source: True heading_level: 3 \ No newline at end of file diff --git a/docs/transformation/features/rolling_windows.md b/docs/transformation/features/rolling_windows.md new file mode 100644 index 00000000..3e3ef464 --- /dev/null +++ b/docs/transformation/features/rolling_windows.md @@ -0,0 +1,6 @@ +## Rolling Windows + +::: ceruleo.transformation.features.rolling_windows + options: + show_source: True + heading_level: 3 \ No newline at end of file diff --git a/docs/transformation/features/scalers.md b/docs/transformation/features/scalers.md new file mode 100644 index 00000000..7a3133b5 --- /dev/null +++ b/docs/transformation/features/scalers.md @@ -0,0 +1,6 @@ +## Scaling + +::: ceruleo.transformation.features.scalers + options: + show_source: True + heading_level: 3 \ No newline at end of file diff --git a/docs/transformation/features/selection.md b/docs/transformation/features/selection.md index 86cefa3d..ba06dbc5 100644 --- a/docs/transformation/features/selection.md +++ b/docs/transformation/features/selection.md @@ -1,6 +1,6 @@ -## Resamplers +## Selectors ::: ceruleo.transformation.features.selection options: - show_source: false + show_source: True heading_level: 3 \ No newline at end of file diff --git a/docs/transformation/features/slicing.md b/docs/transformation/features/slicing.md new file mode 100644 index 00000000..afccddd3 --- /dev/null +++ b/docs/transformation/features/slicing.md @@ -0,0 +1,6 @@ +## Slicing + +::: ceruleo.transformation.features.slicing + options: + show_source: True + heading_level: 3 \ No newline at end of file diff --git a/docs/transformation/features/split.md b/docs/transformation/features/split.md new file mode 100644 index 00000000..73aa907a --- /dev/null +++ b/docs/transformation/features/split.md @@ -0,0 +1,6 @@ +## Split + +::: ceruleo.transformation.features.split + options: + show_source: True + heading_level: 3 \ No newline at end of file diff --git a/docs/transformation/features/transformation.md b/docs/transformation/features/transformation.md new file mode 100644 index 00000000..bbd75cb3 --- /dev/null +++ b/docs/transformation/features/transformation.md @@ -0,0 +1,6 @@ +## Global Transformations + +::: ceruleo.transformation.features.transformation + options: + show_source: True + heading_level: 3 \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index feffa8bc..047c67ad 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -60,9 +60,17 @@ nav: - Denoising: transformation/features/denoising.md - Entropy: transformation/features/entropy.md - Extraction: transformation/features/extraction.md + - Extraction Frequency: transformation/features/extraction_frequency.md - Imputers: transformation/features/imputers.md - Resamplers: transformation/features/resamplers.md - - Resamplers: transformation/features/selection.md + - Selectors: transformation/features/selection.md + - Scalers: transformation/features/scalers.md + - Slicing: transformation/features/slicing.md + - Split: transformation/features/split.md + - Rolling Windows: transformation/features/rolling_windows.md + - Operations: transformation/features/operations.md + - Global Transformations: transformation/features/transformation.md + - Hurst Exponent Estimator: transformation/features/hurst.md - Outliers: transformation/features/outliers.md - Results: - Results: results/results.md @@ -97,11 +105,12 @@ markdown_extensions: anchor_linenums: true - pymdownx.inlinehilite - pymdownx.snippets + - pymdownx.tasklist - pymdownx.superfences - toc: toc_depth: 5 - footnotes - pymdownx.emoji: - emoji_index: !!python/name:materialx.emoji.twemoji - emoji_generator: !!python/name:materialx.emoji.to_svg + #emoji_index: !!python/name:materialx.emoji.twemoji + #emoji_generator: !!python/name:materialx.emoji.to_svg - meta \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 7d769fbb..f2cab4ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,5 @@ mkdocs-material mkdocs-jupyter jupyter_contrib_nbextensions mkdocstrings -mkdocs-bibtex \ No newline at end of file +mkdocs-bibtex +notebook==6.4.12 \ No newline at end of file