Update the MkDocs for the Library Documentation (#33)

* Update mkdocs.yml * Update requirements.txt * Update extraction.md file so that it appears in the doc * Update md files to show the source code * Update docstrings in the py files * Update selection.py * Update mkdocs.yml * Update selection.py and .md * Update cast.py * Update denoising.py * Update entropy.py * Update outliers.py * Added extraction_frequency.py * Added scalers.md * Update mkdocs.yml * Update scalers.py * Transfer scaler.md to the docs/transformation/features folder * Added slicing.md and split.md * Adding operations.md and rolling_windows.md * Update operations.py and rolling_windows.py * Update operations.py and rolling_windows.py * Added transformation.md * Update transformation.py * Update mkdocs.yml * Update transformation.py --------- Co-authored-by: Lemeda98 <lemeda98@gmail.com> Co-authored-by: Luciano Lorenti <lucianolorenti@gmail.com>
lucianolorenti · Jan 7, 2024 · 7862b3c · 7862b3c
1 parent c1ccd94
commit 7862b3c
Show file tree

Hide file tree

Showing 59 changed files with 2,085 additions and 1,655 deletions.
diff --git a/ceruleo/dataset/analysis/correlation.py b/ceruleo/dataset/analysis/correlation.py
@@ -1,5 +1,3 @@
-
-
 from itertools import combinations
 from typing import List, Optional, Tuple
 
@@ -18,30 +16,29 @@ def correlation_analysis(
     Compute the correlation between all the features given an Iterable of executions.
 
     Parameters:
-    
         dataset: Dataset of time series
         corr_threshold: Threshold to consider two features of a single execution highly correlated
         features: List of features to consider when computing the correlations
 
     Returns:
+        A DataFrame indexed with the column names with the following columns:
 
-        pd.DataFrame: A DataFrame indexed with the column names with the following columns:
+            - Mean Correlation
+            - Std Correlation
+            - Percentage of lives with a high correlation
+            - Abs mean correlation
+            - Std mean correlation
+            - Max correlation
+            - Min correlation
 
-                    - Mean Correlation
-                    - Std Correlation
-                    - Percentage of lives with a high correlation
-                    - Abs mean correlation
-                    - Std mean correlation
-                    - Max correlation
-                    - Min correlation
     """
     if features is None:
         features = sorted(list(dataset.common_features()))
     else:
         features = sorted(list(set(features).intersection(dataset.common_features())))
     features = dataset.get_features_of_life(0)[features].corr().columns
     correlated_features = []
-    
+
     for ex in iterate_over_features(dataset):
         ex = ex[features]
         corr_m = ex.corr().fillna(0)
@@ -73,4 +70,4 @@ def percentage_above_treshold(x):
     )
     output["Max correlation"] = df.groupby(by=["Feature 1", "Feature 2"]).max()
     output["Min correlation"] = df.groupby(by=["Feature 1", "Feature 2"]).min()
-    return output
+    return output
diff --git a/ceruleo/dataset/analysis/distribution.py b/ceruleo/dataset/analysis/distribution.py
@@ -19,7 +19,6 @@ def histogram_per_life(
     bins_to_use: np.ndarray,
     normalize: bool = True,
 ) -> List[np.ndarray]:
-
     try:
         d = life[feature]
         h, _ = np.histogram(d, bins=bins_to_use)
@@ -49,27 +48,22 @@ def features_divergeces(
     columns: Optional[List[str]] = None,
     show_progress: bool = False,
 ) -> pd.DataFrame:
-    """Compute the divergence between features
+    """
+    Compute the divergence between features
 
     Parameters:
-
         ds: The dataset
         number_of_bins: Number of bins
         columns: Which columns to use
 
     Returns:
-        df: A DataFrame in which cach row contains the 
-            distances between a feature of two run-to-failure cycle
-            with the following columns:
-
-                - Life 1: Run-to-failure cycle 1
-                - Life 2: Run-to-failure cycle 2
-                - W: Wasserstein
-                - KL: KL Divergence
-                - feature: The feature name
-
-           
-        
+        A DataFrame in which each row contains the distances between a feature of two run-to-failure cycle with the following columns:
+
+            - Life 1: Run-to-failure cycle 1
+            - Life 2: Run-to-failure cycle 2
+            - W: Wasserstein
+            - KL: KL Divergence
+            - feature: The feature name
     """
     if columns is None:
         columns = ds.numeric_features()
@@ -88,14 +82,16 @@ def features_divergeces(
             histograms[feature].append(
                 histogram_per_life(life, feature, features_bins[feature])
             )
-    
+
     df_data = []
     for feature in columns:
         data = {}
-        for ((i, h1), (j, h2)) in itertools.combinations(enumerate(histograms[feature]), 2):
+        for (i, h1), (j, h2) in itertools.combinations(
+            enumerate(histograms[feature]), 2
+        ):
             kl = (np.mean(kl_div(h1, h2)) + np.mean(kl_div(h2, h1))) / 2
             wd = wasserstein_distance(h1, h2)
             df_data.append((i, j, wd, kl, feature))
     df = pd.DataFrame(df_data, columns=["Life 1", "Life 2", "W", "KL", "feature"])
 
-    return df
+    return df
diff --git a/ceruleo/dataset/analysis/numerical_features.py b/ceruleo/dataset/analysis/numerical_features.py
@@ -14,33 +14,31 @@
 from ceruleo.dataset.utils import iterate_over_features_and_target
 
 
-def entropy(s: np.ndarray)-> float:
-    """Approximate entropy
+def entropy(s: np.ndarray) -> float:
+    """
+    Approximate entropy
 
     The approximate entropy quantifies the amount of regularity and the unpredictability of fluctuations over time-series data.
 
     Parameters:
-    
         s: A single feature
 
     Returns:
-
-        ae: Approximate entropy of feature s
+        Approximate entropy of feature s
     """
     return ant.app_entropy(s)
 
 
-def correlation(s: np.ndarray, y:Optional[np.ndarray]=None):
-    """Correlation of the feature with the target
+def correlation(s: np.ndarray, y: Optional[np.ndarray] = None) -> float:
+    """
+    Correlation of the feature with the target
 
     Parameters:
-
         s: A single feature
         y: The RUL target
 
     Returns:
-
-        c: Correlation between the feature ant the RUL target
+        Correlation between the feature and the RUL target
     """
     N = s.shape[0]
     if not (s[0] == s).all():
@@ -51,113 +49,110 @@ def correlation(s: np.ndarray, y:Optional[np.ndarray]=None):
     return corr
 
 
-def autocorrelation(s: np.ndarray)-> float:
-    """Autocorrelation of a feature
+def autocorrelation(s: np.ndarray) -> float:
+    """
+    Autocorrelation of a feature
 
     Parameters:
-
         s: A single feature
 
     Returns:
-
-        ac: Autocorrelation of the feature
+        Autocorrelation of the feature
     """
     diff = np.diff(s)
-    return np.sum(diff ** 2) / s.shape[0]
+    return np.sum(diff**2) / s.shape[0]
 
 
 def monotonicity(s: np.ndarray) -> float:
-    """Monotonicity of a feature
+    """
+    Monotonicity of a feature, the two extreme values are 0 if the feature is constant and 1 if it is strictly monotonic.
 
     Parameters:
-
         s: A single feature
 
     Returns:
-    
-        f: Monotonicity of the feature
+        Monotonicity of the feature
     """
     N = s.shape[0]
     diff = np.diff(s)
     return 1 / (N - 1) * np.abs(np.sum(diff > 0) - np.sum(diff < 0))
 
 
 def n_unique(s: np.ndarray) -> int:
-    """Number of unique values in the array
+    """
+    Number of unique values in the array
 
     Parameters:
-
         s: A single feature
 
     Returns:
-
-        n: Number of unique values
+        Number of unique values
     """
     return len(np.unique(s))
 
+
 def null(s: np.ndarray) -> float:
-    """Null proportion for a given feature
+    """
+    Null proportion for a given feature
 
     Parameters:
         s: A feature
 
     Returns:
-    
-        n: Null proportion
+        Null proportion
     """
     return np.mean(~np.isfinite(s))
 
-def mutual_information(x:np.ndarray, y:np.ndarray):
+
+def mutual_information(x: np.ndarray, y: np.ndarray) -> float:
     """Mutual information between a feature and the target
 
     [Reference](Remaining Useful Life Prediction Using Ranking Mutual Information Based Monotonic Health Indicator)
 
     Parameters:
-    
         x: A single feature
         y: RUL Target
 
-    Returns:    
-        float: Mutual information between x and y
+    Returns:
+        Mutual information between x and y
 
     """
     x = x.reshape(-1, 1)
     x = np.nan_to_num(x)
     return mutual_info_regression(x, y)
 
+
 metrics = {
     "std": lambda x, y: np.std(x),
-    "correlation": lambda x,y: correlation(x, y),
-    "autocorrelation": lambda x,y:autocorrelation(x),
-    "monotonicity": lambda x,y:monotonicity(x),
-    "number_of_unique_elements": lambda x,y:n_unique(x),    
-    'mutual_information': mutual_information,
-    'null': lambda x, y: null(x),
-    'entropy': lambda x, y: entropy(x)
+    "correlation": lambda x, y: correlation(x, y),
+    "autocorrelation": lambda x, y: autocorrelation(x),
+    "monotonicity": lambda x, y: monotonicity(x),
+    "number_of_unique_elements": lambda x, y: n_unique(x),
+    "mutual_information": mutual_information,
+    "null": lambda x, y: null(x),
+    "entropy": lambda x, y: entropy(x),
 }
 
 
-
-
 def analysis_single_time_series(
     X: np.ndarray,
     y: np.ndarray,
     column_names: List[str],
     data: Optional[Dict] = None,
     what_to_compute: List[str] = [],
 ) -> dict:
-    """Compute the analysis for a single run-to-failure cycle
+    """
+    Compute the analysis for a single run-to-failure cycle
 
     Parameters:
-
-        X: Features            
+        X: Input Features
         y: RUL Target
         column_names: Column names of the features
         data: Initial data
         what_to_compute: Features to compute
 
     Returns:
-        dict: Computed info
+        Dictionary containing the computed info
     """
 
     if data is None:
@@ -171,12 +166,11 @@ def analysis_single_time_series(
 
             m = metrics[what](x_ts, y)
 
-
             data[column_name][what].append(m)
     return data
 
 
-def merge_analysis(data: dict):
+def merge_analysis(data: dict) -> pd.DataFrame:
     data_df = defaultdict(lambda: defaultdict(list))
     for column_name in data.keys():
         for what in data[column_name]:
@@ -195,25 +189,26 @@ def analysis(
     show_progress: bool = False,
     what_to_compute: List[str] = [],
 ) -> pd.DataFrame:
-    """Compute analysis of numerical features
+    """
+    Compute analysis of numerical features
 
     Parameters:
-
-        transformed_dataset: A transformed dataset with a features and target
+        dataset: A transformed dataset with features and target
         show_progress: Wether to show the progress when computing the features
-        what_to_compute: Elements available to compute        
-                        - std
-                        - correlation
-                        - autocorrelation,
-                        - monotonicity
-                        - number_of_unique_elements
-                        - mutual_information
-                        - null
-                        - entropy
+        what_to_compute: Elements available to compute:
+
+            - std
+            - Correlation
+            - Autocorrelation
+            - Monotonicity
+            - Number of unique elements
+            - Mutual information
+            - Null
+            - Entropy
 
-    Returns:
 
-        df: Dataframe with the columns specified by what_to_compute
+    Returns:
+        Dataframe with the columns specified by what_to_compute
     """
 
     if len(what_to_compute) == 0:
@@ -229,7 +224,5 @@ def analysis(
         column_names = dataset.numeric_features()
     for X, y in iterate_over_features_and_target(dataset):
         y = np.squeeze(y)
-        data = analysis_single_time_series(
-            X, y, column_names, data, what_to_compute
-        )
-    return merge_analysis(data)
+        data = analysis_single_time_series(X, y, column_names, data, what_to_compute)
+    return merge_analysis(data)