Improve the analysis module (#41)

* Analysis: Improve * Transformation: Allow the option for missing NA in MixMaxScaler * Graphics: Show better labels in duration histogram * Analysis: Provide a better html repr for the sample rate metric * Analysis: Add a function for obtain the numeric analysis as a dataframe * Tests: Update tests
lucianolorenti · Jun 17, 2024 · a394807 · a394807
1 parent ef911b6
commit a394807
Show file tree

Hide file tree

Showing 12 changed files with 2,450 additions and 689 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 3.0.5
+current_version = 3.0.6
 commit = True
 tag = True
 

diff --git a/ceruleo/__init__.py b/ceruleo/__init__.py
@@ -9,4 +9,4 @@
 CACHE_PATH.mkdir(parents=True, exist_ok=True)
 
 
-__version__ = "3.0.5"
+__version__ = "3.0.6"
diff --git a/ceruleo/dataset/analysis/distribution.py b/ceruleo/dataset/analysis/distribution.py
@@ -13,14 +13,25 @@
 logger = logging.getLogger(__name__)
 
 
-def histogram_per_life(
-    life: pd.DataFrame,
+def histogram_per_cycle(
+    cycle: pd.DataFrame,
     feature: str,
     bins_to_use: np.ndarray,
     normalize: bool = True,
 ) -> List[np.ndarray]:
+    """Compute the histogram of a feature in a run-to-failure cycle
+
+    Args:
+        cycle (pd.DataFrame): The run-to-failure cycle
+        feature (str): The  feature to compute the histogram
+        bins_to_use (np.ndarray): Number of bins to use
+        normalize (bool, optional): Wheter to normalize the histogram. Defaults to True.
+
+    Returns:
+        List[np.ndarray]: The histogram of the feature
+    """
     try:
-        d = life[feature]
+        d = cycle[feature]
         h, _ = np.histogram(d, bins=bins_to_use)
 
         if normalize:
@@ -59,9 +70,9 @@ def features_divergeces(
     Returns:
         A DataFrame in which each row contains the distances between a feature of two run-to-failure cycle with the following columns:
 
-            - Life 1: Run-to-failure cycle 1
-            - Life 2: Run-to-failure cycle 2
-            - W: Wasserstein
+            - Cycle 1: Run-to-failure cycle 1
+            - Cycle 2: Run-to-failure cycle 2
+            - Wasserstein: Wasserstein
             - KL: KL Divergence
             - feature: The feature name
     """
@@ -80,7 +91,7 @@ def features_divergeces(
             if feature not in histograms:
                 histograms[feature] = []
             histograms[feature].append(
-                histogram_per_life(life, feature, features_bins[feature])
+                histogram_per_cycle(life, feature, features_bins[feature])
             )
 
     df_data = []
@@ -91,7 +102,30 @@ def features_divergeces(
         ):
             kl = (np.mean(kl_div(h1, h2)) + np.mean(kl_div(h2, h1))) / 2
             wd = wasserstein_distance(h1, h2)
-            df_data.append((i, j, wd, kl, feature))
-    df = pd.DataFrame(df_data, columns=["Life 1", "Life 2", "W", "KL", "feature"])
+            df_data.append(
+                (
+                    i,
+                    j,
+                    ds.get_features_of_life(i).shape[0],
+                    ds.get_features_of_life(j).shape[0],
+                    abs(ds.get_features_of_life(i).shape[0]-ds.get_features_of_life(j).shape[0]),
+                    wd,
+                    kl,
+                    feature,
+                )
+            )
+    df = pd.DataFrame(
+        df_data,
+        columns=[
+            "Cycle 1",
+            "Cycle 2",
+            "Cycle 1 length",
+            "Cycle 2 length",
+            "Abs Length difference",           
+            "Wasserstein",
+            "KL",
+            "feature",
+        ],
+    )
 
     return df
diff --git a/ceruleo/dataset/analysis/numerical_features.py b/ceruleo/dataset/analysis/numerical_features.py
@@ -1,6 +1,5 @@
-
 from enum import Enum
-from typing import Dict, List, Optional, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import antropy as ant
 import numpy as np
@@ -12,6 +11,7 @@
 from ceruleo.dataset.transformed import TransformedDataset
 from ceruleo.dataset.ts_dataset import AbstractPDMDataset
 from ceruleo.dataset.utils import iterate_over_features_and_target
+import pandas as pd
 
 
 class MetricType(str, Enum):
@@ -29,7 +29,7 @@ def from_str(s: str) -> "MetricType":
         return MetricType(s)
 
 
-class MetricValues(BaseModel):
+class MetricValuesSummary(BaseModel):
     mean: float
     std: float
     max: float
@@ -38,7 +38,28 @@ class MetricValues(BaseModel):
 
 class NumericalFeaturesAnalysis(BaseModel):
     feature: str
-    metric: Dict[MetricType, MetricValues]
+    metric: Dict[MetricType, List[float]]
+
+    def summarize(self) -> Dict[MetricType, MetricValuesSummary]:
+        out = {}
+        for metric in self.metric.keys():
+            mean = np.nanmean(self.metric[metric])
+            std = np.nanstd(self.metric[metric])
+            max_ = np.nanmax(self.metric[metric])
+            min_ = np.nanmin(self.metric[metric])
+            out[metric] = MetricValuesSummary(mean=mean, std=std, max=max_, min=min_)
+        return out
+
+    def __getitem__(self, key: str) -> MetricValuesSummary:
+        return self.metric[MetricType.from_str(key)]
+
+    def _repr_html_(self) -> str:
+        out = "<table>"
+        out += "<tr><th>Metric</th><th>Mean</th><th>Std</th><th>Max</th><th>Min</th></tr>"
+        for metric, summary in self.summarize().items():
+            out += f"<tr><td>{metric}</td><td>{summary.mean}</td><td>{summary.std}</td><td>{summary.max}</td><td>{summary.min}</td></tr>"
+        out += "</table>"
+        return out
 
 
 def entropy(s: np.ndarray) -> float:
@@ -120,15 +141,15 @@ def n_unique(s: np.ndarray) -> int:
 
 def null(s: np.ndarray) -> float:
     """
-    Null proportion for a given feature
+    Null percentage for a given feature
 
     Parameters:
         s: A feature
 
     Returns:
-        Null proportion
+        Null percentage
     """
-    return np.mean(~np.isfinite(s))
+    return np.mean(~np.isfinite(s)) * 100
 
 
 def mutual_information(x: np.ndarray, y: np.ndarray) -> float:
@@ -151,17 +172,19 @@ def mutual_information(x: np.ndarray, y: np.ndarray) -> float:
 
 metrics = {
     "std": lambda x, y: np.std(x),
-    "correlation": lambda x, y: correlation(x, y),
+
     "autocorrelation": lambda x, y: autocorrelation(x),
     "monotonicity": lambda x, y: monotonicity(x),
     "number_of_unique_elements": lambda x, y: n_unique(x),
-    "mutual_information": mutual_information,
+
     "null": lambda x, y: null(x),
     "entropy": lambda x, y: entropy(x),
+    "mutual_information": mutual_information,
+     "correlation": lambda x, y: correlation(x, y),
 }
 
 
-def analysis_single_cycle(
+def analyze_single_cycle(
     X: np.ndarray,
     y: np.ndarray,
     out: Dict[str, Dict[MetricType, List[float]]],
@@ -206,16 +229,11 @@ def merge_cycle_analysis(
     for column_name in data.keys():
         for what in data[column_name]:
             metric_type = MetricType.from_str(what)
-            out[column_name].metric[metric_type] = MetricValues(
-                mean=np.nanmean(data[column_name][what]),
-                std=np.nanstd(data[column_name][what]),
-                max=np.nanmax(data[column_name][what]),
-                min=np.nanmin(data[column_name][what]),
-            )
+            out[column_name].metric[metric_type] = data[column_name][what]
     return out
 
 
-def analysis(
+def analyze(
     dataset: Union[TransformedDataset, AbstractPDMDataset],
     *,
     show_progress: bool = False,
@@ -260,6 +278,62 @@ def analysis(
     }
     for X, y in iterate_over_features_and_target(dataset):
         y = np.squeeze(y)
-        analysis_single_cycle(X, y, data_per_cycle, column_names, what_to_compute)
+        analyze_single_cycle(X, y, data_per_cycle, column_names, what_to_compute)
 
     return merge_cycle_analysis(data_per_cycle)
+
+
+def analyze_as_dataframe(
+    dataset: Union[TransformedDataset, AbstractPDMDataset],
+    *,
+    show_progress: bool = False,
+    what_to_compute: List[str] = [],
+) -> pd.DataFrame:
+    """
+    Compute analysis of numerical features
+
+    Parameters:
+        dataset: A transformed dataset with features and target
+        show_progress: Wether to show the progress when computing the features
+        what_to_compute: Elements available to compute:
+
+            - std
+            - Correlation
+            - Autocorrelation
+            - Monotonicity
+            - Number of unique elements
+            - Mutual information
+            - Null
+            - Entropy
+
+
+    Returns:
+        pd.DataFrame
+    """
+    rr = analyze(dataset, show_progress=show_progress, what_to_compute=what_to_compute)
+
+
+    out: Dict[Tuple[str, str], List[float]] = {}
+
+
+    for k, metrics in rr.items():
+        metrics_summary = metrics.summarize()
+        for metric_name, metric_values in metrics_summary.items():
+
+            key_mean = (metric_name.value, "Mean value across the cycles")
+            key_std = (metric_name.value, "Standard deviation across the cycles")
+            key_max = (metric_name.value, "Maximum value found in a cycle")
+            key_min = (metric_name.value, "Minimum value found in a cycle")
+
+            if key_mean not in out:
+                out[key_mean] = []
+                out[key_std] = []
+                out[key_max] = []
+                out[key_min] = []
+
+            out[key_mean].append(metric_values.mean)
+            out[key_std].append(metric_values.std)
+            out[key_max].append(metric_values.max)
+            out[key_min].append(metric_values.min)
+
+    return pd.DataFrame(out, index=rr.keys())
diff --git a/ceruleo/dataset/analysis/sample_rate.py b/ceruleo/dataset/analysis/sample_rate.py
@@ -4,21 +4,33 @@
 import numpy as np
 import pandas as pd
 from pydantic import BaseModel
-
+from typing import List 
 from ceruleo.dataset.ts_dataset import AbstractPDMDataset
 from ceruleo.utils import pydantic_to_dict
 
 logger = logging.getLogger(__name__)
 
 
 class SampleRateAnalysis(BaseModel):
-    mode: float
+    median: float
     mean: float
     std: float
+    unit: str
 
     def to_pandas(self) -> pd.Series:
         return pd.Series(pydantic_to_dict(self)).to_frame().T
 
+    def __repr__(self) -> str:
+        return f"Median: {self.median} | {self.mean} +- {self.std} [{self.unit}]"
+
+
+    def _repr_html_(self) -> str:
+        return f"""<div> 
+        <p> <span style="font-weight:bold"> Median: </span> {self.median} [{self.unit}]  </p>  
+        <p> <span style="font-weight:bold">  Mean +- Std: </span> {self.mean:.3f} +- {self.std:.3f} [{self.unit}] </p>
+        </div>
+    """
+
 
 def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray:
     """Obtain an array of time difference between two consecutive samples
@@ -33,9 +45,10 @@ def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray:
         Array of time differences
 
     """
-    time_diff = []
+    time_diff : List[float ]= []
     for life in ds:
         diff = np.diff(life.index.values)
+        diff = diff[diff <= np.median(diff)]
         if pd.api.types.is_timedelta64_ns_dtype(diff.dtype):
             diff = diff / np.timedelta64(1, unit)
         time_diff.extend(diff)
@@ -44,10 +57,10 @@ def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray:
 
 
 def sample_rate_summary(
-    ds: AbstractPDMDataset, unit: Optional[str] = "s"
+    ds: AbstractPDMDataset, unit: str = "s"
 ) -> SampleRateAnalysis:
     """
-    Obtain the mean, mode and standard deviation of the sample rate of the dataset
+    Obtain the mean, median and standard deviation of the sample rate of the dataset
 
     Parameters:
         ds: The dataset
@@ -60,5 +73,6 @@ def sample_rate_summary(
     return SampleRateAnalysis(
         mean=np.mean(sr),
         std=np.std(sr),
-        mode=pd.Series(sr).mode().values[0],
+        median=np.median(sr),
+        unit=unit
     )
diff --git a/ceruleo/dataset/catalog/PHMDataset2018.py b/ceruleo/dataset/catalog/PHMDataset2018.py
@@ -180,7 +180,6 @@ def track_progress(members):
 
         path = self.dataset_path / "raw"
         path.mkdir(parents=True, exist_ok=True)
-        print(path / OUTPUT)
         if not (path / OUTPUT).resolve().is_file():
             download(self.url, path)
         logger.info("Decompressing  dataset...")

diff --git a/ceruleo/dataset/ts_dataset.py b/ceruleo/dataset/ts_dataset.py
@@ -65,14 +65,6 @@ def number_of_samples_of_time_series(self, i: int) -> int:
     def rul_column(self) -> str:
         raise NotImplementedError
 
-    def duration(self, life: pd.DataFrame) -> float:
-        return life[self.rul_column].max()
-
-    def number_of_samples(self) -> List[int]:
-        return [
-            self.number_of_samples_of_time_series(i) for i in tqdm(range(len(self)))
-        ]
-
     def duration(self, life: pd.DataFrame) -> float:
         """Obtain the duration of the time-series
 
@@ -82,8 +74,14 @@ def duration(self, life: pd.DataFrame) -> float:
         Returns:
             Duration of the life
         """
-        v = life.index
-        return v.max() - v.min()
+        return life[self.rul_column].max()
+
+    def number_of_samples(self) -> List[int]:
+        return [
+            self.number_of_samples_of_time_series(i) for i in tqdm(range(len(self)))
+        ]
+
+
 
     def durations(self, show_progress: bool = False) -> List[float]:
         """