Skip to content

Commit

Permalink
Improve the analysis module (#41)
Browse files Browse the repository at this point in the history
* Analysis: Improve

* Transformation: Allow the option for missing NA in MixMaxScaler

* Graphics: Show better labels in duration histogram

* Analysis: Provide a better html repr for the sample rate metric

* Analysis: Add a function for obtain the numeric analysis as a dataframe

* Tests: Update tests
  • Loading branch information
lucianolorenti authored Jun 17, 2024
1 parent ef911b6 commit a394807
Show file tree
Hide file tree
Showing 12 changed files with 2,450 additions and 689 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 3.0.5
current_version = 3.0.6
commit = True
tag = True

Expand Down
2 changes: 1 addition & 1 deletion ceruleo/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@
CACHE_PATH.mkdir(parents=True, exist_ok=True)


__version__ = "3.0.5"
__version__ = "3.0.6"
52 changes: 43 additions & 9 deletions ceruleo/dataset/analysis/distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,25 @@
logger = logging.getLogger(__name__)


def histogram_per_life(
life: pd.DataFrame,
def histogram_per_cycle(
cycle: pd.DataFrame,
feature: str,
bins_to_use: np.ndarray,
normalize: bool = True,
) -> List[np.ndarray]:
"""Compute the histogram of a feature in a run-to-failure cycle
Args:
cycle (pd.DataFrame): The run-to-failure cycle
feature (str): The feature to compute the histogram
bins_to_use (np.ndarray): Number of bins to use
normalize (bool, optional): Wheter to normalize the histogram. Defaults to True.
Returns:
List[np.ndarray]: The histogram of the feature
"""
try:
d = life[feature]
d = cycle[feature]
h, _ = np.histogram(d, bins=bins_to_use)

if normalize:
Expand Down Expand Up @@ -59,9 +70,9 @@ def features_divergeces(
Returns:
A DataFrame in which each row contains the distances between a feature of two run-to-failure cycle with the following columns:
- Life 1: Run-to-failure cycle 1
- Life 2: Run-to-failure cycle 2
- W: Wasserstein
- Cycle 1: Run-to-failure cycle 1
- Cycle 2: Run-to-failure cycle 2
- Wasserstein: Wasserstein
- KL: KL Divergence
- feature: The feature name
"""
Expand All @@ -80,7 +91,7 @@ def features_divergeces(
if feature not in histograms:
histograms[feature] = []
histograms[feature].append(
histogram_per_life(life, feature, features_bins[feature])
histogram_per_cycle(life, feature, features_bins[feature])
)

df_data = []
Expand All @@ -91,7 +102,30 @@ def features_divergeces(
):
kl = (np.mean(kl_div(h1, h2)) + np.mean(kl_div(h2, h1))) / 2
wd = wasserstein_distance(h1, h2)
df_data.append((i, j, wd, kl, feature))
df = pd.DataFrame(df_data, columns=["Life 1", "Life 2", "W", "KL", "feature"])
df_data.append(
(
i,
j,
ds.get_features_of_life(i).shape[0],
ds.get_features_of_life(j).shape[0],
abs(ds.get_features_of_life(i).shape[0]-ds.get_features_of_life(j).shape[0]),
wd,
kl,
feature,
)
)
df = pd.DataFrame(
df_data,
columns=[
"Cycle 1",
"Cycle 2",
"Cycle 1 length",
"Cycle 2 length",
"Abs Length difference",
"Wasserstein",
"KL",
"feature",
],
)

return df
110 changes: 92 additions & 18 deletions ceruleo/dataset/analysis/numerical_features.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@

from enum import Enum
from typing import Dict, List, Optional, Union
from typing import Dict, List, Optional, Tuple, Union

import antropy as ant
import numpy as np
Expand All @@ -12,6 +11,7 @@
from ceruleo.dataset.transformed import TransformedDataset
from ceruleo.dataset.ts_dataset import AbstractPDMDataset
from ceruleo.dataset.utils import iterate_over_features_and_target
import pandas as pd


class MetricType(str, Enum):
Expand All @@ -29,7 +29,7 @@ def from_str(s: str) -> "MetricType":
return MetricType(s)


class MetricValues(BaseModel):
class MetricValuesSummary(BaseModel):
mean: float
std: float
max: float
Expand All @@ -38,7 +38,28 @@ class MetricValues(BaseModel):

class NumericalFeaturesAnalysis(BaseModel):
feature: str
metric: Dict[MetricType, MetricValues]
metric: Dict[MetricType, List[float]]

def summarize(self) -> Dict[MetricType, MetricValuesSummary]:
out = {}
for metric in self.metric.keys():
mean = np.nanmean(self.metric[metric])
std = np.nanstd(self.metric[metric])
max_ = np.nanmax(self.metric[metric])
min_ = np.nanmin(self.metric[metric])
out[metric] = MetricValuesSummary(mean=mean, std=std, max=max_, min=min_)
return out

def __getitem__(self, key: str) -> MetricValuesSummary:
return self.metric[MetricType.from_str(key)]

def _repr_html_(self) -> str:
out = "<table>"
out += "<tr><th>Metric</th><th>Mean</th><th>Std</th><th>Max</th><th>Min</th></tr>"
for metric, summary in self.summarize().items():
out += f"<tr><td>{metric}</td><td>{summary.mean}</td><td>{summary.std}</td><td>{summary.max}</td><td>{summary.min}</td></tr>"
out += "</table>"
return out


def entropy(s: np.ndarray) -> float:
Expand Down Expand Up @@ -120,15 +141,15 @@ def n_unique(s: np.ndarray) -> int:

def null(s: np.ndarray) -> float:
"""
Null proportion for a given feature
Null percentage for a given feature
Parameters:
s: A feature
Returns:
Null proportion
Null percentage
"""
return np.mean(~np.isfinite(s))
return np.mean(~np.isfinite(s)) * 100


def mutual_information(x: np.ndarray, y: np.ndarray) -> float:
Expand All @@ -151,17 +172,19 @@ def mutual_information(x: np.ndarray, y: np.ndarray) -> float:

metrics = {
"std": lambda x, y: np.std(x),
"correlation": lambda x, y: correlation(x, y),

"autocorrelation": lambda x, y: autocorrelation(x),
"monotonicity": lambda x, y: monotonicity(x),
"number_of_unique_elements": lambda x, y: n_unique(x),
"mutual_information": mutual_information,

"null": lambda x, y: null(x),
"entropy": lambda x, y: entropy(x),
"mutual_information": mutual_information,
"correlation": lambda x, y: correlation(x, y),
}


def analysis_single_cycle(
def analyze_single_cycle(
X: np.ndarray,
y: np.ndarray,
out: Dict[str, Dict[MetricType, List[float]]],
Expand Down Expand Up @@ -206,16 +229,11 @@ def merge_cycle_analysis(
for column_name in data.keys():
for what in data[column_name]:
metric_type = MetricType.from_str(what)
out[column_name].metric[metric_type] = MetricValues(
mean=np.nanmean(data[column_name][what]),
std=np.nanstd(data[column_name][what]),
max=np.nanmax(data[column_name][what]),
min=np.nanmin(data[column_name][what]),
)
out[column_name].metric[metric_type] = data[column_name][what]
return out


def analysis(
def analyze(
dataset: Union[TransformedDataset, AbstractPDMDataset],
*,
show_progress: bool = False,
Expand Down Expand Up @@ -260,6 +278,62 @@ def analysis(
}
for X, y in iterate_over_features_and_target(dataset):
y = np.squeeze(y)
analysis_single_cycle(X, y, data_per_cycle, column_names, what_to_compute)
analyze_single_cycle(X, y, data_per_cycle, column_names, what_to_compute)

return merge_cycle_analysis(data_per_cycle)


def analyze_as_dataframe(
dataset: Union[TransformedDataset, AbstractPDMDataset],
*,
show_progress: bool = False,
what_to_compute: List[str] = [],
) -> pd.DataFrame:
"""
Compute analysis of numerical features
Parameters:
dataset: A transformed dataset with features and target
show_progress: Wether to show the progress when computing the features
what_to_compute: Elements available to compute:
- std
- Correlation
- Autocorrelation
- Monotonicity
- Number of unique elements
- Mutual information
- Null
- Entropy
Returns:
pd.DataFrame
"""
rr = analyze(dataset, show_progress=show_progress, what_to_compute=what_to_compute)


out: Dict[Tuple[str, str], List[float]] = {}


for k, metrics in rr.items():
metrics_summary = metrics.summarize()
for metric_name, metric_values in metrics_summary.items():

key_mean = (metric_name.value, "Mean value across the cycles")
key_std = (metric_name.value, "Standard deviation across the cycles")
key_max = (metric_name.value, "Maximum value found in a cycle")
key_min = (metric_name.value, "Minimum value found in a cycle")

if key_mean not in out:
out[key_mean] = []
out[key_std] = []
out[key_max] = []
out[key_min] = []

out[key_mean].append(metric_values.mean)
out[key_std].append(metric_values.std)
out[key_max].append(metric_values.max)
out[key_min].append(metric_values.min)

return pd.DataFrame(out, index=rr.keys())
26 changes: 20 additions & 6 deletions ceruleo/dataset/analysis/sample_rate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,21 +4,33 @@
import numpy as np
import pandas as pd
from pydantic import BaseModel

from typing import List
from ceruleo.dataset.ts_dataset import AbstractPDMDataset
from ceruleo.utils import pydantic_to_dict

logger = logging.getLogger(__name__)


class SampleRateAnalysis(BaseModel):
mode: float
median: float
mean: float
std: float
unit: str

def to_pandas(self) -> pd.Series:
return pd.Series(pydantic_to_dict(self)).to_frame().T

def __repr__(self) -> str:
return f"Median: {self.median} | {self.mean} +- {self.std} [{self.unit}]"


def _repr_html_(self) -> str:
return f"""<div>
<p> <span style="font-weight:bold"> Median: </span> {self.median} [{self.unit}] </p>
<p> <span style="font-weight:bold"> Mean +- Std: </span> {self.mean:.3f} +- {self.std:.3f} [{self.unit}] </p>
</div>
"""


def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray:
"""Obtain an array of time difference between two consecutive samples
Expand All @@ -33,9 +45,10 @@ def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray:
Array of time differences
"""
time_diff = []
time_diff : List[float ]= []
for life in ds:
diff = np.diff(life.index.values)
diff = diff[diff <= np.median(diff)]
if pd.api.types.is_timedelta64_ns_dtype(diff.dtype):
diff = diff / np.timedelta64(1, unit)
time_diff.extend(diff)
Expand All @@ -44,10 +57,10 @@ def sample_rate(ds: AbstractPDMDataset, unit: str = "s") -> np.ndarray:


def sample_rate_summary(
ds: AbstractPDMDataset, unit: Optional[str] = "s"
ds: AbstractPDMDataset, unit: str = "s"
) -> SampleRateAnalysis:
"""
Obtain the mean, mode and standard deviation of the sample rate of the dataset
Obtain the mean, median and standard deviation of the sample rate of the dataset
Parameters:
ds: The dataset
Expand All @@ -60,5 +73,6 @@ def sample_rate_summary(
return SampleRateAnalysis(
mean=np.mean(sr),
std=np.std(sr),
mode=pd.Series(sr).mode().values[0],
median=np.median(sr),
unit=unit
)
1 change: 0 additions & 1 deletion ceruleo/dataset/catalog/PHMDataset2018.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,6 @@ def track_progress(members):

path = self.dataset_path / "raw"
path.mkdir(parents=True, exist_ok=True)
print(path / OUTPUT)
if not (path / OUTPUT).resolve().is_file():
download(self.url, path)
logger.info("Decompressing dataset...")
Expand Down
18 changes: 8 additions & 10 deletions ceruleo/dataset/ts_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,6 @@ def number_of_samples_of_time_series(self, i: int) -> int:
def rul_column(self) -> str:
raise NotImplementedError

def duration(self, life: pd.DataFrame) -> float:
return life[self.rul_column].max()

def number_of_samples(self) -> List[int]:
return [
self.number_of_samples_of_time_series(i) for i in tqdm(range(len(self)))
]

def duration(self, life: pd.DataFrame) -> float:
"""Obtain the duration of the time-series
Expand All @@ -82,8 +74,14 @@ def duration(self, life: pd.DataFrame) -> float:
Returns:
Duration of the life
"""
v = life.index
return v.max() - v.min()
return life[self.rul_column].max()

def number_of_samples(self) -> List[int]:
return [
self.number_of_samples_of_time_series(i) for i in tqdm(range(len(self)))
]



def durations(self, show_progress: bool = False) -> List[float]:
"""
Expand Down
Loading

0 comments on commit a394807

Please sign in to comment.