Skip to content

Commit

Permalink
Update the MkDocs for the Library Documentation (#33)
Browse files Browse the repository at this point in the history
* Update mkdocs.yml
* Update requirements.txt
* Update extraction.md file so that it appears in the doc
* Update md files to show the source code
* Update docstrings in the py files
* Update selection.py
* Update mkdocs.yml
* Update selection.py and .md
* Update cast.py
* Update denoising.py
* Update entropy.py
* Update outliers.py
* Added extraction_frequency.py
* Added scalers.md
* Update mkdocs.yml
* Update scalers.py
* Transfer scaler.md to the docs/transformation/features folder
* Added slicing.md and split.md
* Adding operations.md and rolling_windows.md
* Update operations.py and rolling_windows.py

* Update operations.py and rolling_windows.py

* Added transformation.md

* Update transformation.py

* Update mkdocs.yml

* Update transformation.py

---------

Co-authored-by: Lemeda98 <lemeda98@gmail.com>
Co-authored-by: Luciano Lorenti <lucianolorenti@gmail.com>
  • Loading branch information
3 people committed Jan 7, 2024
1 parent c1ccd94 commit 7862b3c
Show file tree
Hide file tree
Showing 59 changed files with 2,085 additions and 1,655 deletions.
23 changes: 10 additions & 13 deletions ceruleo/dataset/analysis/correlation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@


from itertools import combinations
from typing import List, Optional, Tuple

Expand All @@ -18,30 +16,29 @@ def correlation_analysis(
Compute the correlation between all the features given an Iterable of executions.
Parameters:
dataset: Dataset of time series
corr_threshold: Threshold to consider two features of a single execution highly correlated
features: List of features to consider when computing the correlations
Returns:
A DataFrame indexed with the column names with the following columns:
pd.DataFrame: A DataFrame indexed with the column names with the following columns:
- Mean Correlation
- Std Correlation
- Percentage of lives with a high correlation
- Abs mean correlation
- Std mean correlation
- Max correlation
- Min correlation
- Mean Correlation
- Std Correlation
- Percentage of lives with a high correlation
- Abs mean correlation
- Std mean correlation
- Max correlation
- Min correlation
"""
if features is None:
features = sorted(list(dataset.common_features()))
else:
features = sorted(list(set(features).intersection(dataset.common_features())))
features = dataset.get_features_of_life(0)[features].corr().columns
correlated_features = []

for ex in iterate_over_features(dataset):
ex = ex[features]
corr_m = ex.corr().fillna(0)
Expand Down Expand Up @@ -73,4 +70,4 @@ def percentage_above_treshold(x):
)
output["Max correlation"] = df.groupby(by=["Feature 1", "Feature 2"]).max()
output["Min correlation"] = df.groupby(by=["Feature 1", "Feature 2"]).min()
return output
return output
32 changes: 14 additions & 18 deletions ceruleo/dataset/analysis/distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ def histogram_per_life(
bins_to_use: np.ndarray,
normalize: bool = True,
) -> List[np.ndarray]:

try:
d = life[feature]
h, _ = np.histogram(d, bins=bins_to_use)
Expand Down Expand Up @@ -49,27 +48,22 @@ def features_divergeces(
columns: Optional[List[str]] = None,
show_progress: bool = False,
) -> pd.DataFrame:
"""Compute the divergence between features
"""
Compute the divergence between features
Parameters:
ds: The dataset
number_of_bins: Number of bins
columns: Which columns to use
Returns:
df: A DataFrame in which cach row contains the
distances between a feature of two run-to-failure cycle
with the following columns:
- Life 1: Run-to-failure cycle 1
- Life 2: Run-to-failure cycle 2
- W: Wasserstein
- KL: KL Divergence
- feature: The feature name
A DataFrame in which each row contains the distances between a feature of two run-to-failure cycle with the following columns:
- Life 1: Run-to-failure cycle 1
- Life 2: Run-to-failure cycle 2
- W: Wasserstein
- KL: KL Divergence
- feature: The feature name
"""
if columns is None:
columns = ds.numeric_features()
Expand All @@ -88,14 +82,16 @@ def features_divergeces(
histograms[feature].append(
histogram_per_life(life, feature, features_bins[feature])
)

df_data = []
for feature in columns:
data = {}
for ((i, h1), (j, h2)) in itertools.combinations(enumerate(histograms[feature]), 2):
for (i, h1), (j, h2) in itertools.combinations(
enumerate(histograms[feature]), 2
):
kl = (np.mean(kl_div(h1, h2)) + np.mean(kl_div(h2, h1))) / 2
wd = wasserstein_distance(h1, h2)
df_data.append((i, j, wd, kl, feature))
df = pd.DataFrame(df_data, columns=["Life 1", "Life 2", "W", "KL", "feature"])

return df
return df
121 changes: 57 additions & 64 deletions ceruleo/dataset/analysis/numerical_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,33 +14,31 @@
from ceruleo.dataset.utils import iterate_over_features_and_target


def entropy(s: np.ndarray)-> float:
"""Approximate entropy
def entropy(s: np.ndarray) -> float:
"""
Approximate entropy
The approximate entropy quantifies the amount of regularity and the unpredictability of fluctuations over time-series data.
Parameters:
s: A single feature
Returns:
ae: Approximate entropy of feature s
Approximate entropy of feature s
"""
return ant.app_entropy(s)


def correlation(s: np.ndarray, y:Optional[np.ndarray]=None):
"""Correlation of the feature with the target
def correlation(s: np.ndarray, y: Optional[np.ndarray] = None) -> float:
"""
Correlation of the feature with the target
Parameters:
s: A single feature
y: The RUL target
Returns:
c: Correlation between the feature ant the RUL target
Correlation between the feature and the RUL target
"""
N = s.shape[0]
if not (s[0] == s).all():
Expand All @@ -51,113 +49,110 @@ def correlation(s: np.ndarray, y:Optional[np.ndarray]=None):
return corr


def autocorrelation(s: np.ndarray)-> float:
"""Autocorrelation of a feature
def autocorrelation(s: np.ndarray) -> float:
"""
Autocorrelation of a feature
Parameters:
s: A single feature
Returns:
ac: Autocorrelation of the feature
Autocorrelation of the feature
"""
diff = np.diff(s)
return np.sum(diff ** 2) / s.shape[0]
return np.sum(diff**2) / s.shape[0]


def monotonicity(s: np.ndarray) -> float:
"""Monotonicity of a feature
"""
Monotonicity of a feature, the two extreme values are 0 if the feature is constant and 1 if it is strictly monotonic.
Parameters:
s: A single feature
Returns:
f: Monotonicity of the feature
Monotonicity of the feature
"""
N = s.shape[0]
diff = np.diff(s)
return 1 / (N - 1) * np.abs(np.sum(diff > 0) - np.sum(diff < 0))


def n_unique(s: np.ndarray) -> int:
"""Number of unique values in the array
"""
Number of unique values in the array
Parameters:
s: A single feature
Returns:
n: Number of unique values
Number of unique values
"""
return len(np.unique(s))


def null(s: np.ndarray) -> float:
"""Null proportion for a given feature
"""
Null proportion for a given feature
Parameters:
s: A feature
Returns:
n: Null proportion
Null proportion
"""
return np.mean(~np.isfinite(s))

def mutual_information(x:np.ndarray, y:np.ndarray):

def mutual_information(x: np.ndarray, y: np.ndarray) -> float:
"""Mutual information between a feature and the target
[Reference](Remaining Useful Life Prediction Using Ranking Mutual Information Based Monotonic Health Indicator)
Parameters:
x: A single feature
y: RUL Target
Returns:
float: Mutual information between x and y
Returns:
Mutual information between x and y
"""
x = x.reshape(-1, 1)
x = np.nan_to_num(x)
return mutual_info_regression(x, y)


metrics = {
"std": lambda x, y: np.std(x),
"correlation": lambda x,y: correlation(x, y),
"autocorrelation": lambda x,y:autocorrelation(x),
"monotonicity": lambda x,y:monotonicity(x),
"number_of_unique_elements": lambda x,y:n_unique(x),
'mutual_information': mutual_information,
'null': lambda x, y: null(x),
'entropy': lambda x, y: entropy(x)
"correlation": lambda x, y: correlation(x, y),
"autocorrelation": lambda x, y: autocorrelation(x),
"monotonicity": lambda x, y: monotonicity(x),
"number_of_unique_elements": lambda x, y: n_unique(x),
"mutual_information": mutual_information,
"null": lambda x, y: null(x),
"entropy": lambda x, y: entropy(x),
}




def analysis_single_time_series(
X: np.ndarray,
y: np.ndarray,
column_names: List[str],
data: Optional[Dict] = None,
what_to_compute: List[str] = [],
) -> dict:
"""Compute the analysis for a single run-to-failure cycle
"""
Compute the analysis for a single run-to-failure cycle
Parameters:
X: Features
X: Input Features
y: RUL Target
column_names: Column names of the features
data: Initial data
what_to_compute: Features to compute
Returns:
dict: Computed info
Dictionary containing the computed info
"""

if data is None:
Expand All @@ -171,12 +166,11 @@ def analysis_single_time_series(

m = metrics[what](x_ts, y)


data[column_name][what].append(m)
return data


def merge_analysis(data: dict):
def merge_analysis(data: dict) -> pd.DataFrame:
data_df = defaultdict(lambda: defaultdict(list))
for column_name in data.keys():
for what in data[column_name]:
Expand All @@ -195,25 +189,26 @@ def analysis(
show_progress: bool = False,
what_to_compute: List[str] = [],
) -> pd.DataFrame:
"""Compute analysis of numerical features
"""
Compute analysis of numerical features
Parameters:
transformed_dataset: A transformed dataset with a features and target
dataset: A transformed dataset with features and target
show_progress: Wether to show the progress when computing the features
what_to_compute: Elements available to compute
- std
- correlation
- autocorrelation,
- monotonicity
- number_of_unique_elements
- mutual_information
- null
- entropy
what_to_compute: Elements available to compute:
- std
- Correlation
- Autocorrelation
- Monotonicity
- Number of unique elements
- Mutual information
- Null
- Entropy
Returns:
df: Dataframe with the columns specified by what_to_compute
Returns:
Dataframe with the columns specified by what_to_compute
"""

if len(what_to_compute) == 0:
Expand All @@ -229,7 +224,5 @@ def analysis(
column_names = dataset.numeric_features()
for X, y in iterate_over_features_and_target(dataset):
y = np.squeeze(y)
data = analysis_single_time_series(
X, y, column_names, data, what_to_compute
)
return merge_analysis(data)
data = analysis_single_time_series(X, y, column_names, data, what_to_compute)
return merge_analysis(data)

0 comments on commit 7862b3c

Please sign in to comment.