Cositas

lucianolorenti · Aug 18, 2022 · 8b8c613 · 8b8c613
2 parents 877885d + 4e78cb5
commit 8b8c613
Show file tree

Hide file tree

Showing 12 changed files with 155 additions and 104 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 2.0.0
+current_version = 2.0.6
 commit = True
 tag = True
 

diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -18,6 +18,7 @@ jobs:
       - name: Set up Python ${ matrix.python-version }}
         uses: actions/setup-python@v3
         with:
+<<<<<<< HEAD
           python-version: "3.10"
       - name: FragileTech/bump-version
         uses: FragileTech/bump-version@main
@@ -26,6 +27,24 @@ jobs:
           commit_email: lucianolorenti@gmail.com
           login: lucianolorenti@gmail.com
           token: "${{ secrets.TOKEN_GITHUB }}"
+=======
+          python-version: "3.10"  
+      - name: Bump version
+        run: |            
+            git config --global user.name "Bump bot"
+            git config --global user.email "lucianolorenti@gmail.com"
+            git config --global pull.rebase false            
+            pip install bump2version
+            git remote add remote https://lucianolorenti:${{ secrets.TOKEN_GITHUB }}@github.com/$GITHUB_REPOSITORY
+            git pull --no-edit remote main
+            bump2version --tag --commit --allow-dirty --commit-args="-a" patch            
+      - name: Push changes
+        uses: ad-m/github-push-action@master
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          branch: main
+          tags: true
+>>>>>>> 4e78cb51c368737a91101483a4120e737525b7d6
       - name: Install pypa/build
         run: >-
           python -m

diff --git a/ceruleo/__init__.py b/ceruleo/__init__.py
@@ -9,4 +9,4 @@
 CACHE_PATH.mkdir(parents=True, exist_ok=True)
 
 
-__version__ = "2.0.0"
+__version__ = "2.0.6"
diff --git a/ceruleo/dataset/analysis/numerical_features.py b/ceruleo/dataset/analysis/numerical_features.py
@@ -133,7 +133,8 @@ def mutual_information(x:np.ndarray, y:np.ndarray):
     "monotonicity": lambda x,y:monotonicity(x),
     "number_of_unique_elements": lambda x,y:n_unique(x),    
     'mutual_information': mutual_information,
-    'null': lambda x, y: null(x)
+    'null': lambda x, y: null(x),
+    'entropy': lambda x, y: entropy(x)
 }
 
 
@@ -164,7 +165,7 @@ def analysis_single_time_series(
         data = defaultdict(lambda: defaultdict(list))
     if len(what_to_compute) == 0:
         what_to_compute = list(sorted(metrics.keys()))
-    for column_index in range(X.shape[1]):
+    for column_index in range(len(column_names)):
         column_name = column_names[column_index]
         for what in what_to_compute:
             x_ts = np.squeeze(X.loc[:, column_name].values)
@@ -209,6 +210,7 @@ def analysis(
                         - number_of_unique_elements
                         - mutual_information
                         - null
+                        - entropy
 
     Returns:
 
@@ -225,7 +227,7 @@ def analysis(
     if isinstance(dataset, TransformedDataset):
         column_names = dataset.transformer.column_names
     else:
-        column_names = dataset[0].columns
+        column_names = dataset.numeric_features()
     for X, y in iterate_over_features_and_target(dataset):
         y = np.squeeze(y)
         data = analysis_single_time_series(

diff --git a/ceruleo/graphics/analysis.py b/ceruleo/graphics/analysis.py
@@ -1,44 +1,60 @@
 from typing import List, Optional
-import matplotlib.pyplot as plt
-
-from temporis.dataset.ts_dataset import AbstractTimeSeriesDataset 
-
 
+import matplotlib
+import matplotlib.pyplot as plt
+from ceruleo.dataset.ts_dataset import AbstractTimeSeriesDataset
 
 
 def correlation_analysis(
     dataset: AbstractTimeSeriesDataset,
     corr_threshold: float = 0,
     features: Optional[List[str]] = None,
-    ax =None,
-    **kwargs):
+    ax: matplotlib.axes.Axes = Optional[None],
+    **kwargs,
+):
+    """Plot the correlated features in a dataset
+
+    Parameters:
     
-    df = correlation_analysis(dataset, corr_threshold, features=list(set(features) - set(['relative_time'])))
-    df1 = df[(df['Abs mean correlation']>corr_threshold)]
-
+        dataset: The dataset
+        corr_threshold: Minimum threshold to consider that the correlation is high
+        features: List of features
+        ax: The axis where to draw
+
+    Returns:
+        ax: the axis
+    """
+
+    df = correlation_analysis(
+        dataset, corr_threshold, features=list(set(features) - set(["relative_time"]))
+    )
+    df1 = df[(df["Abs mean correlation"] > corr_threshold)]
+
     df1.reset_index(inplace=True)
-    df1.sort_values(by='Mean Correlation', ascending=True, inplace=True)
+    df1.sort_values(by="Mean Correlation", ascending=True, inplace=True)
     if ax is None:
         fig, ax = plt.subplots(**kwargs)
     labels = []
     for i, (_, r) in enumerate(df1.iterrows()):
-        f1 = r['Feature 1']
-        f2 = r['Feature 2']
-        label = f'{f1}\n{f2}'
-        ax.barh(y=i, 
-                width=r['Mean Correlation'], 
-                label=label,
-                xerr=r['Std Correlation'],
-                color="#7878FF")
+        f1 = r["Feature 1"]
+        f2 = r["Feature 2"]
+        label = f"{f1}\n{f2}"
+        ax.barh(
+            y=i,
+            width=r["Mean Correlation"],
+            label=label,
+            xerr=r["Std Correlation"],
+            color="#7878FF",
+        )
         labels.append(label)
 
-    ax.axvline(x=0.90, linestyle='--')
-    ax.axvline(x=-0.90, linestyle='--')
+    ax.axvline(x=0.90, linestyle="--")
+    ax.axvline(x=-0.90, linestyle="--")
 
     ax.set_yticks(list(range(len(labels))))
     ax.set_yticklabels(labels)
     xticks = ax.get_xticks()
 
-    ax.set_xticks([-1,-0.90, -0.5, 0, 0.5,  0.90, 1])
-    ax.set_xlabel('Correlation')
-    return ax
+    ax.set_xticks([-1, -0.90, -0.5, 0, 0.5, 0.90, 1])
+    ax.set_xlabel("Correlation")
+    return ax
diff --git a/ceruleo/models/keras/layers.py b/ceruleo/models/keras/layers.py
@@ -4,24 +4,14 @@
 
 import numpy as np
 import tensorflow as tf
-from tensorflow.keras import Input, Model, Sequential
+from tensorflow.keras import Sequential
 from tensorflow.keras import backend as K
-from tensorflow.keras.layers import (
-    Activation,
-    Add,
-    BatchNormalization,
-    Conv1D,
-    Conv2D,
-    Dense,
-    Dropout,
-    Flatten,
-    Lambda,
-    Activation,
-    GlobalAveragePooling2D,
-    Permute,
-)
+from tensorflow.keras import regularizers
+from tensorflow.keras.layers import (Activation, BatchNormalization, Conv2D,
+                                     Dense, Flatten, Lambda, Layer, Permute,
+                                     Reshape)
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras.layers.pooling import GlobalAveragePooling1D, MaxPool1D
+from tensorflow.python.keras.layers.pooling import GlobalAveragePooling1D
 
 
 def ExpandDimension(dim: int = -1):
@@ -185,3 +175,52 @@ def call(self, inputs):
         residual = tf.keras.backend.sign(residual) * n_sub
         residual = RemoveDimension(3)(residual)
         return residual + inputs
+
+
+
+
+class ZeroWeights(tf.keras.constraints.Constraint):
+
+
+  def __init__(self, l1:float):
+    self.l1 = l1
+
+  def __call__(self, w):
+
+    return (tf.math.multiply(w, tf.cast(tf.abs(w) > self.l1, tf.float32)) )
+
+  def get_config(self):
+    return {'l1': self.l1}
+
+
+class LASSOLayer(Layer):
+    def __init__(self, l1:float):
+        super(LASSOLayer, self).__init__()
+        self.l1 = l1
+        self.kernel_regularizer = regularizers.L1(l1)
+
+
+    def build(self, input_shape):
+        W_size = np.prod(input_shape[1:])
+        self.w = self.add_weight(
+            shape=(W_size, ),
+            initializer="random_normal",
+            trainable=True,
+            regularizer=self.kernel_regularizer,
+            constraint=ZeroWeights(self.l1)
+        )
+
+
+        self.input_reshape = Reshape((W_size,))
+        self.output_reshape = Reshape(input_shape[1:])
+
+
+
+    def call(self, inputs):
+        x = self.input_reshape(inputs)
+
+
+        x = tf.math.multiply(self.w, x) 
+
+        self.add_metric(tf.math.reduce_sum(tf.cast(tf.abs(self.w) > 0, tf.float32)), name="Number of features")
+        return self.output_reshape(x)
diff --git a/ceruleo/transformation/features/extraction.py b/ceruleo/transformation/features/extraction.py
@@ -1046,19 +1046,6 @@ class Interactions(TransformerStep):
     """Compute pairwise interactions between the features"""
 
     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
-        """Transform the given life computing the iteractions between features
-
-        Parameters
-        ----------
-        X : pd.DataFrame
-            Input life
-
-        Returns
-        -------
-        pd.DataFrame
-            A new dataframe with the same index as the input
-            with n*(n-1) / 2 columns with the interactions between the features
-        """
         X_new = pd.DataFrame(index=X.index)
         for c1, c2 in itertools.combinations(X.columns, 2):
             X_new[f"{c1}_{c2}"] = X[c1] * X[c2]

diff --git a/ceruleo/transformation/features/outliers.py b/ceruleo/transformation/features/outliers.py
@@ -12,27 +12,20 @@
 
 
 class IQROutlierRemover(TransformerStep):
-    """
-    Impute values outside (Q1 - margin*IQR, Q2 + margin*IQR)
+    """Remove values outside (Q1 - margin*IQR, Q2 + margin*IQR)
 
     If clip is True the values will be clipped between the range,
     otherwise the values are going to be replaced by inf and -inf
 
 
 
-    Parameters
-    ----------
-    lower_quantile: float, default 0.25
-        Lower quantile threshold for the non-anomalous values
-    upper_quantile: float, default 0.75
-        Upper quantile threshold for the non-anomalous values
-    margin: float, default 1.5
-        How many times the IQR gets multiplied
-    proportion_to_sample:float, default 1.0
-        If you want to compute the quantiles in an smaller proportion of data
-        you can specify it
-    clip: bool
-        Wether to clip the values outside the range.
+    Parameters:
+        lower_quantile: Lower quantile threshold for the non-anomalous values
+        upper_quantile: Upper quantile threshold for the non-anomalous values
+        margin: How many times the IQR gets multiplied
+        proportion_to_sample: If you want to compute the quantiles in an smaller proportion of data
+            you can specify it
+        clip: Wether to clip the values outside the range.
 
     """
 
@@ -123,22 +116,17 @@ def description(self):
 
 
 class BeyondQuartileOutlierRemover(TransformerStep):
-    """
-    Impute values outside (Q1, Q3)
+    """Remove values outside (Q1, Q3)
 
     If clip is True the values will be clipped between the range,
     otherwise the values are going to be replaced by inf and -inf
 
 
 
-    Parameters
-    ----------
-    lower_quantile: float, default 0.25
-        Lower quantile threshold for the non-anomalous values
-    upper_quantile: float, default 0.75
-        Upper quantile threshold for the non-anomalous values
-    clip: bool
-        Wether to clip the values outside the range.
+    Parameters:
+        lower_quantile:  Lower quantile threshold for the non-anomalous values
+        upper_quantile: Upper quantile threshold for the non-anomalous values
+        clip: Wether to clip the values outside the range.
 
     """
 

diff --git a/ceruleo/transformation/features/rolling_windows.py b/ceruleo/transformation/features/rolling_windows.py
@@ -25,24 +25,19 @@ def apply_rolling_data(values : np.ndarray, function, window, step=1):
     sections of length `window` at the data of column `col`. Append
     the results to `data` at a new columns with name `label`.
 
-    Parameters
-    ----------
-    data : np.ndarray
-          1-D Time series of data
-    function : callable
-        Function to be called to calculate the rolling window
+    Parameters:
+    
+    data: 1-D Time series of data
+    function: Function to be called to calculate the rolling window
         analysis, the function must receive as input an array or
         pandas series. Its output must be either a number or a pandas
         series
-    window : int
-        length of the window to perform the analysis
-    step : int
-        step to take between two consecutive windows
+    window: length of the window to perform the analysis
+    step: step to take between two consecutive windows
 
-    Returns
+    Returns:
     -------
-    data : np.ndarray
-        Columns generated by the function applied
+        data: Columns generated by the function applied
 
     """