Rename sample_weight to w (code quality) (#1457)

* rename sample_weight to w in cluster module files Signed-off-by: mariliatd <mariliatd@gmail.com> * rename sample_weight to w in ensemble module files Signed-off-by: mariliatd <mariliatd@gmail.com> * rename sample_weight to w in facto module files Signed-off-by: mariliatd <mariliatd@gmail.com> * rename sample_weight to w in forest and tree modules files Signed-off-by: mariliatd <mariliatd@gmail.com> * rename sample_weight to w in metrics module files Signed-off-by: mariliatd <mariliatd@gmail.com> * rename sample_weight to w in tree splitter module files Signed-off-by: mariliatd <mariliatd@gmail.com> * add note to unreleased.md file Signed-off-by: mariliatd <mariliatd@gmail.com> --------- Signed-off-by: mariliatd <mariliatd@gmail.com>
online-ml · Nov 23, 2023 · 7b2f1b8 · 7b2f1b8
1 parent 3758eb1
commit 7b2f1b8
Show file tree

Hide file tree

Showing 45 changed files with 272 additions and 245 deletions.
diff --git a/docs/releases/unreleased.md b/docs/releases/unreleased.md
@@ -0,0 +1,27 @@
+# Unreleased
+
+## cluster 
+
+- Renamed `sample_weight` to `w`.
+
+## ensemble
+
+- Renamed `sample_weight` to `w`.
+
+## facto
+
+- Renamed `sample_weight` to `w`.
+
+## forest
+
+- Renamed `sample_weight` to `w`.
+
+## tree
+
+- Renamed `sample_weight` to `w`.
+
+## metrics
+
+- Renamed `sample_weight` to `w`.
+
+
diff --git a/river/cluster/dbstream.py b/river/cluster/dbstream.py
@@ -389,7 +389,7 @@ def _recluster(self):
 
         self.clustering_is_up_to_date = True
 
-    def learn_one(self, x, sample_weight=None):
+    def learn_one(self, x, w=None):
         self._update(x)
 
         if self._time_stamp % self.cleanup_interval == 0:
@@ -399,7 +399,7 @@ def learn_one(self, x, sample_weight=None):
 
         return self
 
-    def predict_one(self, x, sample_weight=None):
+    def predict_one(self, x, w=None):
         self._recluster()
 
         min_distance = math.inf

diff --git a/river/cluster/denstream.py b/river/cluster/denstream.py
@@ -313,7 +313,7 @@ def _initial_dbscan(self):
                 else:
                     item.covered = False
 
-    def learn_one(self, x, sample_weight=None):
+    def learn_one(self, x, w=None):
         self._n_samples_seen += 1
         # control the stream speed
         if self._n_samples_seen % self.stream_speed == 0:
@@ -352,7 +352,7 @@ def learn_one(self, x, sample_weight=None):
                     self.o_micro_clusters.pop(j)
         return self
 
-    def predict_one(self, x, sample_weight=None):
+    def predict_one(self, x, w=None):
         # This function handles the case when a clustering request arrives.
         # implementation of the DBSCAN algorithm proposed by Ester et al.
         if not self.initialized:

diff --git a/river/cluster/streamkmeans.py b/river/cluster/streamkmeans.py
@@ -84,7 +84,7 @@ def __init__(self, chunk_size=10, n_clusters=2, **kwargs):
         self._temp_chunk = {}
         self.centers = {}
 
-    def learn_one(self, x, sample_weight=None):
+    def learn_one(self, x, w=None):
         self.time_stamp += 1
 
         index = self.time_stamp % self.chunk_size
@@ -107,7 +107,7 @@ def learn_one(self, x, sample_weight=None):
 
         return self
 
-    def predict_one(self, x, sample_weight=None):
+    def predict_one(self, x, w=None):
         def get_distance(c):
             return utils.math.minkowski_distance(self.centers[c], x, 2)
 

diff --git a/river/cluster/textclust.py b/river/cluster/textclust.py
@@ -153,7 +153,7 @@ def __init__(
         self.micro_distance = self.distances(self.micro_distance)
         self.macro_distance = self.distances(self.macro_distance)
 
-    def learn_one(self, x, t=None, sample_weight=None):
+    def learn_one(self, x, t=None, w=None):
         localdict = {}
         for key in x.keys():
             new_key = key
@@ -213,7 +213,7 @@ def learn_one(self, x, t=None, sample_weight=None):
 
     ## predicts the cluster number. The type specifies whether this should happen on micro-cluster
     ## or macro-cluster level
-    def predict_one(self, x, sample_weight=None, type="micro"):
+    def predict_one(self, x, w=None, type="micro"):
         localdict = {}
         for key in x.keys():
             new_key = key

diff --git a/river/ensemble/streaming_random_patches.py b/river/ensemble/streaming_random_patches.py
@@ -109,7 +109,7 @@ def learn_one(self, x: dict, y: base.typing.Target, **kwargs):
                 k = poisson(rate=self.lam, rng=self._rng)
                 if k == 0:
                     continue
-            model.learn_one(x=x, y=y, sample_weight=k, n_samples_seen=self._n_samples_seen)
+            model.learn_one(x=x, y=y, w=k, n_samples_seen=self._n_samples_seen)
 
         return self
 
@@ -532,7 +532,7 @@ def learn_one(
         x: dict,
         y: base.typing.ClfTarget,
         *,
-        sample_weight: int,
+        w: int,
         n_samples_seen: int,
         **kwargs,
     ):
@@ -543,16 +543,16 @@ def learn_one(
             # Use all features
             x_subset = x
 
-        # TODO Find a way to verify if the model natively supports sample_weight
-        for _ in range(int(sample_weight)):
+        # TODO Find a way to verify if the model natively supports sample_weight (w)
+        for _ in range(int(w)):
             self.model.learn_one(x=x_subset, y=y, **kwargs)
 
         if self._background_learner:
             # Train the background learner
             # Note: Pass the original instance x so features are correctly
             # selected based on the corresponding subspace
             self._background_learner.learn_one(
-                x=x, y=y, sample_weight=sample_weight, n_samples_seen=n_samples_seen  # type: ignore
+                x=x, y=y, w=w, n_samples_seen=n_samples_seen  # type: ignore
             )
 
         if not self.disable_drift_detector and not self.is_background_learner:
@@ -830,7 +830,7 @@ def learn_one(
         x: dict,
         y: base.typing.RegTarget,
         *,
-        sample_weight: int,
+        w: int,
         n_samples_seen: int,
         **kwargs,
     ):
@@ -842,8 +842,8 @@ def learn_one(
             # Use all features
             x_subset = x
 
-        # TODO Find a way to verify if the model natively supports sample_weight
-        for _ in range(int(sample_weight)):
+        # TODO Find a way to verify if the model natively supports sample_weight (w)
+        for _ in range(int(w)):
             self.model.learn_one(x=x_subset, y=y, **kwargs)
 
         # Drift detection input
@@ -860,7 +860,7 @@ def learn_one(
             # Note: Pass the original instance x so features are correctly
             # selected based on the corresponding subspace
             self._background_learner.learn_one(
-                x=x, y=y, sample_weight=sample_weight, n_samples_seen=n_samples_seen  # type: ignore
+                x=x, y=y, w=w, n_samples_seen=n_samples_seen  # type: ignore
             )
 
         if not self.disable_drift_detector and not self.is_background_learner:

diff --git a/river/facto/base.py b/river/facto/base.py
@@ -65,28 +65,28 @@ def __init__(
     def _init_latents(self) -> collections.defaultdict:
         """Initializes latent weights dict."""
 
-    def learn_one(self, x, y, sample_weight=1.0):
+    def learn_one(self, x, y, w=1.0):
         x = self._ohe_cat_features(x)
 
         if self.sample_normalization:
             x_l2_norm = sum(xj**2 for xj in x.values()) ** 0.5
             x = {j: xj / x_l2_norm for j, xj in x.items()}
 
-        return self._learn_one(x, y, sample_weight=sample_weight)
+        return self._learn_one(x, y, w=w)
 
     def _ohe_cat_features(self, x):
         """One hot encodes string features considering them as categorical."""
         return dict((f"{j}_{xj}", 1) if isinstance(xj, str) else (j, xj) for j, xj in x.items())
 
-    def _learn_one(self, x, y, sample_weight=1.0):
+    def _learn_one(self, x, y, w=1.0):
         # Calculate the gradient of the loss with respect to the raw output
         g_loss = self.loss.gradient(y_true=y, y_pred=self._raw_dot(x))
 
         # Clamp the gradient to avoid numerical instability
         g_loss = utils.math.clamp(g_loss, minimum=-self.clip_gradient, maximum=self.clip_gradient)
 
         # Apply the sample weight
-        g_loss *= sample_weight
+        g_loss *= w
 
         # Update the intercept
         intercept_lr = self.intercept_lr.get(self.weight_optimizer.n_iterations)

diff --git a/river/forest/adaptive_random_forest.py b/river/forest/adaptive_random_forest.py
@@ -169,9 +169,9 @@ def learn_one(self, x: dict, y: base.typing.Target, **kwargs):
             k = poisson(rate=self.lambda_value, rng=self._rng)
             if k > 0:
                 if not self._warning_detection_disabled and self._background[i] is not None:
-                    self._background[i].learn_one(x=x, y=y, sample_weight=k)  # type: ignore
+                    self._background[i].learn_one(x=x, y=y, w=k)  # type: ignore
 
-                model.learn_one(x=x, y=y, sample_weight=k)
+                model.learn_one(x=x, y=y, w=k)
 
                 drift_input = None
                 if not self._warning_detection_disabled:

diff --git a/river/forest/online_extra_trees.py b/river/forest/online_extra_trees.py
@@ -314,10 +314,10 @@ def learn_one(self, x, y):
             if w == 0:  # Skip model update if w is zero
                 continue
 
-            model.learn_one(x, y, sample_weight=w)
+            model.learn_one(x, y, w=w)
 
             if i in self._background_trees:
-                self._background_trees[i].learn_one(x, y, sample_weight=w)
+                self._background_trees[i].learn_one(x, y, w=w)
 
             trained.append(i)
 

diff --git a/river/metrics/base.py b/river/metrics/base.py
@@ -84,19 +84,19 @@ def __init__(self, cm: confusion.ConfusionMatrix | None = None):
             cm = confusion.ConfusionMatrix()
         self.cm = cm
 
-    def update(self, y_true, y_pred, sample_weight=1.0):
+    def update(self, y_true, y_pred, w=1.0):
         self.cm.update(
             y_true,
             y_pred,
-            sample_weight=sample_weight,
+            w=w,
         )
         return self
 
-    def revert(self, y_true, y_pred, sample_weight=1.0):
+    def revert(self, y_true, y_pred, w=1.0):
         self.cm.revert(
             y_true,
             y_pred,
-            sample_weight=sample_weight,
+            w=w,
         )
         return self
 
@@ -148,21 +148,21 @@ def update(
         self,
         y_true: bool,
         y_pred: bool | float | dict[bool, float],
-        sample_weight=1.0,
+        w=1.0,
     ) -> BinaryMetric:
         if self.requires_labels:
             y_pred = y_pred == self.pos_val
-        return super().update(y_true == self.pos_val, y_pred, sample_weight)
+        return super().update(y_true == self.pos_val, y_pred, w)
 
     def revert(
         self,
         y_true: bool,
         y_pred: bool | float | dict[bool, float],
-        sample_weight=1.0,
+        w=1.0,
     ) -> BinaryMetric:
         if self.requires_labels:
             y_pred = y_pred == self.pos_val
-        return super().revert(y_true == self.pos_val, y_pred, sample_weight)
+        return super().revert(y_true == self.pos_val, y_pred, w)
 
 
 class MultiClassMetric(ClassificationMetric):
@@ -224,7 +224,7 @@ def __init__(self, metrics, str_sep=", "):
         super().__init__(metrics)
         self.str_sep = str_sep
 
-    def update(self, y_true, y_pred, sample_weight=1.0):
+    def update(self, y_true, y_pred, w=1.0):
         # If the metrics are classification metrics, then we have to handle the case where some
         # of the metrics require labels, whilst others need to be fed probabilities
         if hasattr(self, "requires_labels") and not self.requires_labels:
@@ -239,19 +239,19 @@ def update(self, y_true, y_pred, sample_weight=1.0):
             m.update(y_true, y_pred)
         return self
 
-    def revert(self, y_true, y_pred, sample_weight=1.0):
+    def revert(self, y_true, y_pred, w=1.0):
         # If the metrics are classification metrics, then we have to handle the case where some
         # of the metrics require labels, whilst others need to be fed probabilities
         if hasattr(self, "requires_labels") and not self.requires_labels:
             for m in self:
                 if m.requires_labels:
-                    m.revert(y_true, max(y_pred, key=y_pred.get), sample_weight)
+                    m.revert(y_true, max(y_pred, key=y_pred.get), w)
                 else:
-                    m.revert(y_true, y_pred, sample_weight)
+                    m.revert(y_true, y_pred, w)
             return self
 
         for m in self:
-            m.revert(y_true, y_pred, sample_weight)
+            m.revert(y_true, y_pred, w)
         return self
 
     def get(self):
@@ -333,12 +333,12 @@ def __init__(self):
     def _eval(self, y_true, y_pred):
         pass
 
-    def update(self, y_true, y_pred, sample_weight=1.0):
-        self._mean.update(x=self._eval(y_true, y_pred), w=sample_weight)
+    def update(self, y_true, y_pred, w=1.0):
+        self._mean.update(x=self._eval(y_true, y_pred), w=w)
         return self
 
-    def revert(self, y_true, y_pred, sample_weight=1.0):
-        self._mean.revert(x=self._eval(y_true, y_pred), w=sample_weight)
+    def revert(self, y_true, y_pred, w=1.0):
+        self._mean.revert(x=self._eval(y_true, y_pred), w=w)
         return self
 
     def get(self):
@@ -354,11 +354,11 @@ class ClusteringMetric(base.Base, abc.ABC):
     _fmt = ",.6f"  # Use commas to separate big numbers and show 6 decimals
 
     @abc.abstractmethod
-    def update(self, x, y_pred, centers, sample_weight=1.0) -> ClusteringMetric:
+    def update(self, x, y_pred, centers, w=1.0) -> ClusteringMetric:
         """Update the metric."""
 
     @abc.abstractmethod
-    def revert(self, x, y_pred, centers, sample_weight=1.0) -> ClusteringMetric:
+    def revert(self, x, y_pred, centers, w=1.0) -> ClusteringMetric:
         """Revert the metric."""
 
     @abc.abstractmethod

diff --git a/river/metrics/confusion.py b/river/metrics/confusion.py
@@ -62,22 +62,22 @@ def __getitem__(self, key):
         """Syntactic sugar for accessing the counts directly."""
         return self.data[key]
 
-    def update(self, y_true, y_pred, sample_weight=1.0):
+    def update(self, y_true, y_pred, w=1.0):
         self.n_samples += 1
-        self._update(y_true, y_pred, sample_weight)
+        self._update(y_true, y_pred, w)
         return self
 
-    def revert(self, y_true, y_pred, sample_weight=1.0):
+    def revert(self, y_true, y_pred, w=1.0):
         self.n_samples -= 1
-        # Revert is equal to subtracting so we pass the negative sample_weight
-        self._update(y_true, y_pred, -sample_weight)
+        # Revert is equal to subtracting so we pass the negative sample_weight (w)
+        self._update(y_true, y_pred, -w)
         return self
 
-    def _update(self, y_true, y_pred, sample_weight):
-        self.data[y_true][y_pred] += sample_weight
-        self.total_weight += sample_weight
-        self.sum_row[y_true] += sample_weight
-        self.sum_col[y_pred] += sample_weight
+    def _update(self, y_true, y_pred, w):
+        self.data[y_true][y_pred] += w
+        self.total_weight += w
+        self.sum_row[y_true] += w
+        self.sum_col[y_pred] += w
 
     @property
     def classes(self):

diff --git a/river/metrics/mse.py b/river/metrics/mse.py
@@ -81,5 +81,5 @@ class RMSLE(RMSE):
 
     """
 
-    def update(self, y_true, y_pred, sample_weight=1.0):
-        return super().update(math.log(y_true + 1), math.log(y_pred + 1), sample_weight)
+    def update(self, y_true, y_pred, w=1.0):
+        return super().update(math.log(y_true + 1), math.log(y_pred + 1), w)