Memory-efficient subsampling of training data

ogrisel · Dec 18, 2018 · 66accfa · 66accfa
1 parent 0d1e68f
commit 66accfa
Showing 1 changed file with 8 additions and 5 deletions.
diff --git a/pygbm/gradient_boosting.py b/pygbm/gradient_boosting.py
@@ -169,11 +169,14 @@ def fit(self, X, y):
         # Subsample the training set for score-based monitoring.
         if do_early_stopping:
             subsample_size = 10000
-            indices = np.arange(X_binned_train.shape[0])
-            if X_binned_train.shape[0] > subsample_size:
-                indices = rng.choice(indices, subsample_size)
-            X_binned_small_train = X_binned_train[indices]
-            y_small_train = y_train[indices]
+            n_samples_train = X_binned_train.shape[0]
+            if n_samples_train > subsample_size:
+                indices = rng.choice(X_binned_train.shape[0], subsample_size)
+                X_binned_small_train = X_binned_train[indices]
+                y_small_train = y_train[indices]
+            else:
+                X_binned_small_train = X_binned_train
+                y_small_train = y_train
             # Predicting is faster of C-contiguous arrays.
             X_binned_small_train = np.ascontiguousarray(X_binned_small_train)