Navigation Menu

Skip to content

Commit

Permalink
Memory-efficient subsampling of training data
Browse files Browse the repository at this point in the history
  • Loading branch information
maartenbreddels authored and NicolasHug committed Dec 18, 2018
1 parent 0d1e68f commit 66accfa
Showing 1 changed file with 8 additions and 5 deletions.
13 changes: 8 additions & 5 deletions pygbm/gradient_boosting.py
Expand Up @@ -169,11 +169,14 @@ def fit(self, X, y):
# Subsample the training set for score-based monitoring.
if do_early_stopping:
subsample_size = 10000
indices = np.arange(X_binned_train.shape[0])
if X_binned_train.shape[0] > subsample_size:
indices = rng.choice(indices, subsample_size)
X_binned_small_train = X_binned_train[indices]
y_small_train = y_train[indices]
n_samples_train = X_binned_train.shape[0]
if n_samples_train > subsample_size:
indices = rng.choice(X_binned_train.shape[0], subsample_size)
X_binned_small_train = X_binned_train[indices]
y_small_train = y_train[indices]
else:
X_binned_small_train = X_binned_train
y_small_train = y_train
# Predicting is faster of C-contiguous arrays.
X_binned_small_train = np.ascontiguousarray(X_binned_small_train)

Expand Down

0 comments on commit 66accfa

Please sign in to comment.