mlpack · rcurtin · Nov 15, 2020 · Oct 8, 2020 · Oct 10, 2020 · Oct 10, 2020
diff --git a/src/mlpack/core/data/split_data.hpp b/src/mlpack/core/data/split_data.hpp
@@ -17,6 +17,138 @@
 
 namespace mlpack {
 namespace data {
+
+/**
+ * Given an input dataset and labels, stratify into a training set and test set.
+ * It is recommended to have the input labels between the range [0, n) where n
+ * is the number of different labels. The NormalizeLabels() function in
+ * mlpack::data can be used for this.
+ * Example usage below. This overload places the stratified dataset into the
+ * four output parameters given (trainData, testData, trainLabel,
+ * and testLabel).
+ *
+ * @code
+ * arma::mat input = loadData();
+ * arma::Row<size_t> label = loadLabel();
+ * arma::mat trainData;
+ * arma::mat testData;
+ * arma::Row<size_t> trainLabel;
+ * arma::Row<size_t> testLabel;
+ * math::RandomSeed(100); // Set the seed if you like.
+ *
+ * // Stratify the dataset into a training and test set, with 30% of the data
+ * // being held out for the test set.
+ * StratifiedSplit(input, label, trainData,
+ *                 testData, trainLabel, testLabel, 0.3);
+ * @endcode
+ *
+ * @param input Input dataset to stratify.
+ * @param inputLabel Input labels to stratify.
+ * @param trainData Matrix to store training data into.
+ * @param testData Matrix to store test data into.
+ * @param trainLabel Vector to store training labels into.
+ * @param testLabel Vector to store test labels into.
+ * @param testRatio Percentage of dataset to use for test set (between 0 and 1).
+ * @param shuffleData If true, the sample order is shuffled; otherwise, each
+ *     sample is visited in linear order. (Default true.)
+ */
+template<typename T, typename U>
+void StratifiedSplit(const arma::Mat<T>& input,
+                     const arma::Row<U>& inputLabel,
+                     arma::Mat<T>& trainData,
+                     arma::Mat<T>& testData,
+                     arma::Row<U>& trainLabel,
+                     arma::Row<U>& testLabel,
+                     const double testRatio,
+                     const bool shuffleData = true)
+{
+  /**
+   * Basic idea:
+   * Let us say we have to stratify a dataset based on labels:
+   * 0 0 0 0 0 (5 0s)
+   * 1 1 1 1 1 1 1 1 1 1 1 (11 1s)
+   *
+   * Let our test ratio be 0.2.
+   * Then, the number of 0 labels in our test set = floor(5 * 0.2) = 1.
+   * The number of 1 labels in our test set = floor(11 * 0.2) = 2.
+   *
+   * In our first pass over the dataset,
+   * We visit each label and keep count of each label in our 'labelCounts' uvec.
+   *
+   * We then take a second pass over the dataset.
+   * We now maintain an additional uvec 'testLabelCounts' to hold the label
+   * counts of our test set.
+   *
+   * In this pass, when we encounter a label we check the 'testLabelCounts' uvec
+   * for the count of this label in the test set.
+   * If this count is less than the required number of labels in the test set,
+   * we add the data to the test set and increment the label count in the uvec.
+   * If this count is equal to or more than the required count in the test set,
+   * we add this data to the train set.
+   *
+   * Based on the above steps, we get the following labels in the split set:
+   * Train set (4 0s, 9 1s)
+   * 0 0 0 0
+   * 1 1 1 1 1 1 1 1 1
+   *
+   * Test set (1 0s, 2 1s)
+   * 0
+   * 1 1
+   */
+  size_t trainIdx = 0;
+  size_t testIdx = 0;
+  size_t trainSize = 0;
+  size_t testSize = 0;
+  arma::uvec labelCounts;
+  arma::uvec testLabelCounts;
+  U maxLabel = inputLabel.max();
+
+  labelCounts.zeros(maxLabel+1);
+  testLabelCounts.zeros(maxLabel+1);
+
+  arma::uvec order =
+      arma::linspace<arma::uvec>(0, input.n_cols - 1, input.n_cols);
+
+  if (shuffleData)
+  {
+    order = arma::shuffle(order);
+  }
+
+  for (U label : inputLabel)
+  {
+    ++labelCounts[label];
+  }
+
+  for (arma::uword labelCount : labelCounts)
+  {
+    testSize += floor(labelCount * testRatio);
+    trainSize += labelCount - floor(labelCount * testRatio);
+  }
+
+  trainData.set_size(input.n_rows, trainSize);
+  testData.set_size(input.n_rows, testSize);
+  trainLabel.set_size(trainSize);
+  testLabel.set_size(testSize);
+
+  for (arma::uword i : order)
+  {
+    U label = inputLabel[i];
+    if (testLabelCounts[label] < floor(labelCounts[label] * testRatio))
+    {
+      testLabelCounts[label] += 1;
+      testData.col(testIdx) = input.col(i);
+      testLabel[testIdx] = inputLabel[i];
+      testIdx += 1;
+    }
+    else
+    {
+      trainData.col(trainIdx) = input.col(i);
+      trainLabel[trainIdx] = inputLabel[i];
+      trainIdx += 1;
+    }
+  }
+}
+
 /**
  * Given an input dataset and labels, split into a training set and test set.
  * Example usage below.  This overload places the split dataset into the four
@@ -167,7 +299,10 @@ void Split(const arma::Mat<T>& input,
  * @param inputLabel Input labels to split.
  * @param testRatio Percentage of dataset to use for test set (between 0 and 1).
  * @param shuffleData If true, the sample order is shuffled; otherwise, each
- *       sample is visited in linear order. (Default true).
+ *     sample is visited in linear order. (Default true).
+ * @param stratifyData If true, the train and test splits are stratified
+ *     so that the ratio of each class in the training and test sets is the same
+ *     as in the original dataset.
  * @return std::tuple containing trainData (arma::Mat<T>), testData
  *      (arma::Mat<T>), trainLabel (arma::Row<U>), and testLabel (arma::Row<U>).
  */
@@ -176,15 +311,24 @@ std::tuple<arma::Mat<T>, arma::Mat<T>, arma::Row<U>, arma::Row<U>>
 Split(const arma::Mat<T>& input,
       const arma::Row<U>& inputLabel,
       const double testRatio,
-      const bool shuffleData = true)
+      const bool shuffleData = true,
+      const bool stratifyData = false)
 {
   arma::Mat<T> trainData;
   arma::Mat<T> testData;
   arma::Row<U> trainLabel;
   arma::Row<U> testLabel;
 
-  Split(input, inputLabel, trainData, testData, trainLabel, testLabel,
-      testRatio, shuffleData);
+  if (stratifyData)
+  {
+    StratifiedSplit(input, inputLabel, trainData, testData, trainLabel,
+        testLabel, testRatio, shuffleData);
+  }
+  else
+  {
+    Split(input, inputLabel, trainData, testData, trainLabel, testLabel,
+        testRatio, shuffleData);
+  }
 
   return std::make_tuple(std::move(trainData),
                          std::move(testData),

diff --git a/src/mlpack/methods/preprocess/preprocess_split_main.cpp b/src/mlpack/methods/preprocess/preprocess_split_main.cpp
@@ -69,6 +69,10 @@ BINDING_EXAMPLE(
         "test_ratio", 0.3, "training", "X_train", "training_labels", "y_train",
         "test", "X_test", "test_labels", "y_test"));
 
+BINDING_EXAMPLE(
+    "To maintain the ratio of each class in the train and test sets, the" +
+    PRINT_PARAM_STRING("stratify_data") + " option can be used.");
+
 // See also...
 BINDING_SEE_ALSO("@preprocess_binarize", "#preprocess_binarize");
 BINDING_SEE_ALSO("@preprocess_describe", "#preprocess_describe");
@@ -90,6 +94,7 @@ PARAM_DOUBLE_IN("test_ratio", "Ratio of test set; if not set,"
 
 PARAM_INT_IN("seed", "Random seed (0 for std::time(NULL)).", "s", 0);
 PARAM_FLAG("no_shuffle", "Avoid shuffling and splitting the data.", "S");
+PARAM_FLAG("stratify_data", "Stratify the data according to labels", "z")
 
 using namespace mlpack;
 using namespace mlpack::data;
@@ -102,6 +107,7 @@ static void mlpackMain()
   // Parse command line options.
   const double testRatio = IO::GetParam<double>("test_ratio");
   const bool shuffleData = IO::GetParam<bool>("no_shuffle");
+  const bool stratifyData = IO::GetParam<bool>("stratify_data");
 
   if (IO::GetParam<int>("seed") == 0)
     mlpack::math::RandomSeed(std::time(NULL));
@@ -148,11 +154,15 @@ static void mlpackMain()
         IO::GetParam<arma::Mat<size_t>>("input_labels");
     arma::Row<size_t> labelsRow = labels.row(0);
 
-    const auto value = data::Split(data, labelsRow, testRatio, !shuffleData);
-    Log::Info << "Training data contains " << get<0>(value).n_cols << " points."
-        << endl;
-    Log::Info << "Test data contains " << get<1>(value).n_cols << " points."
-        << endl;
+    Timer::Start("splitting_data");
+    const auto value =
+        data::Split(data, labelsRow, testRatio, !shuffleData, stratifyData);
+    Timer::Stop("splitting_data");
+
+    Log::Info << "Training data contains "
+        << get<0>(value).n_cols << " points." << endl;
+    Log::Info << "Test data contains "
+        << get<1>(value).n_cols << " points." << endl;
 
     if (IO::HasParam("training"))
       IO::GetParam<arma::mat>("training") = std::move(get<0>(value));
@@ -167,7 +177,10 @@ static void mlpackMain()
   }
   else // We have no labels, so just split the dataset.
   {
+    Timer::Start("splitting_data");
     const auto value = data::Split(data, testRatio, !shuffleData);
+    Timer::Stop("splitting_data");
+
     Log::Info << "Training data contains " << get<0>(value).n_cols << " points."
         << endl;
     Log::Info << "Test data contains " << get<1>(value).n_cols << " points."