microsoft · dylanw-oss · Apr 26, 2023 · May 18, 2023 · May 19, 2023
@@ -0,0 +1,74 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.azure.synapse.ml.stages
+
+import com.microsoft.azure.synapse.ml.codegen.Wrappable
+import com.microsoft.azure.synapse.ml.logging.SynapseMLLogging
+import org.apache.spark.ml.{Pipeline, Transformer}
+import org.apache.spark.ml.feature.StringIndexer
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.functions.{col, when}
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.{DataFrame, Dataset}
+import spray.json.DefaultJsonProtocol.IntJsonFormat
+
+object LumpFeatures extends DefaultParamsReadable[LumpFeatures]
+
+/** <code>LumpFeatures</code> takes a dataframe and a list of lumping rules as input and returns
+  * a dataframe comprised of the original columns but the columns defined in lumping rules
+  * will be indexed and lumped to top k.
+  *
+  * This transformer can be used to handle high cardinality skewed categorical before doing encoding.
+  *
+  */
+
+class LumpFeatures(val uid: String) extends Transformer with Wrappable with DefaultParamsWritable with SynapseMLLogging {
+  logClass()
+
+  def this() = this(Identifiable.randomUID("LumpCategoricalFeatures"))
+
+  val lumpRules: Param[String] = new Param(this, "lumpRules", "JSON string representing lumping rules")
+
+  def getLumpRules: String = $(lumpRules)
+
+  def setLumpRules(value: String): this.type = set(lumpRules, value)
+
+  /** @param dataset - The input dataset, to be transformed
+    * @return The DataFrame that results from categorical features lumping
+    */
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    logTransform[DataFrame]({
+      val lumpMap = spray.json.JsonParser(getLumpRules).asJsObject.fields.mapValues(_.convertTo[Int])
+
+      val indexers = lumpMap.keys.map { colName =>
+        new StringIndexer()
+          .setInputCol(colName)
+          .setOutputCol(colName + "_indexed")
+          .setHandleInvalid("keep")
+      }
+
+      val pipeline = new Pipeline().setStages(indexers.toArray)
+      val indexedDF = pipeline.fit(dataset).transform(dataset)
+
+      // Keep top k levels for each categorical column
+      val cleanedDFWithLumpedCols = lumpMap.keys.foldLeft(indexedDF) { (df, colName) =>
+        val k = lumpMap(colName)
+        df.withColumn(colName + "_lumped", when(col(colName + "_indexed") >= k, k).otherwise(col(colName + "_indexed")))
+          .withColumn(colName, col(colName + "_lumped").cast("string"))
+          .drop(colName + "_indexed", colName + "_lumped")
+      }
+
+      cleanedDFWithLumpedCols
+    })
+  }
+
+  def transformSchema(schema: StructType): StructType = {
+    StructType(schema)
+  }
+
+  def copy(extra: ParamMap): LumpFeatures = defaultCopy(extra)
+}
+
+
@@ -0,0 +1,62 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.azure.synapse.ml.stages
+
+import com.microsoft.azure.synapse.ml.core.test.base.TestBase
+import com.microsoft.azure.synapse.ml.core.test.fuzzing.{TestObject, TransformerFuzzing}
+import org.apache.spark.ml.util.MLReadable
+
+class LumpFeaturesSuite extends TestBase with TransformerFuzzing[LumpFeatures]{
+  import spark.implicits._
+
+  val value = "value"
+  val color = "color"
+  val shape = "shape"
+  val size = "size"
+
+  lazy val input = Seq(
+    (0, "Blue", "Rectangle", 2),
+    (0, "Blue", "Pentagon", 3),
+    (0, "Blue", "Hexagon", 7),
+    (1, "Blue", "Circle", 6),
+    (1, "Yellow", "Circle", 2),
+    (1, "Yellow", "Square", 5),
+    (2, "Yellow", "Square",1),
+    (2, "Yellow", "Square",7),
+    (2, "White", "Triangle",4),
+    (3, "Gray", "Triangle",3),
+    (3, "Black", "Triangle",9),
+    (3, "Cerulean", "Triangle", 8)
+  ).toDF(value, color, shape, size)
+
+  lazy val expectedLumpedDF = Seq(
+    (0, "0", "3", 2),
+    (0, "0", "3", 3),
+    (0, "0", "3", 7),
+    (1, "0", "2", 6),
+    (1, "1", "2", 2),
+    (1, "1", "1", 5),
+    (2, "1", "1", 1),
+    (2, "1", "1", 7),
+    (2, "2", "0", 4),
+    (3, "2", "0", 3),
+    (3, "2", "0", 9),
+    (3, "2", "0", 8)
+  ).toDF(value, color, shape, size)
+
+  test("basic functionality to lumping categorical columns") {
+
+    val lt = new LumpFeatures()
+      .setLumpRules("{\"color\":2, \"shape\":3}")
+    val lumpedDF = lt.transform(input)
+
+    lumpedDF.show()
+    assert(verifyResult(expectedLumpedDF, lumpedDF))
+  }
+
+  def testObjects(): Seq[TestObject[LumpFeatures]] = List(new TestObject(
+    new LumpFeatures(), makeBasicDF()))
+
+  override def reader: MLReadable[_] = DropColumns
+}