From aa166b55cfb4d38a7964dc7e51d5879f310999fb Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Fri, 20 Aug 2021 17:01:49 -0700 Subject: [PATCH 01/32] Initial PDP version. --- .../ml/spark/explainers/ICEExplainer.scala | 171 ++++++++++++++++++ .../ml/spark/explainers/LocalExplainer.scala | 32 ---- .../ml/spark/explainers/SharedParams.scala | 40 +++- .../explainers/split1/ICEExplainerSuite.scala | 72 ++++++++ 4 files changed, 282 insertions(+), 33 deletions(-) create mode 100644 core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala create mode 100644 core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala diff --git a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala new file mode 100644 index 0000000000..4c479d139b --- /dev/null +++ b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala @@ -0,0 +1,171 @@ +package com.microsoft.ml.spark.explainers +import com.microsoft.ml.spark.core.contracts.HasOutputCol +import com.microsoft.ml.spark.core.schema.DatasetExtensions +import org.apache.spark.ml.Transformer +import org.apache.spark.ml.linalg.SQLDataTypes.VectorType +import org.apache.spark.ml.param.{DoubleParam, IntParam, ParamMap, Params} +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.ml.param._ +import org.apache.spark.ml.stat.Summarizer + + +trait ICEFeatureParams extends Params { + val feature = new Param[String] (this, "feature", "Feature to explain") + def getFeature: String = $(feature) + def setFeature(value: String): this.type = set(feature, value) + + val featureType = new Param[String] (this, "featureType", "Type of feature to explain") + def getFeatureType: String = $(featureType) + def setFeatureType(value: String): this.type = set(featureType, value) + + val topNValues = new IntParam (this, "topNValues", "topNValues") + def getTopNValues: Int = $(topNValues) + def setTopNValues(value: Int): this.type = set(topNValues, value) + + val nSplits = new IntParam (this, "nSplits", "nSplits") + def getNSplits: Int = $(nSplits) + def setNSplits(value: Int): this.type = set(nSplits, value) + + val rangeMax = new DoubleParam(this, "rangeMax", "rangeMax") + def getRangeMax: Double = $(rangeMax) + def setRangeMax(value: Double): this.type = set(rangeMax, value) + + val rangeMin = new DoubleParam(this, "rangeMin", "rangeMin") + def getRangeMin: Double = $(rangeMin) + def setRangeMin(value: Double): this.type = set(rangeMin, value) + + setDefault(featureType -> "discrete", topNValues -> 100, nSplits -> 20) + +} + +class ICETransformer(override val uid: String) extends Transformer + with HasNumSamples + with HasExplainTarget + with HasModel + with ICEFeatureParams + with HasOutputCol { + + /* transform: + 1) gives feature values + 2) individual series plots + + */ + def this() = { + this(Identifiable.randomUID("ICETransformer")) + } + + def transform(instances: Dataset[_]): DataFrame = { + + val df = instances.toDF + val idCol = DatasetExtensions.findUnusedColumnName("id", df) + val targetClasses = DatasetExtensions.findUnusedColumnName("targetClasses", df) + val dfWithId = df + .withColumn(idCol, monotonically_increasing_id()) + .withColumn(targetClasses, this.get(targetClassesCol).map(col).getOrElse(lit(getTargetClasses))) + + + val values = $(featureType).toLowerCase match { + case "discrete" => + collectDiscreteValues(dfWithId, $(feature), $(topNValues)) + case "continuous" => + collectSplits(dfWithId, $(feature), $(nSplits), get(rangeMin), get(rangeMax)) + case other => + throw new IllegalArgumentException( + s"The feature type can only be 'discrete' or 'continuous'. Instead it is set to '$other'" + ) + } + + val dataType = dfWithId.schema($(feature)).dataType + val explodeFunc = explode(array(values.map(v => lit(v).cast(dataType)): _*)) + + val predicted = getModel.transform(dfWithId.withColumn($(feature), explodeFunc)) + val targetCol = DatasetExtensions.findUnusedColumnName("target", predicted) + + val explainTarget = extractTarget(predicted.schema, targetClasses) + val result = predicted.withColumn(targetCol, explainTarget) + + result + .groupBy($(feature)) + .agg(Summarizer.mean(col(targetCol)).alias("__feature__importance__")) + .withColumnRenamed($(feature), "__feature__value__") + .withColumn("__feature__name__", lit($(feature))) + .select("__feature__name__", "__feature__value__", "__feature__importance__") + } + + private def collectDiscreteValues[_](df: DataFrame, feature: String, topNValues: Int): Array[_] = { + val values = df + .groupBy(col(feature)) + .agg(count("*").as("__feature__count__")) + .orderBy(col("__feature__count__").desc) + .head(topNValues) + .map(row => row.get(0)) + values + } + + private def collectSplits(df: DataFrame, feature: String, nSplits: Int, + rangeMin: Option[Double], rangeMax: Option[Double]): Array[Double] = { + def createNSplits(n: Int)(from: Double, to: Double): Seq[Double] = { + (0 to n) map { + i => (to - from) / n * i + from + } + } + + val featureCol = df.schema(feature) + + val createSplits = createNSplits(nSplits) _ + + val values = if (rangeMin.isDefined && rangeMax.isDefined) { + val (mi, ma) = (rangeMin.get, rangeMax.get) + + // The ranges are defined + featureCol.dataType match { + case _@(ByteType | IntegerType | LongType | ShortType) => + if (ma.toLong - mi.toLong <= nSplits) { + // For integral types, no need to create more splits than needed. + (mi.toLong to ma.toLong) map (_.toDouble) + } else { + createSplits(mi, ma) + } + case _ => + createSplits(mi, ma) + } + } else { + // The ranges need to be calculated from background dataset. + featureCol.dataType match { + case _@(ByteType | IntegerType | LongType | ShortType) => + val Row(minValue: Long, maxValue: Long) = df + .agg(min(col(feature)).cast(LongType), max(col(feature)).cast(LongType)) + .head + + val mi = rangeMin.map(_.toLong).getOrElse(minValue) + val ma = rangeMax.map(_.toLong).getOrElse(maxValue) + + if (ma - mi <= nSplits) { + // For integral types, no need to create more splits than needed. + (mi to ma) map (_.toDouble) + } else { + createSplits(mi, ma) + } + case _ => + val Row(minValue: Double, maxValue: Double) = df + .agg(min(col(feature)).cast(DoubleType), max(col(feature)).cast(DoubleType)) + .head + + val mi = rangeMin.getOrElse(minValue) + val ma = rangeMax.getOrElse(maxValue) + createSplits(mi, ma) + } + } + values.toArray + } + + override def copy(extra: ParamMap): Transformer = this.defaultCopy(extra) + + override def transformSchema(schema: StructType): StructType = { + this.validateSchema(schema) + schema.add(getOutputCol, ArrayType(VectorType)) + } +} diff --git a/core/src/main/scala/com/microsoft/ml/spark/explainers/LocalExplainer.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/LocalExplainer.scala index dd1578fb75..4c3bd98c20 100644 --- a/core/src/main/scala/com/microsoft/ml/spark/explainers/LocalExplainer.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/explainers/LocalExplainer.scala @@ -31,38 +31,6 @@ trait LocalExplainer } protected def preprocess(df: DataFrame): DataFrame = df - - /** - * This function supports a variety of target column types: - * - NumericType: in the case of a regression model - * - VectorType: in the case of a typical Spark ML classification model with probability output - * - ArrayType(NumericType): in the case where the output was converted to an array of numeric types. - * - MapType(IntegerType, NumericType): this is to support ZipMap type of output for sklearn models via ONNX runtime. - */ - private[explainers] def extractTarget(schema: StructType, targetClassesCol: String): Column = { - val toVector = UDFUtils.oldUdf( - (values: Seq[Double]) => Vectors.dense(values.toArray), - VectorType - ) - - val target = schema(getTargetCol).dataType match { - case _: NumericType => - toVector(array(col(getTargetCol))) - case VectorType => - SlicerFunctions.vectorSlicer(col(getTargetCol), col(targetClassesCol)) - case ArrayType(elementType: NumericType, _) => - SlicerFunctions.arraySlicer(elementType)(col(getTargetCol), col(targetClassesCol)) - case MapType(_: IntegerType, valueType: NumericType, _) => - SlicerFunctions.mapSlicer(valueType)(col(getTargetCol), col(targetClassesCol)) - case other => - throw new IllegalArgumentException( - s"Only numeric types, vector type, array of numeric types and map types with numeric value type " + - s"are supported as target column. The current type is $other." - ) - } - - target - } } object LocalExplainer { diff --git a/core/src/main/scala/com/microsoft/ml/spark/explainers/SharedParams.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/SharedParams.scala index 5502cfa35b..f8205157fd 100644 --- a/core/src/main/scala/com/microsoft/ml/spark/explainers/SharedParams.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/explainers/SharedParams.scala @@ -3,9 +3,14 @@ package com.microsoft.ml.spark.explainers +import com.microsoft.ml.spark.core.utils.SlicerFunctions +import org.apache.spark.injections.UDFUtils import org.apache.spark.ml.Transformer +import org.apache.spark.ml.linalg.SQLDataTypes.VectorType +import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param._ -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{array, col} +import org.apache.spark.sql.{Column, DataFrame} import org.apache.spark.sql.types._ trait CanValidateSchema { @@ -149,4 +154,37 @@ trait HasExplainTarget extends Params with CanValidateSchema { } setDefault(targetClasses -> Array.empty[Int]) + + /** + * This function supports a variety of target column types: + * - NumericType: in the case of a regression model + * - VectorType: in the case of a typical Spark ML classification model with probability output + * - ArrayType(NumericType): in the case where the output was converted to an array of numeric types. + * - MapType(IntegerType, NumericType): this is to support ZipMap type of output for sklearn models via ONNX runtime. + */ + + private[explainers] def extractTarget(schema: StructType, targetClassesCol: String): Column = { + val toVector = UDFUtils.oldUdf( + (values: Seq[Double]) => Vectors.dense(values.toArray), + VectorType + ) + + val target = schema(getTargetCol).dataType match { + case _: NumericType => + toVector(array(col(getTargetCol))) + case VectorType => + SlicerFunctions.vectorSlicer(col(getTargetCol), col(targetClassesCol)) + case ArrayType(elementType: NumericType, _) => + SlicerFunctions.arraySlicer(elementType)(col(getTargetCol), col(targetClassesCol)) + case MapType(_: IntegerType, valueType: NumericType, _) => + SlicerFunctions.mapSlicer(valueType)(col(getTargetCol), col(targetClassesCol)) + case other => + throw new IllegalArgumentException( + s"Only numeric types, vector type, array of numeric types and map types with numeric value type " + + s"are supported as target column. The current type is $other." + ) + } + + target + } } diff --git a/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala new file mode 100644 index 0000000000..e1165253bd --- /dev/null +++ b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala @@ -0,0 +1,72 @@ +package com.microsoft.ml.spark.explainers.split1 + +import org.apache.spark.ml.{Pipeline, PipelineModel} +import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer, VectorAssembler} +import com.microsoft.ml.spark.core.test.base.TestBase +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions._ +import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} +import com.microsoft.ml.spark.explainers.ICETransformer + + +class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransformer] { + + import spark.implicits._ + val data: DataFrame = (1 to 100).flatMap(_ => Seq( + (-5d, "a", -5d, 0), + (-5d, "b", -5d, 0), + (5d, "a", 5d, 1), + (5d, "b", 5d, 1) + )).toDF("col1", "col2", "col3", "label") + + val new_data = data.withColumn("col4", rand()*100) + + new_data.show() + + val pipeline: Pipeline = new Pipeline().setStages(Array( + new StringIndexer().setInputCol("col2").setOutputCol("col2_ind"), + new OneHotEncoder().setInputCol("col2_ind").setOutputCol("col2_enc"), + new VectorAssembler().setInputCols(Array("col1", "col2_enc", "col3", "col4")).setOutputCol("features"), + new LogisticRegression().setLabelCol("label").setFeaturesCol("features") + )) + + + val model: PipelineModel = pipeline.fit(new_data) + + val ice = new ICETransformer() + + ice.setModel(model).setOutputCol("iceValues").setTargetCol("probability").setFeature("col1") + .setTargetClasses(Array(1)) + + + val output = ice.transform(new_data) + output.show() + + val iceCon = new ICETransformer() + + iceCon.setModel(model) + .setOutputCol("iceValues") + .setTargetCol("probability") + .setFeature("col4") + .setFeatureType("continuous") + .setTargetClasses(Array(1)) + + val outputCon = iceCon.transform(new_data) + outputCon.show() + + + val iceCon1 = new ICETransformer() + + iceCon1.setModel(model) + .setOutputCol("iceValues") + .setTargetCol("probability") + .setFeature("col4") + .setFeatureType("continuous") + .setRangeMin(0.0) + .setRangeMax(100.0) + .setTargetClasses(Array(1)) + + val outputCon1 = iceCon.transform(new_data) + outputCon1.show() + +} From 151ef9926d230dca238d6caf2257f8de732d0c03 Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Tue, 21 Sep 2021 11:28:31 -0700 Subject: [PATCH 02/32] Apply suggestions --- .../ml/spark/explainers/ICEExplainer.scala | 58 +++++++++++++------ .../explainers/split1/ICEExplainerSuite.scala | 2 + 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala index 4c479d139b..96531b4bcf 100644 --- a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala @@ -12,37 +12,65 @@ import org.apache.spark.ml.param._ import org.apache.spark.ml.stat.Summarizer -trait ICEFeatureParams extends Params { - val feature = new Param[String] (this, "feature", "Feature to explain") +trait ICEFeatureParams extends Params with HasNumSamples { + val feature = new Param[String] ( + this, + "feature", + "Feature to explain" + ) def getFeature: String = $(feature) def setFeature(value: String): this.type = set(feature, value) - val featureType = new Param[String] (this, "featureType", "Type of feature to explain") + val featureType = new Param[String] ( + this, + "featureType", + "Type of feature to explain", + ParamValidators.inArray(Array("discrete", "continuous")) + ) def getFeatureType: String = $(featureType) def setFeatureType(value: String): this.type = set(featureType, value) - val topNValues = new IntParam (this, "topNValues", "topNValues") + val topNValues = new IntParam ( + this, + "topNValues", + "topNValues", + ParamValidators.gt(0) + ) def getTopNValues: Int = $(topNValues) def setTopNValues(value: Int): this.type = set(topNValues, value) - val nSplits = new IntParam (this, "nSplits", "nSplits") + val nSplits = new IntParam ( + this, + "nSplits", + "nSplits", + ParamValidators.gt(0) + ) def getNSplits: Int = $(nSplits) def setNSplits(value: Int): this.type = set(nSplits, value) - val rangeMax = new DoubleParam(this, "rangeMax", "rangeMax") + val rangeMax = new DoubleParam( + this, + "rangeMax", + "rangeMax", + ParamValidators.gtEq(0.0) + ) def getRangeMax: Double = $(rangeMax) def setRangeMax(value: Double): this.type = set(rangeMax, value) - val rangeMin = new DoubleParam(this, "rangeMin", "rangeMin") + val rangeMin = new DoubleParam( + this, + "rangeMin", + "rangeMin", + ParamValidators.gtEq(0.0) + ) def getRangeMin: Double = $(rangeMin) def setRangeMin(value: Double): this.type = set(rangeMin, value) - setDefault(featureType -> "discrete", topNValues -> 100, nSplits -> 20) + setDefault(numSamples -> 1000, featureType -> "discrete", topNValues -> 100, nSplits -> 20) } class ICETransformer(override val uid: String) extends Transformer - with HasNumSamples with HasExplainTarget with HasModel with ICEFeatureParams @@ -57,25 +85,20 @@ class ICETransformer(override val uid: String) extends Transformer this(Identifiable.randomUID("ICETransformer")) } - def transform(instances: Dataset[_]): DataFrame = { + def transform(ds: Dataset[_]): DataFrame = { - val df = instances.toDF - val idCol = DatasetExtensions.findUnusedColumnName("id", df) + val df = ds.toDF val targetClasses = DatasetExtensions.findUnusedColumnName("targetClasses", df) val dfWithId = df - .withColumn(idCol, monotonically_increasing_id()) .withColumn(targetClasses, this.get(targetClassesCol).map(col).getOrElse(lit(getTargetClasses))) + transformSchema(df.schema) val values = $(featureType).toLowerCase match { case "discrete" => collectDiscreteValues(dfWithId, $(feature), $(topNValues)) case "continuous" => collectSplits(dfWithId, $(feature), $(nSplits), get(rangeMin), get(rangeMax)) - case other => - throw new IllegalArgumentException( - s"The feature type can only be 'discrete' or 'continuous'. Instead it is set to '$other'" - ) } val dataType = dfWithId.schema($(feature)).dataType @@ -165,6 +188,7 @@ class ICETransformer(override val uid: String) extends Transformer override def copy(extra: ParamMap): Transformer = this.defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { + assert(!schema.fieldNames.contains(feature.name), s"The schema does not contain column ${feature.name}") this.validateSchema(schema) schema.add(getOutputCol, ArrayType(VectorType)) } diff --git a/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala index e1165253bd..c1b916cc4d 100644 --- a/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala @@ -69,4 +69,6 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor val outputCon1 = iceCon.transform(new_data) outputCon1.show() + println("Finished") + } From b47d4103e023339782046eedb0463d420d0be016 Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Thu, 23 Sep 2021 15:15:45 -0700 Subject: [PATCH 03/32] Added ICE --- .../ml/spark/explainers/ICEExplainer.scala | 64 +++++++++++++------ .../explainers/split1/ICEExplainerSuite.scala | 25 ++++++-- 2 files changed, 65 insertions(+), 24 deletions(-) diff --git a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala index 96531b4bcf..2ef92f464d 100644 --- a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala @@ -3,12 +3,11 @@ import com.microsoft.ml.spark.core.contracts.HasOutputCol import com.microsoft.ml.spark.core.schema.DatasetExtensions import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.SQLDataTypes.VectorType -import org.apache.spark.ml.param.{DoubleParam, IntParam, ParamMap, Params} +import org.apache.spark.ml.param.{DoubleParam, IntParam, ParamMap, ParamValidators, Params, _} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} -import org.apache.spark.ml.param._ import org.apache.spark.ml.stat.Summarizer @@ -66,7 +65,16 @@ trait ICEFeatureParams extends Params with HasNumSamples { def getRangeMin: Double = $(rangeMin) def setRangeMin(value: Double): this.type = set(rangeMin, value) - setDefault(numSamples -> 1000, featureType -> "discrete", topNValues -> 100, nSplits -> 20) + val kind = new Param[String] ( + this, + "kind", + "pdp or ice", + ParamValidators.inArray(Array("average", "individual")) + ) + def getKind: String = $(kind) + def setKind(value: String): this.type = set(kind, value) + + setDefault(numSamples -> 1000, featureType -> "discrete", topNValues -> 100, nSplits -> 20, kind -> "individual") } @@ -88,55 +96,72 @@ class ICETransformer(override val uid: String) extends Transformer def transform(ds: Dataset[_]): DataFrame = { val df = ds.toDF + val idCol = DatasetExtensions.findUnusedColumnName("id", df) val targetClasses = DatasetExtensions.findUnusedColumnName("targetClasses", df) val dfWithId = df + .withColumn(idCol, monotonically_increasing_id()) .withColumn(targetClasses, this.get(targetClassesCol).map(col).getOrElse(lit(getTargetClasses))) transformSchema(df.schema) + val feature = this.getFeature - val values = $(featureType).toLowerCase match { + val values = getFeatureType.toLowerCase match { case "discrete" => - collectDiscreteValues(dfWithId, $(feature), $(topNValues)) + collectDiscreteValues(dfWithId) case "continuous" => - collectSplits(dfWithId, $(feature), $(nSplits), get(rangeMin), get(rangeMax)) + collectSplits(dfWithId, get(rangeMin), get(rangeMax)) } - val dataType = dfWithId.schema($(feature)).dataType + val dataType = dfWithId.schema(feature).dataType val explodeFunc = explode(array(values.map(v => lit(v).cast(dataType)): _*)) - val predicted = getModel.transform(dfWithId.withColumn($(feature), explodeFunc)) + val predicted = getModel.transform(dfWithId.withColumn(feature, explodeFunc)) val targetCol = DatasetExtensions.findUnusedColumnName("target", predicted) val explainTarget = extractTarget(predicted.schema, targetClasses) val result = predicted.withColumn(targetCol, explainTarget) - result - .groupBy($(feature)) - .agg(Summarizer.mean(col(targetCol)).alias("__feature__importance__")) - .withColumnRenamed($(feature), "__feature__value__") - .withColumn("__feature__name__", lit($(feature))) - .select("__feature__name__", "__feature__value__", "__feature__importance__") + //result.show() + + getKind.toLowerCase match { + case "average" => + result + .groupBy(feature) + .agg(Summarizer.mean(col(targetCol)).alias("__feature__importance__")) + .withColumnRenamed(feature, "__feature__value__") + .withColumn("__feature__name__", lit(feature)) + .select("__feature__name__", "__feature__value__", "__feature__importance__") + case "individual" => + // storing as a map feature -> target value + result.groupBy("id") + .agg(collect_list(feature).alias("feature_list"), collect_list(targetCol).alias("target_list")) + .withColumn("__feature__importance__", map_from_arrays(col("feature_list"), col("target_list"))) + .select(idCol, "__feature__importance__") + .orderBy(idCol) + + } } - private def collectDiscreteValues[_](df: DataFrame, feature: String, topNValues: Int): Array[_] = { + private def collectDiscreteValues[_](df: DataFrame): Array[_] = { val values = df - .groupBy(col(feature)) + .groupBy(col(getFeature)) .agg(count("*").as("__feature__count__")) .orderBy(col("__feature__count__").desc) - .head(topNValues) + .head(getTopNValues) .map(row => row.get(0)) values } - private def collectSplits(df: DataFrame, feature: String, nSplits: Int, - rangeMin: Option[Double], rangeMax: Option[Double]): Array[Double] = { + private def collectSplits(df: DataFrame, rangeMin: Option[Double], rangeMax: Option[Double]): Array[Double] = { def createNSplits(n: Int)(from: Double, to: Double): Seq[Double] = { (0 to n) map { i => (to - from) / n * i + from } } + val feature = getFeature val featureCol = df.schema(feature) + val nSplits = getNSplits val createSplits = createNSplits(nSplits) _ @@ -182,6 +207,7 @@ class ICETransformer(override val uid: String) extends Transformer createSplits(mi, ma) } } + values.toArray } diff --git a/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala index c1b916cc4d..bc4b7dfec0 100644 --- a/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala @@ -40,7 +40,7 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor val output = ice.transform(new_data) - output.show() + output.show(false) val iceCon = new ICETransformer() @@ -52,7 +52,7 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor .setTargetClasses(Array(1)) val outputCon = iceCon.transform(new_data) - outputCon.show() + outputCon.show(false) val iceCon1 = new ICETransformer() @@ -66,9 +66,24 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor .setRangeMax(100.0) .setTargetClasses(Array(1)) - val outputCon1 = iceCon.transform(new_data) - outputCon1.show() + val outputCon1 = iceCon1.transform(new_data) + outputCon1.show(false) - println("Finished") + + val pdp = new ICETransformer() + + pdp.setModel(model) + .setOutputCol("iceValues") + .setTargetCol("probability") + .setFeature("col4") + .setFeatureType("continuous") + .setRangeMin(0.0) + .setRangeMax(100.0) + .setNSplits(3) + .setTargetClasses(Array(1)) + .setKind("average") + + val pdpOutput = pdp.transform(new_data) + pdpOutput.show(false) } From 7d701100888deb2acf3ef43b01941fba9a1bcba9 Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Mon, 4 Oct 2021 15:51:38 -0700 Subject: [PATCH 04/32] Apply suggestions and fix --- .../ml/spark/explainers/ICEExplainer.scala | 70 ++++++++++--------- .../explainers/split1/ICEExplainerSuite.scala | 69 ++++++++++-------- 2 files changed, 76 insertions(+), 63 deletions(-) diff --git a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala index 2ef92f464d..7e708af037 100644 --- a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala @@ -15,7 +15,7 @@ trait ICEFeatureParams extends Params with HasNumSamples { val feature = new Param[String] ( this, "feature", - "Feature to explain" + "The feature to explain." ) def getFeature: String = $(feature) def setFeature(value: String): this.type = set(feature, value) @@ -23,7 +23,7 @@ trait ICEFeatureParams extends Params with HasNumSamples { val featureType = new Param[String] ( this, "featureType", - "Type of feature to explain", + "Whether the feature is discrete or continuous.", ParamValidators.inArray(Array("discrete", "continuous")) ) def getFeatureType: String = $(featureType) @@ -32,7 +32,8 @@ trait ICEFeatureParams extends Params with HasNumSamples { val topNValues = new IntParam ( this, "topNValues", - "topNValues", + "At most how many discrete values do we collect for discrete features. " + + "The features are ranked by occurrence in descending order.", ParamValidators.gt(0) ) def getTopNValues: Int = $(topNValues) @@ -41,7 +42,7 @@ trait ICEFeatureParams extends Params with HasNumSamples { val nSplits = new IntParam ( this, "nSplits", - "nSplits", + "How many ways to split the value range for continuous feature.", ParamValidators.gt(0) ) def getNSplits: Int = $(nSplits) @@ -50,31 +51,45 @@ trait ICEFeatureParams extends Params with HasNumSamples { val rangeMax = new DoubleParam( this, "rangeMax", - "rangeMax", + "Specifies the max value of the range for continuous features. " + + "If not specified, it will be computed from the background dataset.", ParamValidators.gtEq(0.0) ) - def getRangeMax: Double = $(rangeMax) + def getRangeMax: Option[Double] = get(rangeMax) def setRangeMax(value: Double): this.type = set(rangeMax, value) val rangeMin = new DoubleParam( this, "rangeMin", - "rangeMin", + "Specifies the min value of the range for continuous features. " + + "If not specified, it will be computed from the background dataset.", ParamValidators.gtEq(0.0) ) - def getRangeMin: Double = $(rangeMin) + def getRangeMin: Option[Double] = get(rangeMin) def setRangeMin(value: Double): this.type = set(rangeMin, value) val kind = new Param[String] ( this, "kind", - "pdp or ice", + "Whether to return the partial dependence averaged across all the samples in the dataset or one line per sample.", ParamValidators.inArray(Array("average", "individual")) ) def getKind: String = $(kind) def setKind(value: String): this.type = set(kind, value) - setDefault(numSamples -> 1000, featureType -> "discrete", topNValues -> 100, nSplits -> 20, kind -> "individual") + def setDiscreteFeature(feature: String, topN: Int): this.type = { + this.setFeature(feature).setFeatureType("discrete").setTopNValues(topN) + } + + def setContinuousFeature(feature: String, nSplits: Int, + rangeMin: Option[Double] = None, + rangeMax: Option[Double] = None): this.type = { + rangeMin.foreach(this.setRangeMin) + rangeMax.foreach(this.setRangeMax) + this.setFeature(feature).setFeatureType("continuous").setNSplits(nSplits) + } + + setDefault(numSamples -> 1000, featureType -> "discrete", topNValues -> 100, kind -> "individual") } @@ -84,11 +99,6 @@ class ICETransformer(override val uid: String) extends Transformer with ICEFeatureParams with HasOutputCol { - /* transform: - 1) gives feature values - 2) individual series plots - - */ def this() = { this(Identifiable.randomUID("ICETransformer")) } @@ -96,7 +106,7 @@ class ICETransformer(override val uid: String) extends Transformer def transform(ds: Dataset[_]): DataFrame = { val df = ds.toDF - val idCol = DatasetExtensions.findUnusedColumnName("id", df) + val idCol = DatasetExtensions.findUnusedColumnName("idCol", df) val targetClasses = DatasetExtensions.findUnusedColumnName("targetClasses", df) val dfWithId = df .withColumn(idCol, monotonically_increasing_id()) @@ -109,20 +119,19 @@ class ICETransformer(override val uid: String) extends Transformer case "discrete" => collectDiscreteValues(dfWithId) case "continuous" => - collectSplits(dfWithId, get(rangeMin), get(rangeMax)) + collectSplits(dfWithId) } val dataType = dfWithId.schema(feature).dataType - val explodeFunc = explode(array(values.map(v => lit(v).cast(dataType)): _*)) + val explodeFunc = explode(array(values.map(v => lit(v)): _*).cast(ArrayType(dataType))) - val predicted = getModel.transform(dfWithId.withColumn(feature, explodeFunc)) + val sampled = dfWithId.orderBy(rand()).limit(getNumSamples).cache() + val predicted = getModel.transform(sampled.withColumn(feature, explodeFunc)) val targetCol = DatasetExtensions.findUnusedColumnName("target", predicted) val explainTarget = extractTarget(predicted.schema, targetClasses) val result = predicted.withColumn(targetCol, explainTarget) - //result.show() - getKind.toLowerCase match { case "average" => result @@ -133,12 +142,11 @@ class ICETransformer(override val uid: String) extends Transformer .select("__feature__name__", "__feature__value__", "__feature__importance__") case "individual" => // storing as a map feature -> target value - result.groupBy("id") + val iceFeatures = result.groupBy("idCol") .agg(collect_list(feature).alias("feature_list"), collect_list(targetCol).alias("target_list")) .withColumn("__feature__importance__", map_from_arrays(col("feature_list"), col("target_list"))) .select(idCol, "__feature__importance__") - .orderBy(idCol) - + sampled.join(iceFeatures, idCol) } } @@ -152,22 +160,20 @@ class ICETransformer(override val uid: String) extends Transformer values } - private def collectSplits(df: DataFrame, rangeMin: Option[Double], rangeMax: Option[Double]): Array[Double] = { - def createNSplits(n: Int)(from: Double, to: Double): Seq[Double] = { - (0 to n) map { - i => (to - from) / n * i + from - } + private def createNSplits(n: Int)(from: Double, to: Double): Seq[Double] = { + (0 to n) map { + i => (to - from) / n * i + from } + } - val feature = getFeature + private def collectSplits(df: DataFrame): Array[Double] = { + val (feature, nSplits, rangeMin, rangeMax) = (getFeature, getNSplits, getRangeMin, getRangeMax) val featureCol = df.schema(feature) - val nSplits = getNSplits val createSplits = createNSplits(nSplits) _ val values = if (rangeMin.isDefined && rangeMax.isDefined) { val (mi, ma) = (rangeMin.get, rangeMax.get) - // The ranges are defined featureCol.dataType match { case _@(ByteType | IntegerType | LongType | ShortType) => diff --git a/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala index bc4b7dfec0..8b09fc1ebd 100644 --- a/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala @@ -12,16 +12,15 @@ import com.microsoft.ml.spark.explainers.ICETransformer class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransformer] { import spark.implicits._ - val data: DataFrame = (1 to 100).flatMap(_ => Seq( + val dataDF: DataFrame = (1 to 100).flatMap(_ => Seq( (-5d, "a", -5d, 0), (-5d, "b", -5d, 0), (5d, "a", 5d, 1), (5d, "b", 5d, 1) )).toDF("col1", "col2", "col3", "label") - val new_data = data.withColumn("col4", rand()*100) - - new_data.show() + val data: DataFrame = dataDF.withColumn("col4", rand()*100) + data.show() val pipeline: Pipeline = new Pipeline().setStages(Array( new StringIndexer().setInputCol("col2").setOutputCol("col2_ind"), @@ -29,61 +28,69 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor new VectorAssembler().setInputCols(Array("col1", "col2_enc", "col3", "col4")).setOutputCol("features"), new LogisticRegression().setLabelCol("label").setFeaturesCol("features") )) + val model: PipelineModel = pipeline.fit(data) - val model: PipelineModel = pipeline.fit(new_data) - val ice = new ICETransformer() - - ice.setModel(model).setOutputCol("iceValues").setTargetCol("probability").setFeature("col1") + ice.setModel(model) + .setOutputCol("iceValues") + .setTargetCol("probability") + .setFeature("col1") .setTargetClasses(Array(1)) - - - val output = ice.transform(new_data) + val output: DataFrame = ice.transform(data) output.show(false) val iceCon = new ICETransformer() - iceCon.setModel(model) .setOutputCol("iceValues") .setTargetCol("probability") - .setFeature("col4") - .setFeatureType("continuous") + .setContinuousFeature(feature = "col4", nSplits = 20) .setTargetClasses(Array(1)) - - val outputCon = iceCon.transform(new_data) + val outputCon: DataFrame = iceCon.transform(data) outputCon.show(false) val iceCon1 = new ICETransformer() - iceCon1.setModel(model) .setOutputCol("iceValues") .setTargetCol("probability") - .setFeature("col4") - .setFeatureType("continuous") - .setRangeMin(0.0) - .setRangeMax(100.0) + .setContinuousFeature( + feature = "col4", + nSplits = 20, + rangeMin = Some(0.0), + rangeMax = Some(100.0) + ) .setTargetClasses(Array(1)) - - val outputCon1 = iceCon1.transform(new_data) + val outputCon1: DataFrame = iceCon1.transform(data) outputCon1.show(false) val pdp = new ICETransformer() - pdp.setModel(model) .setOutputCol("iceValues") .setTargetCol("probability") - .setFeature("col4") - .setFeatureType("continuous") - .setRangeMin(0.0) - .setRangeMax(100.0) - .setNSplits(3) + .setContinuousFeature( + feature = "col4", + nSplits = 3, + rangeMin = Some(0.0), + rangeMax = Some(100.0) + ) .setTargetClasses(Array(1)) .setKind("average") - - val pdpOutput = pdp.transform(new_data) + val pdpOutput: DataFrame = pdp.transform(data) pdpOutput.show(false) + val pdpDisc = new ICETransformer() + pdpDisc.setModel(model) + .setOutputCol("iceValues") + .setTargetCol("probability") + .setDiscreteFeature( + feature = "col4", + topN = 2 + ) + .setTargetClasses(Array(1)) + .setKind("average") + val pdpOutputDisc: DataFrame = pdpDisc.transform(data) + pdpOutputDisc.show(false) + } From f5049e303a1ad56dfa6edb1c520e2b6221dd9d63 Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Fri, 15 Oct 2021 17:37:01 -0700 Subject: [PATCH 05/32] Added discrete --- .../ml/spark/explainers/ICEExplainer.scala | 259 +++++++++++------- .../explainers/split1/ICEExplainerSuite.scala | 122 +++++---- 2 files changed, 229 insertions(+), 152 deletions(-) diff --git a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala index 7e708af037..101852b3e8 100644 --- a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala @@ -9,64 +9,103 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.ml.stat.Summarizer +import spray.json.{JsValue, JsonFormat, JsNumber, JsString, JsObject} +case class DiscreteFeature(name: String, numTopValue: Int) { + def validate: Boolean = { + numTopValue > 0 + } +} -trait ICEFeatureParams extends Params with HasNumSamples { - val feature = new Param[String] ( - this, - "feature", - "The feature to explain." - ) - def getFeature: String = $(feature) - def setFeature(value: String): this.type = set(feature, value) +object DiscreteFeature { + implicit val JsonFormat: JsonFormat[DiscreteFeature] = new JsonFormat[DiscreteFeature] { + override def read(json: JsValue): DiscreteFeature = { + val fields = json.asJsObject.fields + val name = fields("name") match { + case JsString(value) => value + case _ => throw new Exception("The name field must be a JsString.") + } + val numTopValues = fields("numTopValues") match { + case JsNumber(value) => value.toInt + case _ => throw new Exception("The numTopValues field must be a JsNumber.") + } - val featureType = new Param[String] ( - this, - "featureType", - "Whether the feature is discrete or continuous.", - ParamValidators.inArray(Array("discrete", "continuous")) - ) - def getFeatureType: String = $(featureType) - def setFeatureType(value: String): this.type = set(featureType, value) + DiscreteFeature(name, numTopValues) - val topNValues = new IntParam ( - this, - "topNValues", - "At most how many discrete values do we collect for discrete features. " + - "The features are ranked by occurrence in descending order.", - ParamValidators.gt(0) - ) - def getTopNValues: Int = $(topNValues) - def setTopNValues(value: Int): this.type = set(topNValues, value) + } + override def write(obj: DiscreteFeature): JsValue = { + JsObject(Map("name" -> JsString(obj.name), "numTopValues" -> JsNumber(obj.numTopValue))) + } + } +} - val nSplits = new IntParam ( - this, - "nSplits", - "How many ways to split the value range for continuous feature.", - ParamValidators.gt(0) - ) - def getNSplits: Int = $(nSplits) - def setNSplits(value: Int): this.type = set(nSplits, value) +case class ContinuousFeature(name: String, numSplits: Option[Int], rangeMin: Option[Double], rangeMax: Option[Double]) { + def validate: Boolean = { + (numSplits.isEmpty || numSplits.get > 0) && (rangeMax.isEmpty || rangeMin.isEmpty || rangeMin.get <= rangeMax.get) + } +} - val rangeMax = new DoubleParam( +object ContinuousFeature { + implicit val JsonFormat: JsonFormat[ContinuousFeature] = new JsonFormat[ContinuousFeature] { + override def read(json: JsValue): ContinuousFeature = { + val fields = json.asJsObject.fields + val name = fields("name") match { + case JsString(value) => value + case _ => throw new Exception("The name field must be a JsString.") + } + val numSplits = fields.get("numSplits").map { + case JsNumber(value) => value.toInt + case _ => 10 + } +// val numSplits = fields("numSplits") match { +// case JsNumber(value) => value.toInt +// case _ => throw new Exception("The numSplits field must be a JsNumber.") +// } + val rangeMin = fields.get("rangeMin").map { + case JsNumber(value) => value.toDouble + } + + val rangeMax = fields.get("rangeMax").map { + case JsNumber(value) => value.toDouble + } + + ContinuousFeature(name, numSplits, rangeMin, rangeMax) + + } + + override def write(obj: ContinuousFeature): JsValue = { + val map = Map("name" -> JsString(obj.name))++ + obj.numSplits.map("numSplits" -> JsNumber(_))++ + // "numSplits" -> JsNumber(obj.numSplits))++ + obj.rangeMin.map("rangeMin" -> JsNumber(_))++ + obj.rangeMax.map("rangeMax" -> JsNumber(_)) + JsObject(map) + } + } +} + + +trait ICEFeatureParams extends Params with HasNumSamples { + + val discreteFeatures = new TypedArrayParam[DiscreteFeature] ( this, - "rangeMax", - "Specifies the max value of the range for continuous features. " + - "If not specified, it will be computed from the background dataset.", - ParamValidators.gtEq(0.0) + "discreteFeatures", + "The list of discrete features to explain.", + {_.forall(_.validate)} ) - def getRangeMax: Option[Double] = get(rangeMax) - def setRangeMax(value: Double): this.type = set(rangeMax, value) - val rangeMin = new DoubleParam( + def setDiscreteFeatures(values: Seq[DiscreteFeature]): this.type = this.set(discreteFeatures, values) + def getDiscreteFeatures: Seq[DiscreteFeature] = $(discreteFeatures) + + val continuousFeatures = new TypedArrayParam[ContinuousFeature] ( this, - "rangeMin", - "Specifies the min value of the range for continuous features. " + - "If not specified, it will be computed from the background dataset.", - ParamValidators.gtEq(0.0) + "continuousFeatures", + "The list of continuous features to explain.", + {_.forall(_.validate)} ) - def getRangeMin: Option[Double] = get(rangeMin) - def setRangeMin(value: Double): this.type = set(rangeMin, value) + + def setContinuousFeatures(values: Seq[ContinuousFeature]): this.type = this.set(continuousFeatures, values) + def getContinuousFeatures: Seq[ContinuousFeature] = $(continuousFeatures) val kind = new Param[String] ( this, @@ -77,19 +116,21 @@ trait ICEFeatureParams extends Params with HasNumSamples { def getKind: String = $(kind) def setKind(value: String): this.type = set(kind, value) - def setDiscreteFeature(feature: String, topN: Int): this.type = { - this.setFeature(feature).setFeatureType("discrete").setTopNValues(topN) - } +// def setDiscreteFeature(feature: String, topN: Int): this.type = { +// this.setFeature(feature).setFeatureType("discrete").setTopNValues(topN) +// } +// +// def setContinuousFeature(feature: String, nSplits: Int, +// rangeMin: Option[Double] = None, +// rangeMax: Option[Double] = None): this.type = { +// rangeMin.foreach(this.setRangeMin) +// rangeMax.foreach(this.setRangeMax) +// this.setFeature(feature).setFeatureType("continuous").setNSplits(nSplits) +// } - def setContinuousFeature(feature: String, nSplits: Int, - rangeMin: Option[Double] = None, - rangeMax: Option[Double] = None): this.type = { - rangeMin.foreach(this.setRangeMin) - rangeMax.foreach(this.setRangeMax) - this.setFeature(feature).setFeatureType("continuous").setNSplits(nSplits) - } + setDefault(numSamples -> 1000, kind -> "individual") - setDefault(numSamples -> 1000, featureType -> "discrete", topNValues -> 100, kind -> "individual") + //setDefault(numSamples -> 1000, featureType -> "discrete", topNValues -> 100, kind -> "individual") } @@ -103,59 +144,83 @@ class ICETransformer(override val uid: String) extends Transformer this(Identifiable.randomUID("ICETransformer")) } - def transform(ds: Dataset[_]): DataFrame = { - - val df = ds.toDF - val idCol = DatasetExtensions.findUnusedColumnName("idCol", df) - val targetClasses = DatasetExtensions.findUnusedColumnName("targetClasses", df) - val dfWithId = df - .withColumn(idCol, monotonically_increasing_id()) - .withColumn(targetClasses, this.get(targetClassesCol).map(col).getOrElse(lit(getTargetClasses))) - - transformSchema(df.schema) - val feature = this.getFeature + def processDiscreteFeature(sampledTotal: DataFrame, idCol: String, targetClassesColumn: String, + feature: DiscreteFeature, values: Array[_]): DataFrame = { - val values = getFeatureType.toLowerCase match { - case "discrete" => - collectDiscreteValues(dfWithId) - case "continuous" => - collectSplits(dfWithId) - } + val sampled = sampledTotal.limit(feature.numTopValue).cache() - val dataType = dfWithId.schema(feature).dataType + val dataType = sampled.schema(feature.name).dataType val explodeFunc = explode(array(values.map(v => lit(v)): _*).cast(ArrayType(dataType))) - val sampled = dfWithId.orderBy(rand()).limit(getNumSamples).cache() - val predicted = getModel.transform(sampled.withColumn(feature, explodeFunc)) + val predicted = getModel.transform(sampled.withColumn(feature.name, explodeFunc)) val targetCol = DatasetExtensions.findUnusedColumnName("target", predicted) - val explainTarget = extractTarget(predicted.schema, targetClasses) + val explainTarget = extractTarget(predicted.schema, targetClassesColumn) val result = predicted.withColumn(targetCol, explainTarget) + val featImpName = feature.name + "__imp" + + result.show() + getKind.toLowerCase match { case "average" => result - .groupBy(feature) - .agg(Summarizer.mean(col(targetCol)).alias("__feature__importance__")) - .withColumnRenamed(feature, "__feature__value__") - .withColumn("__feature__name__", lit(feature)) - .select("__feature__name__", "__feature__value__", "__feature__importance__") + .groupBy(feature.name) + .agg(Summarizer.mean(col(targetCol)).alias("__feature__importance__")) + //.withColumnRenamed(feature.name, "__feature__value__") + .withColumn(featImpName, lit(feature.name)) + .select(featImpName, "__feature__importance__") case "individual" => // storing as a map feature -> target value val iceFeatures = result.groupBy("idCol") - .agg(collect_list(feature).alias("feature_list"), collect_list(targetCol).alias("target_list")) - .withColumn("__feature__importance__", map_from_arrays(col("feature_list"), col("target_list"))) - .select(idCol, "__feature__importance__") - sampled.join(iceFeatures, idCol) + .agg(collect_list(feature.name).alias("feature_list"), collect_list(targetCol).alias("target_list")) + .withColumn(featImpName, map_from_arrays(col("feature_list"), col("target_list"))) + .select(idCol, featImpName) + //sampled.join(iceFeatures, idCol) + iceFeatures.select(idCol, featImpName) + } + } + + + + def transform(ds: Dataset[_]): DataFrame = { + + val df = ds.toDF + val idCol = DatasetExtensions.findUnusedColumnName("idCol", df) + val targetClasses = DatasetExtensions.findUnusedColumnName("targetClasses", df) + val dfWithId = df + .withColumn(idCol, monotonically_increasing_id()) + .withColumn(targetClasses, this.get(targetClassesCol).map(col).getOrElse(lit(getTargetClasses))) + transformSchema(df.schema) + + // collect feature values for all features from original fataset - dfWithId + val discreteFeatures = this.getDiscreteFeatures + + val collectedFeatureValues: Map[DiscreteFeature, Array[_]] = discreteFeatures.map{ + feature => (feature, collectDiscreteValues(dfWithId, feature)) + }.toMap + + val sampled = dfWithId.orderBy(rand()) + + val processFunc: DiscreteFeature => DataFrame = { + f: DiscreteFeature => + processDiscreteFeature(sampled, idCol, targetClasses, f, collectedFeatureValues(f)) } + val stage1 = discreteFeatures map (processFunc) + + val stage2: DataFrame = + stage1.tail.foldLeft(stage1.head)((accDF, currDF) => accDF.join(currDF, Seq(idCol), "inner")) + + sampled.join(stage2, idCol).drop(idCol) + } - private def collectDiscreteValues[_](df: DataFrame): Array[_] = { + private def collectDiscreteValues[_](df: DataFrame, feature: DiscreteFeature): Array[_] = { val values = df - .groupBy(col(getFeature)) + .groupBy(col(feature.name)) .agg(count("*").as("__feature__count__")) .orderBy(col("__feature__count__").desc) - .head(getTopNValues) + .head(feature.numTopValue) .map(row => row.get(0)) values } @@ -166,18 +231,19 @@ class ICETransformer(override val uid: String) extends Transformer } } - private def collectSplits(df: DataFrame): Array[Double] = { - val (feature, nSplits, rangeMin, rangeMax) = (getFeature, getNSplits, getRangeMin, getRangeMax) + private def collectSplits(df: DataFrame, continuousFeature: ContinuousFeature): Array[Double] = { + val (feature, nSplits, rangeMin, rangeMax) = (continuousFeature.name, continuousFeature.numSplits, + continuousFeature.rangeMin, continuousFeature.rangeMax) val featureCol = df.schema(feature) - val createSplits = createNSplits(nSplits) _ + val createSplits = createNSplits(nSplits.get) _ val values = if (rangeMin.isDefined && rangeMax.isDefined) { val (mi, ma) = (rangeMin.get, rangeMax.get) // The ranges are defined featureCol.dataType match { case _@(ByteType | IntegerType | LongType | ShortType) => - if (ma.toLong - mi.toLong <= nSplits) { + if (ma.toLong - mi.toLong <= nSplits.get) { // For integral types, no need to create more splits than needed. (mi.toLong to ma.toLong) map (_.toDouble) } else { @@ -197,7 +263,7 @@ class ICETransformer(override val uid: String) extends Transformer val mi = rangeMin.map(_.toLong).getOrElse(minValue) val ma = rangeMax.map(_.toLong).getOrElse(maxValue) - if (ma - mi <= nSplits) { + if (ma - mi <= nSplits.get) { // For integral types, no need to create more splits than needed. (mi to ma) map (_.toDouble) } else { @@ -220,7 +286,6 @@ class ICETransformer(override val uid: String) extends Transformer override def copy(extra: ParamMap): Transformer = this.defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { - assert(!schema.fieldNames.contains(feature.name), s"The schema does not contain column ${feature.name}") this.validateSchema(schema) schema.add(getOutputCol, ArrayType(VectorType)) } diff --git a/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala index 8b09fc1ebd..8effeea49a 100644 --- a/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala @@ -6,7 +6,7 @@ import com.microsoft.ml.spark.core.test.base.TestBase import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} -import com.microsoft.ml.spark.explainers.ICETransformer +import com.microsoft.ml.spark.explainers.{DiscreteFeature, ICETransformer} class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransformer] { @@ -31,66 +31,78 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor val model: PipelineModel = pipeline.fit(data) - val ice = new ICETransformer() - ice.setModel(model) - .setOutputCol("iceValues") - .setTargetCol("probability") - .setFeature("col1") - .setTargetClasses(Array(1)) - val output: DataFrame = ice.transform(data) - output.show(false) - - val iceCon = new ICETransformer() - iceCon.setModel(model) - .setOutputCol("iceValues") - .setTargetCol("probability") - .setContinuousFeature(feature = "col4", nSplits = 20) - .setTargetClasses(Array(1)) - val outputCon: DataFrame = iceCon.transform(data) - outputCon.show(false) +// val ice = new ICETransformer() +// ice.setModel(model) +// .setOutputCol("iceValues") +// .setTargetCol("probability") +// .setDiscreteFeatures(Array(DiscreteFeature("col1", 100), DiscreteFeature("col4", 4))) +// .setTargetClasses(Array(1)) +// val output: DataFrame = ice.transform(data) +// output.show(false) - val iceCon1 = new ICETransformer() - iceCon1.setModel(model) + val iceAvg = new ICETransformer() + iceAvg.setModel(model) .setOutputCol("iceValues") .setTargetCol("probability") - .setContinuousFeature( - feature = "col4", - nSplits = 20, - rangeMin = Some(0.0), - rangeMax = Some(100.0) - ) - .setTargetClasses(Array(1)) - val outputCon1: DataFrame = iceCon1.transform(data) - outputCon1.show(false) - - - val pdp = new ICETransformer() - pdp.setModel(model) - .setOutputCol("iceValues") - .setTargetCol("probability") - .setContinuousFeature( - feature = "col4", - nSplits = 3, - rangeMin = Some(0.0), - rangeMax = Some(100.0) - ) + .setDiscreteFeatures(Array(DiscreteFeature("col1", 100), DiscreteFeature("col4", 4))) .setTargetClasses(Array(1)) .setKind("average") - val pdpOutput: DataFrame = pdp.transform(data) - pdpOutput.show(false) + val outputAvg: DataFrame = iceAvg.transform(data) + outputAvg.show(false) - val pdpDisc = new ICETransformer() - pdpDisc.setModel(model) - .setOutputCol("iceValues") - .setTargetCol("probability") - .setDiscreteFeature( - feature = "col4", - topN = 2 - ) - .setTargetClasses(Array(1)) - .setKind("average") - val pdpOutputDisc: DataFrame = pdpDisc.transform(data) - pdpOutputDisc.show(false) + +// val iceCon = new ICETransformer() +// iceCon.setModel(model) +// .setOutputCol("iceValues") +// .setTargetCol("probability") +// .setContinuousFeature(feature = "col4", nSplits = 20) +// .setTargetClasses(Array(1)) +// val outputCon: DataFrame = iceCon.transform(data) +// outputCon.show(false) +// +// +// val iceCon1 = new ICETransformer() +// iceCon1.setModel(model) +// .setOutputCol("iceValues") +// .setTargetCol("probability") +// .setContinuousFeature( +// feature = "col4", +// nSplits = 20, +// rangeMin = Some(0.0), +// rangeMax = Some(100.0) +// ) +// .setTargetClasses(Array(1)) +// val outputCon1: DataFrame = iceCon1.transform(data) +// outputCon1.show(false) +// +// +// val pdp = new ICETransformer() +// pdp.setModel(model) +// .setOutputCol("iceValues") +// .setTargetCol("probability") +// .setContinuousFeature( +// feature = "col4", +// nSplits = 3, +// rangeMin = Some(0.0), +// rangeMax = Some(100.0) +// ) +// .setTargetClasses(Array(1)) +// .setKind("average") +// val pdpOutput: DataFrame = pdp.transform(data) +// pdpOutput.show(false) +// +// val pdpDisc = new ICETransformer() +// pdpDisc.setModel(model) +// .setOutputCol("iceValues") +// .setTargetCol("probability") +// .setDiscreteFeature( +// feature = "col4", +// topN = 2 +// ) +// .setTargetClasses(Array(1)) +// .setKind("average") +// val pdpOutputDisc: DataFrame = pdpDisc.transform(data) +// pdpOutputDisc.show(false) } From e6e985e2380f3f1f76651d092bd9aaef32289289 Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Mon, 18 Oct 2021 19:34:50 -0700 Subject: [PATCH 06/32] Added logic for discrete features --- .../ml/spark/explainers/ICEExplainer.scala | 43 ++++++----- .../explainers/split1/ICEExplainerSuite.scala | 73 +++---------------- 2 files changed, 30 insertions(+), 86 deletions(-) diff --git a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala index 101852b3e8..77ad96eb1d 100644 --- a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala @@ -53,6 +53,9 @@ object ContinuousFeature { case JsString(value) => value case _ => throw new Exception("The name field must be a JsString.") } + + // I don't know how to pass default value. If I make Option, I need to specify it anyway. + val numSplits = fields.get("numSplits").map { case JsNumber(value) => value.toInt case _ => 10 @@ -116,22 +119,8 @@ trait ICEFeatureParams extends Params with HasNumSamples { def getKind: String = $(kind) def setKind(value: String): this.type = set(kind, value) -// def setDiscreteFeature(feature: String, topN: Int): this.type = { -// this.setFeature(feature).setFeatureType("discrete").setTopNValues(topN) -// } -// -// def setContinuousFeature(feature: String, nSplits: Int, -// rangeMin: Option[Double] = None, -// rangeMax: Option[Double] = None): this.type = { -// rangeMin.foreach(this.setRangeMin) -// rangeMax.foreach(this.setRangeMax) -// this.setFeature(feature).setFeatureType("continuous").setNSplits(nSplits) -// } - setDefault(numSamples -> 1000, kind -> "individual") - //setDefault(numSamples -> 1000, featureType -> "discrete", topNValues -> 100, kind -> "individual") - } class ICETransformer(override val uid: String) extends Transformer @@ -160,23 +149,21 @@ class ICETransformer(override val uid: String) extends Transformer val featImpName = feature.name + "__imp" - result.show() getKind.toLowerCase match { case "average" => result .groupBy(feature.name) .agg(Summarizer.mean(col(targetCol)).alias("__feature__importance__")) - //.withColumnRenamed(feature.name, "__feature__value__") - .withColumn(featImpName, lit(feature.name)) - .select(featImpName, "__feature__importance__") + .agg(collect_list(feature.name).alias("feature_value_list"), + collect_list("__feature__importance__").alias("feature_imp_list")) + .withColumn(featImpName, map_from_arrays(col("feature_value_list"), col("feature_imp_list"))) + .select(featImpName) case "individual" => - // storing as a map feature -> target value val iceFeatures = result.groupBy("idCol") .agg(collect_list(feature.name).alias("feature_list"), collect_list(targetCol).alias("target_list")) .withColumn(featImpName, map_from_arrays(col("feature_list"), col("target_list"))) .select(idCol, featImpName) - //sampled.join(iceFeatures, idCol) iceFeatures.select(idCol, featImpName) } } @@ -193,8 +180,9 @@ class ICETransformer(override val uid: String) extends Transformer .withColumn(targetClasses, this.get(targetClassesCol).map(col).getOrElse(lit(getTargetClasses))) transformSchema(df.schema) - // collect feature values for all features from original fataset - dfWithId + // collect feature values for all features from original dataset - dfWithId val discreteFeatures = this.getDiscreteFeatures + //val continuousFeature = this.getContinuousFeatures val collectedFeatureValues: Map[DiscreteFeature, Array[_]] = discreteFeatures.map{ feature => (feature, collectDiscreteValues(dfWithId, feature)) @@ -206,13 +194,24 @@ class ICETransformer(override val uid: String) extends Transformer f: DiscreteFeature => processDiscreteFeature(sampled, idCol, targetClasses, f, collectedFeatureValues(f)) } - val stage1 = discreteFeatures map (processFunc) + val stage1 = discreteFeatures map (processFunc) + + // I don't know how it's better to handle this 2 cases. For pdp we don't have idCol + // and also don't merge it with input data + + getKind.toLowerCase match { + case "individual" => val stage2: DataFrame = stage1.tail.foldLeft(stage1.head)((accDF, currDF) => accDF.join(currDF, Seq(idCol), "inner")) sampled.join(stage2, idCol).drop(idCol) + case "average" => + val stage2: DataFrame = stage1.tail.foldLeft(stage1.head)((accDF, currDF) => accDF.crossJoin(currDF)) + stage2 + } + } private def collectDiscreteValues[_](df: DataFrame, feature: DiscreteFeature): Array[_] = { diff --git a/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala index 8effeea49a..973482706d 100644 --- a/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala @@ -20,7 +20,7 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor )).toDF("col1", "col2", "col3", "label") val data: DataFrame = dataDF.withColumn("col4", rand()*100) - data.show() + val pipeline: Pipeline = new Pipeline().setStages(Array( new StringIndexer().setInputCol("col2").setOutputCol("col2_ind"), @@ -31,15 +31,14 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor val model: PipelineModel = pipeline.fit(data) - -// val ice = new ICETransformer() -// ice.setModel(model) -// .setOutputCol("iceValues") -// .setTargetCol("probability") -// .setDiscreteFeatures(Array(DiscreteFeature("col1", 100), DiscreteFeature("col4", 4))) -// .setTargetClasses(Array(1)) -// val output: DataFrame = ice.transform(data) -// output.show(false) + val ice = new ICETransformer() + ice.setModel(model) + .setOutputCol("iceValues") + .setTargetCol("probability") + .setDiscreteFeatures(Array(DiscreteFeature("col1", 100), DiscreteFeature("col4", 4))) + .setTargetClasses(Array(1)) + val output: DataFrame = ice.transform(data) + output.show(false) val iceAvg = new ICETransformer() iceAvg.setModel(model) @@ -51,58 +50,4 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor val outputAvg: DataFrame = iceAvg.transform(data) outputAvg.show(false) - -// val iceCon = new ICETransformer() -// iceCon.setModel(model) -// .setOutputCol("iceValues") -// .setTargetCol("probability") -// .setContinuousFeature(feature = "col4", nSplits = 20) -// .setTargetClasses(Array(1)) -// val outputCon: DataFrame = iceCon.transform(data) -// outputCon.show(false) -// -// -// val iceCon1 = new ICETransformer() -// iceCon1.setModel(model) -// .setOutputCol("iceValues") -// .setTargetCol("probability") -// .setContinuousFeature( -// feature = "col4", -// nSplits = 20, -// rangeMin = Some(0.0), -// rangeMax = Some(100.0) -// ) -// .setTargetClasses(Array(1)) -// val outputCon1: DataFrame = iceCon1.transform(data) -// outputCon1.show(false) -// -// -// val pdp = new ICETransformer() -// pdp.setModel(model) -// .setOutputCol("iceValues") -// .setTargetCol("probability") -// .setContinuousFeature( -// feature = "col4", -// nSplits = 3, -// rangeMin = Some(0.0), -// rangeMax = Some(100.0) -// ) -// .setTargetClasses(Array(1)) -// .setKind("average") -// val pdpOutput: DataFrame = pdp.transform(data) -// pdpOutput.show(false) -// -// val pdpDisc = new ICETransformer() -// pdpDisc.setModel(model) -// .setOutputCol("iceValues") -// .setTargetCol("probability") -// .setDiscreteFeature( -// feature = "col4", -// topN = 2 -// ) -// .setTargetClasses(Array(1)) -// .setKind("average") -// val pdpOutputDisc: DataFrame = pdpDisc.transform(data) -// pdpOutputDisc.show(false) - } From a23df5ca53f192cc0efa642ffbc72cf95ffc2ac3 Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Wed, 20 Oct 2021 10:43:35 -0700 Subject: [PATCH 07/32] New logic (without unit tests) --- .../ml/spark/explainers/ICEExplainer.scala | 165 ++++++++++-------- .../explainers/split1/ICEExplainerSuite.scala | 9 +- 2 files changed, 94 insertions(+), 80 deletions(-) diff --git a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala index 77ad96eb1d..16e2c41056 100644 --- a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala @@ -3,7 +3,7 @@ import com.microsoft.ml.spark.core.contracts.HasOutputCol import com.microsoft.ml.spark.core.schema.DatasetExtensions import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.SQLDataTypes.VectorType -import org.apache.spark.ml.param.{DoubleParam, IntParam, ParamMap, ParamValidators, Params, _} +import org.apache.spark.ml.param.{ParamMap, ParamValidators, Params, _} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ @@ -11,59 +11,67 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.ml.stat.Summarizer import spray.json.{JsValue, JsonFormat, JsNumber, JsString, JsObject} -case class DiscreteFeature(name: String, numTopValue: Int) { +case class CategoricalFeature(name: String, numTopValues: Option[Int] = None) { def validate: Boolean = { - numTopValue > 0 + numTopValues.forall(_ > 0) + } + + private val defaultNumTopValue = 100 + def getNumTopValue: Int = { + this.numTopValues.getOrElse(defaultNumTopValue) } } -object DiscreteFeature { - implicit val JsonFormat: JsonFormat[DiscreteFeature] = new JsonFormat[DiscreteFeature] { - override def read(json: JsValue): DiscreteFeature = { +object CategoricalFeature { + implicit val JsonFormat: JsonFormat[CategoricalFeature] = new JsonFormat[CategoricalFeature] { + override def read(json: JsValue): CategoricalFeature = { val fields = json.asJsObject.fields val name = fields("name") match { case JsString(value) => value case _ => throw new Exception("The name field must be a JsString.") } - val numTopValues = fields("numTopValues") match { - case JsNumber(value) => value.toInt - case _ => throw new Exception("The numTopValues field must be a JsNumber.") + val numTopValues = fields.get("numTopValues") match { + case Some(JsNumber(value)) => Some(value.toInt) + case _ => None } - DiscreteFeature(name, numTopValues) + CategoricalFeature(name, numTopValues) } - override def write(obj: DiscreteFeature): JsValue = { - JsObject(Map("name" -> JsString(obj.name), "numTopValues" -> JsNumber(obj.numTopValue))) + override def write(obj: CategoricalFeature): JsValue = { + val map = Map("name" -> JsString(obj.name))++ + obj.numTopValues.map("numTopValues" -> JsNumber(_)) + JsObject(map) } } } -case class ContinuousFeature(name: String, numSplits: Option[Int], rangeMin: Option[Double], rangeMax: Option[Double]) { +case class NumericFeature(name: String, numSplits: Option[Int] = None, + rangeMin: Option[Double] = None, rangeMax: Option[Double] = None) { def validate: Boolean = { - (numSplits.isEmpty || numSplits.get > 0) && (rangeMax.isEmpty || rangeMin.isEmpty || rangeMin.get <= rangeMax.get) + numSplits.forall(_ > 0) && (rangeMax.isEmpty || rangeMin.isEmpty || rangeMin.get <= rangeMax.get) + } + + private val defaultNumSplits = 10 + def getNumSplits: Int = { + this.numSplits.getOrElse(defaultNumSplits) } } -object ContinuousFeature { - implicit val JsonFormat: JsonFormat[ContinuousFeature] = new JsonFormat[ContinuousFeature] { - override def read(json: JsValue): ContinuousFeature = { +object NumericFeature { + implicit val JsonFormat: JsonFormat[NumericFeature] = new JsonFormat[NumericFeature] { + override def read(json: JsValue): NumericFeature = { val fields = json.asJsObject.fields val name = fields("name") match { case JsString(value) => value case _ => throw new Exception("The name field must be a JsString.") } - // I don't know how to pass default value. If I make Option, I need to specify it anyway. - - val numSplits = fields.get("numSplits").map { - case JsNumber(value) => value.toInt - case _ => 10 + val numSplits = fields.get("numSplits") match { + case Some(JsNumber(value)) => Some(value.toInt) + case _ => None } -// val numSplits = fields("numSplits") match { -// case JsNumber(value) => value.toInt -// case _ => throw new Exception("The numSplits field must be a JsNumber.") -// } + val rangeMin = fields.get("rangeMin").map { case JsNumber(value) => value.toDouble } @@ -72,14 +80,13 @@ object ContinuousFeature { case JsNumber(value) => value.toDouble } - ContinuousFeature(name, numSplits, rangeMin, rangeMax) + NumericFeature(name, numSplits, rangeMin, rangeMax) } - override def write(obj: ContinuousFeature): JsValue = { + override def write(obj: NumericFeature): JsValue = { val map = Map("name" -> JsString(obj.name))++ obj.numSplits.map("numSplits" -> JsNumber(_))++ - // "numSplits" -> JsNumber(obj.numSplits))++ obj.rangeMin.map("rangeMin" -> JsNumber(_))++ obj.rangeMax.map("rangeMax" -> JsNumber(_)) JsObject(map) @@ -90,37 +97,38 @@ object ContinuousFeature { trait ICEFeatureParams extends Params with HasNumSamples { - val discreteFeatures = new TypedArrayParam[DiscreteFeature] ( + val categoricalFeatures = new TypedArrayParam[CategoricalFeature] ( this, - "discreteFeatures", - "The list of discrete features to explain.", + "categoricalFeatures", + "The list of categorical features to explain.", {_.forall(_.validate)} ) - def setDiscreteFeatures(values: Seq[DiscreteFeature]): this.type = this.set(discreteFeatures, values) - def getDiscreteFeatures: Seq[DiscreteFeature] = $(discreteFeatures) + def setCategoricalFeatures(values: Seq[CategoricalFeature]): this.type = this.set(categoricalFeatures, values) + def getCategoricalFeatures: Seq[CategoricalFeature] = $(categoricalFeatures) - val continuousFeatures = new TypedArrayParam[ContinuousFeature] ( + val numericFeatures = new TypedArrayParam[NumericFeature] ( this, - "continuousFeatures", - "The list of continuous features to explain.", + "numericFeatures", + "The list of numeric features to explain.", {_.forall(_.validate)} ) - def setContinuousFeatures(values: Seq[ContinuousFeature]): this.type = this.set(continuousFeatures, values) - def getContinuousFeatures: Seq[ContinuousFeature] = $(continuousFeatures) + def setNumericFeatures(values: Seq[NumericFeature]): this.type = this.set(numericFeatures, values) + def getNumericFeatures: Seq[NumericFeature] = $(numericFeatures) val kind = new Param[String] ( this, "kind", - "Whether to return the partial dependence averaged across all the samples in the dataset or one line per sample.", + "Whether to return the partial dependence averaged across all the samples in the " + + "dataset or individual feature importance per sample.", ParamValidators.inArray(Array("average", "individual")) ) def getKind: String = $(kind) def setKind(value: String): this.type = set(kind, value) - setDefault(numSamples -> 1000, kind -> "individual") - + setDefault(kind -> "individual", numericFeatures -> Seq.empty[NumericFeature], + categoricalFeatures -> Seq.empty[CategoricalFeature]) } class ICETransformer(override val uid: String) extends Transformer @@ -133,35 +141,34 @@ class ICETransformer(override val uid: String) extends Transformer this(Identifiable.randomUID("ICETransformer")) } - def processDiscreteFeature(sampledTotal: DataFrame, idCol: String, targetClassesColumn: String, - feature: DiscreteFeature, values: Array[_]): DataFrame = { + def processFeature(sampled: DataFrame, idCol: String, targetClassesColumn: String, + feature: String, values: Array[_]): DataFrame = { - val sampled = sampledTotal.limit(feature.numTopValue).cache() - - val dataType = sampled.schema(feature.name).dataType + val dataType = sampled.schema(feature).dataType val explodeFunc = explode(array(values.map(v => lit(v)): _*).cast(ArrayType(dataType))) - val predicted = getModel.transform(sampled.withColumn(feature.name, explodeFunc)) + val predicted = getModel.transform(sampled.withColumn(feature, explodeFunc)) val targetCol = DatasetExtensions.findUnusedColumnName("target", predicted) val explainTarget = extractTarget(predicted.schema, targetClassesColumn) val result = predicted.withColumn(targetCol, explainTarget) - val featImpName = feature.name + "__imp" - + val featImpName = feature + "__imp" + // output schema: 1 row * 1 col (pdp for the given feature: feature_value -> explanations) getKind.toLowerCase match { case "average" => result - .groupBy(feature.name) + .groupBy(feature) .agg(Summarizer.mean(col(targetCol)).alias("__feature__importance__")) - .agg(collect_list(feature.name).alias("feature_value_list"), + .agg(collect_list(feature).alias("feature_value_list"), collect_list("__feature__importance__").alias("feature_imp_list")) .withColumn(featImpName, map_from_arrays(col("feature_value_list"), col("feature_imp_list"))) .select(featImpName) + // output schema: rows * (cols + 1) (ice for the given feature: array(feature_value -> explanations)) case "individual" => - val iceFeatures = result.groupBy("idCol") - .agg(collect_list(feature.name).alias("feature_list"), collect_list(targetCol).alias("target_list")) + val iceFeatures = result.groupBy(idCol) + .agg(collect_list(feature).alias("feature_list"), collect_list(targetCol).alias("target_list")) .withColumn(featImpName, map_from_arrays(col("feature_list"), col("target_list"))) .select(idCol, featImpName) iceFeatures.select(idCol, featImpName) @@ -169,9 +176,7 @@ class ICETransformer(override val uid: String) extends Transformer } - def transform(ds: Dataset[_]): DataFrame = { - val df = ds.toDF val idCol = DatasetExtensions.findUnusedColumnName("idCol", df) val targetClasses = DatasetExtensions.findUnusedColumnName("targetClasses", df) @@ -181,24 +186,32 @@ class ICETransformer(override val uid: String) extends Transformer transformSchema(df.schema) // collect feature values for all features from original dataset - dfWithId - val discreteFeatures = this.getDiscreteFeatures - //val continuousFeature = this.getContinuousFeatures + val categoricalFeatures = this.getCategoricalFeatures + val numericFeatures = this.getNumericFeatures - val collectedFeatureValues: Map[DiscreteFeature, Array[_]] = discreteFeatures.map{ - feature => (feature, collectDiscreteValues(dfWithId, feature)) + val collectedCatFeatureValues: Map[String, Array[_]] = categoricalFeatures.map { + feature => (feature.name, collectCategoricalValues(dfWithId, feature)) + }.toMap + + val collectedNumFeatureValues: Map[String, Array[_]] = numericFeatures.map { + feature => (feature.name, collectSplits(dfWithId, feature)) }.toMap - val sampled = dfWithId.orderBy(rand()) + val sampled = this.get(numSamples).map { + s => dfWithId.orderBy(rand()).limit(s) + }.getOrElse(dfWithId).cache() - val processFunc: DiscreteFeature => DataFrame = { - f: DiscreteFeature => - processDiscreteFeature(sampled, idCol, targetClasses, f, collectedFeatureValues(f)) + val processCategoricalFunc: CategoricalFeature => DataFrame = { + f: CategoricalFeature => + processFeature(sampled, idCol, targetClasses, f.name, collectedCatFeatureValues(f.name)) } - val stage1 = discreteFeatures map (processFunc) + val processNumericFunc: NumericFeature => DataFrame = { + f: NumericFeature => + processFeature(sampled, idCol, targetClasses, f.name, collectedNumFeatureValues(f.name)) + } - // I don't know how it's better to handle this 2 cases. For pdp we don't have idCol - // and also don't merge it with input data + val stage1 = (categoricalFeatures map processCategoricalFunc) union (numericFeatures map processNumericFunc) getKind.toLowerCase match { case "individual" => @@ -211,15 +224,14 @@ class ICETransformer(override val uid: String) extends Transformer val stage2: DataFrame = stage1.tail.foldLeft(stage1.head)((accDF, currDF) => accDF.crossJoin(currDF)) stage2 } - } - private def collectDiscreteValues[_](df: DataFrame, feature: DiscreteFeature): Array[_] = { + private def collectCategoricalValues[_](df: DataFrame, feature: CategoricalFeature): Array[_] = { val values = df .groupBy(col(feature.name)) .agg(count("*").as("__feature__count__")) .orderBy(col("__feature__count__").desc) - .head(feature.numTopValue) + .head(feature.getNumTopValue) .map(row => row.get(0)) values } @@ -230,19 +242,19 @@ class ICETransformer(override val uid: String) extends Transformer } } - private def collectSplits(df: DataFrame, continuousFeature: ContinuousFeature): Array[Double] = { - val (feature, nSplits, rangeMin, rangeMax) = (continuousFeature.name, continuousFeature.numSplits, - continuousFeature.rangeMin, continuousFeature.rangeMax) + private def collectSplits(df: DataFrame, numericFeature: NumericFeature): Array[Double] = { + val (feature, nSplits, rangeMin, rangeMax) = (numericFeature.name, numericFeature.getNumSplits, + numericFeature.rangeMin, numericFeature.rangeMax) val featureCol = df.schema(feature) - val createSplits = createNSplits(nSplits.get) _ + val createSplits = createNSplits(nSplits) _ val values = if (rangeMin.isDefined && rangeMax.isDefined) { val (mi, ma) = (rangeMin.get, rangeMax.get) // The ranges are defined featureCol.dataType match { case _@(ByteType | IntegerType | LongType | ShortType) => - if (ma.toLong - mi.toLong <= nSplits.get) { + if (ma.toLong - mi.toLong <= nSplits) { // For integral types, no need to create more splits than needed. (mi.toLong to ma.toLong) map (_.toDouble) } else { @@ -262,7 +274,7 @@ class ICETransformer(override val uid: String) extends Transformer val mi = rangeMin.map(_.toLong).getOrElse(minValue) val ma = rangeMax.map(_.toLong).getOrElse(maxValue) - if (ma - mi <= nSplits.get) { + if (ma - mi <= nSplits) { // For integral types, no need to create more splits than needed. (mi to ma) map (_.toDouble) } else { @@ -278,7 +290,6 @@ class ICETransformer(override val uid: String) extends Transformer createSplits(mi, ma) } } - values.toArray } diff --git a/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala index 973482706d..81a36464a2 100644 --- a/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala @@ -6,7 +6,7 @@ import com.microsoft.ml.spark.core.test.base.TestBase import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} -import com.microsoft.ml.spark.explainers.{DiscreteFeature, ICETransformer} +import com.microsoft.ml.spark.explainers.{CategoricalFeature, ICETransformer, NumericFeature} class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransformer] { @@ -35,7 +35,7 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor ice.setModel(model) .setOutputCol("iceValues") .setTargetCol("probability") - .setDiscreteFeatures(Array(DiscreteFeature("col1", 100), DiscreteFeature("col4", 4))) + .setCategoricalFeatures(Array(CategoricalFeature("col1", Some(100)), CategoricalFeature("col4", Some(4)))) .setTargetClasses(Array(1)) val output: DataFrame = ice.transform(data) output.show(false) @@ -44,10 +44,13 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor iceAvg.setModel(model) .setOutputCol("iceValues") .setTargetCol("probability") - .setDiscreteFeatures(Array(DiscreteFeature("col1", 100), DiscreteFeature("col4", 4))) + .setCategoricalFeatures(Array(CategoricalFeature("col1", Some(100)), CategoricalFeature("col2"))) + .setNumericFeatures(Array(NumericFeature("col4"), NumericFeature("col4", Some(3), Some(0.0), Some(100.0)))) .setTargetClasses(Array(1)) .setKind("average") val outputAvg: DataFrame = iceAvg.transform(data) outputAvg.show(false) + + } From 43d16481b6fdbd10407430ff49819164078698c6 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 20 Oct 2021 15:31:59 -0700 Subject: [PATCH 08/32] WIP --- .../ml/spark/explainers/ICEExplainer.scala | 168 +++++------------- .../ml/spark/explainers/ICEFeature.scala | 41 +++++ .../explainers/split1/ICEExplainerSuite.scala | 8 +- 3 files changed, 93 insertions(+), 124 deletions(-) create mode 100644 core/src/main/scala/com/microsoft/ml/spark/explainers/ICEFeature.scala diff --git a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala index 16e2c41056..a5c28061fd 100644 --- a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala @@ -9,126 +9,42 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.ml.stat.Summarizer -import spray.json.{JsValue, JsonFormat, JsNumber, JsString, JsObject} - -case class CategoricalFeature(name: String, numTopValues: Option[Int] = None) { - def validate: Boolean = { - numTopValues.forall(_ > 0) - } - - private val defaultNumTopValue = 100 - def getNumTopValue: Int = { - this.numTopValues.getOrElse(defaultNumTopValue) - } -} - -object CategoricalFeature { - implicit val JsonFormat: JsonFormat[CategoricalFeature] = new JsonFormat[CategoricalFeature] { - override def read(json: JsValue): CategoricalFeature = { - val fields = json.asJsObject.fields - val name = fields("name") match { - case JsString(value) => value - case _ => throw new Exception("The name field must be a JsString.") - } - val numTopValues = fields.get("numTopValues") match { - case Some(JsNumber(value)) => Some(value.toInt) - case _ => None - } - - CategoricalFeature(name, numTopValues) - - } - override def write(obj: CategoricalFeature): JsValue = { - val map = Map("name" -> JsString(obj.name))++ - obj.numTopValues.map("numTopValues" -> JsNumber(_)) - JsObject(map) - } - } -} - -case class NumericFeature(name: String, numSplits: Option[Int] = None, - rangeMin: Option[Double] = None, rangeMax: Option[Double] = None) { - def validate: Boolean = { - numSplits.forall(_ > 0) && (rangeMax.isEmpty || rangeMin.isEmpty || rangeMin.get <= rangeMax.get) - } - - private val defaultNumSplits = 10 - def getNumSplits: Int = { - this.numSplits.getOrElse(defaultNumSplits) - } -} - -object NumericFeature { - implicit val JsonFormat: JsonFormat[NumericFeature] = new JsonFormat[NumericFeature] { - override def read(json: JsValue): NumericFeature = { - val fields = json.asJsObject.fields - val name = fields("name") match { - case JsString(value) => value - case _ => throw new Exception("The name field must be a JsString.") - } - - val numSplits = fields.get("numSplits") match { - case Some(JsNumber(value)) => Some(value.toInt) - case _ => None - } - - val rangeMin = fields.get("rangeMin").map { - case JsNumber(value) => value.toDouble - } - - val rangeMax = fields.get("rangeMax").map { - case JsNumber(value) => value.toDouble - } - - NumericFeature(name, numSplits, rangeMin, rangeMax) - - } - - override def write(obj: NumericFeature): JsValue = { - val map = Map("name" -> JsString(obj.name))++ - obj.numSplits.map("numSplits" -> JsNumber(_))++ - obj.rangeMin.map("rangeMin" -> JsNumber(_))++ - obj.rangeMax.map("rangeMax" -> JsNumber(_)) - JsObject(map) - } - } -} - trait ICEFeatureParams extends Params with HasNumSamples { - - val categoricalFeatures = new TypedArrayParam[CategoricalFeature] ( + val categoricalFeatures = new TypedArrayParam[ICECategoricalFeature] ( this, "categoricalFeatures", "The list of categorical features to explain.", {_.forall(_.validate)} ) - def setCategoricalFeatures(values: Seq[CategoricalFeature]): this.type = this.set(categoricalFeatures, values) - def getCategoricalFeatures: Seq[CategoricalFeature] = $(categoricalFeatures) + def setCategoricalFeatures(values: Seq[ICECategoricalFeature]): this.type = this.set(categoricalFeatures, values) + def getCategoricalFeatures: Seq[ICECategoricalFeature] = $(categoricalFeatures) - val numericFeatures = new TypedArrayParam[NumericFeature] ( + val numericFeatures = new TypedArrayParam[ICENumericFeature] ( this, "numericFeatures", "The list of numeric features to explain.", {_.forall(_.validate)} ) - def setNumericFeatures(values: Seq[NumericFeature]): this.type = this.set(numericFeatures, values) - def getNumericFeatures: Seq[NumericFeature] = $(numericFeatures) + def setNumericFeatures(values: Seq[ICENumericFeature]): this.type = this.set(numericFeatures, values) + def getNumericFeatures: Seq[ICENumericFeature] = $(numericFeatures) val kind = new Param[String] ( this, "kind", - "Whether to return the partial dependence averaged across all the samples in the " + - "dataset or individual feature importance per sample.", + "Whether to return the partial dependence plot (PDP) averaged across all the samples in the " + + "dataset or individual feature importance (ICE) per sample. " + + "Allowed values are \"average\" for PDP and \"individual\" for ICE.", ParamValidators.inArray(Array("average", "individual")) ) + def getKind: String = $(kind) def setKind(value: String): this.type = set(kind, value) - setDefault(kind -> "individual", numericFeatures -> Seq.empty[NumericFeature], - categoricalFeatures -> Seq.empty[CategoricalFeature]) + setDefault(kind -> "individual", numericFeatures -> Seq.empty[ICENumericFeature], + categoricalFeatures -> Seq.empty[ICECategoricalFeature]) } class ICETransformer(override val uid: String) extends Transformer @@ -141,13 +57,13 @@ class ICETransformer(override val uid: String) extends Transformer this(Identifiable.randomUID("ICETransformer")) } - def processFeature(sampled: DataFrame, idCol: String, targetClassesColumn: String, - feature: String, values: Array[_]): DataFrame = { + private def processFeature(df: DataFrame, idCol: String, targetClassesColumn: String, + feature: String, values: Array[_]): DataFrame = { - val dataType = sampled.schema(feature).dataType + val dataType = df.schema(feature).dataType val explodeFunc = explode(array(values.map(v => lit(v)): _*).cast(ArrayType(dataType))) - val predicted = getModel.transform(sampled.withColumn(feature, explodeFunc)) + val predicted = getModel.transform(df.withColumn(feature, explodeFunc)) val targetCol = DatasetExtensions.findUnusedColumnName("target", predicted) val explainTarget = extractTarget(predicted.schema, targetClassesColumn) @@ -155,23 +71,31 @@ class ICETransformer(override val uid: String) extends Transformer val featImpName = feature + "__imp" - // output schema: 1 row * 1 col (pdp for the given feature: feature_value -> explanations) getKind.toLowerCase match { case "average" => + // PDP output schema: 1 row * 1 col (pdp for the given feature: feature_value -> explanations) + + // TODO: define the temp string column names from DatasetExtensions.findUnusedColumnName result .groupBy(feature) - .agg(Summarizer.mean(col(targetCol)).alias("__feature__importance__")) - .agg(collect_list(feature).alias("feature_value_list"), - collect_list("__feature__importance__").alias("feature_imp_list")) - .withColumn(featImpName, map_from_arrays(col("feature_value_list"), col("feature_imp_list"))) - .select(featImpName) - // output schema: rows * (cols + 1) (ice for the given feature: array(feature_value -> explanations)) + .agg(Summarizer.mean(col(targetCol)).alias("__feature__dependence__")) + .agg( + map_from_arrays( + collect_list(feature), + collect_list("__feature__dependence__") + ).alias(feature) + ) + case "individual" => - val iceFeatures = result.groupBy(idCol) - .agg(collect_list(feature).alias("feature_list"), collect_list(targetCol).alias("target_list")) - .withColumn(featImpName, map_from_arrays(col("feature_list"), col("target_list"))) - .select(idCol, featImpName) - iceFeatures.select(idCol, featImpName) + // ICE output schema: n rows * 2 cols (idCol + ice for the given feature: map(feature_value -> explanations)) + result + .groupBy(idCol) + .agg( + map_from_arrays( + collect_list(feature), + collect_list(targetCol) + ).alias(featImpName) + ) } } @@ -201,24 +125,28 @@ class ICETransformer(override val uid: String) extends Transformer s => dfWithId.orderBy(rand()).limit(s) }.getOrElse(dfWithId).cache() - val processCategoricalFunc: CategoricalFeature => DataFrame = { - f: CategoricalFeature => + val processCategoricalFunc: ICECategoricalFeature => DataFrame = { + f: ICECategoricalFeature => processFeature(sampled, idCol, targetClasses, f.name, collectedCatFeatureValues(f.name)) } - val processNumericFunc: NumericFeature => DataFrame = { - f: NumericFeature => + val processNumericFunc: ICENumericFeature => DataFrame = { + f: ICENumericFeature => processFeature(sampled, idCol, targetClasses, f.name, collectedNumFeatureValues(f.name)) } - val stage1 = (categoricalFeatures map processCategoricalFunc) union (numericFeatures map processNumericFunc) + val stage1 = (categoricalFeatures map processCategoricalFunc) ++ (numericFeatures map processNumericFunc) getKind.toLowerCase match { case "individual" => val stage2: DataFrame = stage1.tail.foldLeft(stage1.head)((accDF, currDF) => accDF.join(currDF, Seq(idCol), "inner")) - sampled.join(stage2, idCol).drop(idCol) + val stage3 = (categoricalFeatures ++ numericFeatures).foldLeft(stage2){ + case (accDf, feature) => accDf.withColumnRenamed(feature.name, feature.name + "_dep") + } + + sampled.join(stage3, idCol).drop(idCol) case "average" => val stage2: DataFrame = stage1.tail.foldLeft(stage1.head)((accDF, currDF) => accDF.crossJoin(currDF)) @@ -226,7 +154,7 @@ class ICETransformer(override val uid: String) extends Transformer } } - private def collectCategoricalValues[_](df: DataFrame, feature: CategoricalFeature): Array[_] = { + private def collectCategoricalValues[_](df: DataFrame, feature: ICECategoricalFeature): Array[_] = { val values = df .groupBy(col(feature.name)) .agg(count("*").as("__feature__count__")) @@ -242,7 +170,7 @@ class ICETransformer(override val uid: String) extends Transformer } } - private def collectSplits(df: DataFrame, numericFeature: NumericFeature): Array[Double] = { + private def collectSplits(df: DataFrame, numericFeature: ICENumericFeature): Array[Double] = { val (feature, nSplits, rangeMin, rangeMax) = (numericFeature.name, numericFeature.getNumSplits, numericFeature.rangeMin, numericFeature.rangeMax) val featureCol = df.schema(feature) diff --git a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEFeature.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEFeature.scala new file mode 100644 index 0000000000..9249f05786 --- /dev/null +++ b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEFeature.scala @@ -0,0 +1,41 @@ +package com.microsoft.ml.spark.explainers + +import spray.json._ +import DefaultJsonProtocol._ + +private[explainers] abstract class ICEFeature(val name: String) { + def validate: Boolean +} + +case class ICECategoricalFeature(override val name: String, numTopValues: Option[Int] = None) + extends ICEFeature(name) { + override def validate: Boolean = { + numTopValues.forall(_ > 0) + } + + private val defaultNumTopValue = 100 + def getNumTopValue: Int = { + this.numTopValues.getOrElse(defaultNumTopValue) + } +} + +object ICECategoricalFeature { + implicit val JsonFormat: JsonFormat[ICECategoricalFeature] = jsonFormat2(ICECategoricalFeature.apply) +} + +case class ICENumericFeature(override val name: String, numSplits: Option[Int] = None, + rangeMin: Option[Double] = None, rangeMax: Option[Double] = None) + extends ICEFeature(name) { + override def validate: Boolean = { + numSplits.forall(_ > 0) && (rangeMax.isEmpty || rangeMin.isEmpty || rangeMin.get <= rangeMax.get) + } + + private val defaultNumSplits = 10 + def getNumSplits: Int = { + this.numSplits.getOrElse(defaultNumSplits) + } +} + +object ICENumericFeature { + implicit val JsonFormat: JsonFormat[ICENumericFeature] = jsonFormat4(ICENumericFeature.apply) +} \ No newline at end of file diff --git a/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala index 81a36464a2..fb202beac6 100644 --- a/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/ml/spark/explainers/split1/ICEExplainerSuite.scala @@ -6,7 +6,7 @@ import com.microsoft.ml.spark.core.test.base.TestBase import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} -import com.microsoft.ml.spark.explainers.{CategoricalFeature, ICETransformer, NumericFeature} +import com.microsoft.ml.spark.explainers.{ICECategoricalFeature, ICETransformer, ICENumericFeature} class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransformer] { @@ -35,7 +35,7 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor ice.setModel(model) .setOutputCol("iceValues") .setTargetCol("probability") - .setCategoricalFeatures(Array(CategoricalFeature("col1", Some(100)), CategoricalFeature("col4", Some(4)))) + .setCategoricalFeatures(Array(ICECategoricalFeature("col1", Some(100)), ICECategoricalFeature("col4", Some(4)))) .setTargetClasses(Array(1)) val output: DataFrame = ice.transform(data) output.show(false) @@ -44,8 +44,8 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor iceAvg.setModel(model) .setOutputCol("iceValues") .setTargetCol("probability") - .setCategoricalFeatures(Array(CategoricalFeature("col1", Some(100)), CategoricalFeature("col2"))) - .setNumericFeatures(Array(NumericFeature("col4"), NumericFeature("col4", Some(3), Some(0.0), Some(100.0)))) + .setCategoricalFeatures(Array(ICECategoricalFeature("col1", Some(100)), ICECategoricalFeature("col2"))) + .setNumericFeatures(Array(ICENumericFeature("col4"), ICENumericFeature("col4", Some(3), Some(0.0), Some(100.0)))) .setTargetClasses(Array(1)) .setKind("average") val outputAvg: DataFrame = iceAvg.transform(data) From 9b379e8b294c79f602a284706964ce0e03ca9ea8 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Wed, 20 Oct 2021 17:12:04 -0700 Subject: [PATCH 09/32] WIP --- .../ml/spark/explainers/ICEExplainer.scala | 71 ++++++++++++------- 1 file changed, 44 insertions(+), 27 deletions(-) diff --git a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala index a5c28061fd..d7d8246e6e 100644 --- a/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/ml/spark/explainers/ICEExplainer.scala @@ -11,6 +11,10 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.ml.stat.Summarizer trait ICEFeatureParams extends Params with HasNumSamples { + + val averageKind = "average" + val individualKind = "individual" + val categoricalFeatures = new TypedArrayParam[ICECategoricalFeature] ( this, "categoricalFeatures", @@ -37,7 +41,7 @@ trait ICEFeatureParams extends Params with HasNumSamples { "Whether to return the partial dependence plot (PDP) averaged across all the samples in the " + "dataset or individual feature importance (ICE) per sample. " + "Allowed values are \"average\" for PDP and \"individual\" for ICE.", - ParamValidators.inArray(Array("average", "individual")) + ParamValidators.inArray(Array(averageKind, individualKind)) ) def getKind: String = $(kind) @@ -57,7 +61,7 @@ class ICETransformer(override val uid: String) extends Transformer this(Identifiable.randomUID("ICETransformer")) } - private def processFeature(df: DataFrame, idCol: String, targetClassesColumn: String, + private def calcDependence(df: DataFrame, idCol: String, targetClassesColumn: String, feature: String, values: Array[_]): DataFrame = { val dataType = df.schema(feature).dataType @@ -69,10 +73,8 @@ class ICETransformer(override val uid: String) extends Transformer val explainTarget = extractTarget(predicted.schema, targetClassesColumn) val result = predicted.withColumn(targetCol, explainTarget) - val featImpName = feature + "__imp" - getKind.toLowerCase match { - case "average" => + case super.averageKind => // PDP output schema: 1 row * 1 col (pdp for the given feature: feature_value -> explanations) // TODO: define the temp string column names from DatasetExtensions.findUnusedColumnName @@ -86,7 +88,7 @@ class ICETransformer(override val uid: String) extends Transformer ).alias(feature) ) - case "individual" => + case super.individualKind => // ICE output schema: n rows * 2 cols (idCol + ice for the given feature: map(feature_value -> explanations)) result .groupBy(idCol) @@ -94,25 +96,35 @@ class ICETransformer(override val uid: String) extends Transformer map_from_arrays( collect_list(feature), collect_list(targetCol) - ).alias(featImpName) + ).alias(feature) ) } } - def transform(ds: Dataset[_]): DataFrame = { + transformSchema(ds.schema) + val df = ds.toDF val idCol = DatasetExtensions.findUnusedColumnName("idCol", df) val targetClasses = DatasetExtensions.findUnusedColumnName("targetClasses", df) val dfWithId = df .withColumn(idCol, monotonically_increasing_id()) .withColumn(targetClasses, this.get(targetClassesCol).map(col).getOrElse(lit(getTargetClasses))) - transformSchema(df.schema) + // collect feature values for all features from original dataset - dfWithId val categoricalFeatures = this.getCategoricalFeatures val numericFeatures = this.getNumericFeatures + // TODO: Move the check into transformSchema + // Check for duplicate feature specification + val featureNames = categoricalFeatures.map(_.name) ++ numericFeatures.map(_.name) + + val duplicateFeatureNames = featureNames.groupBy(identity).mapValues(_.length).filter(_._2 > 0).keys.toArray + if (duplicateFeatureNames.nonEmpty) { + throw new Exception(s"Duplicate features specified: ${duplicateFeatureNames.mkString(", ")}") + } + val collectedCatFeatureValues: Map[String, Array[_]] = categoricalFeatures.map { feature => (feature.name, collectCategoricalValues(dfWithId, feature)) }.toMap @@ -125,32 +137,37 @@ class ICETransformer(override val uid: String) extends Transformer s => dfWithId.orderBy(rand()).limit(s) }.getOrElse(dfWithId).cache() - val processCategoricalFunc: ICECategoricalFeature => DataFrame = { + val calcCategoricalFunc: ICECategoricalFeature => DataFrame = { f: ICECategoricalFeature => - processFeature(sampled, idCol, targetClasses, f.name, collectedCatFeatureValues(f.name)) + calcDependence(sampled, idCol, targetClasses, f.name, collectedCatFeatureValues(f.name)) } - val processNumericFunc: ICENumericFeature => DataFrame = { + val calcNumericFunc: ICENumericFeature => DataFrame = { f: ICENumericFeature => - processFeature(sampled, idCol, targetClasses, f.name, collectedNumFeatureValues(f.name)) + calcDependence(sampled, idCol, targetClasses, f.name, collectedNumFeatureValues(f.name)) } - val stage1 = (categoricalFeatures map processCategoricalFunc) ++ (numericFeatures map processNumericFunc) + val dependenceDfs = (categoricalFeatures map calcCategoricalFunc) ++ (numericFeatures map calcNumericFunc) getKind.toLowerCase match { - case "individual" => - val stage2: DataFrame = - stage1.tail.foldLeft(stage1.head)((accDF, currDF) => accDF.join(currDF, Seq(idCol), "inner")) - - val stage3 = (categoricalFeatures ++ numericFeatures).foldLeft(stage2){ - case (accDf, feature) => accDf.withColumnRenamed(feature.name, feature.name + "_dep") - } - - sampled.join(stage3, idCol).drop(idCol) - - case "average" => - val stage2: DataFrame = stage1.tail.foldLeft(stage1.head)((accDF, currDF) => accDF.crossJoin(currDF)) - stage2 + case super.individualKind => + dependenceDfs.reduceOption(_.join(_, Seq(idCol), "inner")) + .map { + df => + (categoricalFeatures ++ numericFeatures).foldLeft(df) { + case (accDf, feature) => accDf.withColumnRenamed(feature.name, feature.name + "_dependence") + } + } + .map(sampled.join(_, idCol)).getOrElse( + throw new Exception("No categorical features or numeric features are set to the explainer. " + + "Call setCategoricalFeatures or setNumericFeatures to set the features to be explained.") + ) + + case super.averageKind => + dependenceDfs.reduceOption(_ crossJoin _).getOrElse( + throw new Exception("No categorical features or numeric features are set to the explainer. " + + "Call setCategoricalFeatures or setNumericFeatures to set the features to be explained.") + ) } } From 5ba0bece255cf47125047bbdba88f581eb29ce1a Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Thu, 21 Oct 2021 17:35:40 -0700 Subject: [PATCH 10/32] small fix --- .../synapse/ml/explainers/ICEExplainer.scala | 31 +++++++++---------- .../synapse/ml/explainers/SharedParams.scala | 2 +- .../explainers/split1/ICEExplainerSuite.scala | 6 ++-- 3 files changed, 18 insertions(+), 21 deletions(-) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index 18ac671c1c..55774c9946 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -70,26 +70,25 @@ class ICETransformer(override val uid: String) extends Transformer val predicted = getModel.transform(df.withColumn(feature, explodeFunc)) val targetCol = DatasetExtensions.findUnusedColumnName("target", predicted) + val dependenceCol = DatasetExtensions.findUnusedColumnName("feature__dependence", predicted) val explainTarget = extractTarget(predicted.schema, targetClassesColumn) val result = predicted.withColumn(targetCol, explainTarget) getKind.toLowerCase match { - case super.averageKind => + case this.averageKind => // PDP output schema: 1 row * 1 col (pdp for the given feature: feature_value -> explanations) - - // TODO: define the temp string column names from DatasetExtensions.findUnusedColumnName result .groupBy(feature) - .agg(Summarizer.mean(col(targetCol)).alias("__feature__dependence__")) + .agg(Summarizer.mean(col(targetCol)).alias(dependenceCol)) .agg( map_from_arrays( collect_list(feature), - collect_list("__feature__dependence__") + collect_list(dependenceCol) ).alias(feature) ) - case super.individualKind => + case this.individualKind => // ICE output schema: n rows * 2 cols (idCol + ice for the given feature: map(feature_value -> explanations)) result .groupBy(idCol) @@ -117,15 +116,6 @@ class ICETransformer(override val uid: String) extends Transformer val categoricalFeatures = this.getCategoricalFeatures val numericFeatures = this.getNumericFeatures - // TODO: Move the check into transformSchema - // Check for duplicate feature specification - val featureNames = categoricalFeatures.map(_.name) ++ numericFeatures.map(_.name) - - val duplicateFeatureNames = featureNames.groupBy(identity).mapValues(_.length).filter(_._2 > 0).keys.toArray - if (duplicateFeatureNames.nonEmpty) { - throw new Exception(s"Duplicate features specified: ${duplicateFeatureNames.mkString(", ")}") - } - val collectedCatFeatureValues: Map[String, Array[_]] = categoricalFeatures.map { feature => (feature.name, collectCategoricalValues(dfWithId, feature)) }.toMap @@ -151,7 +141,7 @@ class ICETransformer(override val uid: String) extends Transformer val dependenceDfs = (categoricalFeatures map calcCategoricalFunc) ++ (numericFeatures map calcNumericFunc) getKind.toLowerCase match { - case super.individualKind => + case this.individualKind => dependenceDfs.reduceOption(_.join(_, Seq(idCol), "inner")) .map { df => @@ -164,7 +154,7 @@ class ICETransformer(override val uid: String) extends Transformer "Call setCategoricalFeatures or setNumericFeatures to set the features to be explained.") ) - case super.averageKind => + case this.averageKind => dependenceDfs.reduceOption(_ crossJoin _).getOrElse( throw new Exception("No categorical features or numeric features are set to the explainer. " + "Call setCategoricalFeatures or setNumericFeatures to set the features to be explained.") @@ -242,6 +232,13 @@ class ICETransformer(override val uid: String) extends Transformer override def copy(extra: ParamMap): Transformer = this.defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { + // Check for duplicate feature specification + val featureNames = getCategoricalFeatures.map(_.name) ++ getNumericFeatures.map(_.name) + val duplicateFeatureNames = featureNames.groupBy(identity).mapValues(_.length).filter(_._2 > 0).keys.toArray + if (duplicateFeatureNames.nonEmpty) { + throw new Exception(s"Duplicate features specified: ${duplicateFeatureNames.mkString(", ")}") + } + this.validateSchema(schema) schema.add(getOutputCol, ArrayType(VectorType)) } diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/SharedParams.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/SharedParams.scala index a895115f42..3a3884baa5 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/SharedParams.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/SharedParams.scala @@ -3,7 +3,7 @@ package com.microsoft.azure.synapse.ml.explainers -import com.microsoft.ml.spark.core.utils.SlicerFunctions +import com.microsoft.azure.synapse.ml.core.utils.SlicerFunctions import org.apache.spark.injections.UDFUtils import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.SQLDataTypes.VectorType diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala index fb202beac6..5296700edf 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala @@ -1,12 +1,12 @@ -package com.microsoft.ml.spark.explainers.split1 +package com.microsoft.azure.synapse.ml.explainers.split1 import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer, VectorAssembler} -import com.microsoft.ml.spark.core.test.base.TestBase +import com.microsoft.azure.synapse.ml.core.test.base.TestBase import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} -import com.microsoft.ml.spark.explainers.{ICECategoricalFeature, ICETransformer, ICENumericFeature} +import com.microsoft.azure.synapse.ml.explainers.{ICETransformer, ICECategoricalFeature, ICENumericFeature} class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransformer] { From c0c9ddf4399e86f6700d4cedce091b4185a6ef99 Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Fri, 5 Nov 2021 17:42:13 -0700 Subject: [PATCH 11/32] added some unit tests --- .../synapse/ml/explainers/ICEExplainer.scala | 14 +-- .../synapse/ml/explainers/ICEFeature.scala | 93 ++++++++++++++++++- .../explainers/split1/ICEExplainerSuite.scala | 46 +++++++-- 3 files changed, 132 insertions(+), 21 deletions(-) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index 55774c9946..878c1ad4a6 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -103,7 +103,6 @@ class ICETransformer(override val uid: String) extends Transformer def transform(ds: Dataset[_]): DataFrame = { transformSchema(ds.schema) - val df = ds.toDF val idCol = DatasetExtensions.findUnusedColumnName("idCol", df) val targetClasses = DatasetExtensions.findUnusedColumnName("targetClasses", df) @@ -111,7 +110,6 @@ class ICETransformer(override val uid: String) extends Transformer .withColumn(idCol, monotonically_increasing_id()) .withColumn(targetClasses, this.get(targetClassesCol).map(col).getOrElse(lit(getTargetClasses))) - // collect feature values for all features from original dataset - dfWithId val categoricalFeatures = this.getCategoricalFeatures val numericFeatures = this.getNumericFeatures @@ -119,7 +117,6 @@ class ICETransformer(override val uid: String) extends Transformer val collectedCatFeatureValues: Map[String, Array[_]] = categoricalFeatures.map { feature => (feature.name, collectCategoricalValues(dfWithId, feature)) }.toMap - val collectedNumFeatureValues: Map[String, Array[_]] = numericFeatures.map { feature => (feature.name, collectSplits(dfWithId, feature)) }.toMap @@ -132,7 +129,6 @@ class ICETransformer(override val uid: String) extends Transformer f: ICECategoricalFeature => calcDependence(sampled, idCol, targetClasses, f.name, collectedCatFeatureValues(f.name)) } - val calcNumericFunc: ICENumericFeature => DataFrame = { f: ICENumericFeature => calcDependence(sampled, idCol, targetClasses, f.name, collectedNumFeatureValues(f.name)) @@ -144,16 +140,14 @@ class ICETransformer(override val uid: String) extends Transformer case this.individualKind => dependenceDfs.reduceOption(_.join(_, Seq(idCol), "inner")) .map { - df => - (categoricalFeatures ++ numericFeatures).foldLeft(df) { - case (accDf, feature) => accDf.withColumnRenamed(feature.name, feature.name + "_dependence") - } + df => (categoricalFeatures ++ numericFeatures).foldLeft(df) { + case (accDf, feature) => accDf.withColumnRenamed(feature.name, feature.name + "_dependence") + } } .map(sampled.join(_, idCol)).getOrElse( throw new Exception("No categorical features or numeric features are set to the explainer. " + "Call setCategoricalFeatures or setNumericFeatures to set the features to be explained.") ) - case this.averageKind => dependenceDfs.reduceOption(_ crossJoin _).getOrElse( throw new Exception("No categorical features or numeric features are set to the explainer. " + @@ -234,7 +228,7 @@ class ICETransformer(override val uid: String) extends Transformer override def transformSchema(schema: StructType): StructType = { // Check for duplicate feature specification val featureNames = getCategoricalFeatures.map(_.name) ++ getNumericFeatures.map(_.name) - val duplicateFeatureNames = featureNames.groupBy(identity).mapValues(_.length).filter(_._2 > 0).keys.toArray + val duplicateFeatureNames = featureNames.groupBy(identity).mapValues(_.length).filter(_._2 > 1).keys.toArray if (duplicateFeatureNames.nonEmpty) { throw new Exception(s"Duplicate features specified: ${duplicateFeatureNames.mkString(", ")}") } diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala index 61fd3feba6..b154201acf 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala @@ -7,8 +7,41 @@ private[explainers] abstract class ICEFeature(val name: String) { def validate: Boolean } +//case class ICECategoricalFeature(override val name: String, numTopValues: Option[Int] = None) +// extends ICEFeature(name) { +// override def validate: Boolean = { +// numTopValues.forall(_ > 0) +// } +// +// private val defaultNumTopValue = 100 +// def getNumTopValue: Int = { +// this.numTopValues.getOrElse(defaultNumTopValue) +// } +//} +// +//object ICECategoricalFeature { +// implicit val JsonFormat: JsonFormat[ICECategoricalFeature] = jsonFormat2(ICECategoricalFeature.apply) +//} +// +//case class ICENumericFeature(override val name: String, numSplits: Option[Int] = None, +// rangeMin: Option[Double] = None, rangeMax: Option[Double] = None) +// extends ICEFeature(name) { +// override def validate: Boolean = { +// numSplits.forall(_ > 0) && (rangeMax.isEmpty || rangeMin.isEmpty || rangeMin.get <= rangeMax.get) +// } +// +// private val defaultNumSplits = 10 +// def getNumSplits: Int = { +// this.numSplits.getOrElse(defaultNumSplits) +// } +//} +// +//object ICENumericFeature { +// implicit val JsonFormat: JsonFormat[ICENumericFeature] = jsonFormat4(ICENumericFeature.apply) +//} + case class ICECategoricalFeature(override val name: String, numTopValues: Option[Int] = None) - extends ICEFeature(name) { + extends ICEFeature(name) { override def validate: Boolean = { numTopValues.forall(_ > 0) } @@ -20,7 +53,27 @@ case class ICECategoricalFeature(override val name: String, numTopValues: Option } object ICECategoricalFeature { - implicit val JsonFormat: JsonFormat[ICECategoricalFeature] = jsonFormat2(ICECategoricalFeature.apply) + implicit val JsonFormat: JsonFormat[ICECategoricalFeature] = new JsonFormat[ICECategoricalFeature] { + override def read(json: JsValue): ICECategoricalFeature = { + val fields = json.asJsObject.fields + val name = fields("name") match { + case JsString(value) => value + case _ => throw new Exception("The name field must be a JsString.") + } + val numTopValues = fields.get("numTopValues") match { + case Some(JsNumber(value)) => Some(value.toInt) + case _ => None + } + + ICECategoricalFeature(name, numTopValues) + + } + override def write(obj: ICECategoricalFeature): JsValue = { + val map = Map("name" -> JsString(obj.name))++ + obj.numTopValues.map("numTopValues" -> JsNumber(_)) + JsObject(map) + } + } } case class ICENumericFeature(override val name: String, numSplits: Option[Int] = None, @@ -37,5 +90,37 @@ case class ICENumericFeature(override val name: String, numSplits: Option[Int] = } object ICENumericFeature { - implicit val JsonFormat: JsonFormat[ICENumericFeature] = jsonFormat4(ICENumericFeature.apply) -} \ No newline at end of file + implicit val JsonFormat: JsonFormat[ICENumericFeature] = new JsonFormat[ICENumericFeature] { + override def read(json: JsValue): ICENumericFeature = { + val fields = json.asJsObject.fields + val name = fields("name") match { + case JsString(value) => value + case _ => throw new Exception("The name field must be a JsString.") + } + + val numSplits = fields.get("numSplits") match { + case Some(JsNumber(value)) => Some(value.toInt) + case _ => None + } + + val rangeMin = fields.get("rangeMin").map { + case JsNumber(value) => value.toDouble + } + + val rangeMax = fields.get("rangeMax").map { + case JsNumber(value) => value.toDouble + } + + ICENumericFeature(name, numSplits, rangeMin, rangeMax) + + } + + override def write(obj: ICENumericFeature): JsValue = { + val map = Map("name" -> JsString(obj.name))++ + obj.numSplits.map("numSplits" -> JsNumber(_))++ + obj.rangeMin.map("rangeMin" -> JsNumber(_))++ + obj.rangeMax.map("rangeMax" -> JsNumber(_)) + JsObject(map) + } + } +} diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala index 5296700edf..341d5e61a5 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala @@ -3,10 +3,12 @@ package com.microsoft.azure.synapse.ml.explainers.split1 import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer, VectorAssembler} import com.microsoft.azure.synapse.ml.core.test.base.TestBase -import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} -import com.microsoft.azure.synapse.ml.explainers.{ICETransformer, ICECategoricalFeature, ICENumericFeature} +import com.microsoft.azure.synapse.ml.explainers.{ICECategoricalFeature, ICENumericFeature, ICETransformer} +import org.apache.spark.ml.linalg.Vector + class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransformer] { @@ -21,7 +23,6 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor val data: DataFrame = dataDF.withColumn("col4", rand()*100) - val pipeline: Pipeline = new Pipeline().setStages(Array( new StringIndexer().setInputCol("col2").setOutputCol("col2_ind"), new OneHotEncoder().setInputCol("col2_ind").setOutputCol("col2_enc"), @@ -35,22 +36,53 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor ice.setModel(model) .setOutputCol("iceValues") .setTargetCol("probability") - .setCategoricalFeatures(Array(ICECategoricalFeature("col1", Some(100)), ICECategoricalFeature("col4", Some(4)))) + .setCategoricalFeatures(Array(ICECategoricalFeature("col2", Some(2)), ICECategoricalFeature("col4", Some(4)))) .setTargetClasses(Array(1)) val output: DataFrame = ice.transform(data) - output.show(false) val iceAvg = new ICETransformer() iceAvg.setModel(model) .setOutputCol("iceValues") .setTargetCol("probability") .setCategoricalFeatures(Array(ICECategoricalFeature("col1", Some(100)), ICECategoricalFeature("col2"))) - .setNumericFeatures(Array(ICENumericFeature("col4"), ICENumericFeature("col4", Some(3), Some(0.0), Some(100.0)))) + .setNumericFeatures(Array(ICENumericFeature("col4", Some(5)))) .setTargetClasses(Array(1)) .setKind("average") val outputAvg: DataFrame = iceAvg.transform(data) - outputAvg.show(false) + test("col2 doesn't contribute to the prediction") { + + val outputCol2: Map[String, Vector] = outputAvg.select("col2").collect().map { + case Row(map: Map[String, Vector]) => + map + }.head + + val impA: Double = outputCol2.get("a").head.toArray.head + val impB: Double = outputCol2.get("b").head.toArray.head + + assert(0.4 < impA && impA < 0.6) + assert(0.4 < impB && impB < 0.6) + + } + + test("The length of explainer map for numeric feature is equal to it's numSplits") { + + val outputCol1: Map[Double, Vector] = outputAvg.select("col4").collect().map { + case Row(map: Map[Double, Vector]) => + map + }.head + + assert(outputCol1.size == iceAvg.getNumericFeatures.head.getNumSplits + 1) + + } + + test("The length of explainer map for categorical feature is equal to it's numTopValues") { + val outputCol: Map[Double, Vector] = output.select("col4_dependence").collect().map { + case Row(map: Map[Double, Vector]) => + map + }.head + assert(outputCol.size === ice.getCategoricalFeatures.last.getNumTopValue) + } } From 51e3d4ffba93aca9a3a96fa3b285db077b814e52 Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Thu, 18 Nov 2021 14:06:19 -0800 Subject: [PATCH 12/32] added python code --- .../synapse/ml/explainers/ICEFeature.py | 23 +++++++++++++++++++ .../synapse/ml/explainers/ICETransformer.py | 23 +++++++++++++++++++ .../python/synapse/ml/explainers/__init__.py | 0 .../synapse/ml/explainers/ICEExplainer.scala | 10 ++++++-- .../synapse/ml/explainers/ICEFeature.scala | 2 +- .../explainers/split1/ICEExplainerSuite.scala | 10 ++++++++ .../ml/core/test/fuzzing/FuzzingTest.scala | 3 ++- 7 files changed, 67 insertions(+), 4 deletions(-) create mode 100644 core/src/main/python/synapse/ml/explainers/ICEFeature.py create mode 100644 core/src/main/python/synapse/ml/explainers/ICETransformer.py create mode 100644 core/src/main/python/synapse/ml/explainers/__init__.py diff --git a/core/src/main/python/synapse/ml/explainers/ICEFeature.py b/core/src/main/python/synapse/ml/explainers/ICEFeature.py new file mode 100644 index 0000000000..6bc8f5ce0d --- /dev/null +++ b/core/src/main/python/synapse/ml/explainers/ICEFeature.py @@ -0,0 +1,23 @@ +from pyspark.ml.wrapper import JavaWrapper +from pyspark import SparkContext + +class ICECategoricalFeature(JavaWrapper): + def __init__(self, col: str, numTopValues: int = None): + sc = SparkContext._active_spark_context + numTopValues = sc._jvm.scala.Some(numTopValues) if numTopValues else sc._jvm.scala.Option.empty() + self._java_obj = JavaWrapper._new_java_obj("com.microsoft.azure.synapse.ml.explainers.ICECategoricalFeature", col, numTopValues) + + def getObject(self): + return self._java_obj + +class ICENumericFeature(JavaWrapper): + def __init__(self, col: str, numSplits: int = None, rangeMin: float = None, rangeMax: float = None): + sc = SparkContext._active_spark_context + numSplits = sc._jvm.scala.Some(numSplits) if numSplits else sc._jvm.scala.Option.empty() + rangeMin = sc._jvm.scala.Some(rangeMin) if rangeMin else sc._jvm.scala.Option.empty() + rangeMax = sc._jvm.scala.Some(rangeMax) if rangeMax else sc._jvm.scala.Option.empty() + self._java_obj = JavaWrapper._new_java_obj("com.microsoft.azure.synapse.ml.explainers.ICENumericFeature", col, numSplits, rangeMin, rangeMax) + + + def getObject(self): + return self._java_obj diff --git a/core/src/main/python/synapse/ml/explainers/ICETransformer.py b/core/src/main/python/synapse/ml/explainers/ICETransformer.py new file mode 100644 index 0000000000..7559e59708 --- /dev/null +++ b/core/src/main/python/synapse/ml/explainers/ICETransformer.py @@ -0,0 +1,23 @@ +from synapse.ml.explainers._ICETransformer import _ICETransformer +from pyspark.ml.common import inherit_doc +from pyspark import SparkContext + +@inherit_doc +class ICETransformer(_ICETransformer): + def setCategoricalFeatures(self, value): + """ + Args: + categoricalFeatures: The list of categorical features to explain. + """ + sc = SparkContext._active_spark_context + feature_list = [v.getObject() for v in value] + return super().setCategoricalFeatures(sc._jvm.PythonUtils.toSeq(feature_list)) + + def setNumericFeatures(self, value): + """ + Args: + categoricalFeatures: The list of categorical features to explain. + """ + sc = SparkContext._active_spark_context + feature_list = [v.getObject() for v in value] + return super().setNumericFeatures(sc._jvm.PythonUtils.toSeq(feature_list)) diff --git a/core/src/main/python/synapse/ml/explainers/__init__.py b/core/src/main/python/synapse/ml/explainers/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index 878c1ad4a6..3b860e7f98 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -2,7 +2,7 @@ package com.microsoft.azure.synapse.ml.explainers import com.microsoft.azure.synapse.ml.core.contracts.HasOutputCol import com.microsoft.azure.synapse.ml.core.schema.DatasetExtensions -import org.apache.spark.ml.Transformer +import org.apache.spark.ml.{ComplexParamsWritable, Transformer} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.ml.param.{ParamMap, ParamValidators, Params, _} import org.apache.spark.ml.util.Identifiable @@ -10,6 +10,8 @@ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.ml.stat.Summarizer +import com.microsoft.azure.synapse.ml.codegen.Wrappable + trait ICEFeatureParams extends Params with HasNumSamples { @@ -56,7 +58,11 @@ class ICETransformer(override val uid: String) extends Transformer with HasExplainTarget with HasModel with ICEFeatureParams - with HasOutputCol { + with HasOutputCol + with Wrappable + with ComplexParamsWritable { + + override protected lazy val pyInternalWrapper = true def this() = { this(Identifiable.randomUID("ICETransformer")) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala index b154201acf..ca9a4b31dd 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala @@ -123,4 +123,4 @@ object ICENumericFeature { JsObject(map) } } -} +} \ No newline at end of file diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala index 341d5e61a5..51be4cd64e 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala @@ -3,11 +3,13 @@ package com.microsoft.azure.synapse.ml.explainers.split1 import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexer, VectorAssembler} import com.microsoft.azure.synapse.ml.core.test.base.TestBase +import com.microsoft.azure.synapse.ml.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import com.microsoft.azure.synapse.ml.explainers.{ICECategoricalFeature, ICENumericFeature, ICETransformer} import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.util.MLReadable @@ -23,6 +25,8 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor val data: DataFrame = dataDF.withColumn("col4", rand()*100) + data.show() + val pipeline: Pipeline = new Pipeline().setStages(Array( new StringIndexer().setInputCol("col2").setOutputCol("col2_ind"), new OneHotEncoder().setInputCol("col2_ind").setOutputCol("col2_enc"), @@ -40,6 +44,8 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor .setTargetClasses(Array(1)) val output: DataFrame = ice.transform(data) + output.show() + val iceAvg = new ICETransformer() iceAvg.setModel(model) .setOutputCol("iceValues") @@ -85,4 +91,8 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor assert(outputCol.size === ice.getCategoricalFeatures.last.getNumTopValue) } + + //override def testObjects(): Seq[TestObject[ICETransformer]] = Seq(new TestObject(ice, data)) + + //override def reader: MLReadable[_] = ICETransformer } diff --git a/src/test/scala/com/microsoft/azure/synapse/ml/core/test/fuzzing/FuzzingTest.scala b/src/test/scala/com/microsoft/azure/synapse/ml/core/test/fuzzing/FuzzingTest.scala index 0871c13aae..d1e61b73a1 100644 --- a/src/test/scala/com/microsoft/azure/synapse/ml/core/test/fuzzing/FuzzingTest.scala +++ b/src/test/scala/com/microsoft/azure/synapse/ml/core/test/fuzzing/FuzzingTest.scala @@ -240,7 +240,8 @@ class FuzzingTest extends TestBase { "com.microsoft.azure.synapse.ml.explainers.TextLIME", "com.microsoft.azure.synapse.ml.explainers.TextSHAP", "com.microsoft.azure.synapse.ml.explainers.VectorLIME", - "com.microsoft.azure.synapse.ml.explainers.VectorSHAP" + "com.microsoft.azure.synapse.ml.explainers.VectorSHAP", + "com.microsoft.azure.synapse.ml.explainers.ICETransformer" ) pipelineStages.foreach { stage => From bda78823a0da40f087e618ea1fa0b3b659ef8de1 Mon Sep 17 00:00:00 2001 From: ezherdeva <82470223+ezherdeva@users.noreply.github.com> Date: Fri, 19 Nov 2021 16:45:15 -0800 Subject: [PATCH 13/32] Update core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala Co-authored-by: Jason Wang --- .../microsoft/azure/synapse/ml/explainers/ICEExplainer.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index 3b860e7f98..f1d3a859fa 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -22,7 +22,7 @@ trait ICEFeatureParams extends Params with HasNumSamples { this, "categoricalFeatures", "The list of categorical features to explain.", - {_.forall(_.validate)} + _.forall(_.validate) ) def setCategoricalFeatures(values: Seq[ICECategoricalFeature]): this.type = this.set(categoricalFeatures, values) From fa0aa6fbed379303f618d1f9e631119a6f426bc6 Mon Sep 17 00:00:00 2001 From: ezherdeva <82470223+ezherdeva@users.noreply.github.com> Date: Fri, 19 Nov 2021 16:45:33 -0800 Subject: [PATCH 14/32] Update core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala Co-authored-by: Jason Wang --- .../microsoft/azure/synapse/ml/explainers/ICEExplainer.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index f1d3a859fa..8121d87de7 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -32,7 +32,7 @@ trait ICEFeatureParams extends Params with HasNumSamples { this, "numericFeatures", "The list of numeric features to explain.", - {_.forall(_.validate)} + _.forall(_.validate) ) def setNumericFeatures(values: Seq[ICENumericFeature]): this.type = this.set(numericFeatures, values) From adc4301bb82ab117422b8e3146804cf5d73dbb10 Mon Sep 17 00:00:00 2001 From: ezherdeva <82470223+ezherdeva@users.noreply.github.com> Date: Fri, 19 Nov 2021 16:45:55 -0800 Subject: [PATCH 15/32] Update core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala Co-authored-by: Jason Wang --- .../microsoft/azure/synapse/ml/explainers/ICEExplainer.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index 8121d87de7..492dcec782 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -117,8 +117,7 @@ class ICETransformer(override val uid: String) extends Transformer .withColumn(targetClasses, this.get(targetClassesCol).map(col).getOrElse(lit(getTargetClasses))) // collect feature values for all features from original dataset - dfWithId - val categoricalFeatures = this.getCategoricalFeatures - val numericFeatures = this.getNumericFeatures + val (categoricalFeatures, numericFeatures) = (this.getCategoricalFeatures, this.getNumericFeatures) val collectedCatFeatureValues: Map[String, Array[_]] = categoricalFeatures.map { feature => (feature.name, collectCategoricalValues(dfWithId, feature)) From 058f27bd9e87a3b3bcfb675ee5b2636c23bc9f5a Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Fri, 19 Nov 2021 16:46:10 -0800 Subject: [PATCH 16/32] fix1 --- .../synapse/ml/explainers/ICEExplainer.scala | 2 +- .../synapse/ml/explainers/ICEFeature.scala | 57 ++++++++----------- 2 files changed, 24 insertions(+), 35 deletions(-) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index 3b860e7f98..edb0ed2196 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -242,4 +242,4 @@ class ICETransformer(override val uid: String) extends Transformer this.validateSchema(schema) schema.add(getOutputCol, ArrayType(VectorType)) } -} +} \ No newline at end of file diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala index ca9a4b31dd..2f8f7153c5 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala @@ -1,45 +1,17 @@ package com.microsoft.azure.synapse.ml.explainers import spray.json._ -import DefaultJsonProtocol._ private[explainers] abstract class ICEFeature(val name: String) { def validate: Boolean } -//case class ICECategoricalFeature(override val name: String, numTopValues: Option[Int] = None) -// extends ICEFeature(name) { -// override def validate: Boolean = { -// numTopValues.forall(_ > 0) -// } -// -// private val defaultNumTopValue = 100 -// def getNumTopValue: Int = { -// this.numTopValues.getOrElse(defaultNumTopValue) -// } -//} -// -//object ICECategoricalFeature { -// implicit val JsonFormat: JsonFormat[ICECategoricalFeature] = jsonFormat2(ICECategoricalFeature.apply) -//} -// -//case class ICENumericFeature(override val name: String, numSplits: Option[Int] = None, -// rangeMin: Option[Double] = None, rangeMax: Option[Double] = None) -// extends ICEFeature(name) { -// override def validate: Boolean = { -// numSplits.forall(_ > 0) && (rangeMax.isEmpty || rangeMin.isEmpty || rangeMin.get <= rangeMax.get) -// } -// -// private val defaultNumSplits = 10 -// def getNumSplits: Int = { -// this.numSplits.getOrElse(defaultNumSplits) -// } -//} -// -//object ICENumericFeature { -// implicit val JsonFormat: JsonFormat[ICENumericFeature] = jsonFormat4(ICENumericFeature.apply) -//} - +/** + * Represents a single categorical feature to be explained by ICE explainer. + * @param name The name of the categorical feature. + * @param numTopValues The max number of top-occurring values to be included in the categorical feature. + * Default: 100. + */ case class ICECategoricalFeature(override val name: String, numTopValues: Option[Int] = None) extends ICEFeature(name) { override def validate: Boolean = { @@ -52,6 +24,9 @@ case class ICECategoricalFeature(override val name: String, numTopValues: Option } } +/** + * Companion object to provide JSON serializer and deserializer for ICECategoricalFeature . + */ object ICECategoricalFeature { implicit val JsonFormat: JsonFormat[ICECategoricalFeature] = new JsonFormat[ICECategoricalFeature] { override def read(json: JsValue): ICECategoricalFeature = { @@ -76,10 +51,21 @@ object ICECategoricalFeature { } } +/** + * Represents a single numeric feature to be explained by ICE explainer. + * @param name The name of the numeric feature. + * @param numSplits The number of splits for the value range for the numeric feature. + * Default: 10.0 + * @param rangeMin Specifies the min value of the range for the numeric feature. If not specified, + * it will be computed from the background dataset. + * @param rangeMax Specifies the max value of the range for the numeric feature. If not specified, + * it will be computed from the background dataset. + */ case class ICENumericFeature(override val name: String, numSplits: Option[Int] = None, rangeMin: Option[Double] = None, rangeMax: Option[Double] = None) extends ICEFeature(name) { override def validate: Boolean = { + // rangeMax and rangeMin may not be specified, but if specified: rangeMin <= rangeMax. numSplits.forall(_ > 0) && (rangeMax.isEmpty || rangeMin.isEmpty || rangeMin.get <= rangeMax.get) } @@ -89,6 +75,9 @@ case class ICENumericFeature(override val name: String, numSplits: Option[Int] = } } +/** + * Companion object to provide JSON serializer and deserializer for ICENumericFeature. + */ object ICENumericFeature { implicit val JsonFormat: JsonFormat[ICENumericFeature] = new JsonFormat[ICENumericFeature] { override def read(json: JsValue): ICENumericFeature = { From 5d3d38ee203903a83cdbdcd563ee0bc9ae044b8c Mon Sep 17 00:00:00 2001 From: ezherdeva <82470223+ezherdeva@users.noreply.github.com> Date: Fri, 19 Nov 2021 16:47:00 -0800 Subject: [PATCH 17/32] Update core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala Co-authored-by: Jason Wang --- .../microsoft/azure/synapse/ml/explainers/ICEExplainer.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index 492dcec782..8ed713a31b 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -126,9 +126,7 @@ class ICETransformer(override val uid: String) extends Transformer feature => (feature.name, collectSplits(dfWithId, feature)) }.toMap - val sampled = this.get(numSamples).map { - s => dfWithId.orderBy(rand()).limit(s) - }.getOrElse(dfWithId).cache() + val sampled = this.get(numSamples).map(dfWithId.orderBy(rand()).limit).getOrElse(dfWithId).cache val calcCategoricalFunc: ICECategoricalFeature => DataFrame = { f: ICECategoricalFeature => From 172a050c4ee5853c5f2dcf91bbd711be6c924835 Mon Sep 17 00:00:00 2001 From: ezherdeva <82470223+ezherdeva@users.noreply.github.com> Date: Fri, 19 Nov 2021 17:02:16 -0800 Subject: [PATCH 18/32] Update core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala Co-authored-by: Jason Wang --- .../azure/synapse/ml/explainers/ICEExplainer.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index 8ed713a31b..30889203b3 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -130,11 +130,13 @@ class ICETransformer(override val uid: String) extends Transformer val calcCategoricalFunc: ICECategoricalFeature => DataFrame = { f: ICECategoricalFeature => - calcDependence(sampled, idCol, targetClasses, f.name, collectedCatFeatureValues(f.name)) + val values = collectCategoricalValues(dfWithId, f) + calcDependence(sampled, idCol, targetClasses, f.name, values) } val calcNumericFunc: ICENumericFeature => DataFrame = { f: ICENumericFeature => - calcDependence(sampled, idCol, targetClasses, f.name, collectedNumFeatureValues(f.name)) + val values = collectSplits(dfWithId, f) + calcDependence(sampled, idCol, targetClasses, f.name, values) } val dependenceDfs = (categoricalFeatures map calcCategoricalFunc) ++ (numericFeatures map calcNumericFunc) From 69486ed81dd849c89978d82ef2e35255b74bbf02 Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Fri, 19 Nov 2021 17:07:18 -0800 Subject: [PATCH 19/32] fix 2 --- .../synapse/ml/explainers/ICEExplainer.scala | 15 +++++++++++---- .../azure/synapse/ml/explainers/ICEFeature.scala | 2 +- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index c1cb0950ed..f353790b52 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -54,6 +54,12 @@ trait ICEFeatureParams extends Params with HasNumSamples { categoricalFeatures -> Seq.empty[ICECategoricalFeature]) } +/** + * Transformer which displays the model dependence on specified features with the given dataframe + * as background dataset. It supports 2 types of plots: individual - dependence per instance and + * average - across all the samples in the dataset. + * Note: This transformer only supports one-way dependence plot. + */ class ICETransformer(override val uid: String) extends Transformer with HasExplainTarget with HasModel @@ -139,6 +145,9 @@ class ICETransformer(override val uid: String) extends Transformer val dependenceDfs = (categoricalFeatures map calcCategoricalFunc) ++ (numericFeatures map calcNumericFunc) + val errorMessage = "No categorical features or numeric features are set to the explainer. " + + "Call setCategoricalFeatures or setNumericFeatures to set the features to be explained." + getKind.toLowerCase match { case this.individualKind => dependenceDfs.reduceOption(_.join(_, Seq(idCol), "inner")) @@ -148,13 +157,11 @@ class ICETransformer(override val uid: String) extends Transformer } } .map(sampled.join(_, idCol)).getOrElse( - throw new Exception("No categorical features or numeric features are set to the explainer. " + - "Call setCategoricalFeatures or setNumericFeatures to set the features to be explained.") + throw new Exception(errorMessage) ) case this.averageKind => dependenceDfs.reduceOption(_ crossJoin _).getOrElse( - throw new Exception("No categorical features or numeric features are set to the explainer. " + - "Call setCategoricalFeatures or setNumericFeatures to set the features to be explained.") + throw new Exception(errorMessage) ) } } diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala index 2f8f7153c5..2023bb489b 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala @@ -25,7 +25,7 @@ case class ICECategoricalFeature(override val name: String, numTopValues: Option } /** - * Companion object to provide JSON serializer and deserializer for ICECategoricalFeature . + * Companion object to provide JSON serializer and deserializer for ICECategoricalFeature. */ object ICECategoricalFeature { implicit val JsonFormat: JsonFormat[ICECategoricalFeature] = new JsonFormat[ICECategoricalFeature] { From 1d658d5b1be54235cb7eb4200829bcc418a1e2b1 Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Sat, 20 Nov 2021 14:51:16 -0800 Subject: [PATCH 20/32] Fixed comments --- .../synapse/ml/explainers/ICEExplainer.scala | 50 ++++++++----------- .../explainers/split1/ICEExplainerSuite.scala | 50 ++++++++++++++----- 2 files changed, 60 insertions(+), 40 deletions(-) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index dc42ac5100..d26d057896 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -2,7 +2,7 @@ package com.microsoft.azure.synapse.ml.explainers import com.microsoft.azure.synapse.ml.core.contracts.HasOutputCol import com.microsoft.azure.synapse.ml.core.schema.DatasetExtensions -import org.apache.spark.ml.{ComplexParamsWritable, Transformer} +import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.ml.param.{ParamMap, ParamValidators, Params, _} import org.apache.spark.ml.util.Identifiable @@ -55,7 +55,7 @@ trait ICEFeatureParams extends Params with HasNumSamples { } /** - * Transformer which displays the model dependence on specified features with the given dataframe + * ICETransformer displays the model dependence on specified features with the given dataframe * as background dataset. It supports 2 types of plots: individual - dependence per instance and * average - across all the samples in the dataset. * Note: This transformer only supports one-way dependence plot. @@ -124,15 +124,8 @@ class ICETransformer(override val uid: String) extends Transformer // collect feature values for all features from original dataset - dfWithId val (categoricalFeatures, numericFeatures) = (this.getCategoricalFeatures, this.getNumericFeatures) - - val collectedCatFeatureValues: Map[String, Array[_]] = categoricalFeatures.map { - feature => (feature.name, collectCategoricalValues(dfWithId, feature)) - }.toMap - val collectedNumFeatureValues: Map[String, Array[_]] = numericFeatures.map { - feature => (feature.name, collectSplits(dfWithId, feature)) - }.toMap - - val sampled = this.get(numSamples).map(dfWithId.orderBy(rand()).limit).getOrElse(dfWithId).cache + + val sampled: Dataset[Row] = this.get(numSamples).map(dfWithId.orderBy(rand()).limit).getOrElse(dfWithId).cache val calcCategoricalFunc: ICECategoricalFeature => DataFrame = { f: ICECategoricalFeature => @@ -147,32 +140,27 @@ class ICETransformer(override val uid: String) extends Transformer val dependenceDfs = (categoricalFeatures map calcCategoricalFunc) ++ (numericFeatures map calcNumericFunc) - val errorMessage = "No categorical features or numeric features are set to the explainer. " + - "Call setCategoricalFeatures or setNumericFeatures to set the features to be explained." - getKind.toLowerCase match { case this.individualKind => dependenceDfs.reduceOption(_.join(_, Seq(idCol), "inner")) .map { - df => (categoricalFeatures ++ numericFeatures).foldLeft(df) { - case (accDf, feature) => accDf.withColumnRenamed(feature.name, feature.name + "_dependence") - } + df => + (categoricalFeatures ++ numericFeatures).foldLeft(df) { + case (accDf, feature) => accDf.withColumnRenamed(feature.name, feature.name + "_dependence") + } } - .map(sampled.join(_, idCol)).getOrElse( - throw new Exception(errorMessage) - ) + .map(sampled.join(_, Seq(idCol), "inner").drop(idCol)).get case this.averageKind => - dependenceDfs.reduceOption(_ crossJoin _).getOrElse( - throw new Exception(errorMessage) - ) + dependenceDfs.reduceOption(_ crossJoin _).get } } private def collectCategoricalValues[_](df: DataFrame, feature: ICECategoricalFeature): Array[_] = { + val featureCountCol = DatasetExtensions.findUnusedColumnName("__feature__count__", df) val values = df .groupBy(col(feature.name)) - .agg(count("*").as("__feature__count__")) - .orderBy(col("__feature__count__").desc) + .agg(count("*").as(featureCountCol)) + .orderBy(col(featureCountCol).desc) .head(feature.getNumTopValue) .map(row => row.get(0)) values @@ -238,14 +226,20 @@ class ICETransformer(override val uid: String) extends Transformer override def copy(extra: ParamMap): Transformer = this.defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { - // Check for duplicate feature specification + // Check if features are specified val featureNames = getCategoricalFeatures.map(_.name) ++ getNumericFeatures.map(_.name) + if (featureNames.isEmpty) { + throw new Exception("No categorical features or numeric features are set to the explainer. " + + "Call setCategoricalFeatures or setNumericFeatures to set the features to be explained.") + } + // Check for duplicate feature specification val duplicateFeatureNames = featureNames.groupBy(identity).mapValues(_.length).filter(_._2 > 1).keys.toArray if (duplicateFeatureNames.nonEmpty) { throw new Exception(s"Duplicate features specified: ${duplicateFeatureNames.mkString(", ")}") } - this.validateSchema(schema) schema.add(getOutputCol, ArrayType(VectorType)) } -} \ No newline at end of file +} + +object ICETransformer extends ComplexParamsReadable[ICETransformer] \ No newline at end of file diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala index 51be4cd64e..d1f11d1ba1 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala @@ -12,8 +12,7 @@ import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.util.MLReadable - -class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransformer] { +class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] { import spark.implicits._ val dataDF: DataFrame = (1 to 100).flatMap(_ => Seq( @@ -25,8 +24,6 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor val data: DataFrame = dataDF.withColumn("col4", rand()*100) - data.show() - val pipeline: Pipeline = new Pipeline().setStages(Array( new StringIndexer().setInputCol("col2").setOutputCol("col2_ind"), new OneHotEncoder().setInputCol("col2_ind").setOutputCol("col2_enc"), @@ -44,8 +41,6 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor .setTargetClasses(Array(1)) val output: DataFrame = ice.transform(data) - output.show() - val iceAvg = new ICETransformer() iceAvg.setModel(model) .setOutputCol("iceValues") @@ -56,7 +51,7 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor .setKind("average") val outputAvg: DataFrame = iceAvg.transform(data) - test("col2 doesn't contribute to the prediction") { + test("col2 doesn't contribute to the prediction.") { val outputCol2: Map[String, Vector] = outputAvg.select("col2").collect().map { case Row(map: Map[String, Vector]) => @@ -71,7 +66,7 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor } - test("The length of explainer map for numeric feature is equal to it's numSplits") { + test("The length of explainer map for numeric feature is equal to it's numSplits.") { val outputCol1: Map[Double, Vector] = outputAvg.select("col4").collect().map { case Row(map: Map[Double, Vector]) => @@ -82,7 +77,7 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor } - test("The length of explainer map for categorical feature is equal to it's numTopValues") { + test("The length of explainer map for categorical feature is equal to it's numTopValues.") { val outputCol: Map[Double, Vector] = output.select("col4_dependence").collect().map { case Row(map: Map[Double, Vector]) => map @@ -92,7 +87,38 @@ class ICEExplainerSuite extends TestBase {// with TransformerFuzzing[ICETransfor } - //override def testObjects(): Seq[TestObject[ICETransformer]] = Seq(new TestObject(ice, data)) + test("No features specified.") { + val ice = new ICETransformer() + ice.setModel(model) + .setOutputCol("iceValues") + .setTargetCol("probability") + .setTargetClasses(Array(1)) + assertThrows[Exception](ice.transform(data)) + } + + test("Duplicate features specified.") { + val ice = new ICETransformer() + ice.setModel(model) + .setOutputCol("iceValues") + .setTargetCol("probability") + .setCategoricalFeatures(Array(ICECategoricalFeature("col1", Some(100)), + ICECategoricalFeature("col2"), ICECategoricalFeature("col1"))) + .setTargetClasses(Array(1)) + assertThrows[Exception](ice.transform(data)) + } + + test("When setNumSamples is called, ICE returns correct number of rows.") { + val ice = new ICETransformer() + ice.setNumSamples(2) + .setModel(model) + .setOutputCol("iceValues") + .setTargetCol("probability") + .setCategoricalFeatures(Array(ICECategoricalFeature("col2", Some(2)), ICECategoricalFeature("col4", Some(4)))) + .setTargetClasses(Array(1)) + val output = ice.transform(data) + assert(output.count() == 2) + } - //override def reader: MLReadable[_] = ICETransformer -} + override def testObjects(): Seq[TestObject[ICETransformer]] = Seq(new TestObject(ice, data)) + override def reader: MLReadable[_] = ICETransformer +} \ No newline at end of file From 25ad8fa0814b4d2ecdac8f7da943b8ac524d453f Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Mon, 29 Nov 2021 13:35:13 -0800 Subject: [PATCH 21/32] fix comments --- .../synapse/ml/explainers/ICEExplainer.scala | 36 ++++++++++------ .../synapse/ml/explainers/ICEFeature.scala | 41 ++++++++++++++----- .../explainers/split1/ICEExplainerSuite.scala | 33 +++++++-------- 3 files changed, 70 insertions(+), 40 deletions(-) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index d26d057896..121ce9a07a 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -1,9 +1,10 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + package com.microsoft.azure.synapse.ml.explainers -import com.microsoft.azure.synapse.ml.core.contracts.HasOutputCol import com.microsoft.azure.synapse.ml.core.schema.DatasetExtensions import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} -import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.ml.param.{ParamMap, ParamValidators, Params, _} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions._ @@ -64,7 +65,6 @@ class ICETransformer(override val uid: String) extends Transformer with HasExplainTarget with HasModel with ICEFeatureParams - with HasOutputCol with Wrappable with ComplexParamsWritable { @@ -75,7 +75,7 @@ class ICETransformer(override val uid: String) extends Transformer } private def calcDependence(df: DataFrame, idCol: String, targetClassesColumn: String, - feature: String, values: Array[_]): DataFrame = { + feature: String, values: Array[_], outputColName: String): DataFrame = { val dataType = df.schema(feature).dataType val explodeFunc = explode(array(values.map(v => lit(v)): _*).cast(ArrayType(dataType))) @@ -97,7 +97,7 @@ class ICETransformer(override val uid: String) extends Transformer map_from_arrays( collect_list(feature), collect_list(dependenceCol) - ).alias(feature) + ).alias(outputColName) ) case this.individualKind => @@ -108,7 +108,7 @@ class ICETransformer(override val uid: String) extends Transformer map_from_arrays( collect_list(feature), collect_list(targetCol) - ).alias(feature) + ).alias(outputColName) ) } } @@ -124,18 +124,18 @@ class ICETransformer(override val uid: String) extends Transformer // collect feature values for all features from original dataset - dfWithId val (categoricalFeatures, numericFeatures) = (this.getCategoricalFeatures, this.getNumericFeatures) - + val sampled: Dataset[Row] = this.get(numSamples).map(dfWithId.orderBy(rand()).limit).getOrElse(dfWithId).cache val calcCategoricalFunc: ICECategoricalFeature => DataFrame = { f: ICECategoricalFeature => val values = collectCategoricalValues(dfWithId, f) - calcDependence(sampled, idCol, targetClasses, f.name, values) + calcDependence(sampled, idCol, targetClasses, f.name, values, f.getOutputColName) } val calcNumericFunc: ICENumericFeature => DataFrame = { f: ICENumericFeature => val values = collectSplits(dfWithId, f) - calcDependence(sampled, idCol, targetClasses, f.name, values) + calcDependence(sampled, idCol, targetClasses, f.name, values, f.getOutputColName) } val dependenceDfs = (categoricalFeatures map calcCategoricalFunc) ++ (numericFeatures map calcNumericFunc) @@ -146,12 +146,12 @@ class ICETransformer(override val uid: String) extends Transformer .map { df => (categoricalFeatures ++ numericFeatures).foldLeft(df) { - case (accDf, feature) => accDf.withColumnRenamed(feature.name, feature.name + "_dependence") + case (accDf, feature) => accDf//.withColumnRenamed(feature.name, feature.getOutputColName) } } .map(sampled.join(_, Seq(idCol), "inner").drop(idCol)).get case this.averageKind => - dependenceDfs.reduceOption(_ crossJoin _).get + dependenceDfs.reduce(_ crossJoin _) } } @@ -226,6 +226,18 @@ class ICETransformer(override val uid: String) extends Transformer override def copy(extra: ParamMap): Transformer = this.defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { + // Check the data type for categorical features + val categoricalFeaturesTypes= getCategoricalFeatures.map(_.name).map(schema(_).dataType) + val allowedCategoricalTypes = Array(StringType, BooleanType, ByteType, ShortType, IntegerType, LongType) + require(categoricalFeaturesTypes.forall(allowedCategoricalTypes.contains(_)), + s"Data type for categorical features must be String, Boolean, Byte, Short, Integer or Long type.") + + // Check the data type for numeric features + val numericFeaturesTypes= getNumericFeatures.map(_.name).map(schema(_).dataType) + val allowedNumericTypes = Array(FloatType, DoubleType, DecimalType) + require(numericFeaturesTypes.forall(allowedNumericTypes.contains(_)), + s"Data type for numeric features must be Float, Double or Decimal type.") + // Check if features are specified val featureNames = getCategoricalFeatures.map(_.name) ++ getNumericFeatures.map(_.name) if (featureNames.isEmpty) { @@ -238,7 +250,7 @@ class ICETransformer(override val uid: String) extends Transformer throw new Exception(s"Duplicate features specified: ${duplicateFeatureNames.mkString(", ")}") } this.validateSchema(schema) - schema.add(getOutputCol, ArrayType(VectorType)) + schema } } diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala index 2023bb489b..b12293fe32 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala @@ -1,9 +1,14 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + package com.microsoft.azure.synapse.ml.explainers import spray.json._ -private[explainers] abstract class ICEFeature(val name: String) { +private[explainers] abstract class ICEFeature(val name: String, outputColName: Option[String] = None) { def validate: Boolean + private val defaultOutputColName = name + "_dependence" + def getOutputColName: String = this.outputColName.getOrElse(defaultOutputColName) } /** @@ -11,9 +16,12 @@ private[explainers] abstract class ICEFeature(val name: String) { * @param name The name of the categorical feature. * @param numTopValues The max number of top-occurring values to be included in the categorical feature. * Default: 100. + * @param outputColName The name for output column with explanations for the feature. + * Default: input name of the feature + _dependence. */ -case class ICECategoricalFeature(override val name: String, numTopValues: Option[Int] = None) - extends ICEFeature(name) { +case class ICECategoricalFeature(override val name: String, numTopValues: Option[Int] = None, + outputColName: Option[String] = None) + extends ICEFeature(name, outputColName) { override def validate: Boolean = { numTopValues.forall(_ > 0) } @@ -39,13 +47,17 @@ object ICECategoricalFeature { case Some(JsNumber(value)) => Some(value.toInt) case _ => None } - - ICECategoricalFeature(name, numTopValues) + val outputColName = fields.get("outputColName") match { + case Some(JsString(value)) => Some(value) + case _ => None + } + ICECategoricalFeature(name, numTopValues, outputColName) } override def write(obj: ICECategoricalFeature): JsValue = { val map = Map("name" -> JsString(obj.name))++ - obj.numTopValues.map("numTopValues" -> JsNumber(_)) + obj.numTopValues.map("numTopValues" -> JsNumber(_))++ + obj.outputColName.map("outputColName" -> JsString(_)) JsObject(map) } } @@ -60,10 +72,13 @@ object ICECategoricalFeature { * it will be computed from the background dataset. * @param rangeMax Specifies the max value of the range for the numeric feature. If not specified, * it will be computed from the background dataset. + * @param outputColName The name for output column with explanations for the feature. + * Default: input name of the feature + "_dependence" */ case class ICENumericFeature(override val name: String, numSplits: Option[Int] = None, - rangeMin: Option[Double] = None, rangeMax: Option[Double] = None) - extends ICEFeature(name) { + rangeMin: Option[Double] = None, rangeMax: Option[Double] = None, + outputColName: Option[String] = None) + extends ICEFeature(name, outputColName) { override def validate: Boolean = { // rangeMax and rangeMin may not be specified, but if specified: rangeMin <= rangeMax. numSplits.forall(_ > 0) && (rangeMax.isEmpty || rangeMin.isEmpty || rangeMin.get <= rangeMax.get) @@ -100,7 +115,12 @@ object ICENumericFeature { case JsNumber(value) => value.toDouble } - ICENumericFeature(name, numSplits, rangeMin, rangeMax) + val outputColName = fields.get("outputColName") match { + case Some(JsString(value)) => Some(value) + case _ => None + } + + ICENumericFeature(name, numSplits, rangeMin, rangeMax, outputColName) } @@ -108,7 +128,8 @@ object ICENumericFeature { val map = Map("name" -> JsString(obj.name))++ obj.numSplits.map("numSplits" -> JsNumber(_))++ obj.rangeMin.map("rangeMin" -> JsNumber(_))++ - obj.rangeMax.map("rangeMax" -> JsNumber(_)) + obj.rangeMax.map("rangeMax" -> JsNumber(_))++ + obj.outputColName.map("outputColName" -> JsString(_)) JsObject(map) } } diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala index d1f11d1ba1..1d81fdca53 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala @@ -1,3 +1,6 @@ +// Copyright (C) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. See LICENSE in project root for information. + package com.microsoft.azure.synapse.ml.explainers.split1 import org.apache.spark.ml.{Pipeline, PipelineModel} @@ -6,7 +9,7 @@ import com.microsoft.azure.synapse.ml.core.test.base.TestBase import com.microsoft.azure.synapse.ml.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions._ -import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} +import org.apache.spark.ml.classification.LogisticRegression import com.microsoft.azure.synapse.ml.explainers.{ICECategoricalFeature, ICENumericFeature, ICETransformer} import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.util.MLReadable @@ -16,10 +19,10 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] import spark.implicits._ val dataDF: DataFrame = (1 to 100).flatMap(_ => Seq( - (-5d, "a", -5d, 0), - (-5d, "b", -5d, 0), - (5d, "a", 5d, 1), - (5d, "b", 5d, 1) + (-5, "a", -5, 0), + (-5, "b", -5, 0), + (5, "a", 5, 1), + (5, "b", 5, 1) )).toDF("col1", "col2", "col3", "label") val data: DataFrame = dataDF.withColumn("col4", rand()*100) @@ -32,18 +35,15 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] )) val model: PipelineModel = pipeline.fit(data) - val ice = new ICETransformer() ice.setModel(model) - .setOutputCol("iceValues") .setTargetCol("probability") - .setCategoricalFeatures(Array(ICECategoricalFeature("col2", Some(2)), ICECategoricalFeature("col4", Some(4)))) + .setCategoricalFeatures(Array(ICECategoricalFeature("col2", Some(2)), ICECategoricalFeature("col3", Some(4)))) .setTargetClasses(Array(1)) val output: DataFrame = ice.transform(data) val iceAvg = new ICETransformer() iceAvg.setModel(model) - .setOutputCol("iceValues") .setTargetCol("probability") .setCategoricalFeatures(Array(ICECategoricalFeature("col1", Some(100)), ICECategoricalFeature("col2"))) .setNumericFeatures(Array(ICENumericFeature("col4", Some(5)))) @@ -53,7 +53,7 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] test("col2 doesn't contribute to the prediction.") { - val outputCol2: Map[String, Vector] = outputAvg.select("col2").collect().map { + val outputCol2: Map[String, Vector] = outputAvg.select("col2_dependence").collect().map { case Row(map: Map[String, Vector]) => map }.head @@ -68,7 +68,7 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] test("The length of explainer map for numeric feature is equal to it's numSplits.") { - val outputCol1: Map[Double, Vector] = outputAvg.select("col4").collect().map { + val outputCol1: Map[Double, Vector] = outputAvg.select("col4_dependence").collect().map { case Row(map: Map[Double, Vector]) => map }.head @@ -77,20 +77,19 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] } - test("The length of explainer map for categorical feature is equal to it's numTopValues.") { - val outputCol: Map[Double, Vector] = output.select("col4_dependence").collect().map { + test("The length of explainer map for categorical feature is less or equal to it's numTopValues.") { + val outputCol: Map[Double, Vector] = output.select("col3_dependence").collect().map { case Row(map: Map[Double, Vector]) => map }.head - assert(outputCol.size === ice.getCategoricalFeatures.last.getNumTopValue) + assert(outputCol.size <= ice.getCategoricalFeatures.last.getNumTopValue) } test("No features specified.") { val ice = new ICETransformer() ice.setModel(model) - .setOutputCol("iceValues") .setTargetCol("probability") .setTargetClasses(Array(1)) assertThrows[Exception](ice.transform(data)) @@ -99,7 +98,6 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] test("Duplicate features specified.") { val ice = new ICETransformer() ice.setModel(model) - .setOutputCol("iceValues") .setTargetCol("probability") .setCategoricalFeatures(Array(ICECategoricalFeature("col1", Some(100)), ICECategoricalFeature("col2"), ICECategoricalFeature("col1"))) @@ -111,9 +109,8 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] val ice = new ICETransformer() ice.setNumSamples(2) .setModel(model) - .setOutputCol("iceValues") .setTargetCol("probability") - .setCategoricalFeatures(Array(ICECategoricalFeature("col2", Some(2)), ICECategoricalFeature("col4", Some(4)))) + .setCategoricalFeatures(Array(ICECategoricalFeature("col2", Some(2)), ICECategoricalFeature("col3", Some(4)))) .setTargetClasses(Array(1)) val output = ice.transform(data) assert(output.count() == 2) From 8045357a13eeafebed0a9af4e42458099bbfc1fc Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Mon, 29 Nov 2021 14:15:28 -0800 Subject: [PATCH 22/32] fix comments 2 --- .../azure/synapse/ml/explainers/ICEExplainer.scala | 6 ------ 1 file changed, 6 deletions(-) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index 121ce9a07a..235c1185c0 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -143,12 +143,6 @@ class ICETransformer(override val uid: String) extends Transformer getKind.toLowerCase match { case this.individualKind => dependenceDfs.reduceOption(_.join(_, Seq(idCol), "inner")) - .map { - df => - (categoricalFeatures ++ numericFeatures).foldLeft(df) { - case (accDf, feature) => accDf//.withColumnRenamed(feature.name, feature.getOutputColName) - } - } .map(sampled.join(_, Seq(idCol), "inner").drop(idCol)).get case this.averageKind => dependenceDfs.reduce(_ crossJoin _) From 2c207d30d735a5dddc7dd48f6f04c03dd75d1e72 Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Thu, 2 Dec 2021 17:27:01 -0800 Subject: [PATCH 23/32] last fix --- .../python/synapse/ml/explainers/ICEFeature.py | 10 ++++++---- .../synapse/ml/explainers/ICEExplainer.scala | 15 ++++++++------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/core/src/main/python/synapse/ml/explainers/ICEFeature.py b/core/src/main/python/synapse/ml/explainers/ICEFeature.py index 6bc8f5ce0d..40438c4926 100644 --- a/core/src/main/python/synapse/ml/explainers/ICEFeature.py +++ b/core/src/main/python/synapse/ml/explainers/ICEFeature.py @@ -2,21 +2,23 @@ from pyspark import SparkContext class ICECategoricalFeature(JavaWrapper): - def __init__(self, col: str, numTopValues: int = None): + def __init__(self, col: str, numTopValues: int = None, outputColName: str = None): sc = SparkContext._active_spark_context numTopValues = sc._jvm.scala.Some(numTopValues) if numTopValues else sc._jvm.scala.Option.empty() - self._java_obj = JavaWrapper._new_java_obj("com.microsoft.azure.synapse.ml.explainers.ICECategoricalFeature", col, numTopValues) + outputColName = sc._jvm.scala.Some(outputColName) if outputColName else sc._jvm.scala.Option.empty() + self._java_obj = JavaWrapper._new_java_obj("com.microsoft.azure.synapse.ml.explainers.ICECategoricalFeature", col, numTopValues, outputColName) def getObject(self): return self._java_obj class ICENumericFeature(JavaWrapper): - def __init__(self, col: str, numSplits: int = None, rangeMin: float = None, rangeMax: float = None): + def __init__(self, col: str, numSplits: int = None, rangeMin: float = None, rangeMax: float = None, outputColName: str = None): sc = SparkContext._active_spark_context numSplits = sc._jvm.scala.Some(numSplits) if numSplits else sc._jvm.scala.Option.empty() rangeMin = sc._jvm.scala.Some(rangeMin) if rangeMin else sc._jvm.scala.Option.empty() rangeMax = sc._jvm.scala.Some(rangeMax) if rangeMax else sc._jvm.scala.Option.empty() - self._java_obj = JavaWrapper._new_java_obj("com.microsoft.azure.synapse.ml.explainers.ICENumericFeature", col, numSplits, rangeMin, rangeMax) + outputColName = sc._jvm.scala.Some(outputColName) if outputColName else sc._jvm.scala.Option.empty() + self._java_obj = JavaWrapper._new_java_obj("com.microsoft.azure.synapse.ml.explainers.ICENumericFeature", col, numSplits, rangeMin, rangeMax, outputColName) def getObject(self): diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index 235c1185c0..7f48903d14 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -61,6 +61,7 @@ trait ICEFeatureParams extends Params with HasNumSamples { * average - across all the samples in the dataset. * Note: This transformer only supports one-way dependence plot. */ +@org.apache.spark.annotation.Experimental class ICETransformer(override val uid: String) extends Transformer with HasExplainTarget with HasModel @@ -221,19 +222,19 @@ class ICETransformer(override val uid: String) extends Transformer override def transformSchema(schema: StructType): StructType = { // Check the data type for categorical features - val categoricalFeaturesTypes= getCategoricalFeatures.map(_.name).map(schema(_).dataType) + val categoricalFeaturesTypes= getCategoricalFeatures.map(f => schema(f.name).dataType) val allowedCategoricalTypes = Array(StringType, BooleanType, ByteType, ShortType, IntegerType, LongType) - require(categoricalFeaturesTypes.forall(allowedCategoricalTypes.contains(_)), - s"Data type for categorical features must be String, Boolean, Byte, Short, Integer or Long type.") + require(categoricalFeaturesTypes.forall(allowedCategoricalTypes.contains), + s"Data type for categorical features must be ${allowedCategoricalTypes.mkString("[", ",", "]")}.") // Check the data type for numeric features - val numericFeaturesTypes= getNumericFeatures.map(_.name).map(schema(_).dataType) + val numericFeaturesTypes= getNumericFeatures.map(f => schema(f.name).dataType) val allowedNumericTypes = Array(FloatType, DoubleType, DecimalType) - require(numericFeaturesTypes.forall(allowedNumericTypes.contains(_)), - s"Data type for numeric features must be Float, Double or Decimal type.") + require(numericFeaturesTypes.forall(allowedNumericTypes.contains), + s"Data type for numeric features must be ${allowedNumericTypes.mkString("[", ",", "]")}.") // Check if features are specified - val featureNames = getCategoricalFeatures.map(_.name) ++ getNumericFeatures.map(_.name) + val featureNames = (getCategoricalFeatures ++ getNumericFeatures).map(_.name) if (featureNames.isEmpty) { throw new Exception("No categorical features or numeric features are set to the explainer. " + "Call setCategoricalFeatures or setNumericFeatures to set the features to be explained.") From fa87e5cc699d226b4fa1b15267270d3196b9b82f Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Thu, 2 Dec 2021 18:01:00 -0800 Subject: [PATCH 24/32] added copyright to py files --- core/src/main/python/synapse/ml/explainers/ICEFeature.py | 3 +++ core/src/main/python/synapse/ml/explainers/ICETransformer.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/core/src/main/python/synapse/ml/explainers/ICEFeature.py b/core/src/main/python/synapse/ml/explainers/ICEFeature.py index 40438c4926..10196de191 100644 --- a/core/src/main/python/synapse/ml/explainers/ICEFeature.py +++ b/core/src/main/python/synapse/ml/explainers/ICEFeature.py @@ -1,3 +1,6 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + from pyspark.ml.wrapper import JavaWrapper from pyspark import SparkContext diff --git a/core/src/main/python/synapse/ml/explainers/ICETransformer.py b/core/src/main/python/synapse/ml/explainers/ICETransformer.py index 7559e59708..c45f9dd87c 100644 --- a/core/src/main/python/synapse/ml/explainers/ICETransformer.py +++ b/core/src/main/python/synapse/ml/explainers/ICETransformer.py @@ -1,3 +1,6 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + from synapse.ml.explainers._ICETransformer import _ICETransformer from pyspark.ml.common import inherit_doc from pyspark import SparkContext From 77b62679e62c2ce01995820190d4e98b2364b532 Mon Sep 17 00:00:00 2001 From: ezherdeva <82470223+ezherdeva@users.noreply.github.com> Date: Fri, 3 Dec 2021 10:58:25 -0800 Subject: [PATCH 25/32] Update src/test/scala/com/microsoft/azure/synapse/ml/core/test/fuzzing/FuzzingTest.scala Co-authored-by: Kashyap Patel <64443771+ms-kashyap@users.noreply.github.com> --- .../azure/synapse/ml/core/test/fuzzing/FuzzingTest.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/scala/com/microsoft/azure/synapse/ml/core/test/fuzzing/FuzzingTest.scala b/src/test/scala/com/microsoft/azure/synapse/ml/core/test/fuzzing/FuzzingTest.scala index 303526f7fd..cbefa8665b 100644 --- a/src/test/scala/com/microsoft/azure/synapse/ml/core/test/fuzzing/FuzzingTest.scala +++ b/src/test/scala/com/microsoft/azure/synapse/ml/core/test/fuzzing/FuzzingTest.scala @@ -241,7 +241,7 @@ class FuzzingTest extends TestBase { "com.microsoft.azure.synapse.ml.explainers.TextSHAP", "com.microsoft.azure.synapse.ml.explainers.VectorLIME", "com.microsoft.azure.synapse.ml.explainers.VectorSHAP", - "com.microsoft.azure.synapse.ml.explainers.ICETransformer" + "com.microsoft.azure.synapse.ml.explainers.ICETransformer", "com.microsoft.azure.synapse.ml.exploratory.AggregateBalanceMeasure", "com.microsoft.azure.synapse.ml.exploratory.DistributionBalanceMeasure", "com.microsoft.azure.synapse.ml.exploratory.FeatureBalanceMeasure" From 7c25c577518d297eb43464750e5084a49f0657c7 Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Mon, 6 Dec 2021 22:26:01 -0800 Subject: [PATCH 26/32] fix 2 --- .../synapse/ml/explainers/ICEFeature.py | 28 -------- .../synapse/ml/explainers/ICETransformer.py | 26 -------- .../python/synapse/ml/explainers/__init__.py | 0 .../synapse/ml/explainers/ICEExplainer.scala | 65 +++++++++++++------ .../synapse/ml/explainers/ICEFeature.scala | 36 +++++++++- .../explainers/split1/ICEExplainerSuite.scala | 47 ++++++++++++-- 6 files changed, 120 insertions(+), 82 deletions(-) delete mode 100644 core/src/main/python/synapse/ml/explainers/ICEFeature.py delete mode 100644 core/src/main/python/synapse/ml/explainers/ICETransformer.py delete mode 100644 core/src/main/python/synapse/ml/explainers/__init__.py diff --git a/core/src/main/python/synapse/ml/explainers/ICEFeature.py b/core/src/main/python/synapse/ml/explainers/ICEFeature.py deleted file mode 100644 index 10196de191..0000000000 --- a/core/src/main/python/synapse/ml/explainers/ICEFeature.py +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (C) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See LICENSE in project root for information. - -from pyspark.ml.wrapper import JavaWrapper -from pyspark import SparkContext - -class ICECategoricalFeature(JavaWrapper): - def __init__(self, col: str, numTopValues: int = None, outputColName: str = None): - sc = SparkContext._active_spark_context - numTopValues = sc._jvm.scala.Some(numTopValues) if numTopValues else sc._jvm.scala.Option.empty() - outputColName = sc._jvm.scala.Some(outputColName) if outputColName else sc._jvm.scala.Option.empty() - self._java_obj = JavaWrapper._new_java_obj("com.microsoft.azure.synapse.ml.explainers.ICECategoricalFeature", col, numTopValues, outputColName) - - def getObject(self): - return self._java_obj - -class ICENumericFeature(JavaWrapper): - def __init__(self, col: str, numSplits: int = None, rangeMin: float = None, rangeMax: float = None, outputColName: str = None): - sc = SparkContext._active_spark_context - numSplits = sc._jvm.scala.Some(numSplits) if numSplits else sc._jvm.scala.Option.empty() - rangeMin = sc._jvm.scala.Some(rangeMin) if rangeMin else sc._jvm.scala.Option.empty() - rangeMax = sc._jvm.scala.Some(rangeMax) if rangeMax else sc._jvm.scala.Option.empty() - outputColName = sc._jvm.scala.Some(outputColName) if outputColName else sc._jvm.scala.Option.empty() - self._java_obj = JavaWrapper._new_java_obj("com.microsoft.azure.synapse.ml.explainers.ICENumericFeature", col, numSplits, rangeMin, rangeMax, outputColName) - - - def getObject(self): - return self._java_obj diff --git a/core/src/main/python/synapse/ml/explainers/ICETransformer.py b/core/src/main/python/synapse/ml/explainers/ICETransformer.py deleted file mode 100644 index c45f9dd87c..0000000000 --- a/core/src/main/python/synapse/ml/explainers/ICETransformer.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (C) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See LICENSE in project root for information. - -from synapse.ml.explainers._ICETransformer import _ICETransformer -from pyspark.ml.common import inherit_doc -from pyspark import SparkContext - -@inherit_doc -class ICETransformer(_ICETransformer): - def setCategoricalFeatures(self, value): - """ - Args: - categoricalFeatures: The list of categorical features to explain. - """ - sc = SparkContext._active_spark_context - feature_list = [v.getObject() for v in value] - return super().setCategoricalFeatures(sc._jvm.PythonUtils.toSeq(feature_list)) - - def setNumericFeatures(self, value): - """ - Args: - categoricalFeatures: The list of categorical features to explain. - """ - sc = SparkContext._active_spark_context - feature_list = [v.getObject() for v in value] - return super().setNumericFeatures(sc._jvm.PythonUtils.toSeq(feature_list)) diff --git a/core/src/main/python/synapse/ml/explainers/__init__.py b/core/src/main/python/synapse/ml/explainers/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index 7f48903d14..274927c618 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -13,6 +13,8 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.ml.stat.Summarizer import com.microsoft.azure.synapse.ml.codegen.Wrappable +//import scala.collection.JavaConverters +import scala.jdk.CollectionConverters.asScalaBufferConverter trait ICEFeatureParams extends Params with HasNumSamples { @@ -26,9 +28,17 @@ trait ICEFeatureParams extends Params with HasNumSamples { _.forall(_.validate) ) + def setCategoricalFeatures(values: Seq[ICECategoricalFeature]): this.type = this.set(categoricalFeatures, values) def getCategoricalFeatures: Seq[ICECategoricalFeature] = $(categoricalFeatures) + def setCategoricalFeatures(values: java.util.List[java.util.HashMap[String, Any]]): this.type = { + val features: Seq[ICECategoricalFeature] = values.asScala.toSeq.map(ICECategoricalFeature.fromMap) + + this.setCategoricalFeatures(features) + //this.set(categoricalFeatures, features) + } + val numericFeatures = new TypedArrayParam[ICENumericFeature] ( this, "numericFeatures", @@ -39,6 +49,14 @@ trait ICEFeatureParams extends Params with HasNumSamples { def setNumericFeatures(values: Seq[ICENumericFeature]): this.type = this.set(numericFeatures, values) def getNumericFeatures: Seq[ICENumericFeature] = $(numericFeatures) + def setNumericFeatures(values: java.util.List[java.util.HashMap[String, Any]]): this.type = { + val features: Seq[ICENumericFeature] = values.asScala.toSeq.map(ICENumericFeature.fromMap) + + //this.set(numericFeatures, features) + + this.setNumericFeatures(features) + } + val kind = new Param[String] ( this, "kind", @@ -51,7 +69,8 @@ trait ICEFeatureParams extends Params with HasNumSamples { def getKind: String = $(kind) def setKind(value: String): this.type = set(kind, value) - setDefault(kind -> "individual", numericFeatures -> Seq.empty[ICENumericFeature], + setDefault(kind -> "individual", + numericFeatures -> Seq.empty[ICENumericFeature], categoricalFeatures -> Seq.empty[ICECategoricalFeature]) } @@ -69,11 +88,9 @@ class ICETransformer(override val uid: String) extends Transformer with Wrappable with ComplexParamsWritable { - override protected lazy val pyInternalWrapper = true + //override protected lazy val pyInternalWrapper = true - def this() = { - this(Identifiable.randomUID("ICETransformer")) - } + def this() = this(Identifiable.randomUID("ICETransformer")) private def calcDependence(df: DataFrame, idCol: String, targetClassesColumn: String, feature: String, values: Array[_], outputColName: String): DataFrame = { @@ -121,12 +138,12 @@ class ICETransformer(override val uid: String) extends Transformer val targetClasses = DatasetExtensions.findUnusedColumnName("targetClasses", df) val dfWithId = df .withColumn(idCol, monotonically_increasing_id()) - .withColumn(targetClasses, this.get(targetClassesCol).map(col).getOrElse(lit(getTargetClasses))) + .withColumn(targetClasses, get(targetClassesCol).map(col).getOrElse(lit(getTargetClasses))) // collect feature values for all features from original dataset - dfWithId - val (categoricalFeatures, numericFeatures) = (this.getCategoricalFeatures, this.getNumericFeatures) + val (categoricalFeatures, numericFeatures) = (getCategoricalFeatures, getNumericFeatures) - val sampled: Dataset[Row] = this.get(numSamples).map(dfWithId.orderBy(rand()).limit).getOrElse(dfWithId).cache + val sampled: Dataset[Row] = get(numSamples).map(dfWithId.orderBy(rand()).limit).getOrElse(dfWithId).cache val calcCategoricalFunc: ICECategoricalFeature => DataFrame = { f: ICECategoricalFeature => @@ -152,13 +169,11 @@ class ICETransformer(override val uid: String) extends Transformer private def collectCategoricalValues[_](df: DataFrame, feature: ICECategoricalFeature): Array[_] = { val featureCountCol = DatasetExtensions.findUnusedColumnName("__feature__count__", df) - val values = df - .groupBy(col(feature.name)) + df.groupBy(col(feature.name)) .agg(count("*").as(featureCountCol)) .orderBy(col(featureCountCol).desc) .head(feature.getNumTopValue) .map(row => row.get(0)) - values } private def createNSplits(n: Int)(from: Double, to: Double): Seq[Double] = { @@ -222,17 +237,25 @@ class ICETransformer(override val uid: String) extends Transformer override def transformSchema(schema: StructType): StructType = { // Check the data type for categorical features - val categoricalFeaturesTypes= getCategoricalFeatures.map(f => schema(f.name).dataType) val allowedCategoricalTypes = Array(StringType, BooleanType, ByteType, ShortType, IntegerType, LongType) - require(categoricalFeaturesTypes.forall(allowedCategoricalTypes.contains), - s"Data type for categorical features must be ${allowedCategoricalTypes.mkString("[", ",", "]")}.") - - // Check the data type for numeric features - val numericFeaturesTypes= getNumericFeatures.map(f => schema(f.name).dataType) - val allowedNumericTypes = Array(FloatType, DoubleType, DecimalType) - require(numericFeaturesTypes.forall(allowedNumericTypes.contains), - s"Data type for numeric features must be ${allowedNumericTypes.mkString("[", ",", "]")}.") - + getCategoricalFeatures.foreach { + f => + schema(f.name).dataType match { + case StringType| BooleanType | ByteType | ShortType | IntegerType | LongType => + case _ => throw new + Exception(s"Data type for categorical features" + + s" must be ${allowedCategoricalTypes.mkString("[", ",", "]")}.") + } + } + val allowedNumericTypes = Array(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType, DecimalType) + getNumericFeatures.foreach { + f => + schema(f.name).dataType match { + case ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType | _: DecimalType => + case _ => throw new + Exception(s"Data type for numeric features must be ${allowedNumericTypes.mkString("[", ",", "]")}.") + } + } // Check if features are specified val featureNames = (getCategoricalFeatures ++ getNumericFeatures).map(_.name) if (featureNames.isEmpty) { diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala index b12293fe32..10b05a2f1d 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala @@ -52,7 +52,6 @@ object ICECategoricalFeature { case _ => None } ICECategoricalFeature(name, numTopValues, outputColName) - } override def write(obj: ICECategoricalFeature): JsValue = { val map = Map("name" -> JsString(obj.name))++ @@ -61,6 +60,19 @@ object ICECategoricalFeature { JsObject(map) } } + def fromMap(inputMap: java.util.HashMap[String, Any]): ICECategoricalFeature = { + val name: String = inputMap.get("name").toString + val numTopValues: Option[Int] = inputMap.get("numTopValues") match { + case value: Integer => Some(Integer2int(value)) + case _ => None + } + val outputColName: Option[String] = inputMap.get("outputColName") match { + case value: String => Some(value) + case _ => None + } + + ICECategoricalFeature(name, numTopValues, outputColName) + } } /** @@ -121,7 +133,6 @@ object ICENumericFeature { } ICENumericFeature(name, numSplits, rangeMin, rangeMax, outputColName) - } override def write(obj: ICENumericFeature): JsValue = { @@ -133,4 +144,25 @@ object ICENumericFeature { JsObject(map) } } + def fromMap(inputMap: java.util.HashMap[String, Any]): ICENumericFeature = { + val name: String = inputMap.get("name").toString + val numSplits: Option[Int] = inputMap.get("numSplits") match { + case value: Integer => Some(Integer2int(value)) + case _ => None + } + val rangeMin: Option[Double] = inputMap.get("rangeMin") match { + case value: java.lang.Double => Some(value.doubleValue()) + case _ => None + } + val rangeMax: Option[Double] = inputMap.get("rangeMax") match { + case value: java.lang.Double => Some(value.doubleValue()) + case _ => None + } + val outputColName = inputMap.get("outputColName") match { + case value: String => Some(value) + case _ => None + } + + ICENumericFeature(name, numSplits, rangeMin, rangeMax, outputColName) + } } \ No newline at end of file diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala index 1d81fdca53..19f77a3b85 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala @@ -14,6 +14,8 @@ import com.microsoft.azure.synapse.ml.explainers.{ICECategoricalFeature, ICENume import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.util.MLReadable +import scala.jdk.CollectionConverters.mapAsJavaMapConverter + class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] { @@ -27,6 +29,7 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] val data: DataFrame = dataDF.withColumn("col4", rand()*100) + data.show() val pipeline: Pipeline = new Pipeline().setStages(Array( new StringIndexer().setInputCol("col2").setOutputCol("col2_ind"), new OneHotEncoder().setInputCol("col2_ind").setOutputCol("col2_enc"), @@ -41,15 +44,18 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] .setCategoricalFeatures(Array(ICECategoricalFeature("col2", Some(2)), ICECategoricalFeature("col3", Some(4)))) .setTargetClasses(Array(1)) val output: DataFrame = ice.transform(data) + output.show(truncate = false) val iceAvg = new ICETransformer() iceAvg.setModel(model) .setTargetCol("probability") - .setCategoricalFeatures(Array(ICECategoricalFeature("col1", Some(100)), ICECategoricalFeature("col2"))) + .setCategoricalFeatures(Array(ICECategoricalFeature("col1", Some(100)), ICECategoricalFeature("col2"), + ICECategoricalFeature("col3"))) .setNumericFeatures(Array(ICENumericFeature("col4", Some(5)))) .setTargetClasses(Array(1)) .setKind("average") val outputAvg: DataFrame = iceAvg.transform(data) + outputAvg.show(truncate = false) test("col2 doesn't contribute to the prediction.") { @@ -61,9 +67,21 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] val impA: Double = outputCol2.get("a").head.toArray.head val impB: Double = outputCol2.get("b").head.toArray.head - assert(0.4 < impA && impA < 0.6) - assert(0.4 < impB && impB < 0.6) + val eps = 0.01 + assert((impA - impB).abs < eps) + } + + test("col3 contribute to the prediction.") { + + val outputCol3: Map[Int, Vector] = outputAvg.select("col3_dependence").collect().map { + case Row(map: Map[Int, Vector]) => + map + }.head + + val impFirst: Double = outputCol3.get(-5).head.toArray.head + val impSec: Double = outputCol3.get(5).head.toArray.head + assert((impFirst - impSec).abs > 0.4) } test("The length of explainer map for numeric feature is equal to it's numSplits.") { @@ -74,7 +92,6 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] }.head assert(outputCol1.size == iceAvg.getNumericFeatures.head.getNumSplits + 1) - } test("The length of explainer map for categorical feature is less or equal to it's numTopValues.") { @@ -84,7 +101,6 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] }.head assert(outputCol.size <= ice.getCategoricalFeatures.last.getNumTopValue) - } test("No features specified.") { @@ -116,6 +132,27 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] assert(output.count() == 2) } + test("ICECategoricalFeature is successfully created from java.util.Map") { + //val map = Map("name" -> "my_name", "numTopValues" -> 100).asJava + val map = new java.util.HashMap[String, Any]() + map.put("name", "my_name") + map.put("numTopValues", 100) + val feature = ICECategoricalFeature.fromMap(map) + println(feature) + assert(feature.name == map.get("name")) + assert(feature.numTopValues.contains(map.get("numTopValues"))) + assert(feature.outputColName.isEmpty) + } + + test("Set categorical") { + val map = new java.util.HashMap[String, Any]() + map.put("name", "col2") + map.put("numTopValues", 2) + val feature = ICECategoricalFeature.fromMap(map) + ice.setCategoricalFeatures(Array(feature)) + assert(ice.getCategoricalFeatures.head == feature) + } + override def testObjects(): Seq[TestObject[ICETransformer]] = Seq(new TestObject(ice, data)) override def reader: MLReadable[_] = ICETransformer } \ No newline at end of file From a11c718c772b80e257a58f2e9182ee0e775ba3c3 Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Wed, 8 Dec 2021 17:39:56 -0800 Subject: [PATCH 27/32] fix python issue --- .../synapse/ml/explainers/ICETransformer.py | 24 +++++++++++++++++++ .../synapse/ml/explainers/ICEExplainer.scala | 19 +++++++-------- .../explainers/split1/ICEExplainerSuite.scala | 4 ++-- 3 files changed, 34 insertions(+), 13 deletions(-) create mode 100644 core/src/main/python/synapse/ml/explainers/ICETransformer.py diff --git a/core/src/main/python/synapse/ml/explainers/ICETransformer.py b/core/src/main/python/synapse/ml/explainers/ICETransformer.py new file mode 100644 index 0000000000..7b5ccd929c --- /dev/null +++ b/core/src/main/python/synapse/ml/explainers/ICETransformer.py @@ -0,0 +1,24 @@ +# Copyright (C) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See LICENSE in project root for information. + +from synapse.ml.explainers._ICETransformer import _ICETransformer +from pyspark.ml.common import inherit_doc +from typing import List, Dict + +@inherit_doc +class ICETransformer(_ICETransformer): + def setCategoricalFeatures(self, value: List[Dict]): + """ + Args: + value: The list of dicts with parameters for categorical features to explain. + """ + self._java_obj.setCategoricalFeatures(value) + return self + + def setNumericFeatures(self, value: List[Dict]): + """ + Args: + value: The list of dicts with parameters for numeric features to explain. + """ + self._java_obj.setNumericFeatures(value) + return self diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index 274927c618..dcbb0f0641 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -13,6 +13,8 @@ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.ml.stat.Summarizer import com.microsoft.azure.synapse.ml.codegen.Wrappable +import scala.collection.mutable + //import scala.collection.JavaConverters import scala.jdk.CollectionConverters.asScalaBufferConverter @@ -28,15 +30,12 @@ trait ICEFeatureParams extends Params with HasNumSamples { _.forall(_.validate) ) - def setCategoricalFeatures(values: Seq[ICECategoricalFeature]): this.type = this.set(categoricalFeatures, values) def getCategoricalFeatures: Seq[ICECategoricalFeature] = $(categoricalFeatures) def setCategoricalFeatures(values: java.util.List[java.util.HashMap[String, Any]]): this.type = { - val features: Seq[ICECategoricalFeature] = values.asScala.toSeq.map(ICECategoricalFeature.fromMap) - + val features: Seq[ICECategoricalFeature] = values.asScala.toSeq.map(f => ICECategoricalFeature.fromMap(f)) this.setCategoricalFeatures(features) - //this.set(categoricalFeatures, features) } val numericFeatures = new TypedArrayParam[ICENumericFeature] ( @@ -51,9 +50,6 @@ trait ICEFeatureParams extends Params with HasNumSamples { def setNumericFeatures(values: java.util.List[java.util.HashMap[String, Any]]): this.type = { val features: Seq[ICENumericFeature] = values.asScala.toSeq.map(ICENumericFeature.fromMap) - - //this.set(numericFeatures, features) - this.setNumericFeatures(features) } @@ -88,7 +84,7 @@ class ICETransformer(override val uid: String) extends Transformer with Wrappable with ComplexParamsWritable { - //override protected lazy val pyInternalWrapper = true + override protected lazy val pyInternalWrapper = true def this() = this(Identifiable.randomUID("ICETransformer")) @@ -237,8 +233,9 @@ class ICETransformer(override val uid: String) extends Transformer override def transformSchema(schema: StructType): StructType = { // Check the data type for categorical features + val (categoricalFeatures, numericFeatures) = (getCategoricalFeatures, getNumericFeatures) val allowedCategoricalTypes = Array(StringType, BooleanType, ByteType, ShortType, IntegerType, LongType) - getCategoricalFeatures.foreach { + categoricalFeatures.foreach { f => schema(f.name).dataType match { case StringType| BooleanType | ByteType | ShortType | IntegerType | LongType => @@ -248,7 +245,7 @@ class ICETransformer(override val uid: String) extends Transformer } } val allowedNumericTypes = Array(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType, DecimalType) - getNumericFeatures.foreach { + numericFeatures.foreach { f => schema(f.name).dataType match { case ByteType | ShortType | IntegerType | LongType | FloatType | DoubleType | _: DecimalType => @@ -257,7 +254,7 @@ class ICETransformer(override val uid: String) extends Transformer } } // Check if features are specified - val featureNames = (getCategoricalFeatures ++ getNumericFeatures).map(_.name) + val featureNames = (categoricalFeatures ++ numericFeatures).map(_.name) if (featureNames.isEmpty) { throw new Exception("No categorical features or numeric features are set to the explainer. " + "Call setCategoricalFeatures or setNumericFeatures to set the features to be explained.") diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala index 19f77a3b85..777f343f67 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala @@ -14,7 +14,7 @@ import com.microsoft.azure.synapse.ml.explainers.{ICECategoricalFeature, ICENume import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.util.MLReadable -import scala.jdk.CollectionConverters.mapAsJavaMapConverter +import scala.jdk.CollectionConverters._ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] { @@ -149,7 +149,7 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] map.put("name", "col2") map.put("numTopValues", 2) val feature = ICECategoricalFeature.fromMap(map) - ice.setCategoricalFeatures(Array(feature)) + ice.setCategoricalFeatures(List(map).asJava) assert(ice.getCategoricalFeatures.head == feature) } From 6483daf37e286cb3929806327072c49027c9940f Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Wed, 8 Dec 2021 17:48:35 -0800 Subject: [PATCH 28/32] fix python issue (small fix) --- .../microsoft/azure/synapse/ml/explainers/ICEExplainer.scala | 4 ---- .../synapse/ml/explainers/split1/ICEExplainerSuite.scala | 4 ---- 2 files changed, 8 deletions(-) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index dcbb0f0641..7de1100a95 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -12,10 +12,6 @@ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.ml.stat.Summarizer import com.microsoft.azure.synapse.ml.codegen.Wrappable - -import scala.collection.mutable - -//import scala.collection.JavaConverters import scala.jdk.CollectionConverters.asScalaBufferConverter trait ICEFeatureParams extends Params with HasNumSamples { diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala index 777f343f67..c11668f176 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala @@ -13,7 +13,6 @@ import org.apache.spark.ml.classification.LogisticRegression import com.microsoft.azure.synapse.ml.explainers.{ICECategoricalFeature, ICENumericFeature, ICETransformer} import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.util.MLReadable - import scala.jdk.CollectionConverters._ @@ -29,7 +28,6 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] val data: DataFrame = dataDF.withColumn("col4", rand()*100) - data.show() val pipeline: Pipeline = new Pipeline().setStages(Array( new StringIndexer().setInputCol("col2").setOutputCol("col2_ind"), new OneHotEncoder().setInputCol("col2_ind").setOutputCol("col2_enc"), @@ -44,7 +42,6 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] .setCategoricalFeatures(Array(ICECategoricalFeature("col2", Some(2)), ICECategoricalFeature("col3", Some(4)))) .setTargetClasses(Array(1)) val output: DataFrame = ice.transform(data) - output.show(truncate = false) val iceAvg = new ICETransformer() iceAvg.setModel(model) @@ -55,7 +52,6 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] .setTargetClasses(Array(1)) .setKind("average") val outputAvg: DataFrame = iceAvg.transform(data) - outputAvg.show(truncate = false) test("col2 doesn't contribute to the prediction.") { From ef2c35e30628b1ec6464e0334e26728aee4539d0 Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Thu, 9 Dec 2021 19:29:12 -0800 Subject: [PATCH 29/32] fixed python issue --- .../synapse/ml/explainers/ICETransformer.py | 40 +++++++++++++++---- .../synapse/ml/explainers/ICEExplainer.scala | 10 ++++- .../explainers/split1/ICEExplainerSuite.scala | 2 +- 3 files changed, 41 insertions(+), 11 deletions(-) diff --git a/core/src/main/python/synapse/ml/explainers/ICETransformer.py b/core/src/main/python/synapse/ml/explainers/ICETransformer.py index 7b5ccd929c..d860fbf13e 100644 --- a/core/src/main/python/synapse/ml/explainers/ICETransformer.py +++ b/core/src/main/python/synapse/ml/explainers/ICETransformer.py @@ -3,22 +3,46 @@ from synapse.ml.explainers._ICETransformer import _ICETransformer from pyspark.ml.common import inherit_doc -from typing import List, Dict +from typing import List, Dict, Union @inherit_doc class ICETransformer(_ICETransformer): - def setCategoricalFeatures(self, value: List[Dict]): + def setCategoricalFeatures(self, values: Union[List[str], List[Dict]]): """ Args: - value: The list of dicts with parameters for categorical features to explain. + values: The list of values that represent categorical features to explain. + Values are list of dicts with parameters or just a list of names of categorical features """ - self._java_obj.setCategoricalFeatures(value) + if len(values) == 0: + pass + else: + list_values = [] + for value in values: + if isinstance(value, str): + list_values.append({"name": value}) + elif isinstance(value, dict): + list_values.append(value) + else: + pass + self._java_obj.setCategoricalFeaturesPy(list_values) return self - def setNumericFeatures(self, value: List[Dict]): + def setNumericFeatures(self, values: List[Dict]): """ Args: - value: The list of dicts with parameters for numeric features to explain. + values: The list of values that represent numeric features to explain. + Values are list of dicts with parameters or just a list of names of numeric features """ - self._java_obj.setNumericFeatures(value) - return self + if len(values) == 0: + pass + else: + list_values = [] + for value in values: + if isinstance(value, str): + list_values.append({"name": value}) + elif isinstance(value, dict): + list_values.append(value) + else: + pass + self._java_obj.setNumericFeaturesPy(list_values) + return self \ No newline at end of file diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index 7de1100a95..1cfa0eddaa 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -12,7 +12,12 @@ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.ml.stat.Summarizer import com.microsoft.azure.synapse.ml.codegen.Wrappable +import org.apache.spark.sql.execution.r.MapPartitionsRWrapper + +import scala.collection.generic.SeqForwarder +import scala.collection.{AbstractSeq, LinearSeq, SeqProxy, SeqViewLike, immutable, mutable} import scala.jdk.CollectionConverters.asScalaBufferConverter +import scala.reflect.ClassTag.AnyVal trait ICEFeatureParams extends Params with HasNumSamples { @@ -29,7 +34,7 @@ trait ICEFeatureParams extends Params with HasNumSamples { def setCategoricalFeatures(values: Seq[ICECategoricalFeature]): this.type = this.set(categoricalFeatures, values) def getCategoricalFeatures: Seq[ICECategoricalFeature] = $(categoricalFeatures) - def setCategoricalFeatures(values: java.util.List[java.util.HashMap[String, Any]]): this.type = { + def setCategoricalFeaturesPy(values: java.util.List[java.util.HashMap[String, Any]]): this.type = { val features: Seq[ICECategoricalFeature] = values.asScala.toSeq.map(f => ICECategoricalFeature.fromMap(f)) this.setCategoricalFeatures(features) } @@ -44,11 +49,12 @@ trait ICEFeatureParams extends Params with HasNumSamples { def setNumericFeatures(values: Seq[ICENumericFeature]): this.type = this.set(numericFeatures, values) def getNumericFeatures: Seq[ICENumericFeature] = $(numericFeatures) - def setNumericFeatures(values: java.util.List[java.util.HashMap[String, Any]]): this.type = { + def setNumericFeaturesPy(values: java.util.List[java.util.HashMap[String, Any]]): this.type = { val features: Seq[ICENumericFeature] = values.asScala.toSeq.map(ICENumericFeature.fromMap) this.setNumericFeatures(features) } + val kind = new Param[String] ( this, "kind", diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala index c11668f176..e4d4d0ea6d 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala @@ -145,7 +145,7 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] map.put("name", "col2") map.put("numTopValues", 2) val feature = ICECategoricalFeature.fromMap(map) - ice.setCategoricalFeatures(List(map).asJava) + ice.setCategoricalFeaturesPy(List(map).asJava) assert(ice.getCategoricalFeatures.head == feature) } From 8c3a6dcd5d1d9949d4a283d81f1309a50ac443bb Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Fri, 10 Dec 2021 16:55:56 -0800 Subject: [PATCH 30/32] fixed comments and add more docs --- .../synapse/ml/explainers/ICEExplainer.scala | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index 1cfa0eddaa..d2ddd85478 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -12,12 +12,8 @@ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.ml.stat.Summarizer import com.microsoft.azure.synapse.ml.codegen.Wrappable -import org.apache.spark.sql.execution.r.MapPartitionsRWrapper - -import scala.collection.generic.SeqForwarder -import scala.collection.{AbstractSeq, LinearSeq, SeqProxy, SeqViewLike, immutable, mutable} import scala.jdk.CollectionConverters.asScalaBufferConverter -import scala.reflect.ClassTag.AnyVal + trait ICEFeatureParams extends Params with HasNumSamples { @@ -104,7 +100,7 @@ class ICETransformer(override val uid: String) extends Transformer val result = predicted.withColumn(targetCol, explainTarget) getKind.toLowerCase match { - case this.averageKind => + case `averageKind` => // PDP output schema: 1 row * 1 col (pdp for the given feature: feature_value -> explanations) result .groupBy(feature) @@ -116,7 +112,7 @@ class ICETransformer(override val uid: String) extends Transformer ).alias(outputColName) ) - case this.individualKind => + case `individualKind` => // ICE output schema: n rows * 2 cols (idCol + ice for the given feature: map(feature_value -> explanations)) result .groupBy(idCol) @@ -138,11 +134,13 @@ class ICETransformer(override val uid: String) extends Transformer .withColumn(idCol, monotonically_increasing_id()) .withColumn(targetClasses, get(targetClassesCol).map(col).getOrElse(lit(getTargetClasses))) - // collect feature values for all features from original dataset - dfWithId + // Collect feature values for all features from original dataset - dfWithId val (categoricalFeatures, numericFeatures) = (getCategoricalFeatures, getNumericFeatures) + // If numSamples is specified, randomly pick numSamples instances from the input dataset val sampled: Dataset[Row] = get(numSamples).map(dfWithId.orderBy(rand()).limit).getOrElse(dfWithId).cache + // Collect values from the input dataframe and create dependenceDF from them val calcCategoricalFunc: ICECategoricalFeature => DataFrame = { f: ICECategoricalFeature => val values = collectCategoricalValues(dfWithId, f) @@ -156,11 +154,14 @@ class ICETransformer(override val uid: String) extends Transformer val dependenceDfs = (categoricalFeatures map calcCategoricalFunc) ++ (numericFeatures map calcNumericFunc) + // In the case of ICE, the function will return the initial df with columns corresponding to each feature to explain + // In the case of PDP the function will return df with a shape (1 row * number of features to explain) + getKind.toLowerCase match { - case this.individualKind => + case `individualKind` => dependenceDfs.reduceOption(_.join(_, Seq(idCol), "inner")) .map(sampled.join(_, Seq(idCol), "inner").drop(idCol)).get - case this.averageKind => + case `averageKind` => dependenceDfs.reduce(_ crossJoin _) } } From e49201442aa2c99d2b976909dd603233c2dab857 Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Wed, 15 Dec 2021 15:11:01 -0800 Subject: [PATCH 31/32] fix comments --- .../synapse/ml/explainers/ICETransformer.py | 4 +- .../synapse/ml/explainers/ICEExplainer.scala | 34 ++++---- .../synapse/ml/explainers/ICEFeature.scala | 84 ++++--------------- .../explainers/split1/ICEExplainerSuite.scala | 65 +++++++------- 4 files changed, 64 insertions(+), 123 deletions(-) diff --git a/core/src/main/python/synapse/ml/explainers/ICETransformer.py b/core/src/main/python/synapse/ml/explainers/ICETransformer.py index d860fbf13e..24e947af65 100644 --- a/core/src/main/python/synapse/ml/explainers/ICETransformer.py +++ b/core/src/main/python/synapse/ml/explainers/ICETransformer.py @@ -7,7 +7,7 @@ @inherit_doc class ICETransformer(_ICETransformer): - def setCategoricalFeatures(self, values: Union[List[str], List[Dict]]): + def setCategoricalFeatures(self, values: List[Union[str, Dict]]): """ Args: values: The list of values that represent categorical features to explain. @@ -27,7 +27,7 @@ def setCategoricalFeatures(self, values: Union[List[str], List[Dict]]): self._java_obj.setCategoricalFeaturesPy(list_values) return self - def setNumericFeatures(self, values: List[Dict]): + def setNumericFeatures(self, values: List[Union[str, Dict]]): """ Args: values: The list of values that represent numeric features to explain. diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index d2ddd85478..9f7664f71d 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -31,7 +31,7 @@ trait ICEFeatureParams extends Params with HasNumSamples { def getCategoricalFeatures: Seq[ICECategoricalFeature] = $(categoricalFeatures) def setCategoricalFeaturesPy(values: java.util.List[java.util.HashMap[String, Any]]): this.type = { - val features: Seq[ICECategoricalFeature] = values.asScala.toSeq.map(f => ICECategoricalFeature.fromMap(f)) + val features: Seq[ICECategoricalFeature] = values.asScala.map(f => ICECategoricalFeature.fromMap(f)) this.setCategoricalFeatures(features) } @@ -46,7 +46,7 @@ trait ICEFeatureParams extends Params with HasNumSamples { def getNumericFeatures: Seq[ICENumericFeature] = $(numericFeatures) def setNumericFeaturesPy(values: java.util.List[java.util.HashMap[String, Any]]): this.type = { - val features: Seq[ICENumericFeature] = values.asScala.toSeq.map(ICENumericFeature.fromMap) + val features: Seq[ICENumericFeature] = values.asScala.map(ICENumericFeature.fromMap) this.setNumericFeatures(features) } @@ -141,18 +141,16 @@ class ICETransformer(override val uid: String) extends Transformer val sampled: Dataset[Row] = get(numSamples).map(dfWithId.orderBy(rand()).limit).getOrElse(dfWithId).cache // Collect values from the input dataframe and create dependenceDF from them - val calcCategoricalFunc: ICECategoricalFeature => DataFrame = { - f: ICECategoricalFeature => - val values = collectCategoricalValues(dfWithId, f) - calcDependence(sampled, idCol, targetClasses, f.name, values, f.getOutputColName) + val features = categoricalFeatures ++ numericFeatures + val dependenceDfs= features.map { + case f: ICECategoricalFeature => + (f, collectCategoricalValues(dfWithId, f)) + case f: ICENumericFeature => + (f, collectSplits(dfWithId, f)) + }.map { + case (f, values) => + calcDependence(sampled, idCol, targetClasses, f.getName, values, f.getOutputColName) } - val calcNumericFunc: ICENumericFeature => DataFrame = { - f: ICENumericFeature => - val values = collectSplits(dfWithId, f) - calcDependence(sampled, idCol, targetClasses, f.name, values, f.getOutputColName) - } - - val dependenceDfs = (categoricalFeatures map calcCategoricalFunc) ++ (numericFeatures map calcNumericFunc) // In the case of ICE, the function will return the initial df with columns corresponding to each feature to explain // In the case of PDP the function will return df with a shape (1 row * number of features to explain) @@ -167,10 +165,10 @@ class ICETransformer(override val uid: String) extends Transformer } private def collectCategoricalValues[_](df: DataFrame, feature: ICECategoricalFeature): Array[_] = { - val featureCountCol = DatasetExtensions.findUnusedColumnName("__feature__count__", df) + val featureCount = DatasetExtensions.findUnusedColumnName("__feature__count__", df) df.groupBy(col(feature.name)) - .agg(count("*").as(featureCountCol)) - .orderBy(col(featureCountCol).desc) + .agg(count("*").as(featureCount)) + .orderBy(col(featureCount).desc) .head(feature.getNumTopValue) .map(row => row.get(0)) } @@ -257,7 +255,7 @@ class ICETransformer(override val uid: String) extends Transformer } } // Check if features are specified - val featureNames = (categoricalFeatures ++ numericFeatures).map(_.name) + val featureNames = (categoricalFeatures ++ numericFeatures).map(_.getName) if (featureNames.isEmpty) { throw new Exception("No categorical features or numeric features are set to the explainer. " + "Call setCategoricalFeatures or setNumericFeatures to set the features to be explained.") @@ -267,7 +265,7 @@ class ICETransformer(override val uid: String) extends Transformer if (duplicateFeatureNames.nonEmpty) { throw new Exception(s"Duplicate features specified: ${duplicateFeatureNames.mkString(", ")}") } - this.validateSchema(schema) + validateSchema(schema) schema } } diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala index 10b05a2f1d..afb32cfd35 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala @@ -3,12 +3,14 @@ package com.microsoft.azure.synapse.ml.explainers +import spray.json.DefaultJsonProtocol._ import spray.json._ -private[explainers] abstract class ICEFeature(val name: String, outputColName: Option[String] = None) { +private[explainers] abstract class ICEFeature(name: String, outputColName: Option[String] = None) { def validate: Boolean private val defaultOutputColName = name + "_dependence" def getOutputColName: String = this.outputColName.getOrElse(defaultOutputColName) + def getName: String = name } /** @@ -19,16 +21,15 @@ private[explainers] abstract class ICEFeature(val name: String, outputColName: O * @param outputColName The name for output column with explanations for the feature. * Default: input name of the feature + _dependence. */ -case class ICECategoricalFeature(override val name: String, numTopValues: Option[Int] = None, +case class ICECategoricalFeature(name: String, numTopValues: Option[Int] = None, outputColName: Option[String] = None) extends ICEFeature(name, outputColName) { override def validate: Boolean = { numTopValues.forall(_ > 0) } - private val defaultNumTopValue = 100 def getNumTopValue: Int = { - this.numTopValues.getOrElse(defaultNumTopValue) + this.numTopValues.getOrElse(ICECategoricalFeature.DefaultNumTopValue) } } @@ -36,36 +37,15 @@ case class ICECategoricalFeature(override val name: String, numTopValues: Option * Companion object to provide JSON serializer and deserializer for ICECategoricalFeature. */ object ICECategoricalFeature { - implicit val JsonFormat: JsonFormat[ICECategoricalFeature] = new JsonFormat[ICECategoricalFeature] { - override def read(json: JsValue): ICECategoricalFeature = { - val fields = json.asJsObject.fields - val name = fields("name") match { - case JsString(value) => value - case _ => throw new Exception("The name field must be a JsString.") - } - val numTopValues = fields.get("numTopValues") match { - case Some(JsNumber(value)) => Some(value.toInt) - case _ => None - } - val outputColName = fields.get("outputColName") match { - case Some(JsString(value)) => Some(value) - case _ => None - } - ICECategoricalFeature(name, numTopValues, outputColName) - } - override def write(obj: ICECategoricalFeature): JsValue = { - val map = Map("name" -> JsString(obj.name))++ - obj.numTopValues.map("numTopValues" -> JsNumber(_))++ - obj.outputColName.map("outputColName" -> JsString(_)) - JsObject(map) - } - } + val DefaultNumTopValue: Int = 100 + implicit val JsonFormat: JsonFormat[ICECategoricalFeature] = jsonFormat3(ICECategoricalFeature.apply) def fromMap(inputMap: java.util.HashMap[String, Any]): ICECategoricalFeature = { val name: String = inputMap.get("name").toString val numTopValues: Option[Int] = inputMap.get("numTopValues") match { case value: Integer => Some(Integer2int(value)) case _ => None } + val outputColName: Option[String] = inputMap.get("outputColName") match { case value: String => Some(value) case _ => None @@ -87,7 +67,7 @@ object ICECategoricalFeature { * @param outputColName The name for output column with explanations for the feature. * Default: input name of the feature + "_dependence" */ -case class ICENumericFeature(override val name: String, numSplits: Option[Int] = None, +case class ICENumericFeature(name: String, numSplits: Option[Int] = None, rangeMin: Option[Double] = None, rangeMax: Option[Double] = None, outputColName: Option[String] = None) extends ICEFeature(name, outputColName) { @@ -96,9 +76,8 @@ case class ICENumericFeature(override val name: String, numSplits: Option[Int] = numSplits.forall(_ > 0) && (rangeMax.isEmpty || rangeMin.isEmpty || rangeMin.get <= rangeMax.get) } - private val defaultNumSplits = 10 def getNumSplits: Int = { - this.numSplits.getOrElse(defaultNumSplits) + this.numSplits.getOrElse(ICENumericFeature.DefaultNumSplits) } } @@ -106,58 +85,25 @@ case class ICENumericFeature(override val name: String, numSplits: Option[Int] = * Companion object to provide JSON serializer and deserializer for ICENumericFeature. */ object ICENumericFeature { - implicit val JsonFormat: JsonFormat[ICENumericFeature] = new JsonFormat[ICENumericFeature] { - override def read(json: JsValue): ICENumericFeature = { - val fields = json.asJsObject.fields - val name = fields("name") match { - case JsString(value) => value - case _ => throw new Exception("The name field must be a JsString.") - } - - val numSplits = fields.get("numSplits") match { - case Some(JsNumber(value)) => Some(value.toInt) - case _ => None - } - - val rangeMin = fields.get("rangeMin").map { - case JsNumber(value) => value.toDouble - } - - val rangeMax = fields.get("rangeMax").map { - case JsNumber(value) => value.toDouble - } - - val outputColName = fields.get("outputColName") match { - case Some(JsString(value)) => Some(value) - case _ => None - } - - ICENumericFeature(name, numSplits, rangeMin, rangeMax, outputColName) - } - - override def write(obj: ICENumericFeature): JsValue = { - val map = Map("name" -> JsString(obj.name))++ - obj.numSplits.map("numSplits" -> JsNumber(_))++ - obj.rangeMin.map("rangeMin" -> JsNumber(_))++ - obj.rangeMax.map("rangeMax" -> JsNumber(_))++ - obj.outputColName.map("outputColName" -> JsString(_)) - JsObject(map) - } - } + val DefaultNumSplits: Int = 10 + implicit val JsonFormat: JsonFormat[ICENumericFeature] = jsonFormat5(ICENumericFeature.apply) def fromMap(inputMap: java.util.HashMap[String, Any]): ICENumericFeature = { val name: String = inputMap.get("name").toString val numSplits: Option[Int] = inputMap.get("numSplits") match { case value: Integer => Some(Integer2int(value)) case _ => None } + val rangeMin: Option[Double] = inputMap.get("rangeMin") match { case value: java.lang.Double => Some(value.doubleValue()) case _ => None } + val rangeMax: Option[Double] = inputMap.get("rangeMax") match { case value: java.lang.Double => Some(value.doubleValue()) case _ => None } + val outputColName = inputMap.get("outputColName") match { case value: String => Some(value) case _ => None diff --git a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala index e4d4d0ea6d..08376dc3f0 100644 --- a/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala +++ b/core/src/test/scala/com/microsoft/azure/synapse/ml/explainers/split1/ICEExplainerSuite.scala @@ -19,88 +19,86 @@ import scala.jdk.CollectionConverters._ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] { import spark.implicits._ - val dataDF: DataFrame = (1 to 100).flatMap(_ => Seq( + lazy val dataDF: DataFrame = (1 to 100).flatMap(_ => Seq( (-5, "a", -5, 0), (-5, "b", -5, 0), (5, "a", 5, 1), (5, "b", 5, 1) )).toDF("col1", "col2", "col3", "label") - val data: DataFrame = dataDF.withColumn("col4", rand()*100) + lazy val data: DataFrame = dataDF.withColumn("col4", rand()*100) - val pipeline: Pipeline = new Pipeline().setStages(Array( + lazy val pipeline: Pipeline = new Pipeline().setStages(Array( new StringIndexer().setInputCol("col2").setOutputCol("col2_ind"), new OneHotEncoder().setInputCol("col2_ind").setOutputCol("col2_enc"), new VectorAssembler().setInputCols(Array("col1", "col2_enc", "col3", "col4")).setOutputCol("features"), new LogisticRegression().setLabelCol("label").setFeaturesCol("features") )) - val model: PipelineModel = pipeline.fit(data) + lazy val model: PipelineModel = pipeline.fit(data) - val ice = new ICETransformer() - ice.setModel(model) + lazy val ice: ICETransformer = new ICETransformer() + .setModel(model) .setTargetCol("probability") .setCategoricalFeatures(Array(ICECategoricalFeature("col2", Some(2)), ICECategoricalFeature("col3", Some(4)))) .setTargetClasses(Array(1)) - val output: DataFrame = ice.transform(data) + lazy val output: DataFrame = ice.transform(data).cache() - val iceAvg = new ICETransformer() - iceAvg.setModel(model) + lazy val iceAvg: ICETransformer = new ICETransformer() + .setModel(model) .setTargetCol("probability") .setCategoricalFeatures(Array(ICECategoricalFeature("col1", Some(100)), ICECategoricalFeature("col2"), ICECategoricalFeature("col3"))) .setNumericFeatures(Array(ICENumericFeature("col4", Some(5)))) .setTargetClasses(Array(1)) .setKind("average") - val outputAvg: DataFrame = iceAvg.transform(data) + lazy val outputAvg: DataFrame = iceAvg.transform(data).cache() + + // Helper function which returns value from first row in a column specified by "colName". + def getFirstValueFromOutput(output: DataFrame, colName: String): Map[_, Vector] = { + output.select(colName).collect().map { + case Row(map: Map[String, Vector]) => map + case Row(map: Map[Int, Vector]) => map + case Row(map: Map[Double, Vector]) => map + }.head + } test("col2 doesn't contribute to the prediction.") { - - val outputCol2: Map[String, Vector] = outputAvg.select("col2_dependence").collect().map { - case Row(map: Map[String, Vector]) => - map - }.head + val outputCol2: Map[String, Vector] = + getFirstValueFromOutput(outputAvg, "col2_dependence").asInstanceOf[Map[String, Vector]] val impA: Double = outputCol2.get("a").head.toArray.head val impB: Double = outputCol2.get("b").head.toArray.head - val eps = 0.01 assert((impA - impB).abs < eps) } test("col3 contribute to the prediction.") { - val outputCol3: Map[Int, Vector] = outputAvg.select("col3_dependence").collect().map { - case Row(map: Map[Int, Vector]) => - map - }.head + val outputCol3: Map[Int, Vector] = + getFirstValueFromOutput(outputAvg, "col3_dependence").asInstanceOf[Map[Int, Vector]] val impFirst: Double = outputCol3.get(-5).head.toArray.head val impSec: Double = outputCol3.get(5).head.toArray.head - assert((impFirst - impSec).abs > 0.4) } test("The length of explainer map for numeric feature is equal to it's numSplits.") { - val outputCol1: Map[Double, Vector] = outputAvg.select("col4_dependence").collect().map { - case Row(map: Map[Double, Vector]) => - map - }.head + val outputCol1: Map[Double, Vector] = + getFirstValueFromOutput(outputAvg, "col4_dependence").asInstanceOf[Map[Double, Vector]] assert(outputCol1.size == iceAvg.getNumericFeatures.head.getNumSplits + 1) } test("The length of explainer map for categorical feature is less or equal to it's numTopValues.") { - val outputCol: Map[Double, Vector] = output.select("col3_dependence").collect().map { - case Row(map: Map[Double, Vector]) => - map - }.head + val outputCol: Map[Double, Vector] = + getFirstValueFromOutput(output, "col3_dependence").asInstanceOf[Map[Double, Vector]] assert(outputCol.size <= ice.getCategoricalFeatures.last.getNumTopValue) } test("No features specified.") { - val ice = new ICETransformer() + val ice: ICETransformer = new ICETransformer() ice.setModel(model) .setTargetCol("probability") .setTargetClasses(Array(1)) @@ -108,7 +106,7 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] } test("Duplicate features specified.") { - val ice = new ICETransformer() + val ice: ICETransformer = new ICETransformer() ice.setModel(model) .setTargetCol("probability") .setCategoricalFeatures(Array(ICECategoricalFeature("col1", Some(100)), @@ -118,7 +116,7 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] } test("When setNumSamples is called, ICE returns correct number of rows.") { - val ice = new ICETransformer() + val ice: ICETransformer = new ICETransformer() ice.setNumSamples(2) .setModel(model) .setTargetCol("probability") @@ -129,7 +127,6 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] } test("ICECategoricalFeature is successfully created from java.util.Map") { - //val map = Map("name" -> "my_name", "numTopValues" -> 100).asJava val map = new java.util.HashMap[String, Any]() map.put("name", "my_name") map.put("numTopValues", 100) @@ -151,4 +148,4 @@ class ICEExplainerSuite extends TestBase with TransformerFuzzing[ICETransformer] override def testObjects(): Seq[TestObject[ICETransformer]] = Seq(new TestObject(ice, data)) override def reader: MLReadable[_] = ICETransformer -} \ No newline at end of file +} From 61624ded4ee447f0225ce44e2161a31508523e7f Mon Sep 17 00:00:00 2001 From: Elena Zherdeva Date: Wed, 15 Dec 2021 15:54:54 -0800 Subject: [PATCH 32/32] fix code style --- .../azure/synapse/ml/explainers/ICEExplainer.scala | 11 ++++++++--- .../azure/synapse/ml/explainers/ICEFeature.scala | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala index 9f7664f71d..9bf5f2650c 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEExplainer.scala @@ -50,7 +50,6 @@ trait ICEFeatureParams extends Params with HasNumSamples { this.setNumericFeatures(features) } - val kind = new Param[String] ( this, "kind", @@ -197,6 +196,7 @@ class ICETransformer(override val uid: String) extends Transformer } else { createSplits(mi, ma) } + case _ => createSplits(mi, ma) } @@ -227,6 +227,7 @@ class ICETransformer(override val uid: String) extends Transformer createSplits(mi, ma) } } + values.toArray } @@ -239,12 +240,13 @@ class ICETransformer(override val uid: String) extends Transformer categoricalFeatures.foreach { f => schema(f.name).dataType match { - case StringType| BooleanType | ByteType | ShortType | IntegerType | LongType => + case StringType | BooleanType | ByteType | ShortType | IntegerType | LongType => case _ => throw new Exception(s"Data type for categorical features" + s" must be ${allowedCategoricalTypes.mkString("[", ",", "]")}.") } } + val allowedNumericTypes = Array(ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType, DecimalType) numericFeatures.foreach { f => @@ -254,20 +256,23 @@ class ICETransformer(override val uid: String) extends Transformer Exception(s"Data type for numeric features must be ${allowedNumericTypes.mkString("[", ",", "]")}.") } } + // Check if features are specified val featureNames = (categoricalFeatures ++ numericFeatures).map(_.getName) if (featureNames.isEmpty) { throw new Exception("No categorical features or numeric features are set to the explainer. " + "Call setCategoricalFeatures or setNumericFeatures to set the features to be explained.") } + // Check for duplicate feature specification val duplicateFeatureNames = featureNames.groupBy(identity).mapValues(_.length).filter(_._2 > 1).keys.toArray if (duplicateFeatureNames.nonEmpty) { throw new Exception(s"Duplicate features specified: ${duplicateFeatureNames.mkString(", ")}") } + validateSchema(schema) schema } } -object ICETransformer extends ComplexParamsReadable[ICETransformer] \ No newline at end of file +object ICETransformer extends ComplexParamsReadable[ICETransformer] diff --git a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala index afb32cfd35..7cd1a2ebab 100644 --- a/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala +++ b/core/src/main/scala/com/microsoft/azure/synapse/ml/explainers/ICEFeature.scala @@ -111,4 +111,4 @@ object ICENumericFeature { ICENumericFeature(name, numSplits, rangeMin, rangeMax, outputColName) } -} \ No newline at end of file +}