Merge pull request #1 from combust/master

Sync with combust master
lucagiovagnoli · Jun 16, 2020 · 63a90cc · 63a90cc
2 parents 7b40eb7 + 41dbde9
commit 63a90cc
Show file tree

Hide file tree

Showing 115 changed files with 11,063 additions and 551 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,43 +1,60 @@
-# Use container-based infrastructure
-sudo: required
-dist: trusty
-
-services:
-  - docker
-
-language: scala
-scala:
-  - 2.11.8
-
-python:
-  - "2.7"
-  - "3.6"
-  - "3.7"
-
-install:
-    -   pip install --user tox
-script:
-  - tox -c python/tox.ini -e $(echo py$TRAVIS_PYTHON_VERSION | tr -d .) -v
-  - travis/travis.sh
-
-notifications:
-  on_success: change
-  on_failure: always
-  urls:
-    - https://webhooks.gitter.im/e/65cd239c75c243ce635e
-  slack:
-    rooms:
-      - secure: "RLxhNQarrr8WNtJRSx855EJX6w9XsyvzfhJGHrJYcNs5hNZfnCcWJaGRExqSHjd3UKpyZOgOAa0+5IrUl4Pbug7Dzq+Kk5nIOxdDILfcL8giVXV2bMfeGADKyiDdkwyeud4eb0Oc7uVhFfmelW8d3EFPWio4yXUaVGZl0cNpCfx9iF1r0vLwRb7U5fiicuvpz8tNZkw6Vp8lrW0+teeXWpzVopxh4gTZxULlaVvp28OIJ09dS2k02baL6g3L4G4z9IxW4adEdH6DpAbw0JgyAFjZE1BXoEQuoZvHjoYstwy62f+maB8c2xNEB9bzobUruc0JHLKly0AdOqtKL25OCeSD3qBVWZKK/TOSQ4phY+iapI1JB5JCoZj/z/k5YogxdRGjFGyUhupuR8+kqZuAUAegD9N5wOLgwei5GTD2oim90ZixeS1Do7GvqhRiyaRHSL0CKRyJXV/iiDr1a8zV5tW8uREZX29Jq5DObxCTLDrWGjqijNWC06lOV0dhvF83qov9A1c+yulrS28qP7Hn7MXQWETShvlw4YxBCDGQLNJ3BMwA9nOx9O0ZI+0KCNVXPu6n8ZVbahJg0LoYZ2ftGFiwxq1/svlhInUN57MXkgI6iDf7NojAL7urGwN3YWbxuJemnASjPtigkN2yboAsBG/aSjOiYNeIVfna//XrFeI="
-      - secure: "Afbm/moiSBB7Kn9J9mX4h+ss4cju6WvOBqup0WdkNCUPWKPFs3XRkt8sDZzAyhn7Fu2w4+NclGI/QHhMo8fC+eZG9fhbpPyVoZv1o9e/43qALOB4nXnZGZeGQLdDTFLcx6ylNEK63cQrro5xY7jCwfPYhuakGr9SWkabX6mKqZWThNyH2HPbKQZX540rRwPOxIVgnkhHleCC1vosfgD1brNBt/100cXZXBVjP7GXSJbBDPuKf+cG+bUNET0xxoBK9GurZl8UDhIoo7jfoAj1tY9l0cgzMjxdSO5Yn5Z73DMT3XfiOlT8Hjp+fC37pvssKsfSg72KgTidva71CGwCoJ/B4ly2BtiXNuazzd1vAyCirbJKT8YQwPGihElkaANtMcpWjtnn0v3nTfMvooMT+HRip4kBtfE7Rl+JeVKET0Hhi0Er16eGoCL/XzVuJB7Cfh6B6UJTqHSv3PqeCVP3PQm+qp07VT6KjmMXa2rbMuh5ybs6hQIgZLVc3BeAjGWCZndRxcwgNpG+UvT7Fz87fWOBrN6xZVctXylWatjizf1IXEwBz25X05gMBnafeC3whhX/8jSR3NikwVgY4UQb4mgfSPqXgCeaVQ8VHeLpTM7swVI3q1Vcmms/v7YdPEFGDFLcyYo24GojgBaBZCl7WAk4QxxGrAANqcMpOWXhALo="
-    on_success: change
-    on_failure: always
-
-jdk:
-  - oraclejdk8
-cache:
-  directories:
-    - $HOME/.ivy2/cache
-    - $HOME/.sbt/boot/
-before_cache:
-  - find $HOME/.ivy2 -name "ivydata-*.properties" -delete
-  - find $HOME/.sbt -name "*.lock" -delete
+# Use container-based infrastructure
+sudo: required
+dist: trusty
+
+services:
+  - docker
+
+language: scala
+scala:
+  - 2.11.8
+jdk:
+  - oraclejdk8  # sbt +compile fails on jdk11 (xenial's default)
+
+jobs:
+  include:
+    - stage: "Mleap tests"
+      name: "Scala Tests"
+      script:
+        - travis/travis.sh
+
+    - name: "Python 3.6 tests"
+      language: python
+      python: 3.6
+      install:
+        - pip install "virtualenv<20.0.0" tox  # Bug https://github.com/pypa/virtualenv/issues/1551
+      script:
+        - make py36_test
+
+    - name: "Python 3.7 tests"
+      language: python  # This can't be java and python at the same time
+      python: 3.7
+      env:
+        # Required when not setting 'language: java' to force the correct jdk8
+        - PATH=$(echo "$PATH" | sed -e 's/:\/usr\/local\/lib\/jvm\/openjdk11\/bin//')
+        - JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk-amd64"
+      dist: xenial  # Travis won't install py37 on trusty
+      install:
+        - pip install "virtualenv<20.0.0" tox  # Bug https://github.com/pypa/virtualenv/issues/1551
+      script:
+        - make py37_test
+
+notifications:
+  on_success: change
+  on_failure: always
+  urls:
+    - https://webhooks.gitter.im/e/65cd239c75c243ce635e
+  slack:
+    rooms:
+      - secure: "RLxhNQarrr8WNtJRSx855EJX6w9XsyvzfhJGHrJYcNs5hNZfnCcWJaGRExqSHjd3UKpyZOgOAa0+5IrUl4Pbug7Dzq+Kk5nIOxdDILfcL8giVXV2bMfeGADKyiDdkwyeud4eb0Oc7uVhFfmelW8d3EFPWio4yXUaVGZl0cNpCfx9iF1r0vLwRb7U5fiicuvpz8tNZkw6Vp8lrW0+teeXWpzVopxh4gTZxULlaVvp28OIJ09dS2k02baL6g3L4G4z9IxW4adEdH6DpAbw0JgyAFjZE1BXoEQuoZvHjoYstwy62f+maB8c2xNEB9bzobUruc0JHLKly0AdOqtKL25OCeSD3qBVWZKK/TOSQ4phY+iapI1JB5JCoZj/z/k5YogxdRGjFGyUhupuR8+kqZuAUAegD9N5wOLgwei5GTD2oim90ZixeS1Do7GvqhRiyaRHSL0CKRyJXV/iiDr1a8zV5tW8uREZX29Jq5DObxCTLDrWGjqijNWC06lOV0dhvF83qov9A1c+yulrS28qP7Hn7MXQWETShvlw4YxBCDGQLNJ3BMwA9nOx9O0ZI+0KCNVXPu6n8ZVbahJg0LoYZ2ftGFiwxq1/svlhInUN57MXkgI6iDf7NojAL7urGwN3YWbxuJemnASjPtigkN2yboAsBG/aSjOiYNeIVfna//XrFeI="
+      - secure: "Afbm/moiSBB7Kn9J9mX4h+ss4cju6WvOBqup0WdkNCUPWKPFs3XRkt8sDZzAyhn7Fu2w4+NclGI/QHhMo8fC+eZG9fhbpPyVoZv1o9e/43qALOB4nXnZGZeGQLdDTFLcx6ylNEK63cQrro5xY7jCwfPYhuakGr9SWkabX6mKqZWThNyH2HPbKQZX540rRwPOxIVgnkhHleCC1vosfgD1brNBt/100cXZXBVjP7GXSJbBDPuKf+cG+bUNET0xxoBK9GurZl8UDhIoo7jfoAj1tY9l0cgzMjxdSO5Yn5Z73DMT3XfiOlT8Hjp+fC37pvssKsfSg72KgTidva71CGwCoJ/B4ly2BtiXNuazzd1vAyCirbJKT8YQwPGihElkaANtMcpWjtnn0v3nTfMvooMT+HRip4kBtfE7Rl+JeVKET0Hhi0Er16eGoCL/XzVuJB7Cfh6B6UJTqHSv3PqeCVP3PQm+qp07VT6KjmMXa2rbMuh5ybs6hQIgZLVc3BeAjGWCZndRxcwgNpG+UvT7Fz87fWOBrN6xZVctXylWatjizf1IXEwBz25X05gMBnafeC3whhX/8jSR3NikwVgY4UQb4mgfSPqXgCeaVQ8VHeLpTM7swVI3q1Vcmms/v7YdPEFGDFLcyYo24GojgBaBZCl7WAk4QxxGrAANqcMpOWXhALo="
+    on_success: change
+    on_failure: always
+
+cache:
+  directories:
+    - $HOME/.ivy2/cache
+    - $HOME/.sbt/boot/
+before_cache:
+  - find $HOME/.ivy2 -name "ivydata-*.properties" -delete
+  - find $HOME/.sbt -name "*.lock" -delete
diff --git a/Makefile b/Makefile
@@ -0,0 +1,9 @@
+SHELL := /bin/bash
+
+.PHONY: py36_test py37_test
+
+py36_test:
+	source scripts/scala_classpath_for_python.sh && make -C python py36_test
+
+py37_test:
+	source scripts/scala_classpath_for_python.sh && make -C python py37_test
diff --git a/README.md b/README.md
@@ -45,17 +45,23 @@ moment.
 
 ### MLeap/Spark Version
 
-Choose the right verison of the `mleap-spark` module to export your pipeline. The serialization format is backwards compatible between different versions of MLeap. So if you export a pipeline using MLeap 0.11.0 and Spark 2.1, you can still load that pipeline using MLeap runtime version 0.12.0.
+Choose the right version of the `mleap-spark` module to export your pipeline. The serialization format is backwards compatible between different versions of MLeap. So if you export a pipeline using MLeap 0.11.0 and Spark 2.1, you can still load that pipeline using MLeap runtime version 0.12.0.
 
 | MLeap Version | Spark Version |
 |---------------|---------------|
+| 0.16.0        | 2.4.5         |
 | 0.15.0        | 2.4           |
 | 0.14.0        | 2.4           |
 | 0.13.0        | 2.3           |
 | 0.12.0        | 2.3           |
 | 0.11.0        | 2.2           |
 | 0.11.0        | 2.1           |
 | 0.11.0        | 2.0           |
+| 0.10.3        | 2.2           |
+| 0.10.3        | 2.1           |
+| 0.10.3        | 2.0           |
+
+Please see the [release notes](RELEASE_NOTES.md) for changes (especially breaking changes) included with each release.
 
 ## Setup
 
@@ -64,7 +70,7 @@ Choose the right verison of the `mleap-spark` module to export your pipeline. Th
 #### SBT
 
 ```sbt
-libraryDependencies += "ml.combust.mleap" %% "mleap-runtime" % "0.15.0"
+libraryDependencies += "ml.combust.mleap" %% "mleap-runtime" % "0.16.0"
 ```
 
 #### Maven
@@ -73,7 +79,7 @@ libraryDependencies += "ml.combust.mleap" %% "mleap-runtime" % "0.15.0"
 <dependency>
     <groupId>ml.combust.mleap</groupId>
     <artifactId>mleap-runtime_2.11</artifactId>
-    <version>0.15.0</version>
+    <version>0.16.0</version>
 </dependency>
 ```
 
@@ -82,7 +88,7 @@ libraryDependencies += "ml.combust.mleap" %% "mleap-runtime" % "0.15.0"
 #### SBT
 
 ```sbt
-libraryDependencies += "ml.combust.mleap" %% "mleap-spark" % "0.15.0"
+libraryDependencies += "ml.combust.mleap" %% "mleap-spark" % "0.16.0"
 ```
 
 #### Maven
@@ -91,14 +97,14 @@ libraryDependencies += "ml.combust.mleap" %% "mleap-spark" % "0.15.0"
 <dependency>
     <groupId>ml.combust.mleap</groupId>
     <artifactId>mleap-spark_2.11</artifactId>
-    <version>0.15.0</version>
+    <version>0.16.0</version>
 </dependency>
 ```
 
 ### Spark Packages
 
 ```bash
-$ bin/spark-shell --packages ml.combust.mleap:mleap-spark_2.11:0.15.0
+$ bin/spark-shell --packages ml.combust.mleap:mleap-spark_2.11:0.16.0
 ```
 
 ### PySpark Integration
@@ -169,6 +175,8 @@ import mleap.pyspark
 from mleap.pyspark.spark_support import SimpleSparkSerializer
 ```
 
+See [PySpark Integration of python/README.md](python/README.md#pyspark-integration) for more.
+
 ### Create and Export a Scikit-Learn Pipeline
 
 ```python

diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,3 +1,31 @@
+# Release 0.16.0
+
+### Breaking Changes
+- Fix default ports when running grpc/http requests; default grpc port is 65328 and can be overridden via MLEAP_GRPC_PORT; default http port should be: 65327 and can be overridden via MLEAP_HTTP_PORT
+
+### New Features
+- Upgrade to Spark version 2.4.5
+- Support for a performant implementation of the XGboost runtime (XGboost Predictor)
+- Scikit-learn support for MultinomialLogisticRegression
+- Support for min/max values other than defaults (i.e. 0.0 and 1.0) in MinMaxScalerModel
+- Support for custom transformers (StringMap, MathUnary, MathBinary) in Pyspark
+- Support MLWritable/MLReadable for custom transformers (StringMap, MathUnary, MathBinary) and fix this for Imputer transformer
+- Fixes support for loading/storing bundles from/to hdfs in Pyspark
+- Improve importing mleap __version__ for python modules
+
+### Bug Fixes
+- Fix XGBoost sparse vector support
+- Fix MinMaxScalerModel outputs different in Spark vs MLeap
+- Fix Spark deserialization for CountVectorizer transformer
+- Added support for HandleInvalid in Bucketizer, VectorIndexer
+- Fix setting HandleInvalid by default to OneHotEncoder for backwards compatibility
+- Fixes MLReader for Imputer mleap implementation of Spark transformer
+
+### Improvements
+- Minor documentation updates
+
+### Other Changes
+
 # Release 0.15.0
 
 ### Breaking Changes
@@ -20,5 +48,9 @@
 - Add default grpc port to docker config
 - General documentation improvements
 
-### Other Changes
-- None
+
+# Older releases
+
+We make every effort for the serialization format to be backwards compatible between different versions of MLeap. Please note below some important notes regarding backwards compatibility. 
+
+- The deprecated OneHotEncoder unfortunately had breaking changes in a few releases. In releases 0.11.0 and 0.12.0, the deserialization into MLeap was broken for OneHotEncoder. When using releases 0.13.0, 0.14.0, and 0.15.0, please ensure that the model returns the same results as before the upgrade, by potentially changing dropLast and handleInvalid values after deserialization. Alternatively, please use MLeap version 0.16.0 or higher, in case you have models serialized with other versions of MLeap that use OneHotEncoder. If your model uses OneHotEncoderEstimator or no one hot encoding, then you should not encounter any of the issues above. 
diff --git a/...main/scala/ml/combust/mleap/core/classification/MultiLayerPerceptronClassifierModel.scala b/...main/scala/ml/combust/mleap/core/classification/MultiLayerPerceptronClassifierModel.scala
@@ -1,11 +1,8 @@
 package ml.combust.mleap.core.classification
 
-import ml.combust.mleap.core.Model
 import ml.combust.mleap.core.ann.FeedForwardTopology
 import ml.combust.mleap.core.annotation.SparkCode
-import ml.combust.mleap.core.classification.MultiLayerPerceptronClassifierModel.LabelConverter
 import ml.combust.mleap.core.feature.LabeledPoint
-import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType}
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 
 /**

diff --git a/mleap-core/src/main/scala/ml/combust/mleap/core/classification/NaiveBayesModel.scala b/mleap-core/src/main/scala/ml/combust/mleap/core/classification/NaiveBayesModel.scala
@@ -54,7 +54,7 @@ case class NaiveBayesModel(numFeatures: Int,
     val ones = new DenseVector(Array.fill(theta.numCols) {1.0})
     val thetaMinusNegTheta = Matrices.map(theta, value =>
       value - math.log(1.0 - math.exp(value)))
-    val negThetaSum = negTheta.multiply(ones);
+    val negThetaSum = negTheta.multiply(ones)
 
     raw.foreachActive((_, value) =>
       require(value == 0.0 || value == 1.0,

diff --git a/mleap-core/src/main/scala/ml/combust/mleap/core/feature/BucketizerModel.scala b/mleap-core/src/main/scala/ml/combust/mleap/core/feature/BucketizerModel.scala
@@ -10,14 +10,21 @@ import ml.combust.mleap.core.types.{ScalarType, StructType}
   *
   * @param splits splits used to determine bucket
   */
-@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala")
-case class BucketizerModel(splits: Array[Double]) extends Model {
+@SparkCode(uri = "https://github.com/apache/spark/blob/v2.4.5/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala")
+case class BucketizerModel(splits: Array[Double], handleInvalid: HandleInvalid = HandleInvalid.Error) extends Model {
   def apply(feature: Double): Double = {
-    binarySearchForBuckets(splits, feature)
+      binarySearchForBuckets(splits, feature, keepInvalid = handleInvalid == HandleInvalid.Keep)
   }
 
-  def binarySearchForBuckets(splits: Array[Double], feature: Double): Double = {
-    if (feature == splits.last) {
+  def binarySearchForBuckets(splits: Array[Double], feature: Double, keepInvalid: Boolean): Double = {
+    if (feature.isNaN) {
+      if (keepInvalid) {
+        splits.length - 1
+      } else {
+        throw new RuntimeException("Bucketizer encountered NaN value. To handle or skip NaNs," +
+          " try setting Bucketizer.handleInvalid.")
+      }
+    } else if (feature == splits.last) {
       splits.length - 2
     } else {
       val idx = java.util.Arrays.binarySearch(splits, feature)
@@ -26,8 +33,8 @@ case class BucketizerModel(splits: Array[Double]) extends Model {
       } else {
         val insertPos = -idx - 1
         if (insertPos == 0 || insertPos == splits.length) {
-          throw new Exception(s"Feature value $feature out of Bucketizer bounds" +
-            s" [${splits.head}, ${splits.last}].  Check your features, or loosen " +
+          throw new RuntimeException(s"Feature value $feature out of Bucketizer bounds" +
+            s" [${splits.head}, ${splits.last}]. Check your features, or loosen " +
             s"the lower/upper bound constraints.")
         } else {
           insertPos - 1

diff --git a/mleap-core/src/main/scala/ml/combust/mleap/core/feature/HandleInvalid.scala b/mleap-core/src/main/scala/ml/combust/mleap/core/feature/HandleInvalid.scala
@@ -0,0 +1,28 @@
+package ml.combust.mleap.core.feature
+
+sealed trait HandleInvalid {
+  def asParamString: String
+}
+
+object HandleInvalid {
+  val default = Error
+
+  case object Error extends HandleInvalid {
+    override def asParamString: String = "error"
+  }
+
+  case object Skip extends HandleInvalid {
+    override def asParamString: String = "skip"
+  }
+
+  case object Keep extends HandleInvalid {
+    override def asParamString: String = "keep"
+  }
+
+  def fromString(value: String, canSkip: Boolean = true): HandleInvalid = value match {
+    case "error" => HandleInvalid.Error
+    case "skip" => if (canSkip) HandleInvalid.Skip else throw new IllegalArgumentException(s"Invalid handler: $value")
+    case "keep" => HandleInvalid.Keep
+    case _ => throw new IllegalArgumentException(s"Invalid handler: $value")
+  }
+}
diff --git a/mleap-core/src/main/scala/ml/combust/mleap/core/feature/MinMaxScalerModel.scala b/mleap-core/src/main/scala/ml/combust/mleap/core/feature/MinMaxScalerModel.scala
@@ -15,50 +15,34 @@ import scala.math.{max, min}
   * @param originalMin minimum values from training features
   * @param originalMax maximum values from training features
   */
-@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala")
+@SparkCode(uri = "https://github.com/apache/spark/blob/v2.4.0/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala")
 case class MinMaxScalerModel(originalMin: Vector,
-                             originalMax: Vector) extends Model {
+                             originalMax: Vector,
+                             minValue: Double = 0.0,
+                             maxValue: Double = 1.0) extends Model {
   val originalRange = (originalMax.toBreeze - originalMin.toBreeze).toArray
   val minArray = originalMin.toArray
 
   /**Scale a feature vector using the min/max
     *
     * @param vector feature vector
-    * @return scaled feature fector
+    * @return scaled feature vector
     */
   def apply(vector: Vector): Vector = {
-    vector match {
-      case DenseVector(values) =>
-        val vs = values.clone()
-        val size = vs.length
-        var i = 0
-        while (i < size) {
-          if (!values(i).isNaN) {
-            val raw = if (originalRange(i) != 0) {
-              max(min(1.0, (values(i) - minArray(i)) / originalRange(i)), 0.0)
-            } else {
-              0.5
-            }
-            vs(i) = raw
-          }
-          i += 1
-        }
-        Vectors.dense(vs)
-      case SparseVector(size, indices, values) =>
-        val vs = values.clone()
-        val nnz = vs.length
-        var i = 0
-        while (i < nnz) {
-          val raw = if (originalRange(i) != 0) {
-            max(min(1.0, (indices(i) - minArray(i)) / originalRange(i)), 0.0)
-          } else {
-            0.5
-          }
-          vs(i) = raw
-          i += 1
-        }
-        Vectors.sparse(size, indices, vs)
+    val scale = maxValue - minValue
+
+    // 0 in sparse vector will probably be rescaled to non-zero
+    val values = vector.copy.toArray
+    val size = values.length
+    var i = 0
+    while (i < size) {
+      if (!values(i).isNaN) {
+        val raw = if (originalRange(i) != 0) (values(i) - minArray(i)) / originalRange(i) else 0.5
+        values(i) = raw * scale + minValue
+      }
+      i += 1
     }
+    Vectors.dense(values)
   }
 
   override def inputSchema: StructType = StructType("input" -> TensorType.Double(originalRange.length)).get