Skip to content

Commit

Permalink
Merge pull request #1 from combust/master
Browse files Browse the repository at this point in the history
Sync with combust master
  • Loading branch information
talalryz authored Jun 16, 2020
2 parents 7b40eb7 + 41dbde9 commit 63a90cc
Show file tree
Hide file tree
Showing 115 changed files with 11,063 additions and 551 deletions.
103 changes: 60 additions & 43 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -1,43 +1,60 @@
# Use container-based infrastructure
sudo: required
dist: trusty

services:
- docker

language: scala
scala:
- 2.11.8

python:
- "2.7"
- "3.6"
- "3.7"

install:
- pip install --user tox
script:
- tox -c python/tox.ini -e $(echo py$TRAVIS_PYTHON_VERSION | tr -d .) -v
- travis/travis.sh

notifications:
on_success: change
on_failure: always
urls:
- https://webhooks.gitter.im/e/65cd239c75c243ce635e
slack:
rooms:
- secure: "RLxhNQarrr8WNtJRSx855EJX6w9XsyvzfhJGHrJYcNs5hNZfnCcWJaGRExqSHjd3UKpyZOgOAa0+5IrUl4Pbug7Dzq+Kk5nIOxdDILfcL8giVXV2bMfeGADKyiDdkwyeud4eb0Oc7uVhFfmelW8d3EFPWio4yXUaVGZl0cNpCfx9iF1r0vLwRb7U5fiicuvpz8tNZkw6Vp8lrW0+teeXWpzVopxh4gTZxULlaVvp28OIJ09dS2k02baL6g3L4G4z9IxW4adEdH6DpAbw0JgyAFjZE1BXoEQuoZvHjoYstwy62f+maB8c2xNEB9bzobUruc0JHLKly0AdOqtKL25OCeSD3qBVWZKK/TOSQ4phY+iapI1JB5JCoZj/z/k5YogxdRGjFGyUhupuR8+kqZuAUAegD9N5wOLgwei5GTD2oim90ZixeS1Do7GvqhRiyaRHSL0CKRyJXV/iiDr1a8zV5tW8uREZX29Jq5DObxCTLDrWGjqijNWC06lOV0dhvF83qov9A1c+yulrS28qP7Hn7MXQWETShvlw4YxBCDGQLNJ3BMwA9nOx9O0ZI+0KCNVXPu6n8ZVbahJg0LoYZ2ftGFiwxq1/svlhInUN57MXkgI6iDf7NojAL7urGwN3YWbxuJemnASjPtigkN2yboAsBG/aSjOiYNeIVfna//XrFeI="
- secure: "Afbm/moiSBB7Kn9J9mX4h+ss4cju6WvOBqup0WdkNCUPWKPFs3XRkt8sDZzAyhn7Fu2w4+NclGI/QHhMo8fC+eZG9fhbpPyVoZv1o9e/43qALOB4nXnZGZeGQLdDTFLcx6ylNEK63cQrro5xY7jCwfPYhuakGr9SWkabX6mKqZWThNyH2HPbKQZX540rRwPOxIVgnkhHleCC1vosfgD1brNBt/100cXZXBVjP7GXSJbBDPuKf+cG+bUNET0xxoBK9GurZl8UDhIoo7jfoAj1tY9l0cgzMjxdSO5Yn5Z73DMT3XfiOlT8Hjp+fC37pvssKsfSg72KgTidva71CGwCoJ/B4ly2BtiXNuazzd1vAyCirbJKT8YQwPGihElkaANtMcpWjtnn0v3nTfMvooMT+HRip4kBtfE7Rl+JeVKET0Hhi0Er16eGoCL/XzVuJB7Cfh6B6UJTqHSv3PqeCVP3PQm+qp07VT6KjmMXa2rbMuh5ybs6hQIgZLVc3BeAjGWCZndRxcwgNpG+UvT7Fz87fWOBrN6xZVctXylWatjizf1IXEwBz25X05gMBnafeC3whhX/8jSR3NikwVgY4UQb4mgfSPqXgCeaVQ8VHeLpTM7swVI3q1Vcmms/v7YdPEFGDFLcyYo24GojgBaBZCl7WAk4QxxGrAANqcMpOWXhALo="
on_success: change
on_failure: always

jdk:
- oraclejdk8
cache:
directories:
- $HOME/.ivy2/cache
- $HOME/.sbt/boot/
before_cache:
- find $HOME/.ivy2 -name "ivydata-*.properties" -delete
- find $HOME/.sbt -name "*.lock" -delete
# Use container-based infrastructure
sudo: required
dist: trusty

services:
- docker

language: scala
scala:
- 2.11.8
jdk:
- oraclejdk8 # sbt +compile fails on jdk11 (xenial's default)

jobs:
include:
- stage: "Mleap tests"
name: "Scala Tests"
script:
- travis/travis.sh

- name: "Python 3.6 tests"
language: python
python: 3.6
install:
- pip install "virtualenv<20.0.0" tox # Bug https://github.com/pypa/virtualenv/issues/1551
script:
- make py36_test

- name: "Python 3.7 tests"
language: python # This can't be java and python at the same time
python: 3.7
env:
# Required when not setting 'language: java' to force the correct jdk8
- PATH=$(echo "$PATH" | sed -e 's/:\/usr\/local\/lib\/jvm\/openjdk11\/bin//')
- JAVA_HOME="/usr/lib/jvm/java-1.8.0-openjdk-amd64"
dist: xenial # Travis won't install py37 on trusty
install:
- pip install "virtualenv<20.0.0" tox # Bug https://github.com/pypa/virtualenv/issues/1551
script:
- make py37_test

notifications:
on_success: change
on_failure: always
urls:
- https://webhooks.gitter.im/e/65cd239c75c243ce635e
slack:
rooms:
- secure: "RLxhNQarrr8WNtJRSx855EJX6w9XsyvzfhJGHrJYcNs5hNZfnCcWJaGRExqSHjd3UKpyZOgOAa0+5IrUl4Pbug7Dzq+Kk5nIOxdDILfcL8giVXV2bMfeGADKyiDdkwyeud4eb0Oc7uVhFfmelW8d3EFPWio4yXUaVGZl0cNpCfx9iF1r0vLwRb7U5fiicuvpz8tNZkw6Vp8lrW0+teeXWpzVopxh4gTZxULlaVvp28OIJ09dS2k02baL6g3L4G4z9IxW4adEdH6DpAbw0JgyAFjZE1BXoEQuoZvHjoYstwy62f+maB8c2xNEB9bzobUruc0JHLKly0AdOqtKL25OCeSD3qBVWZKK/TOSQ4phY+iapI1JB5JCoZj/z/k5YogxdRGjFGyUhupuR8+kqZuAUAegD9N5wOLgwei5GTD2oim90ZixeS1Do7GvqhRiyaRHSL0CKRyJXV/iiDr1a8zV5tW8uREZX29Jq5DObxCTLDrWGjqijNWC06lOV0dhvF83qov9A1c+yulrS28qP7Hn7MXQWETShvlw4YxBCDGQLNJ3BMwA9nOx9O0ZI+0KCNVXPu6n8ZVbahJg0LoYZ2ftGFiwxq1/svlhInUN57MXkgI6iDf7NojAL7urGwN3YWbxuJemnASjPtigkN2yboAsBG/aSjOiYNeIVfna//XrFeI="
- secure: "Afbm/moiSBB7Kn9J9mX4h+ss4cju6WvOBqup0WdkNCUPWKPFs3XRkt8sDZzAyhn7Fu2w4+NclGI/QHhMo8fC+eZG9fhbpPyVoZv1o9e/43qALOB4nXnZGZeGQLdDTFLcx6ylNEK63cQrro5xY7jCwfPYhuakGr9SWkabX6mKqZWThNyH2HPbKQZX540rRwPOxIVgnkhHleCC1vosfgD1brNBt/100cXZXBVjP7GXSJbBDPuKf+cG+bUNET0xxoBK9GurZl8UDhIoo7jfoAj1tY9l0cgzMjxdSO5Yn5Z73DMT3XfiOlT8Hjp+fC37pvssKsfSg72KgTidva71CGwCoJ/B4ly2BtiXNuazzd1vAyCirbJKT8YQwPGihElkaANtMcpWjtnn0v3nTfMvooMT+HRip4kBtfE7Rl+JeVKET0Hhi0Er16eGoCL/XzVuJB7Cfh6B6UJTqHSv3PqeCVP3PQm+qp07VT6KjmMXa2rbMuh5ybs6hQIgZLVc3BeAjGWCZndRxcwgNpG+UvT7Fz87fWOBrN6xZVctXylWatjizf1IXEwBz25X05gMBnafeC3whhX/8jSR3NikwVgY4UQb4mgfSPqXgCeaVQ8VHeLpTM7swVI3q1Vcmms/v7YdPEFGDFLcyYo24GojgBaBZCl7WAk4QxxGrAANqcMpOWXhALo="
on_success: change
on_failure: always

cache:
directories:
- $HOME/.ivy2/cache
- $HOME/.sbt/boot/
before_cache:
- find $HOME/.ivy2 -name "ivydata-*.properties" -delete
- find $HOME/.sbt -name "*.lock" -delete
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
SHELL := /bin/bash

.PHONY: py36_test py37_test

py36_test:
source scripts/scala_classpath_for_python.sh && make -C python py36_test

py37_test:
source scripts/scala_classpath_for_python.sh && make -C python py37_test
20 changes: 14 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,23 @@ moment.

### MLeap/Spark Version

Choose the right verison of the `mleap-spark` module to export your pipeline. The serialization format is backwards compatible between different versions of MLeap. So if you export a pipeline using MLeap 0.11.0 and Spark 2.1, you can still load that pipeline using MLeap runtime version 0.12.0.
Choose the right version of the `mleap-spark` module to export your pipeline. The serialization format is backwards compatible between different versions of MLeap. So if you export a pipeline using MLeap 0.11.0 and Spark 2.1, you can still load that pipeline using MLeap runtime version 0.12.0.

| MLeap Version | Spark Version |
|---------------|---------------|
| 0.16.0 | 2.4.5 |
| 0.15.0 | 2.4 |
| 0.14.0 | 2.4 |
| 0.13.0 | 2.3 |
| 0.12.0 | 2.3 |
| 0.11.0 | 2.2 |
| 0.11.0 | 2.1 |
| 0.11.0 | 2.0 |
| 0.10.3 | 2.2 |
| 0.10.3 | 2.1 |
| 0.10.3 | 2.0 |

Please see the [release notes](RELEASE_NOTES.md) for changes (especially breaking changes) included with each release.

## Setup

Expand All @@ -64,7 +70,7 @@ Choose the right verison of the `mleap-spark` module to export your pipeline. Th
#### SBT

```sbt
libraryDependencies += "ml.combust.mleap" %% "mleap-runtime" % "0.15.0"
libraryDependencies += "ml.combust.mleap" %% "mleap-runtime" % "0.16.0"
```

#### Maven
Expand All @@ -73,7 +79,7 @@ libraryDependencies += "ml.combust.mleap" %% "mleap-runtime" % "0.15.0"
<dependency>
<groupId>ml.combust.mleap</groupId>
<artifactId>mleap-runtime_2.11</artifactId>
<version>0.15.0</version>
<version>0.16.0</version>
</dependency>
```

Expand All @@ -82,7 +88,7 @@ libraryDependencies += "ml.combust.mleap" %% "mleap-runtime" % "0.15.0"
#### SBT

```sbt
libraryDependencies += "ml.combust.mleap" %% "mleap-spark" % "0.15.0"
libraryDependencies += "ml.combust.mleap" %% "mleap-spark" % "0.16.0"
```

#### Maven
Expand All @@ -91,14 +97,14 @@ libraryDependencies += "ml.combust.mleap" %% "mleap-spark" % "0.15.0"
<dependency>
<groupId>ml.combust.mleap</groupId>
<artifactId>mleap-spark_2.11</artifactId>
<version>0.15.0</version>
<version>0.16.0</version>
</dependency>
```

### Spark Packages

```bash
$ bin/spark-shell --packages ml.combust.mleap:mleap-spark_2.11:0.15.0
$ bin/spark-shell --packages ml.combust.mleap:mleap-spark_2.11:0.16.0
```

### PySpark Integration
Expand Down Expand Up @@ -169,6 +175,8 @@ import mleap.pyspark
from mleap.pyspark.spark_support import SimpleSparkSerializer
```

See [PySpark Integration of python/README.md](python/README.md#pyspark-integration) for more.

### Create and Export a Scikit-Learn Pipeline

```python
Expand Down
36 changes: 34 additions & 2 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,31 @@
# Release 0.16.0

### Breaking Changes
- Fix default ports when running grpc/http requests; default grpc port is 65328 and can be overridden via MLEAP_GRPC_PORT; default http port should be: 65327 and can be overridden via MLEAP_HTTP_PORT

### New Features
- Upgrade to Spark version 2.4.5
- Support for a performant implementation of the XGboost runtime (XGboost Predictor)
- Scikit-learn support for MultinomialLogisticRegression
- Support for min/max values other than defaults (i.e. 0.0 and 1.0) in MinMaxScalerModel
- Support for custom transformers (StringMap, MathUnary, MathBinary) in Pyspark
- Support MLWritable/MLReadable for custom transformers (StringMap, MathUnary, MathBinary) and fix this for Imputer transformer
- Fixes support for loading/storing bundles from/to hdfs in Pyspark
- Improve importing mleap __version__ for python modules

### Bug Fixes
- Fix XGBoost sparse vector support
- Fix MinMaxScalerModel outputs different in Spark vs MLeap
- Fix Spark deserialization for CountVectorizer transformer
- Added support for HandleInvalid in Bucketizer, VectorIndexer
- Fix setting HandleInvalid by default to OneHotEncoder for backwards compatibility
- Fixes MLReader for Imputer mleap implementation of Spark transformer

### Improvements
- Minor documentation updates

### Other Changes

# Release 0.15.0

### Breaking Changes
Expand All @@ -20,5 +48,9 @@
- Add default grpc port to docker config
- General documentation improvements

### Other Changes
- None

# Older releases

We make every effort for the serialization format to be backwards compatible between different versions of MLeap. Please note below some important notes regarding backwards compatibility.

- The deprecated OneHotEncoder unfortunately had breaking changes in a few releases. In releases 0.11.0 and 0.12.0, the deserialization into MLeap was broken for OneHotEncoder. When using releases 0.13.0, 0.14.0, and 0.15.0, please ensure that the model returns the same results as before the upgrade, by potentially changing dropLast and handleInvalid values after deserialization. Alternatively, please use MLeap version 0.16.0 or higher, in case you have models serialized with other versions of MLeap that use OneHotEncoder. If your model uses OneHotEncoderEstimator or no one hot encoding, then you should not encounter any of the issues above.
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.ann.FeedForwardTopology
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.classification.MultiLayerPerceptronClassifierModel.LabelConverter
import ml.combust.mleap.core.feature.LabeledPoint
import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType}
import org.apache.spark.ml.linalg.{Vector, Vectors}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ case class NaiveBayesModel(numFeatures: Int,
val ones = new DenseVector(Array.fill(theta.numCols) {1.0})
val thetaMinusNegTheta = Matrices.map(theta, value =>
value - math.log(1.0 - math.exp(value)))
val negThetaSum = negTheta.multiply(ones);
val negThetaSum = negTheta.multiply(ones)

raw.foreachActive((_, value) =>
require(value == 0.0 || value == 1.0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,21 @@ import ml.combust.mleap.core.types.{ScalarType, StructType}
*
* @param splits splits used to determine bucket
*/
@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala")
case class BucketizerModel(splits: Array[Double]) extends Model {
@SparkCode(uri = "https://github.com/apache/spark/blob/v2.4.5/mllib/src/main/scala/org/apache/spark/ml/feature/Bucketizer.scala")
case class BucketizerModel(splits: Array[Double], handleInvalid: HandleInvalid = HandleInvalid.Error) extends Model {
def apply(feature: Double): Double = {
binarySearchForBuckets(splits, feature)
binarySearchForBuckets(splits, feature, keepInvalid = handleInvalid == HandleInvalid.Keep)
}

def binarySearchForBuckets(splits: Array[Double], feature: Double): Double = {
if (feature == splits.last) {
def binarySearchForBuckets(splits: Array[Double], feature: Double, keepInvalid: Boolean): Double = {
if (feature.isNaN) {
if (keepInvalid) {
splits.length - 1
} else {
throw new RuntimeException("Bucketizer encountered NaN value. To handle or skip NaNs," +
" try setting Bucketizer.handleInvalid.")
}
} else if (feature == splits.last) {
splits.length - 2
} else {
val idx = java.util.Arrays.binarySearch(splits, feature)
Expand All @@ -26,8 +33,8 @@ case class BucketizerModel(splits: Array[Double]) extends Model {
} else {
val insertPos = -idx - 1
if (insertPos == 0 || insertPos == splits.length) {
throw new Exception(s"Feature value $feature out of Bucketizer bounds" +
s" [${splits.head}, ${splits.last}]. Check your features, or loosen " +
throw new RuntimeException(s"Feature value $feature out of Bucketizer bounds" +
s" [${splits.head}, ${splits.last}]. Check your features, or loosen " +
s"the lower/upper bound constraints.")
} else {
insertPos - 1
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package ml.combust.mleap.core.feature

sealed trait HandleInvalid {
def asParamString: String
}

object HandleInvalid {
val default = Error

case object Error extends HandleInvalid {
override def asParamString: String = "error"
}

case object Skip extends HandleInvalid {
override def asParamString: String = "skip"
}

case object Keep extends HandleInvalid {
override def asParamString: String = "keep"
}

def fromString(value: String, canSkip: Boolean = true): HandleInvalid = value match {
case "error" => HandleInvalid.Error
case "skip" => if (canSkip) HandleInvalid.Skip else throw new IllegalArgumentException(s"Invalid handler: $value")
case "keep" => HandleInvalid.Keep
case _ => throw new IllegalArgumentException(s"Invalid handler: $value")
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,50 +15,34 @@ import scala.math.{max, min}
* @param originalMin minimum values from training features
* @param originalMax maximum values from training features
*/
@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala")
@SparkCode(uri = "https://github.com/apache/spark/blob/v2.4.0/mllib/src/main/scala/org/apache/spark/ml/feature/MinMaxScaler.scala")
case class MinMaxScalerModel(originalMin: Vector,
originalMax: Vector) extends Model {
originalMax: Vector,
minValue: Double = 0.0,
maxValue: Double = 1.0) extends Model {
val originalRange = (originalMax.toBreeze - originalMin.toBreeze).toArray
val minArray = originalMin.toArray

/**Scale a feature vector using the min/max
*
* @param vector feature vector
* @return scaled feature fector
* @return scaled feature vector
*/
def apply(vector: Vector): Vector = {
vector match {
case DenseVector(values) =>
val vs = values.clone()
val size = vs.length
var i = 0
while (i < size) {
if (!values(i).isNaN) {
val raw = if (originalRange(i) != 0) {
max(min(1.0, (values(i) - minArray(i)) / originalRange(i)), 0.0)
} else {
0.5
}
vs(i) = raw
}
i += 1
}
Vectors.dense(vs)
case SparseVector(size, indices, values) =>
val vs = values.clone()
val nnz = vs.length
var i = 0
while (i < nnz) {
val raw = if (originalRange(i) != 0) {
max(min(1.0, (indices(i) - minArray(i)) / originalRange(i)), 0.0)
} else {
0.5
}
vs(i) = raw
i += 1
}
Vectors.sparse(size, indices, vs)
val scale = maxValue - minValue

// 0 in sparse vector will probably be rescaled to non-zero
val values = vector.copy.toArray
val size = values.length
var i = 0
while (i < size) {
if (!values(i).isNaN) {
val raw = if (originalRange(i) != 0) (values(i) - minArray(i)) / originalRange(i) else 0.5
values(i) = raw * scale + minValue
}
i += 1
}
Vectors.dense(values)
}

override def inputSchema: StructType = StructType("input" -> TensorType.Double(originalRange.length)).get
Expand Down
Loading

0 comments on commit 63a90cc

Please sign in to comment.