From 70be8dd11720fc2d34b764b642c9dcbacd540508 Mon Sep 17 00:00:00 2001
From: mmlspark-bot <mmlspark-bot@microsoft.com>
Date: Fri, 2 Jun 2017 11:57:25 -0400
Subject: [PATCH] Initial content

---
 .gitignore                                    |  29 +
 CONTRIBUTING.md                               |  38 ++
 LICENSE                                       |  22 +
 README.md                                     | 177 ++++++
 docs/developer-readme.md                      |  80 +++
 docs/third-party-notices.txt                  | 298 ++++++++++
 docs/your-first-model.md                      | 109 ++++
 .../101 - Adult Census Income Training.ipynb  | 139 +++++
 ...on Example with Flight Delay Dataset.ipynb | 161 +++++
 .../103 - Before and After MMLSpark.ipynb     | 286 +++++++++
 ...Amazon Book Reviews - TextFeaturizer.ipynb | 186 ++++++
 ...202 - Amazon Book Reviews - Word2Vec.ipynb | 228 +++++++
 .../301 - CIFAR10 CNTK CNN Evaluation.ipynb   | 264 +++++++++
 ...302 - Pipeline Image Transformations.ipynb | 236 ++++++++
 notebooks/tests/BasicDFOpsSmokeTest.ipynb     | 107 ++++
 runme                                         |  50 ++
 src/.gitignore                                |  44 ++
 src/.sbtopts                                  |   2 +
 src/build.sbt                                 |  11 +
 src/checkpoint-data/build.sbt                 |   1 +
 .../src/main/scala/CheckpointData.scala       |  71 +++
 .../src/test/scala/CheckpointDataSuite.scala  |  41 ++
 src/cntk-model/build.sbt                      |   3 +
 src/cntk-model/src/main/python/CNTKModel.py   |  21 +
 src/cntk-model/src/main/scala/CNTKModel.scala | 230 +++++++
 .../src/test/scala/CNTKBindingSuite.scala     |  60 ++
 .../src/test/scala/CNTKModelSuite.scala       | 157 +++++
 .../src/test/scala/CNTKTestUtils.scala        |  74 +++
 src/cntk-train/build.sbt                      |   3 +
 src/cntk-train/src/main/python/CNTKLearner.py |  23 +
 .../src/main/scala/BrainscriptBuilder.scala   | 117 ++++
 .../src/main/scala/CNTKLearner.scala          | 168 ++++++
 .../src/main/scala/CommandBuilders.scala      | 117 ++++
 .../src/main/scala/DataConversion.scala       | 173 ++++++
 .../src/main/scala/TypeMapping.scala          |  41 ++
 .../src/test/scala/ValidateCntkTrain.scala    | 267 +++++++++
 .../test/scala/ValidateConfiguration.scala    |  28 +
 .../test/scala/ValidateDataConversion.scala   |  83 +++
 .../test/scala/ValidateEnvironmentUtils.scala |  14 +
 src/codegen/build.sbt                         |  12 +
 src/codegen/src/main/scala/CodeGen.scala      |  79 +++
 src/codegen/src/main/scala/Config.scala       |  29 +
 .../src/main/scala/PySparkWrapper.scala       | 345 +++++++++++
 .../main/scala/PySparkWrapperGenerator.scala  | 123 ++++
 .../src/main/scala/PySparkWrapperTest.scala   | 275 +++++++++
 src/compute-model-statistics/build.sbt        |   3 +
 .../main/scala/ComputeModelStatistics.scala   | 559 +++++++++++++++++
 .../scala/VerifyComputeModelStatistics.scala  | 245 ++++++++
 src/compute-per-instance-statistics/build.sbt |   3 +
 .../scala/ComputePerInstanceStatistics.scala  | 110 ++++
 .../VerifyComputePerInstanceStatistics.scala  | 130 ++++
 src/core/build.sbt                            |   1 +
 src/core/contracts/build.sbt                  |   1 +
 .../contracts/src/main/scala/Exceptions.scala |  35 ++
 .../contracts/src/main/scala/Metrics.scala    |  47 ++
 .../contracts/src/main/scala/Params.scala     | 134 +++++
 src/core/env/build.sbt                        |   7 +
 src/core/env/src/main/scala/CodegenTags.scala |  13 +
 .../env/src/main/scala/Configuration.scala    |  51 ++
 .../env/src/main/scala/EnvironmentUtils.scala |  52 ++
 .../env/src/main/scala/FileUtilities.scala    | 139 +++++
 src/core/env/src/main/scala/Logging.scala     |  23 +
 src/core/env/src/main/scala/NativeLoader.java | 194 ++++++
 .../env/src/main/scala/ProcessUtilities.scala |  26 +
 src/core/hadoop/build.sbt                     |   1 +
 .../hadoop/src/main/scala/HadoopUtils.scala   | 176 ++++++
 src/core/ml/build.sbt                         |   3 +
 .../ml/src/test/scala/HashingTFSpec.scala     |  81 +++
 src/core/ml/src/test/scala/IDFSpec.scala      | 103 ++++
 src/core/ml/src/test/scala/NGramSpec.scala    |  74 +++
 .../ml/src/test/scala/OneHotEncoderSpec.scala | 102 ++++
 src/core/ml/src/test/scala/Word2VecSpec.scala |  93 +++
 src/core/schema/build.sbt                     |   4 +
 .../src/main/python/TypeConversionUtils.py    |  17 +
 src/core/schema/src/main/python/Utils.py      |  69 +++
 .../src/main/scala/BinaryFileSchema.scala     |  32 +
 .../schema/src/main/scala/Categoricals.scala  | 317 ++++++++++
 .../src/main/scala/DatasetExtensions.scala    |  68 +++
 .../schema/src/main/scala/ImageSchema.scala   |  46 ++
 .../src/main/scala/SchemaConstants.scala      |  44 ++
 .../schema/src/main/scala/SparkSchema.scala   | 352 +++++++++++
 .../src/test/scala/TestCategoricals.scala     | 131 ++++
 .../scala/VerifyFastVectorAssembler.scala     | 118 ++++
 .../src/test/scala/VerifySparkSchema.scala    |  56 ++
 src/core/spark/build.sbt                      |   1 +
 .../spark/src/main/scala/ArrayMapParam.scala  |  70 +++
 .../spark/src/main/scala/EstimatorParam.scala |  36 ++
 .../src/main/scala/FastVectorAssembler.scala  | 154 +++++
 .../spark/src/main/scala/MapArrayParam.scala  |  74 +++
 .../src/main/scala/MetadataUtilities.scala    |  10 +
 .../spark/src/main/scala/TransformParam.scala |  58 ++
 src/core/test/base/build.sbt                  |   1 +
 .../src/main/scala/SparkSessionFactory.scala  |  53 ++
 .../test/base/src/main/scala/TestBase.scala   | 155 +++++
 src/core/test/build.sbt                       |   1 +
 src/core/test/datagen/build.sbt               |   1 +
 .../src/main/scala/DatasetConstraints.scala   |  68 +++
 .../src/main/scala/DatasetOptions.scala       |  57 ++
 .../src/main/scala/GenerateDataType.scala     |  37 ++
 .../src/main/scala/GenerateDataset.scala      | 114 ++++
 .../datagen/src/main/scala/GenerateRow.scala  |  70 +++
 .../src/main/scala/ModuleFuzzingTest.scala    |  52 ++
 .../test/scala/VerifyGenerateDataset.scala    |  46 ++
 src/data-conversion/build.sbt                 |   1 +
 .../src/main/scala/DataConversion.scala       | 161 +++++
 .../src/test/scala/VerifyDataConversion.scala | 232 ++++++++
 src/downloader/build.sbt                      |   1 +
 .../src/main/python/ModelDownloader.py        | 101 ++++
 .../src/main/scala/ModelDownloader.scala      | 260 ++++++++
 src/downloader/src/main/scala/Schema.scala    |  92 +++
 .../src/test/scala/DownloaderSuite.scala      |  49 ++
 src/featurize/build.sbt                       |   3 +
 .../src/main/scala/AssembleFeatures.scala     | 499 ++++++++++++++++
 src/featurize/src/main/scala/Featurize.scala  |  92 +++
 .../src/test/scala/VerifyFeaturize.scala      | 330 +++++++++++
 .../test/scala/benchmarkBasicDataTypes.json   |  12 +
 .../src/test/scala/benchmarkNoOneHot.json     |   6 +
 .../src/test/scala/benchmarkOneHot.json       |   6 +
 .../src/test/scala/benchmarkString.json       |   5 +
 .../scala/benchmarkStringIndexOneHot.json     |   6 +
 .../test/scala/benchmarkStringMissing.json    |   5 +
 .../src/test/scala/benchmarkVectors.json      |   7 +
 src/find-best-model/build.sbt                 |   3 +
 .../src/main/scala/FindBestModel.scala        | 331 +++++++++++
 .../src/test/scala/VerifyFindBestModel.scala  | 106 ++++
 src/fuzzing/build.sbt                         |   5 +
 src/fuzzing/src/test/scala/Fuzzing.scala      | 254 ++++++++
 src/image-featurizer/build.sbt                |   5 +
 .../src/main/scala/ImageFeaturizer.scala      | 128 ++++
 .../src/test/scala/ImageFeaturizerSuite.scala |  66 +++
 src/image-transformer/build.sbt               |   2 +
 .../src/main/python/ImageTransform.py         |  96 +++
 .../src/main/scala/ImageTransformer.scala     | 314 ++++++++++
 .../src/main/scala/UnrollImage.scala          |  70 +++
 .../test/scala/ImageTransformerSuite.scala    | 293 +++++++++
 src/multi-column-adapter/build.sbt            |   1 +
 .../src/main/scala/MultiColumnAdapter.scala   | 121 ++++
 .../test/scala/MultiColumnAdapterSpec.scala   |  49 ++
 src/partition-sample/build.sbt                |   1 +
 .../src/main/scala/PartitionSample.scala      | 117 ++++
 .../test/scala/VerifyPartitionSample.scala    |  67 +++
 src/pipeline-stages/build.sbt                 |   1 +
 .../src/main/scala/Repartition.scala          |  42 ++
 .../src/main/scala/SelectColumns.scala        |  63 ++
 .../src/test/scala/RepartitionSuite.scala     |  50 ++
 .../src/test/scala/SelectColumnsSuite.scala   |  75 +++
 src/project/build.sbt                         |  16 +
 src/project/build.scala                       | 201 +++++++
 src/project/lib-check.scala                   |  34 ++
 src/project/meta.sbt                          | 108 ++++
 src/project/plugins.sbt                       |   5 +
 src/project/scalastyle.scala                  | 136 +++++
 src/readers/build.sbt                         |   1 +
 .../src/main/python/BinaryFileReader.py       |  52 ++
 src/readers/src/main/python/ImageReader.py    |  50 ++
 .../src/main/scala/AzureBlobReader.scala      |  72 +++
 .../src/main/scala/AzureSQLReader.scala       |  53 ++
 .../src/main/scala/BinaryFileReader.scala     |  79 +++
 src/readers/src/main/scala/FileFormat.scala   |  12 +
 src/readers/src/main/scala/ImageReader.scala  |  63 ++
 src/readers/src/main/scala/ReaderUtils.scala  |  47 ++
 src/readers/src/main/scala/Readers.scala      |  50 ++
 src/readers/src/main/scala/WasbReader.scala   |  47 ++
 .../test/scala/BinaryFileReaderSuite.scala    |  44 ++
 .../src/test/scala/ImageReaderSuite.scala     |  75 +++
 src/summarize-data/build.sbt                  |   1 +
 .../src/main/scala/SummarizeData.scala        | 189 ++++++
 .../src/test/scala/SummarizeDataSuite.scala   |  52 ++
 src/text-featurizer/build.sbt                 |   2 +
 .../src/main/scala/TextFeaturizer.scala       | 442 ++++++++++++++
 .../src/test/scala/TextFeaturizerSpec.scala   |  86 +++
 src/train-classifier/build.sbt                |   3 +
 .../src/main/scala/TrainClassifier.scala      | 367 ++++++++++++
 .../test/scala/VerifyTrainClassifier.scala    | 560 ++++++++++++++++++
 .../src/test/scala/benchmarkMetrics.csv       |  68 +++
 src/train-regressor/build.sbt                 |   2 +
 .../src/main/scala/TrainRegressor.scala       | 246 ++++++++
 .../src/test/scala/VerifyTrainRegressor.scala | 184 ++++++
 src/utils/build.sbt                           |   1 +
 .../src/main/scala/JarLoadingUtils.scala      | 139 +++++
 .../src/main/scala/ObjectUtilities.scala      |  71 +++
 .../src/main/scala/PipelineUtilities.scala    |  55 ++
 tools/bin/mml-exec                            |  37 ++
 tools/build-pr/checkout                       |  58 ++
 tools/build-pr/report                         |  39 ++
 tools/build-pr/shared.sh                      |  47 ++
 tools/config.sh                               | 274 +++++++++
 tools/docker/Dockerfile                       |  54 ++
 tools/docker/bin/EULA.txt                     | 203 +++++++
 tools/docker/bin/eula                         |  13 +
 tools/docker/bin/eula.html                    |  54 ++
 tools/docker/bin/eula.py                      |  37 ++
 tools/docker/bin/launcher                     |  24 +
 tools/docker/build-docker                     |  49 ++
 tools/docker/build-env                        |  28 +
 tools/hdi/install-mmlspark.sh                 | 165 ++++++
 tools/hdi/setup-test-authkey.sh               |  34 ++
 tools/hdi/update_livy.py                      |  25 +
 tools/mmlspark-packages.spec                  |  67 +++
 tools/notebook/postprocess.py                 | 110 ++++
 tools/notebook/tester/NotebookTestSuite.py    |  69 +++
 tools/notebook/tester/TestNotebooksLocally.py |  36 ++
 tools/notebook/tester/TestNotebooksOnHdi.py   |  48 ++
 tools/notebook/tester/parallel_run.sh         |  32 +
 tools/pip/MANIFEST.in                         |   5 +
 tools/pip/README.txt                          |   8 +
 tools/pip/generate-pip.sh                     |  29 +
 tools/pip/setup.py                            |  33 ++
 tools/pytests/auto-tests                      |  19 +
 tools/pytests/notebook-tests                  |  11 +
 tools/pytests/shared.sh                       |  16 +
 tools/runme/README.txt                        |   4 +
 tools/runme/build-readme.tmpl                 |  12 +
 tools/runme/build.sh                          | 249 ++++++++
 tools/runme/install.sh                        | 206 +++++++
 tools/runme/runme.sh                          |  51 ++
 tools/runme/show-version                      |   7 +
 tools/runme/utils.sh                          | 450 ++++++++++++++
 tools/tests/tags.sh                           |  70 +++
 219 files changed, 20154 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 CONTRIBUTING.md
 create mode 100644 LICENSE
 create mode 100644 README.md
 create mode 100644 docs/developer-readme.md
 create mode 100644 docs/third-party-notices.txt
 create mode 100644 docs/your-first-model.md
 create mode 100644 notebooks/samples/101 - Adult Census Income Training.ipynb
 create mode 100644 notebooks/samples/102 - Regression Example with Flight Delay Dataset.ipynb
 create mode 100644 notebooks/samples/103 - Before and After MMLSpark.ipynb
 create mode 100644 notebooks/samples/201 - Amazon Book Reviews - TextFeaturizer.ipynb
 create mode 100644 notebooks/samples/202 - Amazon Book Reviews - Word2Vec.ipynb
 create mode 100644 notebooks/samples/301 - CIFAR10 CNTK CNN Evaluation.ipynb
 create mode 100644 notebooks/samples/302 - Pipeline Image Transformations.ipynb
 create mode 100644 notebooks/tests/BasicDFOpsSmokeTest.ipynb
 create mode 100755 runme
 create mode 100644 src/.gitignore
 create mode 100644 src/.sbtopts
 create mode 100644 src/build.sbt
 create mode 100644 src/checkpoint-data/build.sbt
 create mode 100644 src/checkpoint-data/src/main/scala/CheckpointData.scala
 create mode 100644 src/checkpoint-data/src/test/scala/CheckpointDataSuite.scala
 create mode 100644 src/cntk-model/build.sbt
 create mode 100644 src/cntk-model/src/main/python/CNTKModel.py
 create mode 100644 src/cntk-model/src/main/scala/CNTKModel.scala
 create mode 100644 src/cntk-model/src/test/scala/CNTKBindingSuite.scala
 create mode 100644 src/cntk-model/src/test/scala/CNTKModelSuite.scala
 create mode 100644 src/cntk-model/src/test/scala/CNTKTestUtils.scala
 create mode 100644 src/cntk-train/build.sbt
 create mode 100644 src/cntk-train/src/main/python/CNTKLearner.py
 create mode 100644 src/cntk-train/src/main/scala/BrainscriptBuilder.scala
 create mode 100644 src/cntk-train/src/main/scala/CNTKLearner.scala
 create mode 100644 src/cntk-train/src/main/scala/CommandBuilders.scala
 create mode 100644 src/cntk-train/src/main/scala/DataConversion.scala
 create mode 100644 src/cntk-train/src/main/scala/TypeMapping.scala
 create mode 100644 src/cntk-train/src/test/scala/ValidateCntkTrain.scala
 create mode 100644 src/cntk-train/src/test/scala/ValidateConfiguration.scala
 create mode 100644 src/cntk-train/src/test/scala/ValidateDataConversion.scala
 create mode 100644 src/cntk-train/src/test/scala/ValidateEnvironmentUtils.scala
 create mode 100644 src/codegen/build.sbt
 create mode 100644 src/codegen/src/main/scala/CodeGen.scala
 create mode 100644 src/codegen/src/main/scala/Config.scala
 create mode 100644 src/codegen/src/main/scala/PySparkWrapper.scala
 create mode 100644 src/codegen/src/main/scala/PySparkWrapperGenerator.scala
 create mode 100644 src/codegen/src/main/scala/PySparkWrapperTest.scala
 create mode 100644 src/compute-model-statistics/build.sbt
 create mode 100644 src/compute-model-statistics/src/main/scala/ComputeModelStatistics.scala
 create mode 100644 src/compute-model-statistics/src/test/scala/VerifyComputeModelStatistics.scala
 create mode 100644 src/compute-per-instance-statistics/build.sbt
 create mode 100644 src/compute-per-instance-statistics/src/main/scala/ComputePerInstanceStatistics.scala
 create mode 100644 src/compute-per-instance-statistics/src/test/scala/VerifyComputePerInstanceStatistics.scala
 create mode 100644 src/core/build.sbt
 create mode 100644 src/core/contracts/build.sbt
 create mode 100644 src/core/contracts/src/main/scala/Exceptions.scala
 create mode 100644 src/core/contracts/src/main/scala/Metrics.scala
 create mode 100644 src/core/contracts/src/main/scala/Params.scala
 create mode 100644 src/core/env/build.sbt
 create mode 100644 src/core/env/src/main/scala/CodegenTags.scala
 create mode 100644 src/core/env/src/main/scala/Configuration.scala
 create mode 100644 src/core/env/src/main/scala/EnvironmentUtils.scala
 create mode 100644 src/core/env/src/main/scala/FileUtilities.scala
 create mode 100644 src/core/env/src/main/scala/Logging.scala
 create mode 100644 src/core/env/src/main/scala/NativeLoader.java
 create mode 100644 src/core/env/src/main/scala/ProcessUtilities.scala
 create mode 100644 src/core/hadoop/build.sbt
 create mode 100644 src/core/hadoop/src/main/scala/HadoopUtils.scala
 create mode 100644 src/core/ml/build.sbt
 create mode 100644 src/core/ml/src/test/scala/HashingTFSpec.scala
 create mode 100644 src/core/ml/src/test/scala/IDFSpec.scala
 create mode 100644 src/core/ml/src/test/scala/NGramSpec.scala
 create mode 100644 src/core/ml/src/test/scala/OneHotEncoderSpec.scala
 create mode 100644 src/core/ml/src/test/scala/Word2VecSpec.scala
 create mode 100644 src/core/schema/build.sbt
 create mode 100644 src/core/schema/src/main/python/TypeConversionUtils.py
 create mode 100644 src/core/schema/src/main/python/Utils.py
 create mode 100644 src/core/schema/src/main/scala/BinaryFileSchema.scala
 create mode 100644 src/core/schema/src/main/scala/Categoricals.scala
 create mode 100644 src/core/schema/src/main/scala/DatasetExtensions.scala
 create mode 100644 src/core/schema/src/main/scala/ImageSchema.scala
 create mode 100644 src/core/schema/src/main/scala/SchemaConstants.scala
 create mode 100644 src/core/schema/src/main/scala/SparkSchema.scala
 create mode 100644 src/core/schema/src/test/scala/TestCategoricals.scala
 create mode 100644 src/core/schema/src/test/scala/VerifyFastVectorAssembler.scala
 create mode 100644 src/core/schema/src/test/scala/VerifySparkSchema.scala
 create mode 100644 src/core/spark/build.sbt
 create mode 100644 src/core/spark/src/main/scala/ArrayMapParam.scala
 create mode 100644 src/core/spark/src/main/scala/EstimatorParam.scala
 create mode 100644 src/core/spark/src/main/scala/FastVectorAssembler.scala
 create mode 100644 src/core/spark/src/main/scala/MapArrayParam.scala
 create mode 100644 src/core/spark/src/main/scala/MetadataUtilities.scala
 create mode 100644 src/core/spark/src/main/scala/TransformParam.scala
 create mode 100644 src/core/test/base/build.sbt
 create mode 100644 src/core/test/base/src/main/scala/SparkSessionFactory.scala
 create mode 100644 src/core/test/base/src/main/scala/TestBase.scala
 create mode 100644 src/core/test/build.sbt
 create mode 100644 src/core/test/datagen/build.sbt
 create mode 100644 src/core/test/datagen/src/main/scala/DatasetConstraints.scala
 create mode 100644 src/core/test/datagen/src/main/scala/DatasetOptions.scala
 create mode 100644 src/core/test/datagen/src/main/scala/GenerateDataType.scala
 create mode 100644 src/core/test/datagen/src/main/scala/GenerateDataset.scala
 create mode 100644 src/core/test/datagen/src/main/scala/GenerateRow.scala
 create mode 100644 src/core/test/datagen/src/main/scala/ModuleFuzzingTest.scala
 create mode 100644 src/core/test/datagen/src/test/scala/VerifyGenerateDataset.scala
 create mode 100644 src/data-conversion/build.sbt
 create mode 100644 src/data-conversion/src/main/scala/DataConversion.scala
 create mode 100644 src/data-conversion/src/test/scala/VerifyDataConversion.scala
 create mode 100644 src/downloader/build.sbt
 create mode 100644 src/downloader/src/main/python/ModelDownloader.py
 create mode 100644 src/downloader/src/main/scala/ModelDownloader.scala
 create mode 100644 src/downloader/src/main/scala/Schema.scala
 create mode 100644 src/downloader/src/test/scala/DownloaderSuite.scala
 create mode 100644 src/featurize/build.sbt
 create mode 100644 src/featurize/src/main/scala/AssembleFeatures.scala
 create mode 100644 src/featurize/src/main/scala/Featurize.scala
 create mode 100644 src/featurize/src/test/scala/VerifyFeaturize.scala
 create mode 100644 src/featurize/src/test/scala/benchmarkBasicDataTypes.json
 create mode 100644 src/featurize/src/test/scala/benchmarkNoOneHot.json
 create mode 100644 src/featurize/src/test/scala/benchmarkOneHot.json
 create mode 100644 src/featurize/src/test/scala/benchmarkString.json
 create mode 100644 src/featurize/src/test/scala/benchmarkStringIndexOneHot.json
 create mode 100644 src/featurize/src/test/scala/benchmarkStringMissing.json
 create mode 100644 src/featurize/src/test/scala/benchmarkVectors.json
 create mode 100644 src/find-best-model/build.sbt
 create mode 100644 src/find-best-model/src/main/scala/FindBestModel.scala
 create mode 100644 src/find-best-model/src/test/scala/VerifyFindBestModel.scala
 create mode 100644 src/fuzzing/build.sbt
 create mode 100644 src/fuzzing/src/test/scala/Fuzzing.scala
 create mode 100644 src/image-featurizer/build.sbt
 create mode 100644 src/image-featurizer/src/main/scala/ImageFeaturizer.scala
 create mode 100644 src/image-featurizer/src/test/scala/ImageFeaturizerSuite.scala
 create mode 100644 src/image-transformer/build.sbt
 create mode 100644 src/image-transformer/src/main/python/ImageTransform.py
 create mode 100644 src/image-transformer/src/main/scala/ImageTransformer.scala
 create mode 100644 src/image-transformer/src/main/scala/UnrollImage.scala
 create mode 100644 src/image-transformer/src/test/scala/ImageTransformerSuite.scala
 create mode 100644 src/multi-column-adapter/build.sbt
 create mode 100644 src/multi-column-adapter/src/main/scala/MultiColumnAdapter.scala
 create mode 100644 src/multi-column-adapter/src/test/scala/MultiColumnAdapterSpec.scala
 create mode 100644 src/partition-sample/build.sbt
 create mode 100644 src/partition-sample/src/main/scala/PartitionSample.scala
 create mode 100644 src/partition-sample/src/test/scala/VerifyPartitionSample.scala
 create mode 100644 src/pipeline-stages/build.sbt
 create mode 100644 src/pipeline-stages/src/main/scala/Repartition.scala
 create mode 100644 src/pipeline-stages/src/main/scala/SelectColumns.scala
 create mode 100644 src/pipeline-stages/src/test/scala/RepartitionSuite.scala
 create mode 100644 src/pipeline-stages/src/test/scala/SelectColumnsSuite.scala
 create mode 100644 src/project/build.sbt
 create mode 100644 src/project/build.scala
 create mode 100644 src/project/lib-check.scala
 create mode 100644 src/project/meta.sbt
 create mode 100644 src/project/plugins.sbt
 create mode 100644 src/project/scalastyle.scala
 create mode 100644 src/readers/build.sbt
 create mode 100644 src/readers/src/main/python/BinaryFileReader.py
 create mode 100644 src/readers/src/main/python/ImageReader.py
 create mode 100644 src/readers/src/main/scala/AzureBlobReader.scala
 create mode 100644 src/readers/src/main/scala/AzureSQLReader.scala
 create mode 100644 src/readers/src/main/scala/BinaryFileReader.scala
 create mode 100644 src/readers/src/main/scala/FileFormat.scala
 create mode 100644 src/readers/src/main/scala/ImageReader.scala
 create mode 100644 src/readers/src/main/scala/ReaderUtils.scala
 create mode 100644 src/readers/src/main/scala/Readers.scala
 create mode 100644 src/readers/src/main/scala/WasbReader.scala
 create mode 100644 src/readers/src/test/scala/BinaryFileReaderSuite.scala
 create mode 100644 src/readers/src/test/scala/ImageReaderSuite.scala
 create mode 100644 src/summarize-data/build.sbt
 create mode 100644 src/summarize-data/src/main/scala/SummarizeData.scala
 create mode 100644 src/summarize-data/src/test/scala/SummarizeDataSuite.scala
 create mode 100644 src/text-featurizer/build.sbt
 create mode 100644 src/text-featurizer/src/main/scala/TextFeaturizer.scala
 create mode 100644 src/text-featurizer/src/test/scala/TextFeaturizerSpec.scala
 create mode 100644 src/train-classifier/build.sbt
 create mode 100644 src/train-classifier/src/main/scala/TrainClassifier.scala
 create mode 100644 src/train-classifier/src/test/scala/VerifyTrainClassifier.scala
 create mode 100644 src/train-classifier/src/test/scala/benchmarkMetrics.csv
 create mode 100644 src/train-regressor/build.sbt
 create mode 100644 src/train-regressor/src/main/scala/TrainRegressor.scala
 create mode 100644 src/train-regressor/src/test/scala/VerifyTrainRegressor.scala
 create mode 100644 src/utils/build.sbt
 create mode 100644 src/utils/src/main/scala/JarLoadingUtils.scala
 create mode 100644 src/utils/src/main/scala/ObjectUtilities.scala
 create mode 100644 src/utils/src/main/scala/PipelineUtilities.scala
 create mode 100755 tools/bin/mml-exec
 create mode 100755 tools/build-pr/checkout
 create mode 100755 tools/build-pr/report
 create mode 100644 tools/build-pr/shared.sh
 create mode 100644 tools/config.sh
 create mode 100644 tools/docker/Dockerfile
 create mode 100644 tools/docker/bin/EULA.txt
 create mode 100755 tools/docker/bin/eula
 create mode 100644 tools/docker/bin/eula.html
 create mode 100755 tools/docker/bin/eula.py
 create mode 100755 tools/docker/bin/launcher
 create mode 100755 tools/docker/build-docker
 create mode 100755 tools/docker/build-env
 create mode 100755 tools/hdi/install-mmlspark.sh
 create mode 100755 tools/hdi/setup-test-authkey.sh
 create mode 100755 tools/hdi/update_livy.py
 create mode 100644 tools/mmlspark-packages.spec
 create mode 100755 tools/notebook/postprocess.py
 create mode 100644 tools/notebook/tester/NotebookTestSuite.py
 create mode 100644 tools/notebook/tester/TestNotebooksLocally.py
 create mode 100644 tools/notebook/tester/TestNotebooksOnHdi.py
 create mode 100755 tools/notebook/tester/parallel_run.sh
 create mode 100644 tools/pip/MANIFEST.in
 create mode 100644 tools/pip/README.txt
 create mode 100755 tools/pip/generate-pip.sh
 create mode 100644 tools/pip/setup.py
 create mode 100755 tools/pytests/auto-tests
 create mode 100755 tools/pytests/notebook-tests
 create mode 100644 tools/pytests/shared.sh
 create mode 100644 tools/runme/README.txt
 create mode 100644 tools/runme/build-readme.tmpl
 create mode 100644 tools/runme/build.sh
 create mode 100644 tools/runme/install.sh
 create mode 100755 tools/runme/runme.sh
 create mode 100755 tools/runme/show-version
 create mode 100644 tools/runme/utils.sh
 create mode 100755 tools/tests/tags.sh

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000..01eb12d9e8
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,29 @@
+# include BuildArtifacts.zip which is used in some parts of the build
+/BuildArtifacts*
+/TestResults
+# accommodate installing the build environment locally
+/pkgs/
+# useful env configurations
+/tools/local-config.sh
+
+# Generated by tools/build-pr
+/.build-pr
+
+# Ignore these for safety
+*.class
+*.jar
+*.log
+*.tgz
+*.zip
+*.exe
+*.pyc
+*.pyo
+
+# Generic editors
+.vscode
+
+# Common things
+*~
+.#*
+.*.swp
+.DS_Store
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000..a0694bcc4a
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,38 @@
+## Interested in contributing to MMLSpark?  We're excited to work with you.
+
+### You can contribute in many ways
+
+* Use the library and give feedback
+* Report a bug
+* Request a feature
+* Fix a bug
+* Add examples and documentation
+* Code a new feature
+* Review pull requests
+
+### How to contribute?
+
+You can give feedback, report bugs and request new features anytime by
+opening an issue. Also, you can up-vote and comment on existing issues.
+
+To make a pull request into the repo, such as bug fixes, documentation
+or new features, follow these steps:
+
+* If it's a new feature, open an issue for preliminary discussion with
+  us, to ensure your contribution is a good fit and doesn't duplicate
+  on-going work.
+* Typically, you'll need to accept Microsoft Contributor Licence
+  Agreement (CLA).
+* Familiarize yourself with coding style and guidelines.
+* Fork the repository, code your contribution, and create a pull
+  request.
+* Wait for an MMMLSpark team member to review and accept it.  Be patient
+  as we iron out the process for a new project.
+
+A good way to get started contributing is to look for issues with a "help
+wanted" label.  These are issues that we do want to fix, but don't have
+resources to work on currently.
+
+*Apache®, Apache Spark, and Spark® are either registered trademarks or
+trademarks of the Apache Software Foundation in the United States and/or other
+countries.*
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000..e2704e7bac
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000..feeaff1203
--- /dev/null
+++ b/README.md
@@ -0,0 +1,177 @@
+# Microsoft Machine Learning for Apache Spark
+
+<img title="Build Status" src="https://mmlspark.azureedge.net/icons/BuildStatus.png" align="right" />
+
+MMLSpark provides a number of deep learning and data science tools for [Apache
+Spark](https://github.com/apache/spark), including seamless integration of Spark
+Machine Learning pipelines with [Microsoft Cognitive Toolkit
+(CNTK)](https://github.com/Microsoft/CNTK) and [OpenCV](http://www.opencv.org/),
+enabling you to quickly create powerful, highly-scalable predictive and
+analytical models for large image and text datasets.
+
+MMLSpark requires Scala 2.11, Spark 2.1+, and either Python 2.7 or
+Python 3.5+.  See the API documentation
+[for Scala](http://mmlspark.azureedge.net/docs/scala/) and
+[for PySpark](http://mmlspark.azureedge.net/docs/pyspark/).
+
+
+## Salient features
+
+* Easily ingest images from HDFS into Spark `DataFrame` ([example:301])
+* Pre-process image data using transforms from OpenCV ([example:302])
+* Featurize images using pre-trained deep neural nets using CNTK ([example:301])
+* Train DNN-based image classification models on N-Series GPU VMs on Azure
+  ([example:301])
+* Featurize free-form text data using convenient APIs on top of primitives in
+  SparkML via a single transformer ([example:201])
+* Train classification and regression models easily via implicit featurization
+  of data ([example:101])
+* Compute a rich set of evaluation metrics including per-instance metrics
+  ([example:102])
+
+See our [notebooks](notebooks/samples/) for all examples.
+
+[example:101]: notebooks/samples/101%20-%20Adult%20Census%20Income%20Training.ipynb
+  "Adult Census Income Training"
+[example:102]: notebooks/samples/102%20-%20Regression%20Example%20with%20Flight%20Delay%20Dataset.ipynb
+  "Regression Example with Flight Delay Dataset"
+[example:201]: notebooks/samples/201%20-%20Amazon%20Book%20Reviews%20-%20TextFeaturizer.ipynb
+  "Amazon Book Reviews - TextFeaturizer"
+[example:301]: notebooks/samples/301%20-%20CIFAR10%20CNTK%20CNN%20Evaluation.ipynb
+  "CIFAR10 CNTK CNN Evaluation"
+[example:302]: notebooks/samples/302%20-%20Pipeline%20Image%20Transformations.ipynb
+  "Pipeline Image Transformations"
+
+
+## A short example
+
+Below is an excerpt from a simple example of using a pre-trained CNN to classify
+images in the CIFAR-10 dataset.  View the whole source code as [an example
+notebook](notebooks/samples/301%20-%20CIFAR10%20CNTK%20CNN%20Evaluation.ipynb).
+
+   ```python
+   ...
+   import mmlspark as mml
+   # Initialize CNTKModel and define input and output columns
+   cntkModel = mml.CNTKModel().setInputCol("images").setOutputCol("output").setModelLocation(modelFile)
+   # Train on dataset with internal spark pipeline
+   scoredImages = cntkModel.transform(imagesWithLabels)
+   ...
+   ```
+
+See [other sample notebooks](notebooks/samples/) as well as the MMLSpark
+documentation for [Scala](http://mmlspark.azureedge.net/docs/scala/)
+and [PySpark](http://mmlspark.azureedge.net/docs/pyspark/).
+
+
+## Setup and installation
+
+### Docker
+
+The easiest way to evaluate MMLSpark is via our pre-built Docker container.  To
+do so, run the following command:
+
+    docker run -it -p 8888:8888 microsoft/mmlspark
+
+Navigate to <http://localhost:8888> in your web browser to run the sample
+notebooks.  See the
+[documentation](http://mmlspark.azureedge.net/docs/pyspark/install.html)
+for more on Docker use.
+
+> Note: If you wish to run a new instance of the Docker image, make sure you
+> stop & remove the container with the name `my-mml` (using `docker rm my-mml`)
+> before you try to run a new instance, or run it with a `--rm` flag.
+
+### Spark package
+
+MMLSpark can be conveniently installed on existing Spark clusters via the
+`--packages` option, examples:
+
+    spark-shell --packages com.microsoft.ml.spark:mmlspark_2.11:0.5 \
+                --repositories=https://mmlspark.azureedge.net/maven
+
+    pyspark --packages com.microsoft.ml.spark:mmlspark_2.11:0.5 \
+            --repositories=https://mmlspark.azureedge.net/maven
+
+    spark-submit --packages com.microsoft.ml.spark:mmlspark_2.11:0.5 \
+                 --repositories=https://mmlspark.azureedge.net/maven \
+                 MyApp.jar
+
+<img title="Script action submission" src="http://i.imgur.com/oQcS0R2.png" align="right" />
+
+### HDInsight
+
+To install MMLSpark on an existing [HDInsight Spark
+Cluster](https://docs.microsoft.com/en-us/azure/hdinsight/), you can execute a
+script action on the cluster head and worker nodes.  For instructions on running
+script actions, see [this
+guide](https://docs.microsoft.com/en-us/azure/hdinsight/hdinsight-hadoop-customize-cluster-linux#use-a-script-action-during-cluster-creation).
+
+The script action url is:
+<https://mmlspark.azureedge.net/buildartifacts/0.5/install-mmlspark.sh> .
+
+If you're using the Azure Portal to run the script action, go to `Script
+actions` ⇒ `Submit new` in the `Overview` section of your cluster blade.  In the
+`Bash script URI` field, input the script action URL provided above.  Mark the
+rest of the options as shown on the screenshot to the right.
+
+Submit, and the cluster should finish configuring within 10 minutes or so.
+
+### Databricks cloud
+
+To install MMLSpark on the
+[Databricks cloud](http://community.cloud.databricks.com), create a new
+[library from Maven coordinates](https://docs.databricks.com/user-guide/libraries.html#libraries-from-maven-pypi-or-spark-packages)
+in your workspace.
+
+For the coordinates use: `com.microsoft.ml.spark:mmlspark:0.5`.  Then, under
+Advanced Options, use `https://mmlspark.azureedge.net/maven` for the repository.
+Ensure this library is attached to all clusters you create.
+
+Finally, ensure that your Spark cluster has at least Spark 2.1 and Scala 2.11.
+
+You can use MMLSpark in both your Scala and PySpark notebooks.
+
+### SBT
+
+If you are building a Spark application in Scala, add the following lines to
+your `build.sbt`:
+
+   ```scala
+   resolvers += "MMLSpark Repo" at "https://mmlspark.azureedge.net/maven"
+   libraryDependencies += "com.microsoft.ml.spark" %% "mmlspark" % "0.5"
+   ```
+
+### Building from source
+
+You can also easily create your own build by cloning this repo and use the main
+build script: `./runme`.  Run it once to install the needed dependencies, and
+again to do a build.  See [this guide](docs/developer-readme.md) for more
+information.
+
+
+## Contributing & feedback
+
+This project has adopted the [Microsoft Open Source Code of
+Conduct](https://opensource.microsoft.com/codeofconduct/).  For more information
+see the [Code of Conduct
+FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact
+[opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional
+questions or comments.
+
+See [CONTRIBUTING.md](CONTRIBUTING.md) for contribution guidelines.
+
+To give feedback and/or report an issue, open a [GitHub
+Issue](https://help.github.com/articles/creating-an-issue/).
+
+
+## Other relevant projects
+
+* [Microsoft Cognitive Toolkit](https://github.com/Microsoft/CNTK)
+
+* [Azure Machine Learning
+  Operationalization](https://github.com/Azure/Machine-Learning-Operationalization)
+
+*Apache®, Apache Spark, and Spark® are either registered trademarks or
+trademarks of the Apache Software Foundation in the United States and/or other
+countries.*
diff --git a/docs/developer-readme.md b/docs/developer-readme.md
new file mode 100644
index 0000000000..04d0e97a03
--- /dev/null
+++ b/docs/developer-readme.md
@@ -0,0 +1,80 @@
+# MMLSpark
+
+## Repository Layout
+
+* `runme`:    main build entry point
+* `src/`:     scala and python sources
+  - `core/`:  shared functionality
+  - `project/`: sbt build-related materials
+* `tools/`:   build-related tools
+
+
+## Build
+
+### Build Environment
+
+Currently, this code is developed and built on Linux.  The main build entry
+point, `./runme`, will install the needed packages.  When everything is
+installed, you can use `./runme` again to do a build.
+
+
+### Development
+
+From now on, you can continue using `./runme` for builds.  Alternatively, use
+`sbt full-build` to do the build directly through SBT.  The output will show
+the individual steps that are running, and you can use them directly as usual
+with SBT.  For example, use `sbt "project foo-bar" test` to run the tests of
+the `foo-bar` sub-project, or `sbt ~compile` to do a full compilation step
+whenever any file changes.
+
+Note that the SBT environment is set up in a way that makes *all* code in
+`com.microsoft.ml.spark` available in the Scala console that you get when you
+run `sbt console`.  This can be a very useful debugging tool, since you get to
+play with your code in an interactive REPL.
+
+Every once in a while the installed libraries will be updated.  In this case,
+executing `./runme` will update the libraries, and the next run will do a build
+as usual.  If you're using `sbt` directly, it will warn you whenever there was
+a change to the library configurations.
+
+Note: the libraries are all installed in `$HOME/lib` with a few
+executable symlinks in `$HOME/bin`.  The environment is configured in
+`$HOME/.mmlspark_profile` which will be executed whenever a shell starts.
+Occasionally, `./runme` will tell you that there was an update to the
+`.mmlspark_profile` file --- when this happens, you can start a new shell
+to get the updated version, but you can also apply the changes to your
+running shell with `. ~/.mmlspark_profile` which will evaluate its
+contents and save a shell restart.
+
+
+## Adding a Module
+
+To add a new module, create a directory with an appropriate name, and in the
+new directory create a `build.sbt` file.  The contents of `build.sbt` is
+optional, and can be completely empty: its presence will make the build include
+your directory as a sub-project which gets included in SBT work.
+
+You can put the usual SBT customizations in your `build.sbt`, for example:
+
+    version := "1.0"
+    name := "A Useful Module"
+
+In addition, there are a few utilities in `Extras` that can be useful to
+specify some things.  Currently, there is only one such utility:
+
+    Extras.noJar
+
+putting this in your `build.sbt` indicates that no `.jar` file should be
+created for your sub-project in the `package` step.  (Useful, for example, for
+build tools and test-only directories.)
+
+Finally, whenever SBT runs it generates an `autogen.sbt` file that specifies
+the sub-projects.  This file is generated automatically so there is no need to
+edit a central file when you add a module, and therefore customizing what
+appears in it is done via "meta comments" in your `build.sbt`.  This is
+currently used to specify dependencies for your sub-project --- in most cases
+you will want to add this:
+
+    //> DependsOn: core
+
+to use the shared code in the `common` sub-project.
diff --git a/docs/third-party-notices.txt b/docs/third-party-notices.txt
new file mode 100644
index 0000000000..58540ba262
--- /dev/null
+++ b/docs/third-party-notices.txt
@@ -0,0 +1,298 @@
+================================================================================
+*** OpenCV
+================================================================================
+
+By downloading, copying, installing or using the software you agree to
+this license.  If you do not agree to this license, do not download,
+install, copy or use the software.
+
+
+                          License Agreement
+               For Open Source Computer Vision Library
+                       (3-clause BSD License)
+
+Copyright (C) 2000-2016, Intel Corporation, all rights reserved.
+Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
+Copyright (C) 2009-2016, NVIDIA Corporation, all rights reserved.
+Copyright (C) 2010-2013, Advanced Micro Devices, Inc., all rights reserved.
+Copyright (C) 2015-2016, OpenCV Foundation, all rights reserved.
+Copyright (C) 2015-2016, Itseez Inc., all rights reserved.
+Third party copyrights are property of their respective owners.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright notice,
+    this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright notice,
+    this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+  * Neither the names of the copyright holders nor the names of the contributors
+    may be used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+This software is provided by the copyright holders and contributors "as
+is" and any express or implied warranties, including, but not limited
+to, the implied warranties of merchantability and fitness for a
+particular purpose are disclaimed.  In no event shall copyright holders
+or contributors be liable for any direct, indirect, incidental, special,
+exemplary, or consequential damages (including, but not limited to,
+procurement of substitute goods or services; loss of use, data, or
+profits; or business interruption) however caused and on any theory of
+liability, whether in contract, strict liability, or tort (including
+negligence or otherwise) arising in any way out of the use of this
+software, even if advised of the possibility of such damage.
+
+
+
+================================================================================
+*** File with code "taken from" PCL library
+================================================================================
+
+Software License Agreement (BSD License)
+
+Point Cloud Library (PCL) - www.pointclouds.org
+Copyright (c) 2009-2012, Willow Garage, Inc.
+Copyright (c) 2012-, Open Perception, Inc.
+Copyright (c) XXX, respective authors.
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ * Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+ * Neither the name of the copyright holder(s) nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
+OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+================================================================================
+*** KAZE
+================================================================================
+
+Copyright (c) 2012, Pablo Fernández Alcantarilla
+All Rights Reserved
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+
+    * Neither the name of the copyright holders nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+================================================================================
+*** libwebp
+================================================================================
+
+Copyright (c) 2010, Google Inc. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+  * Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+  * Redistributions in binary form must reproduce the above copyright
+    notice, this list of conditions and the following disclaimer in the
+    documentation and/or other materials provided with the distribution.
+
+  * Neither the name of Google nor the names of its contributors may be
+    used to endorse or promote products derived from this software
+    without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Additional IP Rights Grant (Patents)
+------------------------------------
+
+"These implementations" means the copyrightable works that implement the
+WebM codecs distributed by Google as part of the WebM Project.
+
+Google hereby grants to you a perpetual, worldwide, non-exclusive, no-charge,
+royalty-free, irrevocable (except as stated in this section) patent license to
+make, have made, use, offer to sell, sell, import, transfer, and otherwise
+run, modify and propagate the contents of these implementations of WebM, where
+such license applies only to those patent claims, both currently owned by
+Google and acquired in the future, licensable by Google that are necessarily
+infringed by these implementations of WebM. This grant does not include claims
+that would be infringed only as a consequence of further modification of these
+implementations. If you or your agent or exclusive licensee institute or order
+or agree to the institution of patent litigation or any other patent
+enforcement activity against any entity (including a cross-claim or
+counterclaim in a lawsuit) alleging that any of these implementations of WebM
+or any code incorporated within any of these implementations of WebM
+constitute direct or contributory patent infringement, or inducement of
+patent infringement, then any patent rights granted to you under this License
+for these implementations of WebM shall terminate as of the date such
+litigation is filed."
+
+
+
+================================================================================
+*** File with code "based on" a message of Laurent Pinchart on the
+*** video4linux mailing list
+================================================================================
+
+LEGAL ISSUES
+============
+
+In plain English:
+
+1. We don't promise that this software works.  (But if you find any
+   bugs, please let us know!)
+2. You can use this software for whatever you want.  You don't have to
+   pay us.
+3. You may not pretend that you wrote this software.  If you use it in a
+   program, you must acknowledge somewhere in your documentation that
+   you've used the IJG code.
+
+In legalese:
+
+The authors make NO WARRANTY or representation, either express or
+implied, with respect to this software, its quality, accuracy,
+merchantability, or fitness for a particular purpose.  This software is
+provided "AS IS", and you, its user, assume the entire risk as to its
+quality and accuracy.
+
+This software is copyright (C) 1991-2013, Thomas G. Lane, Guido
+Vollbeding.  All Rights Reserved except as specified below.
+
+Permission is hereby granted to use, copy, modify, and distribute this
+software (or portions thereof) for any purpose, without fee, subject to
+these conditions:
+(1) If any part of the source code for this software is distributed,
+    then this README file must be included, with this copyright and
+    no-warranty notice unaltered; and any additions, deletions, or
+    changes to the original files must be clearly indicated in
+    accompanying documentation.
+(2) If only executable code is distributed, then the accompanying
+    documentation must state that "this software is based in part on the
+    work of the Independent JPEG Group".
+(3) Permission for use of this software is granted only if the user
+    accepts full responsibility for any undesirable consequences; the
+    authors accept NO LIABILITY for damages of any kind.
+
+These conditions apply to any software derived from or based on the IJG
+code, not just to the unmodified library.  If you use our work, you
+ought to acknowledge us.
+
+Permission is NOT granted for the use of any IJG author's name or
+company name in advertising or publicity relating to this software or
+products derived from it.  This software may be referred to only as "the
+Independent JPEG Group's software".
+
+We specifically permit and encourage the use of this software as the
+basis of commercial products, provided that all warranty or liability
+claims are assumed by the product vendor.
+
+The Unix configuration script "configure" was produced with GNU
+Autoconf.  It is copyright by the Free Software Foundation but is freely
+distributable.  The same holds for its supporting scripts (config.guess,
+config.sub, ltmain.sh).  Another support script, install-sh, is
+copyright by X Consortium but is also freely distributable.
+
+The IJG distribution formerly included code to read and write GIF files.
+To avoid entanglement with the Unisys LZW patent, GIF reading support
+has been removed altogether, and the GIF writer has been simplified to
+produce "uncompressed GIFs".  This technique does not use the LZW
+algorithm; the resulting GIF files are larger than usual, but are
+readable by all standard GIF decoders.
+
+We are required to state that
+    "The Graphics Interchange Format(c) is the Copyright property of
+    CompuServe Incorporated.  GIF(sm) is a Service Mark property of
+    CompuServe Incorporated."
+
+
+
+================================================================================
+*** File with code copyright Yossi Rubner, as well as code copyright
+*** MD-Mathematische Dienste GmbH
+================================================================================
+
+    Copyright (c) 2002,
+    MD-Mathematische Dienste GmbH
+    Im Defdahl 5-10
+    44141 Dortmund
+    Germany
+    www.md-it.de
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.  Redistributions
+in binary form must reproduce the above copyright notice, this list of
+conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.  The name of Contributor
+may not be used to endorse or promote products derived from this
+software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/docs/your-first-model.md b/docs/your-first-model.md
new file mode 100644
index 0000000000..6bafdb1cc0
--- /dev/null
+++ b/docs/your-first-model.md
@@ -0,0 +1,109 @@
+## Your First Model
+
+In this example, we construct a basic classification model to predict a person's
+income level given demographics data such as education level or marital status.
+We also learn how to use Jupyter notebooks for developing and running the model.
+
+
+### Prerequisites
+
+* You have installed the MMLSpark package, either as a Docker image or on a
+  Spark cluster,
+* You have basic knowledge of Python language,
+* You have basic understanding of machine learning concepts: training, testing,
+  classification.
+
+
+### Working with Jupyter Notebooks
+
+Once you have the MMLSpark package installed, open Jupyter notebooks folder in
+your web browser
+
+* Local Docker: `http://localhost:8888`
+* Spark cluster: `https://<cluster-url>/jupyter`
+
+Create a new notebook by selecting "New" -> "PySpark3".  Let's also give the
+notebook a friendlier name, *Adult Census Income Prediction*, by clicking the
+title.
+
+
+### Importing Packages and Starting the Spark Application
+
+At this point, the notebook is not yet running a Spark application.  In the
+first cell, let's import some needed packages
+
+    import numpy as np
+    import pandas as pd
+
+Click the "run cell" button on the toolbar to start the application.  After a
+few moments, you should see the message "SparkSession available as 'spark'".
+Now you're ready to start coding and running your application.
+
+
+### Reading in Data
+
+In a typical Spark application, you'd likely work with huge datasets stored on
+distributed file system, such as HDFS.  However, to keep this tutorial simple
+and quick, we'll copy over a small dataset from a URL.  We then read this data
+into memory using Pandas CSV reader, and distribute the data as a Spark
+DataFrame.  Finally, we show the first 5 rows of the dataset. Copy the following
+code to the next cell in your notebook, and run the cell.
+
+    dataFile = "AdultCensusIncome.csv"
+    import os, urllib
+    if not os.path.isfile(dataFile):
+        urllib.request.urlretrieve("https://mmlspark.azureedge.net/datasets/" + dataFile, dataFile)
+    data = spark.createDataFrame(pd.read_csv(dataFile, dtype={" hours-per-week": np.float64}))
+    data.show(5)
+
+
+### Selecting Features and Splitting Data to Train and Test Sets
+
+Next, select some features to use in our model.  You can try out different
+features, but you should include `" income"` as it is the label column the model
+is trying to predict.  We then split the data into a `train` and `test` sets.
+
+    data = data.select([" education", " marital-status", " hours-per-week", " income"])
+    train, test = data.randomSplit([0.75, 0.25], seed=123)
+
+
+### Training a Model
+
+To train the classifier model, we use the `mmlspark.TrainClassifier` class.  It
+takes in training data and a base SparkML classifier, maps the data into the
+format expected by the base classifier algorithm, and fits a model.
+
+    from mmlspark.TrainClassifier import TrainClassifier
+    from pyspark.ml.classification import LogisticRegression
+    model = TrainClassifier(model=LogisticRegression(), labelCol=" income").fit(train)
+
+Note that `TrainClassifier` implicitly handles string-valued columns and
+binarizes the label column.
+
+
+### Scoring and Evaluating the Model
+
+Finally, let's score the model against the test set, and use
+`mmlspark.ComputeModelStatistics` class to compute metrics — accuracy, AUC,
+precision, recall — from the scored data.
+
+    from mmlspark.ComputeModelStatistics import ComputeModelStatistics
+    prediction = model.transform(test)
+    metrics = ComputeModelStatistics().transform(prediction)
+    metrics.select('accuracy').show()
+
+And that's it: you've build your first machine learning model using the MMLSpark
+package.  For help on mmlspark classes and methods, you can use Python's help()
+function, for example
+
+    help(mmlspark.TrainClassifier)
+
+Next, view our other tutorials to learn how to
+* Tune model parameters to find the best model
+* Use SparkML pipelines to build a more complex model
+* Use deep neural networks for image classification
+* Use text analytics for document classification
+
+*Apache®, Apache Spark, and Spark® are either registered trademarks or
+trademarks of the Apache Software Foundation in the United States and/or other
+countries.*
diff --git a/notebooks/samples/101 - Adult Census Income Training.ipynb b/notebooks/samples/101 - Adult Census Income Training.ipynb
new file mode 100644
index 0000000000..58d0239476
--- /dev/null
+++ b/notebooks/samples/101 - Adult Census Income Training.ipynb	
@@ -0,0 +1,139 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this example, we try to predict incomes from the *Adult Census* dataset.\n",
+    "\n",
+    "First, we import the packages (use `help(mmlspark)` to view contents),"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import mmlspark\n",
+    "\n",
+    "# help(mmlspark)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's read the data and split it to train and test sets:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "dataFile = \"AdultCensusIncome.csv\"\n",
+    "import os, urllib\n",
+    "if not os.path.isfile(dataFile):\n",
+    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n",
+    "data = spark.createDataFrame(pd.read_csv(dataFile, dtype={\" hours-per-week\": np.float64}))\n",
+    "data = data.select([\" education\", \" marital-status\", \" hours-per-week\", \" income\"])\n",
+    "train, test = data.randomSplit([0.75, 0.25], seed=123)\n",
+    "train.limit(10).toPandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`TrainClassifier` can be used to initialize and fit a model, it wraps SparkML classifiers.\n",
+    "You can use `help(mmlspark.TrainClassifier)` to view the different parameters.\n",
+    "\n",
+    "Note that it implicitly converts the data into the format expected by the algorithm: tokenize",
+    " and hash strings, one-hot encodes categorical variables, assembles the features into vector",
+    " and so on.  The parameter `numFeatures` controls the number of hashed features."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from mmlspark.TrainClassifier import TrainClassifier\n",
+    "from pyspark.ml.classification import LogisticRegression\n",
+    "model = TrainClassifier(model=LogisticRegression(), labelCol=\" income\", numFeatures=256).fit(train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After the model is trained, we score it against the test dataset and view metrics."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from mmlspark.ComputeModelStatistics import ComputeModelStatistics\n",
+    "prediction = model.transform(test)\n",
+    "metrics = ComputeModelStatistics().transform(prediction)\n",
+    "metrics.limit(10).toPandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we save the model so it can be used in a scoring program."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "model.write().overwrite().save(\"AdultCensus.mml\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [default]",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3.0
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/notebooks/samples/102 - Regression Example with Flight Delay Dataset.ipynb b/notebooks/samples/102 - Regression Example with Flight Delay Dataset.ipynb
new file mode 100644
index 0000000000..b835110085
--- /dev/null
+++ b/notebooks/samples/102 - Regression Example with Flight Delay Dataset.ipynb	
@@ -0,0 +1,161 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In this example, we run a linear regression on the *Flight Delay* dataset to predict the delay times.\n",
+    "\n",
+    "We demonstrate how to use the `TrainRegressor` and the `ComputePerInstanceStatistics` APIs.\n",
+    "\n",
+    "First, import the packages."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import mmlspark"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, import the CSV dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# load raw data from small-sized 30 MB CSV file (trimmed to contain just what we use)\n",
+    "dataFile = \"On_Time_Performance_2012_9.csv\"\n",
+    "import os, urllib\n",
+    "if not os.path.isfile(dataFile):\n",
+    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n",
+    "flightDelay = spark.createDataFrame(\n",
+    "    pd.read_csv(dataFile, dtype={\"Month\": np.float64, \"Quarter\": np.float64,\n",
+    "                                 \"DayofMonth\": np.float64, \"DayOfWeek\": np.float64,\n",
+    "                                 \"OriginAirportID\": np.float64, \"DestAirportID\": np.float64,\n",
+    "                                 \"CRSDepTime\": np.float64, \"CRSArrTime\": np.float64}))\n",
+    "# Print information on the dataset we loaded\n",
+    "print(\"records read: \" + str(flightDelay.count()))\n",
+    "print(\"Schema:\")\n",
+    "flightDelay.printSchema()\n",
+    "flightDelay.limit(10).toPandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Split the dataset into train and test sets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "train,test = flightDelay.randomSplit([0.75, 0.25])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Train a regressor on dataset with `l-bfgs`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from mmlspark.TrainRegressor import TrainRegressor\n",
+    "from pyspark.ml.regression import LinearRegression\n",
+    "lr = LinearRegression().setSolver(\"l-bfgs\").setRegParam(0.1).setElasticNetParam(0.3)\n",
+    "model = TrainRegressor(model=lr, labelCol=\"ArrDelay\", numFeatures=1 << 18).fit(train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Score the regressor on the test data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "scoredData = model.transform(test)\n",
+    "scoredData.limit(10).toPandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, compute and show per-instance statistics, demonstrating the usage",
+    " of `ComputePerInstanceStatistics`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from mmlspark import ComputePerInstanceStatistics\n",
+    "evalPerInstance = ComputePerInstanceStatistics().transform(scoredData)\n",
+    "evalPerInstance.select(\"ArrDelay\", \"Scores\", \"L1_loss\", \"L2_loss\").limit(10).toPandas()"
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [default]",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3.0
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/notebooks/samples/103 - Before and After MMLSpark.ipynb b/notebooks/samples/103 - Before and After MMLSpark.ipynb
new file mode 100644
index 0000000000..0542221ea2
--- /dev/null
+++ b/notebooks/samples/103 - Before and After MMLSpark.ipynb	
@@ -0,0 +1,286 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1. Introduction\n",
+    "\n",
+    "<p><img src=\"https://images-na.ssl-images-amazon.com/images/G/01/img16/books/bookstore/landing-page/1000638_books_landing-page_bookstore-photo-01.jpg\" style=\"width: 500px;\" title=\"Image from https://images-na.ssl-images-amazon.com/images/G/01/img16/books/bookstore/landing-page/1000638_books_landing-page_bookstore-photo-01.jpg\" /><br /></p>",
+    "\n",
+    "In this tutorial, we perform the same classification task in two\n",
+    "diffeerent ways: once using plain **`pyspark`** and once using the\n",
+    "**`mmlspark`** library.  The two methods yield the same performance,\n",
+    "but one of the two libraries is drastically simpler to use and iterate\n",
+    "on (can you guess which one?).\n",
+    "\n",
+    "The task is simple: Predict whether a user's review of a book sold on\n",
+    "Amazon is good (rating > 3) or bad based on the text of the review.  We\n",
+    "accomplish this by training LogisticRegression learners with different\n",
+    "hyperparameters and choosing the best model."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. Read the data\n",
+    "\n",
+    "We download and read in the data. We show a sample below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import mmlspark\n",
+    "from pyspark.sql.types import IntegerType, StringType, StructType, StructField\n",
+    "\n",
+    "dataFile = \"BookReviewsFromAmazon10K.tsv\"\n",
+    "textSchema = StructType([StructField(\"rating\", IntegerType(), False),\n",
+    "                         StructField(\"text\", StringType(), False)])\n",
+    "import os, urllib\n",
+    "if not os.path.isfile(dataFile):\n",
+    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n",
+    "raw_data = spark.createDataFrame(pd.read_csv(dataFile, sep=\"\\t\", header=None), textSchema)\n",
+    "raw_data.show(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3. Extract more features and process data\n",
+    "\n",
+    "Real data however is more complex than the above dataset. It is common\n",
+    "for a dataset to have features of multiple types: text, numeric,\n",
+    "categorical.  To illustrate how difficult it is to work with these\n",
+    "datasets, we add two numerical features to the dataset: the **word\n",
+    "count** of the review and the **mean word length**."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark.sql.functions import udf\n",
+    "from pyspark.sql.types import LongType, FloatType, DoubleType\n",
+    "def word_count(s):\n",
+    "    return len(s.split())\n",
+    "def word_length(s):\n",
+    "    import numpy as np\n",
+    "    ss = [len(w) for w in s.split()]\n",
+    "    return round(float(np.mean(ss)), 2)\n",
+    "word_length_udf = udf(word_length, DoubleType())\n",
+    "word_count_udf = udf(word_count, IntegerType())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "data = raw_data \\\n",
+    "       .select(\"rating\", \"text\",\n",
+    "               word_count_udf(\"text\").alias(\"wordCount\"),\n",
+    "               word_length_udf(\"text\").alias(\"wordLength\")) \\\n",
+    "       .withColumn(\"label\", raw_data[\"rating\"] > 3).drop(\"rating\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "data.show(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 4a. Classify using pyspark\n",
+    "\n",
+    "To choose the best LogisticRegression classifier using the `pyspark`\n",
+    "library, need to *explictly* perform the following steps:\n",
+    "\n",
+    "1. Process the features:\n",
+    "   * Tokenize the text column\n",
+    "   * Hash the tokenized column into a vector using hashing\n",
+    "   * Merge the numeric features with the vector in the step above\n",
+    "2. Process the label column: cast it into the proper type.\n",
+    "3. Train multiple LogisticRegression algorithms on the `train` dataset\n",
+    "   with different hyperparameters\n",
+    "4. Compute the area under the ROC curve for each of the trained models\n",
+    "   and select the model with the highest metric as computed on the\n",
+    "   `test` dataset\n",
+    "5. Evaluate the best model on the `validation` set\n",
+    "\n",
+    "As you can see below, there is a lot of work involved and a lot of\n",
+    "steps where something can go wrong!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark.ml.feature import Tokenizer, HashingTF\n",
+    "from pyspark.ml.feature import VectorAssembler\n",
+    "\n",
+    "# Featurize text column\n",
+    "tokenizer = Tokenizer(inputCol=\"text\", outputCol=\"tokenizedText\")\n",
+    "numFeatures = 10000\n",
+    "hashingScheme = HashingTF(inputCol=\"tokenizedText\",\n",
+    "                          outputCol=\"TextFeatures\",\n",
+    "                          numFeatures=numFeatures)\n",
+    "tokenizedData = tokenizer.transform(data)\n",
+    "featurizedData = hashingScheme.transform(tokenizedData)\n",
+    "\n",
+    "# Merge text and numeric features in one feature column\n",
+    "feature_columns_array = [\"TextFeatures\", \"wordCount\", \"wordLength\"]\n",
+    "assembler = VectorAssembler(\n",
+    "    inputCols = feature_columns_array,\n",
+    "    outputCol=\"features\")\n",
+    "assembledData = assembler.transform(featurizedData)\n",
+    "\n",
+    "# Select only columns of interest\n",
+    "# Convert rating column from boolean to int\n",
+    "processedData = assembledData \\\n",
+    "                .select(\"label\", \"features\") \\\n",
+    "                .withColumn(\"label\", assembledData.label.cast(IntegerType()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
+    "from pyspark.ml.classification import LogisticRegression\n",
+    "\n",
+    "# Prepare data for learning\n",
+    "train, test, validation = processedData.randomSplit([0.60, 0.20, 0.20], seed=123)\n",
+    "\n",
+    "# Train the models on the 'train' data\n",
+    "lrHyperParams = [0.05, 0.1, 0.2, 0.4]\n",
+    "logisticRegressions = [LogisticRegression(regParam = hyperParam)\n",
+    "                       for hyperParam in lrHyperParams]\n",
+    "evaluator = BinaryClassificationEvaluator(rawPredictionCol=\"rawPrediction\",\n",
+    "                                          metricName=\"areaUnderROC\")\n",
+    "metrics = []\n",
+    "models = []\n",
+    "\n",
+    "# Select the best model\n",
+    "for learner in logisticRegressions:\n",
+    "    model = learner.fit(train)\n",
+    "    models.append(model)\n",
+    "    scored_data = model.transform(test)\n",
+    "    metrics.append(evaluator.evaluate(scored_data))\n",
+    "best_metric = max(metrics)\n",
+    "best_model = models[metrics.index(best_metric)]\n",
+    "\n",
+    "# Save model\n",
+    "best_model.write().overwrite().save(\"SparkMLExperiment.mmls\")\n",
+    "# Get AUC on the validation dataset\n",
+    "scored_val = best_model.transform(validation)\n",
+    "print(evaluator.evaluate(scored_val))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 4b. Classify using mmlspark\n",
+    "\n",
+    "Life is a lot simpler when using `mmlspark`!\n",
+    "\n",
+    "1. The **`TrainClassifier`** Estimator featurizes the data internally,\n",
+    "   as long as the columns selected in the `train`, `test`, `validation`\n",
+    "   dataset represent the features\n",
+    "\n",
+    "2. The **`FindBestModel`** Estimator find the best model from a pool of\n",
+    "   trained models by find the model which performs best on the `test`\n",
+    "   dataset given the specified metric\n",
+    "\n",
+    "3. The **`CompueModelStatistics`** Transformer computes the different\n",
+    "   metrics on a scored dataset (in our case, the `validation` dataset)\n",
+    "   at the same time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from mmlspark import TrainClassifier, FindBestModel, ComputeModelStatistics\n",
+    "\n",
+    "# Prepare data for learning\n",
+    "train, test, validation = data.randomSplit([0.60, 0.20, 0.20], seed=123)\n",
+    "\n",
+    "# Train the models on the 'train' data\n",
+    "lrHyperParams = [0.05, 0.1, 0.2, 0.4]\n",
+    "logisticRegressions = [LogisticRegression(regParam = hyperParam)\n",
+    "                       for hyperParam in lrHyperParams]\n",
+    "lrmodels = [TrainClassifier(model=lrm, labelCol=\"label\", numFeatures=10000).fit(train)\n",
+    "            for lrm in logisticRegressions]\n",
+    "\n",
+    "# Select the best model\n",
+    "bestModel = FindBestModel(evaluationMetric=\"AUC\", models=lrmodels).fit(test)\n",
+    "\n",
+    "# Save model\n",
+    "bestModel.write().overwrite().save(\"MMLSExperiment.mmls\")\n",
+    "# Get AUC on the validation dataset\n",
+    "predictions = bestModel.transform(validation)\n",
+    "metrics = ComputeModelStatistics().transform(predictions)\n",
+    "print(metrics.first()[\"AUC\"])"
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [default]",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3.0
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/notebooks/samples/201 - Amazon Book Reviews - TextFeaturizer.ipynb b/notebooks/samples/201 - Amazon Book Reviews - TextFeaturizer.ipynb
new file mode 100644
index 0000000000..2d1013747a
--- /dev/null
+++ b/notebooks/samples/201 - Amazon Book Reviews - TextFeaturizer.ipynb	
@@ -0,0 +1,186 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Again, try to predict Amazon book ratings greater than 3 out of 5, this time usaging the",
+    " `TextFeaturizer` module which is a composition of several text analytics APIs that are",
+    " native to Spark."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import mmlspark\n",
+    "from pyspark.sql.types import IntegerType, StringType, StructType, StructField"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "dataFile = \"BookReviewsFromAmazon10K.tsv\"\n",
+    "textSchema = StructType([StructField(\"rating\", IntegerType(), False),\n",
+    "                         StructField(\"text\", StringType(), False)])\n",
+    "import os, urllib\n",
+    "if not os.path.isfile(dataFile):\n",
+    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n",
+    "data = spark.createDataFrame(pd.read_csv(dataFile, sep=\"\\t\", header=None), textSchema)\n",
+    "data.limit(10).toPandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use `TextFeaturizer` to generate our features column.  We remove stop words, and use TF-IDF",
+    " to generate 2²⁰ sparse features."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from mmlspark.TextFeaturizer import TextFeaturizer\n",
+    "textFeaturizer = TextFeaturizer() \\\n",
+    "  .setInputCol(\"text\").setOutputCol(\"features\") \\\n",
+    "  .setUseStopWordsRemover(True).setUseIDF(True).setMinDocFreq(5).setNumFeatures(1 << 16).fit(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "processedData = textFeaturizer.transform(data)\n",
+    "processedData.limit(5).toPandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Change the label so that we can predict whether the rating is greater than 3 using a binary",
+    " classifier."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "processedData = processedData.withColumn(\"label\", processedData[\"rating\"] > 3) \\\n",
+    "                             .select([\"features\", \"label\"])\n",
+    "processedData.limit(5).toPandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Train several Logistic Regression models with different regularizations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "train, test, validation = processedData.randomSplit([0.60, 0.20, 0.20])\n",
+    "from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier\n",
+    "\n",
+    "lrHyperParams = [0.05, 0.1, 0.2, 0.4]\n",
+    "logisticRegressions = [LogisticRegression(regParam = hyperParam) for hyperParam in lrHyperParams]\n",
+    "\n",
+    "from mmlspark.TrainClassifier import TrainClassifier\n",
+    "lrmodels = [TrainClassifier(model=lrm, labelCol=\"label\").fit(train) for lrm in logisticRegressions]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Find the model with the best AUC on the test set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from mmlspark import FindBestModel\n",
+    "bestModel = FindBestModel(evaluationMetric=\"AUC\", models=lrmodels).fit(test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use the optimized `ComputeModelStatistics` API to find the model accuracy."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from mmlspark.ComputeModelStatistics import ComputeModelStatistics\n",
+    "predictions = bestModel.transform(validation)\n",
+    "metrics = ComputeModelStatistics().transform(predictions)\n",
+    "metrics.first()[\"accuracy\"]"
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [default]",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3.0
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/notebooks/samples/202 - Amazon Book Reviews - Word2Vec.ipynb b/notebooks/samples/202 - Amazon Book Reviews - Word2Vec.ipynb
new file mode 100644
index 0000000000..30d5aa7d64
--- /dev/null
+++ b/notebooks/samples/202 - Amazon Book Reviews - Word2Vec.ipynb	
@@ -0,0 +1,228 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Yet again, now using the `Word2Vec` Estimator from Spark.  We can use the tree-based learners",
+    " from spark in this scenario due to the lower dimensionality representation of features."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import mmlspark\n",
+    "from pyspark.sql.types import IntegerType, StringType, StructType, StructField"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "dataFile = \"BookReviewsFromAmazon10K.tsv\"\n",
+    "textSchema = StructType([StructField(\"rating\", IntegerType(), False),\n",
+    "                         StructField(\"text\", StringType(), False)])\n",
+    "import os, urllib\n",
+    "if not os.path.isfile(dataFile):\n",
+    "    urllib.request.urlretrieve(\"https://mmlspark.azureedge.net/datasets/\"+dataFile, dataFile)\n",
+    "data = spark.createDataFrame(pd.read_csv(dataFile, sep=\"\\t\", header=None), textSchema)\n",
+    "data.limit(10).toPandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Modify the label column to predict a rating greater than 3."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "processedData = data.withColumn(\"label\", data[\"rating\"] > 3) \\\n",
+    "                    .select([\"text\", \"label\"])\n",
+    "processedData.limit(5).toPandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Split the dataset into train, test and validation sets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "train, test, validation = processedData.randomSplit([0.60, 0.20, 0.20])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use `Tokenizer` and `Word2Vec` to generate the features."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark.ml import Pipeline\n",
+    "from pyspark.ml.feature import Tokenizer, Word2Vec\n",
+    "tokenizer = Tokenizer(inputCol=\"text\", outputCol=\"words\")\n",
+    "partitions = train.rdd.getNumPartitions()\n",
+    "word2vec = Word2Vec(maxIter=4, seed=42, inputCol=\"words\", outputCol=\"features\",\n",
+    "                    numPartitions=partitions)\n",
+    "textFeaturizer = Pipeline(stages = [tokenizer, word2vec]).fit(train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Transform each of the train, test and validation datasets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "ptrain = textFeaturizer.transform(train).select([\"label\", \"features\"])\n",
+    "ptest = textFeaturizer.transform(test).select([\"label\", \"features\"])\n",
+    "pvalidation = textFeaturizer.transform(validation).select([\"label\", \"features\"])\n",
+    "ptrain.limit(5).toPandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Generate several models with different parameters from the training data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier\n",
+    "from mmlspark.TrainClassifier import TrainClassifier\n",
+    "import itertools\n",
+    "\n",
+    "lrHyperParams       = [0.05, 0.2]\n",
+    "logisticRegressions = [LogisticRegression(regParam = hyperParam)\n",
+    "                       for hyperParam in lrHyperParams]\n",
+    "lrmodels            = [TrainClassifier(model=lrm, labelCol=\"label\").fit(ptrain)\n",
+    "                       for lrm in logisticRegressions]\n",
+    "\n",
+    "rfHyperParams       = itertools.product([5, 10], [3, 5])\n",
+    "randomForests       = [RandomForestClassifier(numTrees=hyperParam[0], maxDepth=hyperParam[1])\n",
+    "                       for hyperParam in rfHyperParams]\n",
+    "rfmodels            = [TrainClassifier(model=rfm, labelCol=\"label\").fit(ptrain)\n",
+    "                       for rfm in randomForests]\n",
+    "\n",
+    "rfHyperParams       = itertools.product([8, 16], [3, 5])\n",
+    "gbtclassifiers      = [GBTClassifier(maxBins=hyperParam[0], maxDepth=hyperParam[1])\n",
+    "                       for hyperParam in rfHyperParams]\n",
+    "gbtmodels           = [TrainClassifier(model=gbt, labelCol=\"label\").fit(ptrain)\n",
+    "                       for gbt in gbtclassifiers]\n",
+    "\n",
+    "trainedModels       = lrmodels + rfmodels + gbtmodels"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Find the best model for the given test dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from mmlspark import FindBestModel\n",
+    "bestModel = FindBestModel(evaluationMetric=\"AUC\", models=trainedModels).fit(ptest)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Get the accuracy from the validation dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from mmlspark.ComputeModelStatistics import ComputeModelStatistics\n",
+    "predictions = bestModel.transform(pvalidation)\n",
+    "metrics = ComputeModelStatistics().transform(predictions)\n",
+    "metrics.first()[\"accuracy\"]"
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [default]",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3.0
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/notebooks/samples/301 - CIFAR10 CNTK CNN Evaluation.ipynb b/notebooks/samples/301 - CIFAR10 CNTK CNN Evaluation.ipynb
new file mode 100644
index 0000000000..015e94f2e1
--- /dev/null
+++ b/notebooks/samples/301 - CIFAR10 CNTK CNN Evaluation.ipynb	
@@ -0,0 +1,264 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from mmlspark import CNTKModel, ModelDownloader\n",
+    "from pyspark.sql.functions import udf\n",
+    "from pyspark.sql.types import IntegerType\n",
+    "from os.path import abspath"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set some paths."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "cdnURL = \"https://mmlspark.azureedge.net/datasets\"\n",
+    "\n",
+    "# Please note that this is a copy of the CIFAR10 dataset originally found here:\n",
+    "# http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz\n",
+    "dataFile = \"cifar-10-python.tar.gz\"\n",
+    "dataURL = cdnURL + \"/CIFAR10/\" + dataFile"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "mml-deploy": "hdinsight",
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "modelName = \"ConvNet\"\n",
+    "modelDir = \"wasb:///models/\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "mml-deploy": "local",
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "modelName = \"ConvNet\"\n",
+    "modelDir = \"file:\" + abspath(\"models\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Get the model and extract the data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import os, tarfile, pickle\n",
+    "import urllib.request\n",
+    "\n",
+    "d = ModelDownloader(spark, modelDir)\n",
+    "model = d.downloadByName(modelName)\n",
+    "if not os.path.isfile(dataFile):\n",
+    "    urllib.request.urlretrieve(dataURL, dataFile)\n",
+    "with tarfile.open(dataFile, \"r:gz\") as f:\n",
+    "    test_dict = pickle.load(f.extractfile(\"cifar-10-batches-py/test_batch\"),\n",
+    "                            encoding=\"latin1\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Preprocess the images."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import array\n",
+    "from pyspark.sql.functions import col\n",
+    "from pyspark.sql.types import *\n",
+    "\n",
+    "def reshape_image(record):\n",
+    "    image, label, filename = record\n",
+    "    data = [float(x) for x in image.reshape(3,32,32).flatten()]\n",
+    "    return data, label, filename\n",
+    "\n",
+    "convert_to_float = udf(lambda x: x, ArrayType(FloatType()))\n",
+    "\n",
+    "image_rdd = zip(test_dict[\"data\"], test_dict[\"labels\"], test_dict[\"filenames\"])\n",
+    "image_rdd = spark.sparkContext.parallelize(image_rdd).map(reshape_image)\n",
+    "\n",
+    "imagesWithLabels = image_rdd.toDF([\"images\", \"labels\", \"filename\"])\n",
+    "imagesWithLabels = imagesWithLabels.withColumn(\"images\", convert_to_float(col(\"images\")))\n",
+    "imagesWithLabels.printSchema()\n",
+    "\n",
+    "imagesWithLabels.cache()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Evaluate CNTK model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "start = time.time()\n",
+    "\n",
+    "# Use CNTK model to get log probabilities\n",
+    "cntkModel = CNTKModel().setInputCol(\"images\").setOutputCol(\"output\").setModel(spark, model.uri).setOutputNodeName(\"z\")\n",
+    "scoredImages = cntkModel.transform(imagesWithLabels)\n",
+    "\n",
+    "# Transform the log probabilities to predictions\n",
+    "def argmax(x): return max(enumerate(x),key=lambda p: p[1])[0]\n",
+    "\n",
+    "argmaxUDF = udf(argmax, IntegerType())\n",
+    "imagePredictions = scoredImages.withColumn(\"predictions\", argmaxUDF(\"output\")) \\\n",
+    "                               .select(\"predictions\", \"labels\")\n",
+    "\n",
+    "numRows = imagePredictions.count()\n",
+    "\n",
+    "end = time.time()\n",
+    "print(\"classifying {} images took {} seconds\".format(numRows,end-start))\n",
+    "\n",
+    "# Register the predictions as a temp table for further analysis using SQL\n",
+    "imagePredictions.registerTempTable(\"ImagePredictions\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Plot confusion matrix."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "mml-deploy": "hdinsight",
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "%%sql -q -o imagePredictions\n",
+    "select * from ImagePredictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "mml-deploy": "hdinsight",
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "%%local\n",
+    "y, y_hat = imagePredictions[\"labels\"], imagePredictions[\"predictions\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "mml-deploy": "local",
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "imagePredictions = imagePredictions.toPandas()\n",
+    "y, y_hat = imagePredictions[\"labels\"], imagePredictions[\"predictions\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "from sklearn.metrics import confusion_matrix\n",
+    "\n",
+    "cm = confusion_matrix(y, y_hat)\n",
+    "\n",
+    "labels = [\"airplane\", \"automobile\", \"bird\", \"cat\", \"deer\", \"dog\", \"frog\",\n",
+    "          \"horse\", \"ship\", \"truck\"]\n",
+    "plt.imshow(cm, interpolation=\"nearest\", cmap=plt.cm.Blues)\n",
+    "plt.colorbar()\n",
+    "tick_marks = np.arange(len(labels))\n",
+    "plt.xticks(tick_marks, labels, rotation=90)\n",
+    "plt.yticks(tick_marks, labels)\n",
+    "plt.xlabel(\"Predicted label\")\n",
+    "plt.ylabel(\"True Label\")\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [default]",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3.0
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/notebooks/samples/302 - Pipeline Image Transformations.ipynb b/notebooks/samples/302 - Pipeline Image Transformations.ipynb
new file mode 100644
index 0000000000..51c0d5d1dc
--- /dev/null
+++ b/notebooks/samples/302 - Pipeline Image Transformations.ipynb	
@@ -0,0 +1,236 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This example shows how to manipulate the collection of images.\n",
+    "First, the images are downloaded to the local directory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "mml-deploy": "local",
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "IMAGE_PATH = \"datasets/CIFAR10\"\n",
+    "\n",
+    "import os, subprocess\n",
+    "from urllib.request import urlretrieve\n",
+    "dataFile = \"test.zip\"\n",
+    "if not os.path.isdir(IMAGE_PATH):\n",
+    "    os.makedirs(IMAGE_PATH)\n",
+    "    urlretrieve(\"https://mmlspark.azureedge.net/datasets/CIFAR10/test.zip\",\n",
+    "                IMAGE_PATH + \".zip\")\n",
+    "    print(subprocess.check_output(\n",
+    "              \"ip=\\\"%s\\\"; cd \\\"$ip\\\" && unzip -q \\\"../$(basename $PWD).zip\\\"\" % IMAGE_PATH,\n",
+    "              stderr = subprocess.STDOUT, shell = True)\n",
+    "          .decode(\"utf-8\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "mml-deploy": "hdinsight",
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "%%local\n",
+    "IMAGE_PATH = \"/datasets/CIFAR10/test\"\n",
+    "import subprocess\n",
+    "if subprocess.call([\"hdfs\", \"dfs\", \"-test\", \"-d\", IMAGE_PATH]):\n",
+    "    from urllib import urlretrieve\n",
+    "    urlretrieve(\"https://mmlspark.azureedge.net/datasets/CIFAR10/test.zip\", \"/tmp/test.zip\")\n",
+    "    print subprocess.check_output(\"rm -rf /tmp/CIFAR10 && mkdir -p /tmp/CIFAR10 && unzip /tmp/test.zip -d /tmp/CIFAR10\", stderr=subprocess.STDOUT, shell=True)\n",
+    "    print subprocess.check_output(\"hdfs dfs -mkdir -p %s\" % IMAGE_PATH, stderr=subprocess.STDOUT, shell=True)\n",
+    "    print subprocess.check_output(\"hdfs dfs -copyFromLocal -f /tmp/CIFAR10/test/011*.png %s\"%IMAGE_PATH, stderr=subprocess.STDOUT, shell=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "mml-deploy": "hdinsight",
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "IMAGE_PATH = \"/datasets/CIFAR10/test\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The images are loaded from the directory (for fast prototyping, consider loading a fraction of",
+    " images). Inside the dataframe, each image is a single field in the image column. The image has",
+    " sub-fields (path, height, width, OpenCV type and OpenCV bytes)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "import mmlspark\n",
+    "import numpy as np\n",
+    "from mmlspark import toNDArray\n",
+    "\n",
+    "images = spark.readImages(IMAGE_PATH, recursive = True, sampleRatio = 0.1).cache()\n",
+    "images.printSchema()\n",
+    "print(images.count())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "When collected from the *DataFrame*, the image data are stored in a *Row*, which is Spark's way",
+    " to represent structures (in the current example, each dataframe row has a single Image, which",
+    " itself is a Row).  It is possible to address image fields by name and use `toNDArray()` helper",
+    " function to convert the image into numpy array for further manipulations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "\n",
+    "data = images.take(3)    # take first three rows of the dataframe\n",
+    "im = data[2][0]          # the image is in the first column of a given row\n",
+    "\n",
+    "print(\"image type: {}, number of fields: {}\".format(type(im), len(im)))\n",
+    "print(\"image path: {}\".format(im.path))\n",
+    "print(\"height: {}, width: {}, OpenCV type: {}\".format(im.height, im.width, im.type))\n",
+    "\n",
+    "arr = toNDArray(im)     # convert to numpy array\n",
+    "Image.fromarray(arr, \"RGB\")   # display the image inside notebook\n",
+    "print(images.count())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Use ImageTransform for the basic image manipulation: resizing, cropping, etc.\n",
+    "Internally, operations are pipelined and backed by OpenCV implementation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from mmlspark import ImageTransform\n",
+    "\n",
+    "tr = (ImageTransform()                    # images are resized and then cropped\n",
+    "      .setOutputCol(\"transformed\")\n",
+    "      .resize(height = 200, width = 200)\n",
+    "      .crop(0, 0, height = 180, width = 180) )\n",
+    "\n",
+    "small = tr.transform(images).select(\"transformed\")\n",
+    "\n",
+    "im = small.take(3)[2][0]                  # take third image\n",
+    "Image.fromarray(toNDArray(im), \"RGB\")   # display the image inside notebook"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For the advanced image manipulations, use Spark UDFs.\n",
+    "The MMLSpark package provides conversion function between *Spark Row* and",
+    " *ndarray* image representations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from pyspark.sql.functions import udf\n",
+    "from mmlspark import ImageSchema, toNDArray, toImage\n",
+    "\n",
+    "def u(row):\n",
+    "    array = toNDArray(row)    # convert Image to numpy ndarray[height, width, 3]\n",
+    "    array[:,:,2] = 0\n",
+    "    return toImage(array)     # numpy array back to Spark Row structure\n",
+    "\n",
+    "noBlueUDF = udf(u,ImageSchema)\n",
+    "\n",
+    "noblue = small.withColumn(\"noblue\", noBlueUDF(small[\"transformed\"])).select(\"noblue\")\n",
+    "\n",
+    "im = noblue.take(3)[2][0]                # take second image\n",
+    "Image.fromarray(toNDArray(im), \"RGB\")   # display the image inside notebook"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Images could be unrolled into the dense 1D vectors suitable for CNKT evaluation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from mmlspark import UnrollImage\n",
+    "\n",
+    "unroller = UnrollImage().setInputCol(\"noblue\").setOutputCol(\"unrolled\")\n",
+    "\n",
+    "unrolled = unroller.transform(noblue).select(\"unrolled\")\n",
+    "\n",
+    "vector = unrolled.take(1)[0][0]\n",
+    "print(type(vector))\n",
+    "len(vector.toArray())"
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [default]",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3.0
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/notebooks/tests/BasicDFOpsSmokeTest.ipynb b/notebooks/tests/BasicDFOpsSmokeTest.ipynb
new file mode 100644
index 0000000000..32222c3ecd
--- /dev/null
+++ b/notebooks/tests/BasicDFOpsSmokeTest.ipynb
@@ -0,0 +1,107 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "assert(\"spark\" in globals())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "assert(sc.defaultParallelism > 0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.datasets import load_iris\n",
+    "from pyspark.sql.types import StringType, FloatType, StructField, StructType\n",
+    "\n",
+    "d = load_iris()\n",
+    "\n",
+    "def make_records(features, label, label_names):\n",
+    "    temp = [float(f) for f in features]\n",
+    "    temp.append(str(label_names[label]))\n",
+    "    return temp\n",
+    "\n",
+    "col_types = [StructField(fname, FloatType(), False) for fname in d[\"feature_names\"]]\n",
+    "col_types.append(StructField(\"target\", StringType(), False))\n",
+    "schema = StructType(col_types)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df = spark.createDataFrame([make_records(feature, label, d[\"target_names\"]) \\\n",
+    "                            for feature, label in zip(d[\"data\"], d[\"target\"])], schema)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "assert(df.count() == 150)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "expected_columns = d[\"feature_names\"].copy()\n",
+    "expected_columns.append(\"target\")\n",
+    "assert(all(actual == expected for actual, expected in zip(df.columns, expected_columns)))"
+   ]
+  }
+ ],
+ "metadata": {
+  "anaconda-cloud": {},
+  "kernelspec": {
+   "display_name": "Python [default]",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/runme b/runme
new file mode 100755
index 0000000000..8d1cf0c77e
--- /dev/null
+++ b/runme
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+########################################################################
+# This script serves as both an environment installation script and as a
+# build script, for use either on a build machine or a on a developer
+# machine.
+#
+# Arguments to the script are all parsed as variable settings in the
+# form of "VAR=VAL", which means that you can use either "X=Y ./runme"
+# or "./runme X=Y".
+#
+# The script works in a "mode" determined by $BUILDMODE.  The current
+# modes are:
+#
+# * "build":   performs all steps listed below,
+# * "server":  similar, but intended for use on a build agent,
+# * "setup":   perform only the setup/update steps, skipping the build,
+# * "runtime": similar to "setup", but installs only runtime libraries.
+#
+# The default (when $BUILDMODE is not set) is similar to doing the
+# environment setup part -- but if that does nothing then it continues
+# with the build.  You can therefore run it once to work directly in the
+# IDE, running it again only on updates, or use it to do full builds
+# (but you can use sbt/etc for that too).
+#
+# Here are the steps that are performed:
+#
+# 1. Setup a working environment with a bunch of needed packages
+#    installed.  (All are installed in $HOME/lib with symlinks in
+#    $HOME/bin, no system changes.)
+#
+# 2. Possibly update existing packages if the configuration (in
+#    "tools/config.sh") was updated.
+#
+# 3. Ensure that the environment is properly set up for these tools.
+#    For example, set $PATH to include $HOME/bin, verify a configured
+#    git identity.
+#
+# 4. Runs a build, including tests, packaging the result, etc.
+#
+# Look for `defvar` in "config.sh" to see other variables that customize
+# the build (e.g., $TESTS).
+########################################################################
+
+. "$(dirname "$(realpath "${BASH_SOURCE[0]}")")/tools/runme/runme.sh"
+
+# run an actual build only when this is invoked directly
+if ((${#BASH_SOURCE[@]} == 1)); then _runme; fi
diff --git a/src/.gitignore b/src/.gitignore
new file mode 100644
index 0000000000..cbb6cff395
--- /dev/null
+++ b/src/.gitignore
@@ -0,0 +1,44 @@
+# Compilation/build/tests
+target/
+# Generated python code
+/src/main/resources/mmlspark
+
+# SBT meta-level generated files (build.properties is created by runme)
+/autogen.sbt
+/project/build.properties
+/project/autogen.scala
+/project/dependencies.*
+/project/project-roots.txt
+/scalastyle-config.xml
+
+# More SBT things
+lib_managed/
+src_managed/
+.cache*
+.history
+.lib
+/project/boot/
+/project/plugins/project/
+/project/project/
+/scalastyle-config.xml
+
+# IntelliJ w/ Scala
+.idea
+.scala_dependencies
+.worksheet
+*.sc
+*.iml
+*.ipr
+*.iws
+out
+
+# Additional Eclipse things
+.classpath
+.project
+.settings
+.target
+
+# ENSIME
+.ensime
+.ensime_lucene
+.ensime_cache
diff --git a/src/.sbtopts b/src/.sbtopts
new file mode 100644
index 0000000000..870a32019f
--- /dev/null
+++ b/src/.sbtopts
@@ -0,0 +1,2 @@
+-J-Xmx4G
+-J-XX:ReservedCodeCacheSize=256M
diff --git a/src/build.sbt b/src/build.sbt
new file mode 100644
index 0000000000..63c5f776fc
--- /dev/null
+++ b/src/build.sbt
@@ -0,0 +1,11 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+name := "mmlspark"
+
+Extras.rootSettings
+
+enablePlugins(ScalaUnidocPlugin)
+
+// Use `in ThisBuild` to provide defaults for all sub-projects
+version in ThisBuild := Extras.mmlVer
diff --git a/src/checkpoint-data/build.sbt b/src/checkpoint-data/build.sbt
new file mode 100644
index 0000000000..6d55f118b6
--- /dev/null
+++ b/src/checkpoint-data/build.sbt
@@ -0,0 +1 @@
+//> DependsOn: core
diff --git a/src/checkpoint-data/src/main/scala/CheckpointData.scala b/src/checkpoint-data/src/main/scala/CheckpointData.scala
new file mode 100644
index 0000000000..eaf0d91466
--- /dev/null
+++ b/src/checkpoint-data/src/main/scala/CheckpointData.scala
@@ -0,0 +1,71 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
+import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.sql.types._
+import org.apache.spark.storage._
+
+trait CheckpointDataParams extends MMLParams {
+
+  // Determines the storage level: MEMORY_ONLY or MEMORY_AND_DISK
+  val diskIncluded: BooleanParam = BooleanParam(this, "diskIncluded", "Persist to disk as well as memory", false)
+  final def getDiskIncluded: Boolean = $(diskIncluded)
+  def setDiskIncluded(value: Boolean): this.type = set(diskIncluded, value)
+
+  // Enables reverse operation to free up memory
+  val removeCheckpoint: BooleanParam = BooleanParam(this, "removeCheckpoint", "Unpersist a cached dataset", false)
+  final def getRemoveCheckpoint: Boolean = $(removeCheckpoint)
+  def setRemoveCheckpoint(value: Boolean): this.type = set(removeCheckpoint, value)
+
+  protected def validateAndTransformSchema(schema: StructType): StructType = {
+    schema
+  }
+
+}
+
+class CheckpointData(override val uid: String) extends Transformer with CheckpointDataParams {
+
+  def this() = this(Identifiable.randomUID("CheckpointData"))
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    if ($(removeCheckpoint)) {
+      CheckpointData.clearCache(dataset, false)
+    } else {
+      CheckpointData.cache(dataset, $(diskIncluded), false)
+    }
+  }
+
+  def transformSchema(schema: StructType): StructType = {
+    validateAndTransformSchema(schema)
+  }
+
+  def copy(extra: ParamMap): CheckpointData = defaultCopy(extra)
+
+}
+
+object CheckpointData extends DefaultParamsReadable[CheckpointData]{
+
+  def clearCache(ds: Dataset[_], blocking: Boolean): DataFrame = {
+    ds.unpersist(blocking)
+    ds.toDF
+  }
+
+  def cache(ds: Dataset[_], disk: Boolean, serialized: Boolean): DataFrame = {
+    ds.persist(if (disk && serialized) StorageLevel.MEMORY_AND_DISK_SER
+               else if (serialized)    StorageLevel.MEMORY_ONLY_SER
+               else if (disk)          StorageLevel.MEMORY_AND_DISK
+               else                    StorageLevel.MEMORY_ONLY)
+    ds.toDF
+  }
+
+  def persistToHive(ds: Dataset[_], dbName: String, tableName: String): DataFrame = {
+    ds.write.mode("overwrite").saveAsTable(dbName + "." + tableName)
+    ds.toDF
+  }
+
+}
diff --git a/src/checkpoint-data/src/test/scala/CheckpointDataSuite.scala b/src/checkpoint-data/src/test/scala/CheckpointDataSuite.scala
new file mode 100644
index 0000000000..5fd146cc35
--- /dev/null
+++ b/src/checkpoint-data/src/test/scala/CheckpointDataSuite.scala
@@ -0,0 +1,41 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.ml.Transformer
+import org.apache.spark.sql.types._
+import org.apache.spark.ml.param._
+
+class CheckpointDataSuite extends TestBase {
+
+  test("Smoke test for Spark session version") {
+    assert(session.sparkContext.version ==
+             sys.env.getOrElse("SPARK_VERSION",
+                               sys.error("Missing $SPARK_VER environment variable")))
+  }
+
+  import session.implicits._
+
+  test("Cache DF") {
+    val input = makeBasicDF()
+    input.createOrReplaceTempView("cachingDFView")
+
+    val checkpointer = new CheckpointData().setDiskIncluded(false).setRemoveCheckpoint(false)
+    checkpointer.transform(input)
+
+    assert(input.sqlContext.isCached("cachingDFView"))
+  }
+
+  test("Remove Cache on DF") {
+    assert(session.sqlContext.isCached("cachingDFView"))
+    val input = session.table("cachingDFView")
+
+    val checkpointer = new CheckpointData().setDiskIncluded(false).setRemoveCheckpoint(true)
+    checkpointer.transform(input)
+
+    assert(!input.sqlContext.isCached("cachingDFView"))
+  }
+
+}
diff --git a/src/cntk-model/build.sbt b/src/cntk-model/build.sbt
new file mode 100644
index 0000000000..83b404851c
--- /dev/null
+++ b/src/cntk-model/build.sbt
@@ -0,0 +1,3 @@
+//> DependsOn: core
+//> DependsOn: readers
+//> DependsOn: image-transformer
diff --git a/src/cntk-model/src/main/python/CNTKModel.py b/src/cntk-model/src/main/python/CNTKModel.py
new file mode 100644
index 0000000000..695943e38d
--- /dev/null
+++ b/src/cntk-model/src/main/python/CNTKModel.py
@@ -0,0 +1,21 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+import sys
+
+if sys.version >= '3':
+    basestring = str
+
+from mmlspark._CNTKModel import _CNTKModel
+from pyspark.ml.common import inherit_doc
+
+@inherit_doc
+class CNTKModel(_CNTKModel):
+    """
+    :param SparkSession SparkSession: The SparkSession that will be used to find the model
+    :param str location: The location of the model, either on local or HDFS
+    """
+    def setModel(self, sparkSession, location):
+        jSpark = sparkSession._jsparkSession
+        self._java_obj = self._java_obj.setModel(jSpark, location)
+        return self
diff --git a/src/cntk-model/src/main/scala/CNTKModel.scala b/src/cntk-model/src/main/scala/CNTKModel.scala
new file mode 100644
index 0000000000..d84c552879
--- /dev/null
+++ b/src/cntk-model/src/main/scala/CNTKModel.scala
@@ -0,0 +1,230 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.io.File
+import javax.xml.bind.DatatypeConverter._
+
+import com.microsoft.CNTK.{Function => CNTKFunction, DataType => CNTKDataType, _}
+import com.microsoft.ml.spark.schema.DatasetExtensions
+import org.apache.commons.io.FileUtils._
+import org.apache.spark.broadcast._
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkFiles
+import org.apache.spark.ml.Model
+import org.apache.spark.ml.linalg.{DenseVector, Vectors}
+import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
+import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators}
+import org.apache.spark.ml.util._
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+
+import scala.collection.mutable
+import scala.collection.mutable.ListBuffer
+
+private object CNTKModelUtils extends java.io.Serializable {
+
+  private def applyModel(inputIndex: Int,
+                         broadcastModelBytes: Broadcast[Array[Byte]],
+                         minibatchSize: Int,
+                         inputNode: Int,
+                         outputNode: Option[String])(inputRows: Iterator[Row]): Iterator[Row] = {
+    val device = DeviceDescriptor.useDefaultDevice
+    val m = CNTKModel.loadModelFromBytes(broadcastModelBytes.value, device)
+    val model = outputNode
+      .map { name => CNTKLib.AsComposite(Option(m.findByName(name)).getOrElse(
+              throw new IllegalArgumentException(s"Node $name does not exist"))) }
+      .getOrElse(m)
+
+    val inputVar = model.getArguments.get(inputNode)
+    require(inputVar.getDataType() == CNTKDataType.Float, "input variable type is not Float input type")
+    val inputShape = inputVar.getShape
+
+    // This defines and instantiates an iterator, hasNext and next are the abstract methods that
+    // define the interface and inputBuffer and outputBuffer hold the input and output rows so that
+    // they can be joined and returned.
+    // The logic inside next checks to see if the buffer is empty, and if so sends a new batch off
+    // to be evaluated
+    new Iterator[Row] {
+      val inputBuffer    = new ListBuffer[Row]()
+      val outputBuffer   = new ListBuffer[Row]()
+      val inputSize: Int = inputShape.getTotalSize().toInt
+      val inputFVV       = new FloatVectorVector(minibatchSize.toLong)
+      val fvs: Array[FloatVector] =
+        (0 until minibatchSize).map(_ => new FloatVector(inputSize.toLong)).toArray
+
+      def hasNext: Boolean = inputRows.hasNext || outputBuffer.nonEmpty
+
+      def next(): Row = {
+        if (outputBuffer.isEmpty) {
+          var paddedRows = 0
+          for (i <- 0 until minibatchSize) {
+            if (inputRows.hasNext) {
+              val row = inputRows.next()
+              inputBuffer += row
+              for ((x, j) <- row.getSeq[Float](inputIndex).view.zipWithIndex) {
+                fvs(i).set(j, x)
+              }
+            } else {
+              //TODO remove padding after CNTK bug is fixed
+              paddedRows += 1
+              for (j <- 0 until inputSize) {
+                fvs(i).set(j, 0.0.toFloat)
+              }
+            }
+            inputFVV.set(i, fvs(i))
+          }
+
+          val inputVal =
+            Value.createDenseFloat(inputShape, inputFVV, device)
+          val inputDataMap = new UnorderedMapVariableValuePtr()
+          inputDataMap.add(inputVar, inputVal)
+
+          val outputDataMap = new UnorderedMapVariableValuePtr()
+          val outputVar     = model.getOutputs.get(0)
+          outputDataMap.add(outputVar, null)
+
+          model.evaluate(inputDataMap, outputDataMap, device)
+
+          val outputFVV = new FloatVectorVector()
+          outputDataMap.getitem(outputVar).copyVariableValueToFloat(outputVar, outputFVV)
+          assert(outputBuffer.isEmpty,
+                 "The output row buffer should be empty before new elements are added.")
+          outputBuffer ++= toSeqSeq(outputFVV)
+            .dropRight(paddedRows)
+            .map(fs => Row(Vectors.dense(fs.map(_.toDouble).toArray)))
+        }
+        val ret = Row.merge(inputBuffer.head, outputBuffer.head)
+        inputBuffer.remove(0)
+        outputBuffer.remove(0)
+        ret
+      }
+    }
+  }
+
+  // here just for serialization
+  val applyModelFunc = (inputIndex: Int, broadcastModelBytes: Broadcast[Array[Byte]],
+                        minibatchSize: Int, inputNode: Int,
+                        outputNode: Option[String]) => {
+    (inputRows: Iterator[Row]) => {
+      applyModel(inputIndex, broadcastModelBytes, minibatchSize, inputNode, outputNode)(inputRows)
+    }
+  }
+
+  private def toSeqSeq(fvv: FloatVectorVector): Seq[Seq[Float]] = {
+    (0 until fvv.size.toInt).map(i => (0 until fvv.get(i).size.toInt).map(j => fvv.get(i).get(j)))
+  }
+}
+
+object CNTKModel extends DefaultParamsReadable[CNTKModel] {
+  def loadModelFromBytes(bytes: Array[Byte],
+                         device: DeviceDescriptor =
+                           DeviceDescriptor.useDefaultDevice): CNTKFunction = {
+    import java.util.UUID._
+    val modelFile = new File(s"$getTempDirectoryPath/$randomUUID.model")
+    writeByteArrayToFile(modelFile, bytes)
+    val model = try {
+      CNTKFunction.load(modelFile.getPath, device)
+    } finally forceDelete(modelFile)
+    model
+  }
+
+  override def load(path: String): CNTKModel = super.load(path)
+}
+
+@InternalWrapper
+class CNTKModel(override val uid: String) extends Model[CNTKModel] with DefaultParamsWritable
+  with HasInputCol with HasOutputCol {
+
+  def this() = this(Identifiable.randomUID("CNTKModel"))
+
+  val model: Param[String] =
+    new Param(this, "model", "Array of bytes containing the serialized CNTKModel")
+  def setModel(spark: SparkSession, path: String): CNTKModel = {
+    val modelBytes = spark.sparkContext.binaryFiles(path).first()._2.toArray
+    set(model, printBase64Binary(modelBytes))
+  }
+  def getModel: Array[Byte] = parseBase64Binary($(model))
+
+  val inputNode: IntParam                 = new IntParam(this, "inputNode", "index of the input node")
+  def setInputNode(value: Int): this.type = set(inputNode, value)
+  def getInputNode: Int                   = $(inputNode)
+  setDefault(inputNode -> 0)
+
+  val outputNodeIndex: IntParam = new IntParam(this, "outputNodeIndex", "index of the output node")
+  def setOutputNodeIndex(value: Int): this.type = set(outputNodeIndex, value)
+  def getOutputNodeIndex: Int                   = $(outputNodeIndex)
+
+  val outputNodeName: Param[String] = new Param(this, "outputNodeName", "name of the output node")
+  def setOutputNodeName(value: String): this.type = set(outputNodeName, value)
+  def getOutputNodeName: String                   = $(outputNodeName)
+
+  val miniBatchSize: IntParam =
+    new IntParam(this, "miniBatchSize", "size of minibatches", ParamValidators.gt(0))
+  def setMiniBatchSize(value: Int): this.type = set(miniBatchSize, value)
+  def getMiniBatchSize: Int                   = $(miniBatchSize)
+  setDefault(miniBatchSize -> 10)
+
+  def transformSchema(schema: StructType): StructType = schema.add(getOutputCol, VectorType)
+
+  override def copy(extra: ParamMap): this.type = defaultCopy(extra)
+
+  def transform(dataset: Dataset[_]): DataFrame = {
+    val spark      = dataset.sparkSession
+    val sc         = spark.sparkContext
+    val inputIndex = dataset.columns.indexOf(getInputCol)
+    val device     = DeviceDescriptor.useDefaultDevice
+
+    if (inputIndex == -1)
+      throw new IllegalArgumentException(s"Input column $getInputCol does not exist")
+
+    val model = CNTKModel.loadModelFromBytes(getModel, device)
+
+    val setByName  = get(outputNodeName)
+    val setByIndex = get(outputNodeIndex)
+    if ((setByName.isDefined && setByIndex.isDefined) ||
+          (!setByName.isDefined && !setByIndex.isDefined))
+      throw new Exception("Must specify one and only one of outputNodeName or outputNodeIndex")
+
+    val outputNode: Option[String] =
+      if (setByName.isDefined) setByName
+      else                     setByIndex.map(i => model.getOutputs.get(i).getName)
+
+    val coersionOptionUDF = dataset.schema.fields(inputIndex).dataType match {
+      case ArrayType(tp, _) =>
+        tp match {
+          case DoubleType => Some(udf((x: mutable.WrappedArray[Double]) => x.map(_.toFloat)))
+          case FloatType  => None
+          case _ =>
+            throw new IllegalArgumentException(s"improper column type: $tp, need Array[Float]")
+        }
+      case VectorType => Some(udf((x: DenseVector) => x.toArray.map(_.toFloat)))
+    }
+
+    val coercedCol = DatasetExtensions.findUnusedColumnName("coerced")(dataset.columns.toSet)
+    val (df, selectedIndex) = coersionOptionUDF match {
+      case Some(coersionUDF) =>
+        val coercedDF = dataset.toDF().withColumn(coercedCol, coersionUDF(col(getInputCol)))
+        (coercedDF, coercedDF.columns.indexOf(coercedCol))
+      case None => (dataset.toDF(), inputIndex)
+    }
+
+    val inputType = df.schema($(inputCol)).dataType
+    val broadcastModelBytes = sc.broadcast(getModel)
+    val rdd = df.rdd.mapPartitions(
+      CNTKModelUtils.applyModelFunc(selectedIndex,
+                                    broadcastModelBytes,
+                                    getMiniBatchSize,
+                                    getInputNode,
+                                    outputNode))
+    val output = spark.createDataFrame(rdd, df.schema.add(StructField(getOutputCol, VectorType)))
+
+    coersionOptionUDF match {
+      case Some(_) => output.drop(coercedCol)
+      case None    => output
+    }
+  }
+
+}
diff --git a/src/cntk-model/src/test/scala/CNTKBindingSuite.scala b/src/cntk-model/src/test/scala/CNTKBindingSuite.scala
new file mode 100644
index 0000000000..1bf842e7cf
--- /dev/null
+++ b/src/cntk-model/src/test/scala/CNTKBindingSuite.scala
@@ -0,0 +1,60 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.CNTK.{Function => CNTKFunction, _}
+
+class CNTKBindingSuite extends LinuxOnly with CNTKTestUtils {
+
+  def toSeqSeq(fvv: FloatVectorVector): Seq[Seq[Float]] = {
+    (0 until fvv.size.toInt).map(i =>
+      (0 until fvv.get(i).size().toInt).map(j => fvv.get(i).get(j)))
+  }
+
+  def toFVV(minibatch: Seq[Seq[Float]]): FloatVectorVector = {
+    minibatch.foldLeft(new FloatVectorVector()) {
+      case (fvv, floats) =>
+        fvv.add(floats.foldLeft(new FloatVector()) { case (fv, f) => fv.add(f); fv })
+        fvv
+    }
+  }
+
+  def randomSeqSeq(outerSize: Int, innerSize: Int = 32 * 32 * 3): Seq[Seq[Float]] = {
+    val r = scala.util.Random
+    (1 to outerSize).map(i => {
+      (1 to innerSize).map(j => {
+        r.nextFloat()
+      })
+    })
+  }
+
+  ignore("Evaluate should be able to change batch size ") {
+    val model = CNTKFunction.load(modelPath, DeviceDescriptor.useDefaultDevice)
+    val inputVar = model.getArguments.get(0)
+    val inputShape = inputVar.getShape
+
+    def evaluateRandomMinibatch(batchSize: Int): Seq[Seq[Float]] = {
+      val fakeImages = randomSeqSeq(batchSize)
+      val inputFVV = toFVV(fakeImages)
+      val inputVal = Value.createDenseFloat(inputShape, inputFVV, DeviceDescriptor.getCPUDevice)
+      val inputDataMap = new UnorderedMapVariableValuePtr()
+      inputDataMap.add(inputVar, inputVal)
+
+      val outputDataMap = new UnorderedMapVariableValuePtr()
+      val outputVar = model.getOutputs.get(0)
+      outputDataMap.add(outputVar, null)
+
+      println(s"evaluating shape ${inputVal.getShape().getDimensions}")
+      model.evaluate(inputDataMap, outputDataMap, DeviceDescriptor.getCPUDevice)
+      val outputFVV = new FloatVectorVector()
+      outputDataMap.getitem(outputVar).copyVariableValueToFloat(outputVar, outputFVV)
+      toSeqSeq(outputFVV)
+    }
+    evaluateRandomMinibatch(1)
+    evaluateRandomMinibatch(3)
+    evaluateRandomMinibatch(2)
+
+  }
+
+}
diff --git a/src/cntk-model/src/test/scala/CNTKModelSuite.scala b/src/cntk-model/src/test/scala/CNTKModelSuite.scala
new file mode 100644
index 0000000000..13e021530e
--- /dev/null
+++ b/src/cntk-model/src/test/scala/CNTKModelSuite.scala
@@ -0,0 +1,157 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.io.File
+import java.util.Date
+
+import org.apache.commons.io.FileUtils.getTempDirectoryPath
+import org.apache.spark.SparkException
+import org.apache.spark.ml.classification.LogisticRegression
+import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
+import org.apache.spark.ml.{Pipeline, PipelineModel}
+import org.apache.spark.ml.linalg.DenseVector
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.types._
+
+class CNTKModelSuite extends LinuxOnly with CNTKTestUtils {
+
+  // TODO: Move away from getTempDirectoryPath and have TestBase provide one
+  val saveFile = s"$getTempDirectoryPath/${new Date()}-spark-z.model"
+
+  def testModel(minibatchSize: Int = 10): CNTKModel = {
+    new CNTKModel()
+      .setModel(session, modelPath)
+      .setInputCol(inputCol)
+      .setOutputCol(outputCol)
+      .setMiniBatchSize(minibatchSize)
+      .setOutputNodeIndex(3)
+  }
+
+  val images = testImages(session)
+
+  private def checkParameters(minibatchSize: Int) = {
+    val model  = testModel(minibatchSize)
+    val result = model.transform(images)
+    compareToTestModel(result)
+  }
+
+  test("A CNTK model should be able to support setting the input and output node") {
+    val model = testModel().setInputNode(0)
+
+    val data = makeFakeData(session, 3, featureVectorLength)
+    val result = model.transform(data)
+    assert(result.select(outputCol).count() == 3)
+  }
+
+  test("A CNTK model should support finding a node by name") {
+    val model = new CNTKModel()
+      .setModel(session, modelPath)
+      .setInputCol(inputCol)
+      .setOutputCol(outputCol)
+      .setOutputNodeName("z")
+
+    val data   = makeFakeData(session, 3, featureVectorLength)
+    val result = model.transform(data)
+    assert(result.select(outputCol).collect()(0).getAs[DenseVector](0).size == 10)
+    assert(result.select(outputCol).count() == 3)
+  }
+
+  test("throws useful exception when invalid node name is given") {
+    val model = new CNTKModel()
+      .setInputCol(inputCol)
+      .setOutputCol(outputCol)
+      .setOutputNodeName("nonexistant-node")
+      .setModel(session, modelPath)
+
+    val data = makeFakeData(session, 3, featureVectorLength)
+    val se = intercept[SparkException] { model.transform(data).collect() }
+    assert(se.getCause.isInstanceOf[IllegalArgumentException])
+  }
+
+  test("A CNTK model should work on doubles") {
+    val model = testModel()
+    val data   = makeFakeData(session, 3, featureVectorLength, outputDouble = true)
+    val result = model.transform(data)
+    assert(result.select(outputCol).collect()(0).getAs[DenseVector](0).size == 10)
+    assert(result.count() == 3)
+  }
+
+  test("A CNTK model should output Vectors and interop with other estimators") {
+    val model = testModel()
+    val data   = makeFakeData(session, 3, featureVectorLength, outputDouble = true)
+    val result = model.transform(data)
+    assert(result.select(outputCol).schema.fields(0).dataType == VectorType)
+
+    val predictions = new LogisticRegression()
+      .setFeaturesCol(outputCol)
+      .setLabelCol(labelCol)
+      .fit(result)
+      .transform(result)
+    assert(predictions.select("prediction").collect().length == 3)
+  }
+
+  test("A CNTK model should have a default minibatch size") {
+    val model = testModel()
+    val result = model.transform(images)
+    compareToTestModel(result)
+  }
+
+  test("A CNTK model should work on resized batches") {
+    val model = testModel()
+    val result = model.transform(images.repartition(1))
+    compareToTestModel(result)
+    //images.printSchema()
+    //result.show()
+  }
+
+  test("A CNTK model should work on an empty dataframe") {
+    val images = session.createDataFrame(sc.emptyRDD[Row],
+                                         StructType(
+                                           StructField(inputCol, ArrayType(FloatType, false)) ::
+                                             Nil))
+    val model = testModel()
+    val result = model.transform(images)
+    assert(result.count == 0)
+  }
+
+  test("A CNTK Model should process images") {
+    checkParameters(1)
+    checkParameters(10)
+    checkParameters(100)
+  }
+
+  test("A CNTK Model should be saveable") {
+    val model = testModel()
+    model.write.overwrite().save(saveFile)
+    val modelLoaded = CNTKModel.load(saveFile)
+    val result      = modelLoaded.transform(images)
+    compareToTestModel(result)
+  }
+
+  test("A CNTK Model should be pipeline compatible") {
+    val model  = testModel()
+    val pipe   = new Pipeline().setStages(Array(model)).fit(images)
+    pipe.write.overwrite().save(saveFile)
+    val pipeLoaded = PipelineModel.load(saveFile)
+    val result     = pipeLoaded.transform(images)
+    compareToTestModel(result)
+  }
+
+  test("useful error message if invalid column name is given") {
+    val model  = testModel().setInputCol("images")
+    val pipe   = new Pipeline().setStages(Array(model)).fit(images)
+    pipe.write.overwrite().save(saveFile)
+    val pipeLoaded = PipelineModel.load(saveFile)
+    assertThrows[IllegalArgumentException] {
+      pipeLoaded.transform(images)
+    }
+  }
+
+  override def afterAll(): Unit = {
+    new File(saveFile).delete()
+    super.afterAll()
+  }
+
+}
diff --git a/src/cntk-model/src/test/scala/CNTKTestUtils.scala b/src/cntk-model/src/test/scala/CNTKTestUtils.scala
new file mode 100644
index 0000000000..30f134a8f5
--- /dev/null
+++ b/src/cntk-model/src/test/scala/CNTKTestUtils.scala
@@ -0,0 +1,74 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.sql._
+import org.apache.spark.ml.linalg.DenseVector
+import com.microsoft.ml.spark.Readers.implicits._
+
+trait CNTKTestUtils {
+
+  val filesRoot = s"${sys.env("DATASETS_HOME")}/"
+  val imagePath = s"$filesRoot/Images/CIFAR"
+  val modelPath = s"$filesRoot/CNTKModel/ConvNet_CIFAR10.model"
+
+  val inputCol  = "cntk_images"
+  val outputCol = "out"
+  val labelCol  = "labels"
+
+  val featureVectorLength = 3 * 32 * 32
+
+  def testModelDF(spark: SparkSession): DataFrame = {
+    import spark.implicits._
+    spark.sparkContext.parallelize(Seq(
+    Array(1.32165250, -2.1215112, 0.63150704, 0.77315974, -1.28163720,
+      -0.20210080, -2.2839167, -2.08691480, 5.08418200, -1.33741090),
+    Array(3.44079640, 1.4877119, -0.74059330, -0.34381202, -2.48724990,
+      -2.62866950, -3.1693816, -3.14182600, 4.76314800, 0.68712880),
+    Array(-1.88747900, -4.7685330, 0.15169683, 6.80547570, -0.38405967,
+      3.41065170, 1.3302778, -0.87714905, -2.18046050, -4.16661830),
+    Array(5.01010300, 3.9860306, -1.36795600, -0.89830830, -4.49545430,
+      -4.19537070, -4.4045380, -5.81759450, 6.93805700, 1.49001510),
+    Array(-4.70754600, -6.0414960, 1.20658250, 5.40738300, 1.07661690,
+      4.71566440, 4.3834330, -1.57187440, -2.96569730, -5.43208270),
+    Array(-1.23873880, -3.2042341, 2.54533000, 5.51954800, 2.89042470,
+      0.12380804, 3.8639085, -4.79466800, -2.41463420, -5.17418430))).toDF
+  }
+
+  def testImages(spark: SparkSession): DataFrame = {
+    val images = spark.readImages(imagePath, true)
+
+    val unroll = new UnrollImage().setInputCol("image").setOutputCol(inputCol)
+
+    unroll.transform(images).select(inputCol)
+  }
+
+  def makeFakeData(spark: SparkSession, rows: Int, size: Int, outputDouble: Boolean = false): DataFrame = {
+    import spark.implicits._
+    if (outputDouble) {
+      List
+        .fill(rows)(List.fill(size)(0.0).toArray)
+        .zip(List.fill(rows)(0.0))
+        .toDF(inputCol, labelCol)
+    } else {
+      List
+        .fill(rows)(List.fill(size)(0.0.toFloat).toArray)
+        .zip(List.fill(rows)(0.0))
+        .toDF(inputCol, labelCol)
+    }
+  }
+
+  protected def compareToTestModel(result: DataFrame) = {
+    //TODO improve checks
+    assert(result.columns.toSet == Set(inputCol, outputCol))
+    assert(result.count() == testModelDF(result.sparkSession).count())
+    val max = result
+      .select(outputCol)
+      .collect()
+      .map(row => row.getAs[DenseVector](0).toArray.max)
+      .max
+    assert(max < 10 & max > -10)
+  }
+
+}
diff --git a/src/cntk-train/build.sbt b/src/cntk-train/build.sbt
new file mode 100644
index 0000000000..f418ec0a86
--- /dev/null
+++ b/src/cntk-train/build.sbt
@@ -0,0 +1,3 @@
+//> DependsOn: core
+//> DependsOn: featurize
+//> DependsOn: cntk-model
diff --git a/src/cntk-train/src/main/python/CNTKLearner.py b/src/cntk-train/src/main/python/CNTKLearner.py
new file mode 100644
index 0000000000..0e985b33ae
--- /dev/null
+++ b/src/cntk-train/src/main/python/CNTKLearner.py
@@ -0,0 +1,23 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+import sys
+
+if sys.version >= '3':
+    basestring = str
+
+from mmlspark._CNTKLearner import _CNTKLearner
+from mmlspark.CNTKModel import CNTKModel as CNTKmod
+from pyspark.ml.common import inherit_doc
+
+@inherit_doc
+class CNTKLearner(_CNTKLearner):
+    """
+    Create CNTK model from existing java model
+    :param py4j.java_gateway.JavaObject java_model: see Scala CNTKModel documentation
+    """
+    def _create_model(self, java_model):
+        model = CNTKmod()
+        model._java_obj = java_model
+        model._transfer_params_from_java()
+        return model
diff --git a/src/cntk-train/src/main/scala/BrainscriptBuilder.scala b/src/cntk-train/src/main/scala/BrainscriptBuilder.scala
new file mode 100644
index 0000000000..cbe6172eba
--- /dev/null
+++ b/src/cntk-train/src/main/scala/BrainscriptBuilder.scala
@@ -0,0 +1,117 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.io.{FileOutputStream, ObjectOutputStream}
+import java.util.UUID
+
+import scala.collection.mutable.ListBuffer
+import scala.sys.process._
+
+import com.microsoft.ml.spark.schema._
+import FileUtilities._
+
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.ml.classification._
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector}
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter}
+import org.apache.spark.ml._
+
+import org.apache.spark.sql._
+import org.apache.spark.sql.types._
+
+// Don't get too excited AK. This is starting to look like a set of contracts..
+case class InputShape(dim: Int, form: String)
+case class InputData(format: String, path: String, shapes: Map[String, InputShape])
+case class BrainScriptConfig(name: String, text: Seq[String])
+
+// It would be nice to extend from Params for this, but this
+// seems more useful than just Spark, so not doing it for now
+class BrainScriptBuilder {
+
+  // We need to know a few things:
+  // 1. Where is the input data?
+  // 2. How do we configure the training itself?
+  // 3. Where should we put the outputs?
+  var modelName = "ModelOut"
+
+  var inData: Option[InputData] = None
+
+  var rootDir: String = ""
+  var outDir: String = ""
+  var weightPrecision: String = "float"
+
+  var commands = ListBuffer[String]("trainNetwork")
+  var testModel = false
+
+  def setInputFile(path: String, format: String, shapes: Map[String, InputShape]): this.type = {
+    inData = Some(InputData(format, path, shapes))
+    this
+  }
+
+  def setModelName(n: String): this.type = {
+    modelName = n
+    this
+  }
+
+  def getModelPath(): String = {
+    s"""$outDir/Models/$modelName"""
+  }
+
+  def setRootDir(p: String): this.type = {
+    outDir = p
+    this
+  }
+
+  def setOutputRoot(p: String): this.type = {
+    outDir = p
+    this
+  }
+
+  private def getInputString(): String = {
+    val ips = inData.get.shapes
+                .map { case(name, shape) => name + " = [ dim = " +
+                       shape.dim.toString + " ; format = \"" + shape.form + "\" ]" }
+                .mkString("; ")
+    s"input = [ $ips ]"
+  }
+
+  def setCommands(c: String*): this.type = {
+    this
+  }
+
+  def setTestModel(b: Boolean): this.type = {
+    if (!testModel && b) {
+      commands.append("testNetwork")
+    }
+    this
+  }
+
+  def toReaderConfig(): String = {
+    val ipstring = getInputString()
+    val loc = inData.get.path
+    val form = inData.get.format match {
+      case "text" => "CNTKTextFormatReader"
+    }
+    s"""reader = [ readerType = $form ; file = "$loc" ; $ipstring ]"""
+  }
+
+  def toOverrideConfig(): Seq[String] = {
+    val rootOverrides = Seq(
+      s"""command = ${ commands.mkString(":") }""",
+      s"precision=$weightPrecision",
+      "traceLevel=1",
+      "deviceId=\"auto\"",
+      s"""rootDir="$rootDir" """,
+      s"""outputDir="$outDir" """,
+      s"""modelPath="${getModelPath}" """)
+    val commandReaders = commands.map(c => s"$c = [ ${toReaderConfig()} ]")
+
+    rootOverrides ++ commandReaders
+  }
+
+}
diff --git a/src/cntk-train/src/main/scala/CNTKLearner.scala b/src/cntk-train/src/main/scala/CNTKLearner.scala
new file mode 100644
index 0000000000..4d601d4f3d
--- /dev/null
+++ b/src/cntk-train/src/main/scala/CNTKLearner.scala
@@ -0,0 +1,168 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.net.URI
+
+import com.microsoft.ml.spark.FileUtilities._
+import org.apache.spark.ml._
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
+import org.apache.spark.sql._
+import org.apache.spark.sql.types._
+
+trait CNTKParams extends MMLParams {
+
+  // This is only needed until Train* accepts CNTKLearner instead of CL acting like Train*
+  val labelsColumnName = StringParam(this, "labelsColumnName", "Label col", "labels")
+  val featuresColumnName = StringParam(this, "featuresColumnName", "feats col", "features")
+
+  // This will go away after the CNTK HDFS Deserializer
+  val localHdfsMount = StringParam(this, "localHdfsMount", "local mount point for hdfs:///")
+  val dataTransfer = StringParam(this, "dataTransfer", "transfer strategy", "local")
+  def setTransferStrategy(s: String): this.type = set(dataTransfer, s)
+
+  // TODO: Convert to enum contract shared with CNTK's HDFS Deserializer
+  val dataFormat = StringParam(this, "dataFormat", "transfer format", "text")
+  val weightPrecision = StringParam(this, "weightPrecision", "weights", "double")
+  val featureCount = IntParam(this, "featureCount", "num features for reduction", 1)
+  def setFeatureCount(c: Int): this.type = set(featureCount, c)
+
+  val brainScript = StringParam(this, "brainScript", "string of BS config")
+  def setBrainScriptText(t: String): this.type = set(brainScript, t)
+  def setBrainScriptFile(f: String): this.type = set(brainScript, FileUtilities.readFile(new File(f)))
+
+  val parallelTrain = BooleanParam(this, "parallelTrain", "train using an MPI ring", true)
+  def setParallelTrain(b: Boolean): this.type = set(parallelTrain, b)
+
+  val workingDir = StringParam(this, "workingDir", "working directory for CNTK", "tmp")
+  def setWorkingDirectory(d: String): this.type = set(workingDir, d)
+
+}
+
+object CNTKLearner extends DefaultParamsReadable[CNTKLearner]
+
+@InternalWrapper
+class CNTKLearner(override val uid: String) extends Estimator[CNTKModel] with CNTKParams {
+
+  def this() = this(Identifiable.randomUID("CNTKLearner"))
+
+  override def fit(dataset: Dataset[_]): CNTKModel = {
+    val spark = dataset.sparkSession
+    val labels = $(labelsColumnName)
+    val features = $(featuresColumnName)
+
+    // Convert label column to categorical on train, remove rows with missing labels
+    val convertedLabelDataset = dataset.na.drop(Seq(labels))
+
+    // This utility function is a stub for the reduction step of Featurize, and
+    // will probably be covered in TrainClass/Regressor
+    val reducedData = DataTransferUtils.reduceAndAssemble(
+      convertedLabelDataset,
+      labels,
+      features,
+      $(weightPrecision),
+      $(featureCount))
+
+    // TODO: Very bad hack - we should store vector sizes in schema for quick retrieval
+    // Apparently this needs some design, not natively supported. This schema transfer
+    // in general needs to be more robust...
+    val feature1 = reducedData.select(features).head.getAs[Vector](0)
+    val featureDim = feature1.size
+    val featureForm = feature1 match {
+      case dv: DenseVector => "dense"
+      case sv: SparseVector => "sparse"
+    }
+
+    val label1 = reducedData.select(labels).head.getAs[Vector](0)
+    val labelDim = label1.size
+    val labelForm = label1 match {
+      case dv: DenseVector => "dense"
+      case sv: SparseVector => "sparse"
+    }
+
+    val partitions = reducedData.rdd.getNumPartitions
+
+    val cntkrootURI = new URI($(workingDir))
+    val cntkrootPath = new File(cntkrootURI).getAbsolutePath
+    println(s"$uid working in $cntkrootPath")
+    val relativeInPath = s"$cntkrootURI/$uid-inputdata"
+
+    val writer = $(dataTransfer) match {
+      case "local" => new LocalWriter(relativeInPath)
+      case "hdfs-mount" => {
+        val mntpt = if (isDefined(localHdfsMount)) {
+          val x = $(localHdfsMount)
+          println(s"Using override hdfsm point: $x")
+          x
+        } else {
+          val x = sys.env.getOrElse("HDFS_MOUNTPOINT", "tmp/mnt")
+          println(s"Using deduced hdfsm point: $x")
+          x
+        }
+        println(s"hdfs-mount mounted at $mntpt")
+        new HdfsMountWriter(mntpt, 1, relativeInPath, spark.sparkContext)
+      }
+      case _ => ???
+    }
+
+    // Actual data movement step
+
+    // As discussed above, this pipelining needs to be elsewhere, so for now
+    // creating utility functions for reuse and not combining the steps
+    val conformedData = $(dataFormat) match {
+      case "text" => DataTransferUtils.convertDatasetToCNTKTextFormat(reducedData, labels, features)
+      case "parquet" => reducedData
+    }
+
+    conformedData.persist()
+
+    val remappedInPath = $(dataFormat) match {
+      case "text" => writer.checkpointToText(conformedData)
+      case "parquet" => writer.checkpointToParquet(conformedData)
+    }
+
+    val relativeOutRoot = s"$cntkrootPath/$uid-outdir"
+
+    val config = new BrainScriptBuilder()
+      .setOutputRoot(relativeOutRoot)
+      // TODO: We need a more structured form of converting schema to CNTK config
+      // this will come in after the parquet + CNTK-as-library work comes in
+      .setInputFile(
+        remappedInPath,
+        $(dataFormat),
+        Map(features -> InputShape(featureDim, featureForm),
+            labels -> InputShape(labelDim, labelForm)))
+
+    // Train the learner
+    val cb = if ($(parallelTrain)) new MPICommandBuilder() else new CNTKCommandBuilder()
+    cb
+      .setWorkingDir(cntkrootPath)
+      .insertBaseConfig($(brainScript))
+      .appendOverrideConfig(config.toOverrideConfig)
+
+    val modelRet = ProcessUtils.runProcess(cb.buildCommand)
+    println(s"CNTK exited with code $modelRet")
+    if (modelRet != 0) {
+      // TODO: Use exception heirarchy
+      throw new Exception("CNTK Training failed. Please view output log for details")
+    }
+
+    conformedData.unpersist()
+
+    // This does not work :(
+    // CNTKModel.load(config.getModelPath)
+    // This also needs a windows dll - currently only runs on linux
+    new CNTKModel(uid + "-model")
+      .setModel(spark, config.getModelPath)
+      .setInputCol(features)
+      .setOutputCol(labels)
+  }
+
+  override def copy(extra: ParamMap): Estimator[CNTKModel] = defaultCopy(extra)
+
+  override def transformSchema(schema: StructType): StructType = ???
+
+}
diff --git a/src/cntk-train/src/main/scala/CommandBuilders.scala b/src/cntk-train/src/main/scala/CommandBuilders.scala
new file mode 100644
index 0000000000..4c02cc0e0c
--- /dev/null
+++ b/src/cntk-train/src/main/scala/CommandBuilders.scala
@@ -0,0 +1,117 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.io.{FileOutputStream, ObjectOutputStream}
+import java.util.UUID
+import java.net.URI
+
+import scala.collection.mutable.ListBuffer
+import scala.sys.process._
+
+import com.microsoft.ml.spark.schema._
+import FileUtilities._
+
+import org.apache.hadoop.fs.Path
+
+import org.apache.spark.ml.classification._
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector}
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter}
+import org.apache.spark.ml._
+
+import org.apache.spark.sql._
+import org.apache.spark.sql.types._
+
+abstract class CNTKCommandBuilderBase {
+  val command: String
+  def arguments(): Seq[String]
+  val configs = ListBuffer.empty[BrainScriptConfig]
+
+  var workingDir = new File(".").toURI
+
+  def setWorkingDir(p: String): this.type = {
+    workingDir = new File(p).toURI
+    this
+  }
+
+  def insertBaseConfig(t: String): this.type = {
+    configs.insert(0, BrainScriptConfig("baseConfig", Seq(t)))
+    this
+  }
+
+  def appendOverrideConfig(t: Seq[String]): this.type = {
+    configs.append(BrainScriptConfig("overrideConfig", t))
+    this
+  }
+
+  protected def configToFile(c: BrainScriptConfig): String = {
+    val outFile = new File(new File(workingDir).getAbsolutePath + s"/${c.name}.cntk")
+    writeFile(outFile, c.text.mkString("\n"))
+    println(s"wrote string to ${outFile.getName}")
+    outFile.getAbsolutePath
+  }
+
+  def buildCommand(): String
+}
+
+class CNTKCommandBuilder(fileBased: Boolean = true) extends CNTKCommandBuilderBase {
+  val command = "cntk"
+  val arguments = Seq[String]()
+
+   def buildCommand(): String = {
+    val cntkArgs = configs
+      .map(c => if (fileBased) s"configFile=${configToFile(c)} " else c.text.mkString(" "))
+      .mkString(" ")
+
+    command + " " + cntkArgs
+  }
+}
+
+trait MPIConfiguration {
+  val command = "mpiexec"
+  // nodename -> workers per node
+  def nodeConfig: Map[String, Int]
+}
+
+class MPICommandBuilder(fileBased: Boolean = true) extends CNTKCommandBuilderBase with MPIConfiguration {
+
+  def nodeConfig: Map[String, Int] = Map("127.0.0.1" -> EnvironmentUtils.GPUCount.get)
+
+  val argName = "-n"
+  val arguments = Seq(argName, nodeConfig.head._2.toString)
+
+  def buildCommand(): String = {
+    val cntkArgs = "cntk " + configs
+      .map(c => if (fileBased) s"configFile=${configToFile(c)} " else c.text.mkString(" "))
+      .mkString(" ")
+
+    Seq(command, arguments.mkString(" "), cntkArgs, "parallelTrain=true").mkString(" ")
+  }
+}
+
+class MultiNodeParallelLauncher(fileBased: Boolean = false) extends CNTKCommandBuilderBase with MPIConfiguration {
+
+  // The difference here is the requirement of locating
+  // and passing on the hosts information
+  val nodeConfig = Map("localhost" -> 1, "remotehost" -> 1)
+  val arguments = if (EnvironmentUtils.IsWindows) {
+    Seq("--hosts", nodeConfig.size.toString) ++ nodeConfig.map { case(name, num) => s"$name $num" }
+  } else {
+    val hostFile = new File(".", "hostfile.txt")
+    val txt = nodeConfig.map { case(name, num) => s"$name slots=$num" }.mkString("\n")
+    writeFile(hostFile, txt)
+    Seq("-hostfile", hostFile.getCanonicalPath)
+  }
+
+  def buildCommand(): String = {
+    val cntkArgs = configs
+      .map(c => if (fileBased) s"configFile=${configToFile(c)} " else c.text.mkString(" "))
+      .mkString(" ")
+
+    Seq(command, arguments.mkString(" "), cntkArgs).mkString(" ")
+  }
+
+}
diff --git a/src/cntk-train/src/main/scala/DataConversion.scala b/src/cntk-train/src/main/scala/DataConversion.scala
new file mode 100644
index 0000000000..b1821a9a54
--- /dev/null
+++ b/src/cntk-train/src/main/scala/DataConversion.scala
@@ -0,0 +1,173 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.SparkContext
+
+import org.apache.spark.sql._
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.functions._
+
+import org.apache.spark.ml.linalg._
+import org.apache.spark.ml.util.Identifiable
+
+import FileUtilities._
+import hadoop.HadoopUtils
+
+object DataTransferUtils {
+
+  // This needs to be broken up into a few areas:
+  // 1. data-conversion is a library that has knowledge
+  //    of type mappings, schema, and a canonical implementation of
+  //    the type map conversion functions themselves
+  // 2. Featurize must be moved to a library utilized by the Train* APIs
+  //    this is a 2 stage estimator:
+  //       a. Type "reduction" utilizing a typemapper
+  //       b. Assembly via single or "feature channel" based multi vector assembler
+  //    at present, the current architecture limits us to
+
+  def toText(value: Any): String = {
+    value match {
+      case v: Vector => return convertVectorToText(v)
+      case d: Double => return d.toString
+      case f: Float => return f.toString
+    }
+  }
+
+  def toVec(value: Any): Vector = {
+    value match {
+      case v: Vector => return v
+      case d: Double => return new DenseVector(Array(d))
+      case f: Float => return new DenseVector(Array(f.toDouble))
+      case i: Integer => return new DenseVector(Array(i.toDouble))
+      case l: Long => return new DenseVector(Array(l.toDouble))
+    }
+  }
+
+  def convertVectorToText(v: Vector): String = {
+    val heuristicBloat = 8
+    val sb = new StringBuilder(v.numActives * heuristicBloat)
+    v match {
+      case sv: SparseVector => {
+        sv.foreachActive { (idx, value) =>
+          sb.append(idx).append(":").append(value).append(" ")
+          ()
+        }
+      }
+      case dv: DenseVector => {
+        dv.values.foreach(value => sb.append(value).append(" "))
+      }
+    }
+    sb.toString
+  }
+
+  private def col2vec = udf(toVec _)
+  private def col2str = udf(toText _)
+
+  // This needs to be converted to a pipeline stage as stated above
+  def reduceAndAssemble(data: Dataset[_],
+                        label: String,
+                        outputVecName: String,
+                        precision: String,
+                        features: Int): DataFrame = {
+    if (precision != "double") throw new NotImplementedError("only doubles")
+
+    val tempFeaturizer = new Featurize()
+      .setFeatureColumns(Map(outputVecName -> data.columns.filter(_ != label)))
+      .setNumberOfFeatures(features)
+      .setOneHotEncodeCategoricals(true)
+      .fit(data)
+    val reduced = tempFeaturizer.transform(data)
+    reduced.select(col2vec(reduced(label)).as(label), reduced(outputVecName))
+  }
+
+  def convertDatasetToCNTKTextFormat(data: Dataset[_], label: String, feats: String): DataFrame = {
+    val labelStrCol = col2str(data(label))
+    val featStrCol = col2str(data(feats))
+    val uberCol = concat(
+        lit(s"|$label "),
+        labelStrCol,
+        lit(" "),
+        lit(s"|$feats "),
+        featStrCol)
+    data.select(uberCol.as('value))
+  }
+
+}
+
+// This is all horrid, find a better way to be cluster/local agnostic
+// via DataSource and DataSink-type model. This will allow us to move to
+// other source/sinks in the future more easily, but think is out of scope here
+// TODO: this should become a set of extensions onto CheckpointData, which can
+// also return a model that is JSON serializable into the DataSource representation
+import java.net.URI
+
+abstract class DataWriter(destPath: String) {
+  protected val destUri = new URI(destPath)
+  protected val relativeDest = destUri.getPath
+
+  protected val partitions: Int
+
+  protected def remapPath(ext: String): String
+
+  def constructedPath: String
+
+  def checkpointToText(data: Dataset[_]): String = {
+    val fullPath = constructedPath
+    println(s"Writing dataset to $fullPath")
+    data.coalesce(partitions).write.text(fullPath)
+    remapPath("txt")
+  }
+
+  def checkpointToParquet(data: Dataset[_]): String = {
+    val fullPath = constructedPath
+    println(s"Writing dataset to $fullPath")
+    data.coalesce(partitions).write.format("parquet").save(fullPath)
+    remapPath("parquet")
+  }
+}
+
+abstract class NormalWriter(path: String) extends DataWriter(path) {
+  protected def remapPath(ext: String): String = constructedPath
+}
+
+// This is used when Hadoop creates the actual single part file
+// inside the path we've provided - we want that one.
+abstract class SingleFileResolver(path: String) extends DataWriter(path) {
+  protected val remappedRoot: String
+
+  protected def remapPath(extension: String): String = {
+    val dir = new File(remappedRoot)
+    println(s"Probing $dir for single file $constructedPath")
+    val file = dir.listFiles.filter(f => f.isFile && f.getName.endsWith(extension)).head
+    println(s"Resolving single file ${file.getAbsolutePath}")
+    file.getAbsolutePath
+  }
+}
+
+class LocalWriter(path: String) extends SingleFileResolver(path) {
+  val partitions = 1
+  val constructedPath = new URI("file", null, relativeDest, null, null).normalize.toString
+
+  // TODO: Move this logic to Apache commons lang helper that already exists
+  // And then provide a helper function in FileUtilities. Why doesn't URI normalize properly for new File()?
+  val remappedRoot = {
+    val root = if (EnvironmentUtils.IsWindows) "C:" else ""
+    root + relativeDest
+  }
+}
+
+class DefaultHdfsWriter(parts: Int, path: String) extends NormalWriter(path) {
+  val partitions = parts
+  val constructedPath = new URI(null, null, relativeDest, null, null).toString
+}
+
+class HdfsMountWriter(localMnt: String, parts: Int, path: String, sc: SparkContext) extends SingleFileResolver(path) {
+  val partitions = parts
+  // TODO: Why is this required on the edge node?
+  val hConf = sc.hadoopConfiguration
+  val namenode = new HadoopUtils(hConf).getActiveNameNode
+  val constructedPath = new URI("hdfs", namenode, relativeDest, null, null).toString
+  val remappedRoot = new URI(localMnt).toString + s"/$relativeDest"
+}
diff --git a/src/cntk-train/src/main/scala/TypeMapping.scala b/src/cntk-train/src/main/scala/TypeMapping.scala
new file mode 100644
index 0000000000..2b4f55f92d
--- /dev/null
+++ b/src/cntk-train/src/main/scala/TypeMapping.scala
@@ -0,0 +1,41 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.sql.types._
+import org.apache.spark.ml._
+
+object TypeMapping {
+  val mmlTypes = Seq[DataType](
+    BooleanType,
+    ByteType,
+    ShortType,
+    IntegerType,
+    LongType,
+    FloatType,
+    DoubleType,
+    StringType)
+}
+
+trait TypeConversion {
+  def conversionMap: Map[DataType, DataType]
+}
+
+// This is the root of Featurize stage 1: type mapping, where stage 2 is assembly tactic
+// There is one problem I cannot resolve: Can type mapping be dependent on assembly strategy?
+// If so, the lines are a bit blurry and it's likely not going to a 2 stage pipeline, but
+// rather a single estimator configurable (params) by an ITypeMapping and IAssemblyStrategy, to use
+// C# terminology for clarity.
+abstract class SingleTypeReducer(target: DataType) extends Transformer with TypeConversion {
+  private lazy val map = TypeMapping.mmlTypes.map(t => t -> target).toMap
+  def conversionMap: Map[DataType, DataType] = map
+}
+
+abstract class VectorAssembler()
+
+class SingleVectorAssembler() extends VectorAssembler
+class MultiVectorAssembler extends VectorAssembler
+object MultiVectorAssembler {
+  def create(groups: Map[String, Seq[Int]]): Unit = {}
+}
diff --git a/src/cntk-train/src/test/scala/ValidateCntkTrain.scala b/src/cntk-train/src/test/scala/ValidateCntkTrain.scala
new file mode 100644
index 0000000000..07a20ad989
--- /dev/null
+++ b/src/cntk-train/src/test/scala/ValidateCntkTrain.scala
@@ -0,0 +1,267 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.net.URI
+
+import org.apache.spark.ml.feature.{OneHotEncoder, StringIndexerModel}
+import com.microsoft.ml.spark.Readers.implicits._
+import FileUtilities._
+import org.scalatest.{BeforeAndAfterEach, Suite}
+
+trait TestFileCleanup extends BeforeAndAfterEach {
+  this: Suite =>
+  var cleanupPath: File
+  override def afterEach(): Unit = {
+    try super.afterEach() // To be stackable, must call super.afterEach
+    finally {
+      if (cleanupPath.exists) {
+        FileUtilities.delTree(cleanupPath)
+        ()
+      }
+    }
+  }
+}
+
+class ValidateCntkTrain extends TestBase with TestFileCleanup {
+
+  override var cleanupPath: File = new File(new URI(dir))
+
+  import session.implicits._
+
+  val dummyTrainScript = s"""
+command = trainNetwork:testNetwork
+
+precision = "float"; traceLevel = 1 ; deviceId = "auto"
+
+rootDir = ".." ; dataDir = "$$rootDir$$/DataSets/MNIST" ;
+outputDir = "./Output" ;
+
+modelPath = "$$outputDir$$/Models/01_OneHidden"
+
+# TRAINING CONFIG
+trainNetwork = {
+    action = "train"
+
+    BrainScriptNetworkBuilder = {
+        labelDim = 1 # number of distinct labels
+
+        # This model returns multiple nodes as a record, which
+        # can be accessed using .x syntax.
+        model(x) = {
+            h1 = DenseLayer {5, activation=ReLU} (x)
+            z = LinearLayer {labelDim} (h1)
+        }
+
+        # inputs
+        features = Input {9}
+        labels = Input {labelDim}
+
+        # apply model to features
+        out = model (features)
+
+        # loss and error computation
+        ce   = CrossEntropyWithSoftmax (labels, out.z)
+        errs = ClassificationError (labels, out.z)
+
+        # declare special nodes
+        featureNodes    = (features)
+        labelNodes      = (labels)
+        criterionNodes  = (ce)
+        evaluationNodes = (errs)
+        outputNodes     = (out.z)
+    }
+
+    SGD = {
+        epochSize = 60
+        minibatchSize = 6
+        maxEpochs = 3
+        learningRatesPerSample = 0.0001
+        momentumAsTimeConstant = 0
+
+        numMBsToShowResult = 500
+    }
+
+    reader = {
+        readerType = "CNTKTextFormatReader"
+        # See ../README.md for details on getting the data (Train-28x28_cntk_text.txt).
+        file = "file:///Train-28x28_cntk_text.txt"
+        input = {
+            features = { dim = 784 ; format = "dense" }
+            labels =   { dim = 10  ; format = "dense" }
+        }
+    }
+}
+
+# TEST CONFIG
+testNetwork = {
+    action = "test"
+    minibatchSize = 1024    # reduce this if you run out of memory
+
+    reader = {
+        readerType = "CNTKTextFormatReader"
+        file = "file:///Test-28x28_cntk_text.txt"
+        input = {
+            features = { dim = 784 ; format = "dense" }
+            labels =   { dim = 10  ; format = "dense" }
+        }
+    }
+}
+"""
+
+  val cifarScript = s"""
+# ConvNet applied on CIFAR-10 dataset, with no data augmentation.
+
+command = TrainNetwork
+
+precision = "float"; traceLevel = 0 ; deviceId = "auto"
+
+rootDir = "../../.." ; dataDir = "$$rootDir$$/DataSets/CIFAR-10" ;
+outputDir = "./Output" ;
+
+TrainNetwork = {
+    action = "train"
+
+    BrainScriptNetworkBuilder = {
+        imageShape = 32:32:3
+        labelDim = 6
+
+        featMean = 128
+        featScale = 1/256
+        Normalize{m,f} = x => f .* (x - m)
+
+        model = Sequential (
+            Normalize {featMean, featScale} :
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU :
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU :
+              MaxPoolingLayer {(3:3), stride = (2:2)} :
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU :
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU :
+              MaxPoolingLayer {(3:3), stride = (2:2)} :
+            DenseLayer {256} : ReLU : Dropout :
+            DenseLayer {128} : ReLU : Dropout :
+            LinearLayer {labelDim}
+        )
+
+        # inputs
+        features = Input {imageShape}
+        labels   = Input {labelDim}
+
+        # apply model to features
+        z = model (features)
+
+        # connect to system
+        ce       = CrossEntropyWithSoftmax     (labels, z)
+        errs     = ClassificationError         (labels, z)
+        top5Errs = ClassificationError         (labels, z, topN=5)  # only used in Eval action
+
+        featureNodes    = (features)
+        labelNodes      = (labels)
+        criterionNodes  = (ce)
+        evaluationNodes = (errs)  # top5Errs only used in Eval
+        outputNodes     = (z)
+    }
+
+    SGD = {
+        epochSize = 0
+        minibatchSize = 256
+
+        learningRatesPerSample = 0.0015625*10:0.00046875*10:0.00015625
+        momentumAsTimeConstant = 0*20:607.44
+        maxEpochs = 30
+        L2RegWeight = 0.002
+        dropoutRate = 0*5:0.5
+
+        numMBsToShowResult = 100
+        parallelTrain = {
+            parallelizationMethod = "DataParallelSGD"
+            parallelizationStartEpoch = 2  # warm start: don't use 1-bit SGD for first epoch
+            distributedMBReading = true
+            dataParallelSGD = { gradientBits = 1 }
+        }
+    }
+
+    reader = {
+        readerType = "CNTKTextFormatReader"
+        file = "$$DataDir$$/Train_cntk_text.txt"
+        randomize = true
+        keepDataInMemory = true     # cache all data in memory
+        input = {
+            features = { dim = 3072 ; format = "dense" }
+            labels   = { dim = 6 ;   format = "dense" }
+        }
+    }
+}
+"""
+
+  test("Smoke test for training on a classifier") {
+    val rawPath = new File(s"${sys.env("DATASETS_HOME")}/Binary/Train", "breast-cancer.train.csv").toString
+    val path = normalizePath(rawPath)
+    val dataset = session.read
+      .option("header", true)
+      .option("inferSchema", true)
+      .option("nullValue", "?")
+      .csv(path)
+      .withColumnRenamed("Label", "labels")
+
+    val learner = new CNTKLearner()
+      .setBrainScriptText(dummyTrainScript)
+      .setParallelTrain(false)
+      .setWorkingDirectory(dir)
+
+    val data = dataset.randomSplit(Seq(0.6, 0.4).toArray, 42)
+    val trainData = data(0)
+    val testData = data(1)
+
+    val model = learner.fit(trainData)
+    println(model)
+  }
+
+  // TODO: Redo this test with the proper image sizes now that full CIFAR is our dataset collection
+  // Also make this an E2E test and reduce validation scope down to smaller chunks.
+  test("train and eval CIFAR") {
+    val trigger = session.sparkContext
+    val filesRoot = s"${sys.env("DATASETS_HOME")}/"
+    val imagePath = s"$filesRoot/Images/CIFAR"
+
+    val inputCol = "cntk_images"
+    val tmpLabel = "labelscol"
+    val indexedLabel = "idxlabels"
+    val labelCol = "labels"
+
+    val images = session.readImages(imagePath, true)
+
+    // Label annotation: CIFAR is constructed here as
+    // 01234-01.png, meaning (len - 5, len - 3) is label
+    val pathLen = images.first.getStruct(0).getString(0).length
+    val labeledData = images.withColumn(tmpLabel, images("image.path").substr(pathLen - 5, 2).cast("float"))
+
+    // Unroll images into Spark representation
+    val unroller = new UnrollImage().setOutputCol(inputCol).setInputCol("image")
+    val unrolled = unroller.transform(labeledData).select(inputCol, tmpLabel)
+
+    // Prepare Spark-like DF with known labels
+
+    val ohe = new OneHotEncoder().setInputCol(tmpLabel).setOutputCol(labelCol).setDropLast(false)
+    val dataset = ohe.transform(unrolled).select(inputCol, labelCol)
+
+    //dataset.printSchema()
+    //dataset.show()
+
+    val learner = new CNTKLearner()
+      .setBrainScriptText(cifarScript)
+      // Build machine doesn't have GPUs
+      .setParallelTrain(false)
+      .setWorkingDirectory(dir)
+
+    val model = learner.fit(dataset)
+      .setInputCol(inputCol)
+      .setOutputCol("out_labels")
+      .setOutputNodeIndex(3)
+
+    val result = model.transform(dataset)
+    result.take(1)
+    //result.show()
+  }
+}
diff --git a/src/cntk-train/src/test/scala/ValidateConfiguration.scala b/src/cntk-train/src/test/scala/ValidateConfiguration.scala
new file mode 100644
index 0000000000..1a87d035e4
--- /dev/null
+++ b/src/cntk-train/src/test/scala/ValidateConfiguration.scala
@@ -0,0 +1,28 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+class ValidateConfiguration extends TestBase {
+
+  test("Basic BrainScript config E2E") {
+    val relativeOutRoot = "out"
+    val remappedInPath = "in.txt"
+    val dataFormat = "text"
+
+    val config = new BrainScriptBuilder()
+      .setOutputRoot(relativeOutRoot)
+      .setInputFile(
+        remappedInPath,
+        dataFormat,
+        Map("features" -> InputShape(10000, "sparse"),
+            "labels" -> InputShape(1, "dense")))
+
+    val cb = new CNTKCommandBuilder(false)
+      .appendOverrideConfig(config.toOverrideConfig)
+
+    // TODO: add assertions to really validate instead
+    println(cb.buildCommand)
+  }
+
+}
diff --git a/src/cntk-train/src/test/scala/ValidateDataConversion.scala b/src/cntk-train/src/test/scala/ValidateDataConversion.scala
new file mode 100644
index 0000000000..d18c2c13f5
--- /dev/null
+++ b/src/cntk-train/src/test/scala/ValidateDataConversion.scala
@@ -0,0 +1,83 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.ml.Transformer
+import org.apache.spark.sql.types._
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.linalg._
+
+import FileUtilities._
+
+class ValidateDataConversion extends TestBase {
+
+  import session.implicits._
+
+  test("vector to text") {
+    val testVectors = List(
+      new DenseVector(Array(1.0, 0.0)),
+      new SparseVector(     1, Array(0), Array(8.0)),
+      new SparseVector(   100, Array(0,10,18,33,62,67,80), Array(1.0,2.0,1.0,1.0,1.0,1.0,1.0)),
+      new SparseVector(100000, Array(5833,9467,16680,29018,68900,85762,97510), Array(1.0,1.0,1.0,1.0,1.0,1.0,2.0))
+    )
+
+    val expected = Seq(
+      "1.0 0.0 ",
+      "0:8.0 ",
+      "0:1.0 10:2.0 18:1.0 33:1.0 62:1.0 67:1.0 80:1.0 ",
+      "5833:1.0 9467:1.0 16680:1.0 29018:1.0 68900:1.0 85762:1.0 97510:2.0 ")
+
+    val outputs = testVectors.map(DataTransferUtils.convertVectorToText)
+    assert(outputs === expected)
+  }
+
+  val mockLabelColumn = "Label"
+
+  def createMockDataset: DataFrame = {
+    session.createDataFrame(Seq(
+      (0, 2, 0.50, 0.60, 0),
+      (1, 3, 0.40, 0.50, 1),
+      (0, 4, 0.78, 0.99, 2),
+      (1, 5, 0.12, 0.34, 3),
+      (0, 1, 0.50, 0.60, 0),
+      (1, 3, 0.40, 0.50, 1),
+      (0, 3, 0.78, 0.99, 2),
+      (1, 4, 0.12, 0.34, 3),
+      (0, 0, 0.50, 0.60, 0),
+      (1, 2, 0.40, 0.50, 1),
+      (0, 3, 0.78, 0.99, 2),
+      (1, 4, 0.12, 0.34, 3)))
+      .toDF(mockLabelColumn, "col1", "col2", "col3", "col4")
+  }
+
+  test("Checkpoint the data") {
+    val data = createMockDataset
+
+    val rData = DataTransferUtils.reduceAndAssemble(data, mockLabelColumn, "feats", "double", 10)
+    val cdata = DataTransferUtils.convertDatasetToCNTKTextFormat(rData, mockLabelColumn, "feats")
+
+    val transfer = new LocalWriter(s"$dir/smoke")
+    val path = transfer.checkpointToText(cdata)
+
+    val out = session.read.text(path)
+
+    assert(verifyResult(cdata, out))
+  }
+
+  test("Verify vector labels") {
+    val data = createMockDataset
+    val rData1 = DataTransferUtils.reduceAndAssemble(data, mockLabelColumn, "feats", "double", 10)
+    val rData = DataTransferUtils.reduceAndAssemble(rData1, "feats", "labels", "double", 10)
+    val cdata = DataTransferUtils.convertDatasetToCNTKTextFormat(rData, "labels", "feats")
+
+    val transfer = new LocalWriter(s"$dir/vectorlabel")
+    val path = transfer.checkpointToText(cdata)
+
+    val out = session.read.text(path)
+
+    assert(verifyResult(cdata, out))
+  }
+
+}
diff --git a/src/cntk-train/src/test/scala/ValidateEnvironmentUtils.scala b/src/cntk-train/src/test/scala/ValidateEnvironmentUtils.scala
new file mode 100644
index 0000000000..37cc52a4b3
--- /dev/null
+++ b/src/cntk-train/src/test/scala/ValidateEnvironmentUtils.scala
@@ -0,0 +1,14 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+class ValidateEnvironmentUtils extends TestBase {
+
+  // This is more of a run harness as asserting this is obviously dumb
+  ignore("Test env features") {
+    println(EnvironmentUtils.IsWindows)
+    println(EnvironmentUtils.GPUCount)
+  }
+
+}
diff --git a/src/codegen/build.sbt b/src/codegen/build.sbt
new file mode 100644
index 0000000000..3e904ff0e0
--- /dev/null
+++ b/src/codegen/build.sbt
@@ -0,0 +1,12 @@
+//> DependsOn: core
+
+Extras.noJar
+
+// Running this project will load all jars, which will fail if they're
+// all "provided".  This magic makes it as if the "provided" is not
+// there for the run task.  See https://github.com/sbt/sbt-assembly and
+// http://stackoverflow.com/questions/18838944/
+run in Compile :=
+  Defaults
+    .runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run))
+    .evaluated
diff --git a/src/codegen/src/main/scala/CodeGen.scala b/src/codegen/src/main/scala/CodeGen.scala
new file mode 100644
index 0000000000..075a8e3d27
--- /dev/null
+++ b/src/codegen/src/main/scala/CodeGen.scala
@@ -0,0 +1,79 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.codegen
+
+import com.microsoft.ml.spark.FileUtilities._
+import Config._
+
+import scala.util.matching.Regex
+import java.util.regex.Pattern
+
+object CodeGen {
+
+  def copyAllFiles(fromDir: File, rx: Regex, toDir: File): Unit = {
+    if (!fromDir.isDirectory) { println(s"'$fromDir' is not a directory"); return }
+    allFiles(fromDir, if (rx == null) null else (f => rx.findFirstIn(f.getName) != None))
+      .foreach{x => copyFile(x, toDir, overwrite=true)}
+  }
+  def copyAllFiles(fromDir: File, extension: String, toDir: File): Unit =
+      copyAllFiles(fromDir,
+                   if (extension == null || extension == "") null
+                   else (Pattern.quote("." + extension) + "$").r,
+                   toDir)
+
+  def copyAllFilesFromRoots(fromDir: File, roots: List[String], relPath: String,
+                            extension: String, toDir: File): Unit = {
+    roots.foreach { root =>
+      val dir = new File(new File(fromDir, root), relPath)
+      if (dir.exists && dir.isDirectory) copyAllFiles(dir, extension, toDir)
+    }
+  }
+  def copyAllFilesFromRoots(fromDir: File, roots: List[String], relPath: String,
+                            rx: Regex, toDir: File): Unit = {
+    roots.foreach { root =>
+      val dir = new File(new File(fromDir, root), relPath)
+      if (dir.exists && dir.isDirectory) copyAllFiles(dir, rx, toDir)
+    }
+  }
+
+  def generateArtifacts(): Unit = {
+    println(s"""|Running registration with config:
+                |  topDir:    $topDir
+                |  srcDir:    $srcDir
+                |  outputDir: $outputDir
+                |  toZipDir:  $toZipDir
+                |  pyTestDir: $pyTestDir""".stripMargin)
+    val roots = // note: excludes the toplevel project
+      if (!rootsFile.exists) sys.error(s"Could not find roots file at $rootsFile")
+      else readFile(rootsFile, _.getLines.toList).filter(_ != ".")
+    println("Creating temp folders")
+    toZipDir.mkdirs
+    pyTestDir.mkdirs
+    println("Copy jar files to output directory")
+    copyAllFilesFromRoots(srcDir, roots, jarRelPath,
+                          (Pattern.quote("-" + mmlVer + ".jar") + "$").r,
+                          outputDir)
+    println("Copy source python files")
+    copyAllFilesFromRoots(srcDir, roots, pyRelPath, "py", toZipDir)
+    println("Generate python APIs")
+    PySparkWrapperGenerator()
+    // build init file
+    val importStrings =
+      (copyrightLines.mkString("\n") + "\n\n") +:
+      allFiles(toZipDir, _.getName.endsWith(".py"))
+        .filter(f => !f.getName.startsWith(internalPrefix))
+        .map(f => s"from mmlspark.${f.getName.dropRight(3)} import *\n")
+    writeFile(new File(toZipDir, "__init__.py"), importStrings.mkString(""))
+    // package python zip file
+    zipFolder(toZipDir, zipFile)
+    // leave the source files there so they will be included in the super-jar
+    // if (!delTree(toZipDir)) println(s"Error: could not delete $toZipDir")
+  }
+
+  def main(args: Array[String]): Unit = {
+    org.apache.log4j.BasicConfigurator.configure(new org.apache.log4j.varia.NullAppender())
+    generateArtifacts()
+  }
+
+}
diff --git a/src/codegen/src/main/scala/Config.scala b/src/codegen/src/main/scala/Config.scala
new file mode 100644
index 0000000000..e6dacd9625
--- /dev/null
+++ b/src/codegen/src/main/scala/Config.scala
@@ -0,0 +1,29 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.codegen
+
+import com.microsoft.ml.spark.FileUtilities._
+import sys.process.Process
+
+object Config {
+
+  val srcDir     = new File(".").getCanonicalFile()
+  val topDir     = new File("..").getCanonicalFile()
+  val rootsFile  = new File(srcDir, "project/project-roots.txt")
+  val outputDir  = new File(topDir, "BuildArtifacts/sdk")
+  val toZipDir   = new File(srcDir, "src/main/resources/mmlspark")
+  val zipFile    = new File(outputDir, "mmlspark.zip")
+  val pyTestDir  = new File(topDir, "TestResults/generated_pytests")
+  val jarRelPath = "target/scala-" + sys.env("SCALA_VERSION")
+  val pyRelPath  = "src/main/python"
+  val mmlVer     = sys.env.getOrElse("MML_VERSION",
+                                     Process("../tools/runme/show-version").!!.trim)
+  val debugMode  = sys.env.getOrElse("DEBUGMODE", "").trim.toLowerCase == "true"
+  val internalPrefix  = "_"
+
+  val copyrightLines =
+    Seq("# Copyright (C) Microsoft Corporation. All rights reserved.",
+        "# Licensed under the MIT License. See LICENSE in the project root for information.")
+
+}
diff --git a/src/codegen/src/main/scala/PySparkWrapper.scala b/src/codegen/src/main/scala/PySparkWrapper.scala
new file mode 100644
index 0000000000..406bc7218b
--- /dev/null
+++ b/src/codegen/src/main/scala/PySparkWrapper.scala
@@ -0,0 +1,345 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.codegen
+
+import scala.collection.mutable.ListBuffer
+import scala.tools.nsc.util.DocStrings
+
+import org.apache.commons.lang3.StringUtils
+import org.apache.spark.ml.{Estimator, Transformer}
+import org.apache.spark.ml.PipelineStage
+import org.apache.spark.ml.param.Param
+
+import com.microsoft.ml.spark.FileUtilities._
+import Config._
+
+/**
+  * :: DeveloperApi ::
+  * Abstraction for PySpark wrapper generators.
+  */
+abstract class PySparkWrapper(entryPoint: PipelineStage,
+                              entryPointName: String,
+                              entryPointQualifiedName: String) {
+
+  private val ScopeDepth = "    "
+  private val additionalImports = Map(
+    ("complexTypes",
+      s"from ${toZipDir.getName}.TypeConversionUtils import generateTypeConverter, complexTypeConverter"),
+    ("utils", s"from ${toZipDir.getName}.Utils import *")
+  )
+
+  def toPySpark(): String = {
+    val output = new StringBuilder()
+    ""
+  }
+
+  protected val classTemplate = Seq(
+    copyrightLines.mkString("\n"),
+    "",
+    "import sys",
+    "if sys.version >= '3':",
+    "    basestring = str",
+    "",
+    "from pyspark.ml.param.shared import *",
+    "from pyspark import keyword_only",
+    "from pyspark.ml.util import JavaMLReadable, JavaMLWritable",
+    "from pyspark.ml.wrapper import JavaTransformer, JavaEstimator, JavaModel",
+    "from pyspark.ml.common import inherit_doc",
+    "%1$s",
+    "","",
+    "@inherit_doc",
+    "class %2$s(%3$s):",
+    "    \"\"\"",
+    "    %9$s",
+    "%11$s",
+    "    \"\"\"",
+    "",
+    "    @keyword_only",
+    "    def __init__(self, %4$s):",
+    "        super(%2$s, self).__init__()",
+    "        self._java_obj = self._new_java_obj(\"%5$s\")",
+    "%6$s",
+    // since 2.1.1, kwargs is an instance attribute...
+    "        if hasattr(self, \"_input_kwargs\"):",
+    "            kwargs = self._input_kwargs",
+    "        else:",
+    //     ... so this can be removed when we drop support for 2.1.0
+    "            kwargs = self.__init__._input_kwargs",
+    "        self.setParams(**kwargs)",
+    "",
+    "    @keyword_only",
+    "    def setParams(self, %4$s):",
+    "        \"\"\"",
+    "        Set the (keyword only) parameters","",
+    "%10$s",
+    "        \"\"\"",
+    "        if hasattr(self, \"_input_kwargs\"):",
+    "            kwargs = self._input_kwargs",
+    "        else:",
+    //     ... same here: remove when we drop support for 2.1.0
+    "            kwargs = self.setParams._input_kwargs",
+    "        return self._set(**kwargs)\n" +
+    "%7$s",
+    "%8$s",
+    "")
+
+  protected val defineParamsTemplate =
+    "        self.%1$s = Param(self, \"%1$s\", \"%2$s\")"
+  // Complex parameters need type converters
+  protected val defineComplexParamsTemplate =
+    "        self.%1$s = Param(self, \"%1$s\", \"%2$s\", %3$s)"
+  protected val setParamDefaultTemplate =
+    "        self._setDefault(%1$s=%2$s)"
+  protected val setParamDefaultWithGuidTemplate =
+    "        self._setDefault(%1$s=self.uid + \"%2$s\")"
+  protected val setTemplate =
+    Seq("",
+      "    def set%1$s(self, value):",
+      "        \"\"\"\n\n%4$s:param %3$s %5$s\n%4$s\"\"\"",
+      "        self._set(%2$s=value)",
+      "        return self")
+  protected val getTemplate =
+    Seq("",
+      "    def get%1$s(self):",
+      "        \"\"\"",
+      "        :return: %2$s",
+      "        :rtype: %3$s",
+      "        \"\"\"",
+      "        return self.getOrDefault(self.%2$s)",
+      "")
+  protected val getComplexTemplate =
+    Seq("",
+      "    def get%1$s(self):",
+      "        \"\"\"",
+      "        :return: %2$s",
+      "        :rtype: %3$s",
+      "        \"\"\"",
+      "        return self._cache[\"%2$s\"]")
+  protected val saveLoadTemplate =
+    Seq("",
+      "    @classmethod",
+      "    def read(cls):",
+      "        \"\"\" Returns an MLReader instance for this class. \"\"\"",
+      "        return JavaMMLReader(cls)",
+      "",
+      "    @staticmethod",
+      "    def getJavaPackage():",
+      "        \"\"\" Returns package name String. \"\"\"",
+      "        return \"%1$s\"",
+      "",
+      "    @staticmethod",
+      "    def _from_java(java_stage):",
+      "        stage_name=%2$s.__module__",
+      "        return from_java(java_stage, stage_name)","")
+
+  // TODO: Get a brief description of the class from the scala and put it here. There is not a simple
+  //       and intuitive way to do this via reflections, similar to the way that we are able to
+  //       retrieve the parameter explanations, for example.
+  protected val classDocTemplate =
+      "This wraps the scala class %1$s\n"
+
+  protected val paramDocTemplate =
+      "%3$s:param %4$s %2$s"
+
+  val psType: String
+  private lazy val objectBaseClass: String = "Java" + psType
+  private lazy val autoInheritedClasses = Seq("JavaMLReadable", "JavaMLWritable", objectBaseClass)
+  // Complex types are not easily recognized by Py4j. They need special processing.
+  private lazy val complexTypes =  Set[String](
+    "TransformerParam",
+    "TransformerArrayParam",
+    "EstimatorParam")
+  protected def isComplexType(paramType: String): Boolean = complexTypes.contains(paramType)
+
+  protected def getParamExplanation(param: Param[_]): String = {
+    entryPoint.explainParam(param)
+  }
+
+  protected def getPythonizedDefault(paramDefault: String, paramType: String,
+                                     defaultStringIsParsable: Boolean): String =
+    paramType match {
+      case "BooleanParam" =>
+        StringUtils.capitalize(paramDefault)
+      case "DoubleParam" | "FloatParam" | "IntParam" | "LongParam" =>
+        paramDefault
+      case x if x == "Param" || defaultStringIsParsable =>
+        "\"" + paramDefault + "\""
+      case _ =>
+        "None"
+    }
+
+  protected def getPythonizedDataType(paramType: String): String =
+    paramType match {
+      case "BooleanParam" => "bool"
+      case "IntParam" => "int"
+      case "LongParam" => "long"
+      case "FloatParam" => "float"
+      case "DoubleParam" => "double"
+      case "StringParam" => "str"
+      case "Param" => "str"
+      case "StringArrayParam" => "list of str"
+      case "MapArrayParam" => "dict of str to list of str"
+      case _ => "object"
+    }
+
+  protected def getParamDefault(param: Param[_]): (String, String) = {
+    var paramDefault:   String = null
+    var pyParamDefault: String = "None"
+    var autogenSuffix:  String = null
+    var defaultStringIsParsable: Boolean = true
+
+    if (entryPoint.hasDefault(param)) {
+      val paramParent: String = param.parent
+      paramDefault = entryPoint.getDefault(param).get.toString
+      if (paramDefault.toLowerCase.contains(paramParent.toLowerCase))
+        autogenSuffix = paramDefault.substring(paramDefault.lastIndexOf(paramParent)
+                                               + paramParent.length)
+      else {
+        try{
+          entryPoint.getParam(param.name).w(paramDefault)
+        }
+        catch{
+          case e: Exception =>
+            defaultStringIsParsable = false
+        }
+        pyParamDefault = getPythonizedDefault(paramDefault,
+          param.getClass.getSimpleName, defaultStringIsParsable)
+      }
+    }
+    (pyParamDefault, autogenSuffix)
+  }
+
+  private def defineParam(param: Param[_]): String = {
+    defineParamsTemplate
+  }
+
+  protected def getPysparkWrapperBase: String = {
+    // Construct relevant strings
+    val imports = ListBuffer[String](additionalImports("utils"))
+    val inheritedClasses = ListBuffer[String]()
+    inheritedClasses ++= autoInheritedClasses
+    val paramsAndDefaults           = ListBuffer[String]()
+    val paramDefinitionsAndDefaults = ListBuffer[String]()
+    val paramGettersAndSetters      = ListBuffer[String]()
+    val paramDocList                = ListBuffer[String]()
+    val classParamDocList           = ListBuffer[String]()
+
+    // Iterate over the params to build strings
+    val allParams: Array[Param[_]] = entryPoint.params
+    // Check for complex types
+    if(allParams.exists(p => isComplexType(p.getClass.getSimpleName))){
+      // Add special imports
+      imports += additionalImports("complexTypes")
+      // Add cache
+      paramDefinitionsAndDefaults += ScopeDepth * 2 + "self._cache = {}"
+    }
+    for (param <- allParams) {
+      val pname = param.name
+      val docType = getPythonizedDataType(param.getClass.getSimpleName)
+      paramGettersAndSetters +=
+        setTemplate.mkString("\n").format(StringUtils.capitalize(pname), pname, docType, ScopeDepth * 2,
+          getParamExplanation(param))
+      if(isComplexType(param.getClass.getSimpleName)){
+        paramDefinitionsAndDefaults +=
+          defineComplexParamsTemplate.format(
+            pname, getParamExplanation(param),
+            s"""generateTypeConverter("$pname", self._cache, complexTypeConverter)""")
+        paramGettersAndSetters +=
+          getComplexTemplate.mkString("\n").format(StringUtils.capitalize(pname), pname, param.getClass.getSimpleName)
+        paramDocList +=
+          paramDocTemplate.format(pname, getParamExplanation(param), ScopeDepth * 2, param.getClass.getSimpleName)
+        classParamDocList +=
+          paramDocTemplate.format(pname, getParamExplanation(param), ScopeDepth, param.getClass.getSimpleName)
+      }
+      else{
+        paramDefinitionsAndDefaults +=
+          defineParamsTemplate.format(pname, getParamExplanation(param))
+        paramGettersAndSetters +=
+          getTemplate.mkString("\n").format(StringUtils.capitalize(pname), pname, docType)
+        paramDocList +=
+          paramDocTemplate.format(pname, getParamExplanation(param), ScopeDepth * 2, docType)
+        classParamDocList +=
+          paramDocTemplate.format(pname, getParamExplanation(param), ScopeDepth, param.getClass.getSimpleName)
+      }
+
+      val (pyParamDefault, autogenSuffix) = getParamDefault(param)
+      paramsAndDefaults += pname + "=" + pyParamDefault
+
+      if (pyParamDefault != "None") {
+        paramDefinitionsAndDefaults += setParamDefaultTemplate.format(pname, pyParamDefault)
+      } else if (autogenSuffix != null) {
+        paramDefinitionsAndDefaults += setParamDefaultWithGuidTemplate.format(pname, autogenSuffix)
+      }
+    }
+
+    // Build strings
+    val importsString = imports.mkString("\n")
+    val inheritanceString = inheritedClasses.mkString(", ")
+    val classParamsString = paramsAndDefaults.mkString(", ")
+    val paramDefinitionsAndDefaultsString = paramDefinitionsAndDefaults.mkString("\n")
+    val paramGettersAndSettersString = paramGettersAndSetters.mkString("\n")
+    val saveLoadString = saveLoadTemplate.mkString("\n").format(entryPointQualifiedName, entryPointName)
+    val classDocString = classDocTemplate.format(entryPointName)
+    val paramDocString = paramDocList.mkString("\n")
+    val classParamDocString = classParamDocList.mkString("\n")
+
+    String.format(classTemplate.mkString("\n"), importsString, entryPointName, inheritanceString,
+                    classParamsString, entryPointQualifiedName,
+                    paramDefinitionsAndDefaultsString, paramGettersAndSettersString, saveLoadString,
+                    classDocString, paramDocString, classParamDocString) + "\n"
+  }
+
+  def pysparkWrapperBuilder(): String = {
+    getPysparkWrapperBase
+  }
+
+  def writeWrapperToFile(dir: File): Unit = {
+    writeFile(new File(dir, entryPointName + ".py"), pysparkWrapperBuilder())
+  }
+}
+
+class SparkTransformerWrapper(entryPoint: Transformer,
+                              entryPointName: String,
+                              entryPointQualifiedName: String)
+    extends PySparkWrapper(entryPoint,
+                           entryPointName,
+                           entryPointQualifiedName) {
+
+  override val psType = "Transformer"
+}
+
+class SparkEstimatorWrapper(entryPoint: Estimator[_],
+                            entryPointName: String,
+                            entryPointQualifiedName: String,
+                            companionModelName: String,
+                            companionModelQualifiedName: String)
+  extends PySparkWrapper(entryPoint,
+                         entryPointName,
+                         entryPointQualifiedName) {
+
+  private val createModelStringTemplate = Seq(
+    "    def _create_model(self, java_model):",
+    "        return %1$s(java_model)",
+    "").mkString("\n")
+
+  private val modelClassString = Seq(
+    "class %1$s(JavaModel, JavaMLWritable, JavaMLReadable):",
+    "    \"\"\"",
+    "    Model fitted by :class:`%2$s`.",
+    "    This class is left empty on purpose.",
+    "    All necessary methods are exposed through inheritance.",
+    "    \"\"\"",
+    "").mkString("\n")
+
+  override def pysparkWrapperBuilder(): String = {
+    Seq(super.pysparkWrapperBuilder,
+          createModelStringTemplate.format(companionModelName),
+          modelClassString.format(companionModelName, entryPointName),
+          saveLoadTemplate.mkString("\n").format(companionModelQualifiedName, companionModelName),
+          "").mkString("\n")
+  }
+
+  override val psType = "Estimator"
+
+}
diff --git a/src/codegen/src/main/scala/PySparkWrapperGenerator.scala b/src/codegen/src/main/scala/PySparkWrapperGenerator.scala
new file mode 100644
index 0000000000..e4116aee36
--- /dev/null
+++ b/src/codegen/src/main/scala/PySparkWrapperGenerator.scala
@@ -0,0 +1,123 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.codegen
+
+import collection.JavaConverters._
+import java.io.File
+import java.lang.reflect.{Type, ParameterizedType}
+import java.util.jar._
+
+import scala.reflect.internal.util.ScalaClassLoader.URLClassLoader
+import org.apache.spark.ml.{Estimator, Transformer}
+
+import com.microsoft.ml.spark.FileUtilities._
+import Config._
+
+import scala.language.existentials
+import com.microsoft.ml.spark.InternalWrapper
+import scala.reflect.runtime.universe._
+
+object PySparkWrapperGenerator {
+
+  // check if the class is annotated with InternalWrapper
+  private[spark] def needsInternalWrapper(myClass: Class[_]):Boolean = {
+    val typ: ClassSymbol = runtimeMirror(myClass.getClassLoader).classSymbol(myClass)
+    typ.annotations.exists(a => a.tree.tpe =:= typeOf[InternalWrapper])
+  }
+
+  private[spark] def pyWrapperName(myClass: Class[_]):String = {
+    val prefix = if(needsInternalWrapper(myClass)) internalPrefix else ""
+    prefix + myClass.getSimpleName
+  }
+
+  def writeWrappersToFile(myClass: Class[_], qualifiedClassName: String): Unit = {
+    try {
+      val classInstance = myClass.newInstance()
+
+      val (wrapper: PySparkWrapper, wrapperTests: PySparkWrapperTest) =
+        classInstance match {
+          case t: Transformer =>
+            val className = pyWrapperName(myClass)
+            (new SparkTransformerWrapper(t, className, qualifiedClassName),
+              new SparkTransformerWrapperTest(t, className, qualifiedClassName))
+          case e: Estimator[_] =>
+            var sc = myClass
+            while(!Seq("Estimator", "Predictor").contains(sc.getSuperclass.getSimpleName)) {
+              sc = sc.getSuperclass
+            }
+            val typeArgs = sc.getGenericSuperclass.asInstanceOf[ParameterizedType]
+              .getActualTypeArguments
+            val getModelFromGenericType = (modelType: Type) => {
+              val modelClass = modelType.getTypeName.split("<").head
+              (modelClass.split("\\.").last, modelClass)
+            }
+            val (modelClass, modelQualifiedClass) = sc.getSuperclass.getSimpleName match {
+              case "Estimator" =>
+                getModelFromGenericType(typeArgs.head)
+              case "Predictor" =>
+                getModelFromGenericType(typeArgs(2))
+            }
+
+            val className = pyWrapperName(myClass)
+            (new SparkEstimatorWrapper(e,
+                className,
+                qualifiedClassName,
+                modelClass,
+                modelQualifiedClass),
+              new SparkEstimatorWrapperTest(e,
+                className,
+                qualifiedClassName,
+                modelClass,
+                modelQualifiedClass))
+          case _ => return
+        }
+      wrapper.writeWrapperToFile(toZipDir)
+      wrapperTests.writeWrapperToFile(pyTestDir)
+      if (debugMode) println(s"Generated wrapper for class ${myClass.getSimpleName}")
+    } catch {
+      // Classes without default constructor
+      case ie: InstantiationException =>
+        if (debugMode) println(s"Could not generate wrapper for class ${myClass.getSimpleName}: $ie")
+      // Classes with "private" modifiers on constructors
+      case iae: IllegalAccessException =>
+        if (debugMode) println(s"Could not generate wrapper for class ${myClass.getSimpleName}: $iae")
+      // Classes that require runtime library loading
+      case ule: UnsatisfiedLinkError =>
+        if (debugMode) println(s"Could not generate wrapper for class ${myClass.getSimpleName}: $ule")
+      case e: Exception =>
+        println(s"Could not generate wrapper for class ${myClass.getSimpleName}: ${e.printStackTrace}")
+    }
+  }
+
+  def getWrappersFromJarFile(jarFilePath: String, cl2: URLClassLoader): Unit = {
+    val cld = new URLClassLoader(Array(new File(jarFilePath).toURI.toURL), cl2)
+    val jfd = new JarFile(jarFilePath)
+
+    using(Seq(cld, jfd)) { s =>
+      val cl = s(0).asInstanceOf[URLClassLoader]
+      val jarFile = s(1).asInstanceOf[JarFile]
+      val _ = jarFile.entries.asScala
+        .filter(e => e.getName.endsWith(".class"))
+        .map(e => e.getName.replace("/", ".").stripSuffix(".class"))
+        .filter(q => {
+           val clazz = cl.loadClass(q)
+           try {
+             clazz.getEnclosingClass == null
+           } catch {
+             case _: java.lang.NoClassDefFoundError => false
+           }
+         })
+        .foreach(q => writeWrappersToFile(cl.loadClass(q), q))
+    }.get
+  }
+
+  def apply(): Unit = {
+    val jarFiles = outputDir.listFiles.filter(_.getName.endsWith(".jar"))
+    val jarUrls = jarFiles.map(_.toURI.toURL)
+    using(Seq(new URLClassLoader(jarUrls, this.getClass.getClassLoader))) { s =>
+      jarFiles.foreach(f => getWrappersFromJarFile(f.getAbsolutePath, s(0)))
+    }.get
+  }
+
+}
diff --git a/src/codegen/src/main/scala/PySparkWrapperTest.scala b/src/codegen/src/main/scala/PySparkWrapperTest.scala
new file mode 100644
index 0000000000..54cf23ec21
--- /dev/null
+++ b/src/codegen/src/main/scala/PySparkWrapperTest.scala
@@ -0,0 +1,275 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.codegen
+
+import org.apache.commons.lang3.StringUtils
+import org.apache.spark.ml.{Estimator, Transformer}
+import org.apache.spark.ml.PipelineStage
+import org.apache.spark.ml.param.Param
+
+import com.microsoft.ml.spark.FileUtilities._
+import Config._
+
+/**
+  * :: DeveloperApi ::
+  * Abstraction for PySpark wrapper generators.
+  */
+abstract class PySparkWrapperTest(entryPoint: PipelineStage,
+                                  entryPointName: String,
+                                  entryPointQualifiedName: String) {
+
+  // general classes are imported from the mmlspark directy;
+  // internal classes have to be imported from their packages
+  private def importClass(entryPointName:String):String = {
+    if(entryPointName startsWith internalPrefix) s"from mmlspark.$entryPointName import $entryPointName"
+    else s"from mmlspark import $entryPointName"
+  }
+
+  protected def classTemplate(classParams: String, paramGettersAndSetters: String) =
+    (
+    s"""|
+        |
+        |import unittest
+        |import pandas as pd
+        |import numpy as np
+        |import pyspark.ml, pyspark.ml.feature
+        |from pyspark import SparkContext
+        |from pyspark.sql import SQLContext
+        |from pyspark.ml.classification import LogisticRegression
+        |from pyspark.ml.regression import LinearRegression
+        |""" + importClass(entryPointName) +
+    s"""
+        |from pyspark.ml.feature import Tokenizer
+        |from mmlspark import TrainClassifier
+        |
+        |sc = SparkContext()
+        |
+        |class ${entryPointName}Test(unittest.TestCase):
+        |
+        |    def test_${entryPointName}AllDefaults(self):
+        |        my$entryPointName = $entryPointName()
+        |        my$entryPointName.setParams($classParams)
+        |        self.assertNotEqual(my$entryPointName, None)
+        |
+        |$paramGettersAndSetters
+        |
+        |""").stripMargin
+
+  protected val unittestString =
+    s"""|
+        |import os, xmlrunner
+        |if __name__ == "__main__":
+        |    result = unittest.main(testRunner=xmlrunner.XMLTestRunner(output=os.getenv("TEST_RESULTS","TestResults")),
+        |                           failfast=False, buffer=False, catchbreak=False)
+        |""".stripMargin
+
+  protected def setAndGetTemplate(paramName: String, value: String) =
+    s"""|    def test_set$paramName(self):
+        |        my$entryPointName = $entryPointName()
+        |        val = $value
+        |        my$entryPointName.set$paramName(val)
+        |        retVal = my$entryPointName.get$paramName()
+        |        self.assertEqual(val, retVal)
+        |""".stripMargin
+
+  protected def tryFitSetupTemplate(entryPointName: String) =
+    s"""|    def test_$entryPointName(self):
+        |        dog = "dog"
+        |        cat = "cat"
+        |        bird = "bird"
+        |        tmp1  = {
+        |            "col1": [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
+        |            "col2": [2, 3, 4, 5, 1, 3, 3, 4, 0, 2, 3, 4],
+        |            "col3": [0.50, 0.40, 0.78, 0.12, 0.50, 0.40, 0.78, 0.12, 0.50, 0.40, 0.78, 0.12],
+        |            "col4": [0.60, 0.50, 0.99, 0.34, 0.60, 0.50, 0.99, 0.34, 0.60, 0.50, 0.99, 0.34],
+        |            "col5": [dog, cat, dog, cat, dog, bird, dog, cat, dog, bird, dog, cat],
+        |            "col6": [cat, dog, bird, dog, bird, dog, cat, dog, cat, dog, bird, dog],
+        |            "image": [cat, dog, bird, dog, bird, dog, cat, dog, cat, dog, bird, dog]
+        |        }
+        |        sqlC = SQLContext(sc)
+        |        pddf = pd.DataFrame(tmp1)
+        |        pddf["col1"] = pddf["col1"].astype(np.float64)
+        |        pddf["col2"] = pddf["col2"].astype(np.int32)
+        |        data = sqlC.createDataFrame(pddf)
+        |""".stripMargin
+
+  protected def tryTransformTemplate(entryPointName: String, param: String) =
+      s"""|        my$entryPointName = $entryPointName($param)
+          |        prediction = my$entryPointName.transform(data)
+          |        self.assertNotEqual(prediction, None)
+          |""".stripMargin
+
+  protected def tryFitTemplate(entryPointName: String, model: String) =
+      s"""|        my$entryPointName = $entryPointName(model=$model, labelCol="col1", numFeatures=5)
+          |        model = my$entryPointName.fit(data)
+          |        self.assertNotEqual(model, None)""".stripMargin
+
+  private def evaluateSetupTemplate(entryPointName: String) =
+    s"""|    def test_$entryPointName(self):
+        |        data = {
+        |            "labelColumn": [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
+        |            "col1": [2, 3, 4, 5, 1, 3, 3, 4, 0, 2, 3, 4],
+        |            "col2": [0.50, 0.40, 0.78, 0.12, 0.50, 0.40, 0.78, 0.12, 0.50, 0.40, 0.78, 0.12],
+        |            "col3": [0.60, 0.50, 0.99, 0.34, 0.60, 0.50, 0.99, 0.34, 0.60, 0.50, 0.99, 0.34],
+        |            "col4": [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]
+        |        }
+        |        sqlC = SQLContext(sc)
+        |        pddf = pd.DataFrame(data)
+        |        data = sqlC.createDataFrame(pddf)
+        |        model = TrainClassifier(model=LogisticRegression(), labelCol="labelColumn",
+        |                                numFeatures=256).fit(data)
+        |""".stripMargin
+
+  protected def computeStatisticsTemplate(entryPointName: String) =
+    s"""|${evaluateSetupTemplate(entryPointName)}
+        |        scoredData = model.transform(data)
+        |        scoredData.limit(10).toPandas()
+        |        evaluatedData = $entryPointName().transform(scoredData)
+        |        self.assertNotEqual(evaluatedData, None)
+        |""".stripMargin
+
+  protected def evaluateTemplate(entryPointName: String) =
+    s"""|${evaluateSetupTemplate(entryPointName)}
+        |        model = TrainClassifier(model=LogisticRegression(), labelCol="labelColumn",
+        |                                numFeatures=256).fit(data)
+        |        evaluateModels = FindBestModel(models=[model, model]).fit(data)
+        |        bestModel = evaluateModels.transform(data)
+        |        self.assertNotEqual(bestModel, None)
+        |""".stripMargin
+
+  // These params are need custom handling. For now, just skip them so we have tests that pass.
+  private lazy val skippedParams =  Set[String]("models")
+  protected def isSkippedParam(paramName: String): Boolean = skippedParams.contains(paramName)
+  protected def isModel(paramName: String): Boolean = paramName.toLowerCase() == "model"
+  protected def isBaseTransformer(paramName: String): Boolean = paramName.toLowerCase() == "basetransformer"
+  protected def tryFitString(entryPointName: String): String =
+    if (entryPointName.contains("Regressor"))
+      tryFitTemplate(entryPointName, "LinearRegression(solver=\"l-bfgs\")")
+    else if (entryPointName.contains("Classifier"))
+      tryFitTemplate(entryPointName, "LogisticRegression()")
+    else ""
+  protected def computeStatisticsString(entryPointName: String): String = computeStatisticsTemplate(entryPointName)
+  protected def evaluateString(entryPointName: String): String          = evaluateTemplate(entryPointName)
+  protected def tryTransformString(entryPointName: String): String = {
+    val param: String =
+      entryPointName match {
+        case "WriteBlob" => "blobPath=\"file:///tmp/" + java.util.UUID.randomUUID + ".tsv\""
+        case "MultiColumnAdapter" =>
+          "baseTransformer=Tokenizer(), inputCols = \"col5,col6\", outputCols = \"output1,output2\"\n "
+        case "DataConversion"      => "col=\"col1\", convertTo=\"double\""
+        case "FastVectorAssembler" => "inputCols=\"col1\""
+        case "MultiNGram"          => "inputColumns=np.array([ \"col5\", \"col6\" ])"
+        case "SelectColumns"       => "cols=[\"col1\"]"
+        case "ImageFeaturizer"     => "modelSaveDir=\"file:///tmp\""
+        case "Repartition"         => "n=2"
+        case "_CNTKModel" | "MultiTokenizer" | "NltTokenizeTransform" | "TextTransform"
+           | "TextNormalizerTransform" | "WordTokenizeTransform" => "inputCol=\"col5\""
+        case _ => ""
+      }
+    tryTransformTemplate(entryPointName, param)
+  }
+
+  protected def getPythonizedDefault(paramDefault: String, paramType: String,
+                                     defaultStringIsParsable: Boolean): String =
+    paramType match {
+      case "BooleanParam" =>
+        StringUtils.capitalize(paramDefault)
+      case "DoubleParam" | "FloatParam" | "IntParam" | "LongParam" =>
+        paramDefault
+      case x if x == "Param" || defaultStringIsParsable =>
+        "\"" + paramDefault + "\""
+      case _ =>
+        "None"
+    }
+
+  protected def getParamDefault(param: Param[_]): (String, String) = {
+    if (!entryPoint.hasDefault(param)) ("None", null)
+    else {
+      val paramParent: String = param.parent
+      val paramDefault = entryPoint.getDefault(param).get.toString
+      if (paramDefault.toLowerCase.contains(paramParent.toLowerCase))
+        ("None",
+         paramDefault.substring(paramDefault.lastIndexOf(paramParent) + paramParent.length))
+      else {
+        val defaultStringIsParsable: Boolean =
+          try {
+            entryPoint.getParam(param.name).w(paramDefault)
+            true
+          } catch {
+            case e: Exception => false
+          }
+        (getPythonizedDefault(paramDefault, param.getClass.getSimpleName, defaultStringIsParsable),
+         null)
+      }
+    }
+  }
+
+  protected def getPysparkWrapperTestBase: String = {
+    // Iterate over the params to build strings
+    val paramGettersAndSettersString =
+      entryPoint.params.filter { param => !isSkippedParam(param.name)
+      }.map { param =>
+        val value = if (isModel(param.name)) "LogisticRegression()"
+                    else if (isBaseTransformer(param.name)) "Tokenizer()"
+                    else getParamDefault(param)._1
+        setAndGetTemplate(StringUtils.capitalize(param.name), value)
+      }.mkString("\n")
+    val classParamsString =
+      entryPoint.params.map(param => param.name + "=" + getParamDefault(param)._1).mkString(", ")
+    classTemplate(classParamsString, paramGettersAndSettersString)
+  }
+
+  def pysparkWrapperTestBuilder(): String = {
+    copyrightLines.mkString("\n") + getPysparkWrapperTestBase
+  }
+
+  def writeWrapperToFile(dir: File): Unit = {
+    writeFile(new File(dir, entryPointName + "_tests.py"), pysparkWrapperTestBuilder())
+  }
+
+}
+
+class SparkTransformerWrapperTest(entryPoint: Transformer,
+                                  entryPointName: String,
+                                  entryPointQualifiedName: String)
+  extends PySparkWrapperTest(entryPoint,
+    entryPointName,
+    entryPointQualifiedName) {
+
+  // The transformer tests for FastVectorAssembler ... UnrollImage are disabled for the moment.
+  override def pysparkWrapperTestBuilder(): String = {
+    val transformTest =
+      entryPointName match {
+        case "ComputeModelStatistics" => computeStatisticsString(entryPointName)
+        case "ComputePerInstanceStatistics" => computeStatisticsString(entryPointName)
+        case "_CNTKModel" | "FastVectorAssembler" | "MultiNGram" | "ImageFeaturizer"
+           | "_ImageTransformer" | "UnrollImage" | "HashTransform" | "StopWordsRemoverTransform"
+           => ""
+        case _ =>
+          tryFitSetupTemplate(entryPointName) + tryTransformString(entryPointName)
+      }
+    super.pysparkWrapperTestBuilder + transformTest + unittestString
+  }
+
+}
+
+class SparkEstimatorWrapperTest(entryPoint: Estimator[_],
+                                entryPointName: String,
+                                entryPointQualifiedName: String,
+                                companionModelName: String,
+                                companionModelQualifiedName: String)
+    extends PySparkWrapperTest(entryPoint, entryPointName, entryPointQualifiedName) {
+
+  private val modelName = entryPointName + "Model"
+
+  override def pysparkWrapperTestBuilder(): String = {
+    val testString =
+      if (entryPointName == "FindBestModel")
+        evaluateString(entryPointName)
+      else
+        tryFitSetupTemplate(entryPointName) + tryFitString(entryPointName)
+    super.pysparkWrapperTestBuilder + testString + unittestString
+  }
+
+}
diff --git a/src/compute-model-statistics/build.sbt b/src/compute-model-statistics/build.sbt
new file mode 100644
index 0000000000..1ddf71d75d
--- /dev/null
+++ b/src/compute-model-statistics/build.sbt
@@ -0,0 +1,3 @@
+//> DependsOn: core
+//> DependsOn: train-regressor
+//> DependsOn: train-classifier
diff --git a/src/compute-model-statistics/src/main/scala/ComputeModelStatistics.scala b/src/compute-model-statistics/src/main/scala/ComputeModelStatistics.scala
new file mode 100644
index 0000000000..7d958abea8
--- /dev/null
+++ b/src/compute-model-statistics/src/main/scala/ComputeModelStatistics.scala
@@ -0,0 +1,559 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.contracts.MetricData
+import com.microsoft.ml.spark.schema.SchemaConstants._
+import com.microsoft.ml.spark.schema.{CategoricalUtilities, SchemaConstants, SparkSchema}
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics, RegressionMetrics}
+import org.apache.spark.ml.param.{Param, ParamMap}
+import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
+import org.apache.spark.mllib.linalg.{Matrices, Matrix}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+import org.apache.log4j.Logger
+
+/**
+  * Contains constants used by Compute Model Statistics.
+  */
+object ComputeModelStatistics extends DefaultParamsReadable[ComputeModelStatistics] {
+  // Regression metrics
+  val MseSparkMetric  = "mse"
+  val RmseSparkMetric = "rmse"
+  val R2SparkMetric   = "r2"
+  val MaeSparkMetric  = "mae"
+
+  // Binary Classification metrics
+  val AreaUnderROCMetric = "areaUnderROC"
+  val AucSparkMetric = "AUC"
+  val AccuracySparkMetric = "accuracy"
+  val PrecisionSparkMetric = "precision"
+  val RecallSparkMetric = "recall"
+  val AllSparkMetrics = "all"
+
+  // Regression column names
+  val MseColumnName  = "mean_squared_error"
+  val RmseColumnName = "root_mean_squared_error"
+  val R2ColumnName   = "R^2"
+  val MaeColumnName  = "mean_absolute_error"
+
+  // Binary Classification column names
+  val AucColumnName       = "AUC"
+
+  // Binary and Multiclass (micro-averaged) column names
+  val PrecisionColumnName = "precision"
+  val RecallColumnName    = "recall"
+  val AccuracyColumnName  = "accuracy"
+
+  // Multiclass Classification column names
+  val AverageAccuracy = "average_accuracy"
+  val MacroAveragedRecall = "macro_averaged_recall"
+  val MacroAveragedPrecision = "macro_averaged_precision"
+
+  // Metric to column name
+  val metricToColumnName = Map(AccuracySparkMetric -> AccuracyColumnName,
+    PrecisionSparkMetric -> PrecisionColumnName,
+    RecallSparkMetric -> RecallColumnName,
+    MseSparkMetric -> MseColumnName,
+    RmseSparkMetric -> RmseColumnName,
+    R2SparkMetric -> R2ColumnName,
+    MaeSparkMetric -> MaeColumnName)
+
+  val classificationColumns = List(AccuracyColumnName, PrecisionColumnName, RecallColumnName)
+
+  val regressionColumns = List(MseColumnName, RmseColumnName, R2ColumnName, MaeColumnName)
+
+  val ClassificationEvaluationType = "Classification"
+  val EvaluationType = "evaluation_type"
+
+  val FpRateROCColumnName = "false_positive_rate"
+  val TpRateROCColumnName = "true_positive_rate"
+
+  val FpRateROCLog = "fpr"
+  val TpRateROCLog = "tpr"
+
+  val BinningThreshold = 1000
+}
+
+/**
+  * Evaluates the given scored dataset.
+  */
+class ComputeModelStatistics(override val uid: String) extends Transformer with MMLParams {
+
+  def this() = this(Identifiable.randomUID("ComputeModelStatistics"))
+
+  val evaluationMetric: Param[String] = StringParam(this, "evaluationMetric", "Metric to evaluate models with", "all")
+
+  def getEvaluationMetric: String = $(evaluationMetric)
+
+  /** @group setParam **/
+  def setEvaluationMetric(value: String): this.type = set(evaluationMetric, value)
+
+  lazy val logger = Logger.getLogger(this.getClass.getName)
+
+  /**
+   * The ROC curve evaluated for a binary classifier.
+   */
+  var rocCurve: DataFrame = null
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    val (modelName, labelColumnName, scoreValueKind) = getSchemaInfo(dataset.schema)
+
+    // For creating the result dataframe in classification or regression case
+    val spark = dataset.sparkSession
+    import spark.implicits._
+
+    if (scoreValueKind == SchemaConstants.ClassificationKind) {
+
+      var resultDF: DataFrame = Seq(ComputeModelStatistics.ClassificationEvaluationType)
+        .toDF(ComputeModelStatistics.EvaluationType)
+      val scoredLabelsColumnName = SparkSchema.getScoredLabelsColumnName(dataset.schema, modelName)
+
+      // Get levels for label column if categorical
+      val levels = CategoricalUtilities.getLevels(dataset.schema, labelColumnName)
+
+      val levelsExist = levels.isDefined
+
+      lazy val levelsToIndexMap: Map[Any, Double] = getLevelsToIndexMap(levels.get)
+
+      lazy val predictionAndLabels = if (levelsExist) {
+        getPredictionAndLabels(dataset, labelColumnName, scoredLabelsColumnName, levelsToIndexMap)
+      } else {
+        selectAndCastToRDD(dataset, scoredLabelsColumnName, labelColumnName)
+      }
+
+      lazy val scoresAndLabels = {
+        val scoresColumnName = SparkSchema.getScoresColumnName(dataset.schema, modelName)
+        if (scoresColumnName != null) {
+          if (levelsExist) {
+            getScoresAndLabels(dataset, labelColumnName, scoresColumnName, levelsToIndexMap)
+          } else {
+            getScalarScoresAndLabels(dataset, labelColumnName, scoresColumnName)
+          }
+        } else {
+          predictionAndLabels
+        }
+      }
+
+      lazy val (labels: Array[Double], confusionMatrix: Matrix) = createConfusionMatrix(predictionAndLabels)
+
+      // If levels exist, use the extra information they give to get better performance
+      getEvaluationMetric match {
+        case ComputeModelStatistics.AllSparkMetrics => {
+          resultDF = addConfusionMatrixToResult(labels, confusionMatrix, resultDF)
+          resultDF = addAllClassificationMetrics(modelName,
+            dataset,
+            labelColumnName,
+            predictionAndLabels,
+            confusionMatrix,
+            scoresAndLabels,
+            resultDF)
+        }
+        case simpleMetric if simpleMetric == ComputeModelStatistics.AccuracySparkMetric ||
+          simpleMetric == ComputeModelStatistics.PrecisionSparkMetric ||
+          simpleMetric == ComputeModelStatistics.RecallSparkMetric => {
+          resultDF = addSimpleMetric(simpleMetric, predictionAndLabels, resultDF)
+        }
+        case ComputeModelStatistics.AucSparkMetric => {
+          val numLevels = if (levelsExist) {
+            levels.get.length
+          } else {
+            confusionMatrix.numRows
+          }
+          if (numLevels <= 2) {
+            // Add the AUC
+            val auc: Double = getAUC(modelName, dataset, labelColumnName, scoresAndLabels)
+            resultDF = resultDF.withColumn(ComputeModelStatistics.AucColumnName, lit(auc))
+          } else {
+            throw new Exception("Error: AUC is not available for multiclass case")
+          }
+        }
+        case default => {
+          throw new Exception(s"Error: $default is not a classification metric")
+        }
+      }
+      resultDF
+    } else if (scoreValueKind == SchemaConstants.RegressionKind) {
+      val scoresColumnName = SparkSchema.getScoresColumnName(dataset.schema, modelName)
+
+      val scoresAndLabels = selectAndCastToRDD(dataset, scoresColumnName, labelColumnName)
+
+      val regressionMetrics = new RegressionMetrics(scoresAndLabels)
+
+      // get all spark metrics possible: "mse", "rmse", "r2", "mae"
+      val mse  = regressionMetrics.meanSquaredError
+      val rmse = regressionMetrics.rootMeanSquaredError
+      val r2   = regressionMetrics.r2
+      val mae  = regressionMetrics.meanAbsoluteError
+
+      logRegressionMetrics(mse, rmse, r2, mae)
+
+      Seq((mse, rmse, r2, mae)).toDF(ComputeModelStatistics.MseColumnName,
+        ComputeModelStatistics.RmseColumnName,
+        ComputeModelStatistics.R2ColumnName,
+        ComputeModelStatistics.MaeColumnName)
+    } else {
+      throwOnInvalidScoringKind(scoreValueKind)
+    }
+  }
+
+  private def getSchemaInfo(schema: StructType): (String, String, String) = {
+    // TODO: evaluate all models; for now, get first model name found
+    val firstModelName = schema.collectFirst {
+      case StructField(c, t, _, m) if (getFirstModelName(m) != null && !getFirstModelName(m).isEmpty) => {
+        getFirstModelName(m).get
+      }
+    }
+    val modelName = if (!firstModelName.isEmpty) firstModelName.get
+    else throw new Exception("Please score the model prior to evaluating")
+    val labelColumnName = SparkSchema.getLabelColumnName(schema, modelName)
+
+    val scoreValueKind = SparkSchema.getScoreValueKind(schema, modelName, labelColumnName)
+    (modelName, labelColumnName, scoreValueKind)
+  }
+
+  private def addSimpleMetric(simpleMetric: String,
+                      predictionAndLabels: RDD[(Double, Double)],
+                      resultDF: DataFrame): DataFrame = {
+    var newResultDF = resultDF
+    val (labels: Array[Double], confusionMatrix: Matrix) = createConfusionMatrix(predictionAndLabels)
+    // Compute metrics for binary classification
+    if (confusionMatrix.numCols == 2) {
+      val (accuracy: Double, precision: Double, recall: Double) =
+        getBinaryAccuracyPrecisionRecall(confusionMatrix)
+      // Add the metrics to the DF
+      if (simpleMetric == ComputeModelStatistics.AccuracySparkMetric) {
+        newResultDF = newResultDF.withColumn(ComputeModelStatistics.AccuracyColumnName, lit(accuracy))
+      } else if (simpleMetric == ComputeModelStatistics.PrecisionSparkMetric) {
+        newResultDF = newResultDF.withColumn(ComputeModelStatistics.PrecisionColumnName, lit(precision))
+      } else if (simpleMetric == ComputeModelStatistics.RecallSparkMetric) {
+        newResultDF = newResultDF.withColumn(ComputeModelStatistics.RecallColumnName, lit(recall))
+      }
+      logClassificationMetrics(accuracy, precision, recall)
+    } else {
+      val (microAvgAccuracy: Double, microAvgPrecision: Double, microAvgRecall: Double, _, _, _) =
+        getMulticlassMetrics(predictionAndLabels, confusionMatrix)
+
+      // Add the metrics to the DF
+      if (simpleMetric == ComputeModelStatistics.AccuracySparkMetric) {
+        newResultDF = newResultDF.withColumn(ComputeModelStatistics.AccuracyColumnName, lit(microAvgAccuracy))
+      } else if (simpleMetric == ComputeModelStatistics.PrecisionSparkMetric) {
+        newResultDF = newResultDF.withColumn(ComputeModelStatistics.PrecisionColumnName, lit(microAvgPrecision))
+      } else if (simpleMetric == ComputeModelStatistics.RecallSparkMetric) {
+        newResultDF = newResultDF.withColumn(ComputeModelStatistics.RecallColumnName, lit(microAvgRecall))
+      }
+      logClassificationMetrics(microAvgAccuracy, microAvgPrecision, microAvgRecall)
+    }
+    newResultDF
+  }
+
+  private def addAllClassificationMetrics(modelName: String,
+                                  dataset: Dataset[_],
+                                  labelColumnName: String,
+                                  predictionAndLabels: RDD[(Double, Double)],
+                                  confusionMatrix: Matrix,
+                                  scoresAndLabels: RDD[(Double, Double)],
+                                  resultDF: DataFrame): DataFrame = {
+    var newResultDF = resultDF
+    // Compute metrics for binary classification
+    if (confusionMatrix.numCols == 2) {
+      val (accuracy: Double, precision: Double, recall: Double) = getBinaryAccuracyPrecisionRecall(confusionMatrix)
+      // Add the metrics to the DF
+      newResultDF = newResultDF.withColumn(ComputeModelStatistics.AccuracyColumnName, lit(accuracy))
+        .withColumn(ComputeModelStatistics.PrecisionColumnName, lit(precision))
+        .withColumn(ComputeModelStatistics.RecallColumnName, lit(recall))
+
+      logClassificationMetrics(accuracy, precision, recall)
+
+      // Add the AUC
+      val auc: Double = getAUC(modelName, dataset, labelColumnName, scoresAndLabels)
+      newResultDF = newResultDF.withColumn(ComputeModelStatistics.AucColumnName, lit(auc))
+
+      logAUC(auc)
+    } else {
+      val (microAvgAccuracy: Double,
+      microAvgPrecision: Double,
+      microAvgRecall: Double,
+      averageAccuracy: Double,
+      macroAveragedPrecision: Double,
+      macroAveragedRecall: Double) = getMulticlassMetrics(predictionAndLabels, confusionMatrix)
+
+      newResultDF = newResultDF.withColumn(ComputeModelStatistics.AccuracyColumnName, lit(microAvgAccuracy))
+        .withColumn(ComputeModelStatistics.PrecisionColumnName, lit(microAvgPrecision))
+        .withColumn(ComputeModelStatistics.RecallColumnName, lit(microAvgRecall))
+        .withColumn(ComputeModelStatistics.AverageAccuracy, lit(averageAccuracy))
+        .withColumn(ComputeModelStatistics.MacroAveragedPrecision, lit(macroAveragedPrecision))
+        .withColumn(ComputeModelStatistics.MacroAveragedRecall, lit(macroAveragedRecall))
+
+      logClassificationMetrics(microAvgAccuracy, microAvgPrecision, microAvgRecall)
+
+    }
+    newResultDF
+  }
+
+  private def addConfusionMatrixToResult(labels: Array[Double],
+                                         confusionMatrix: Matrix,
+                                         resultDF: DataFrame): DataFrame = {
+    var resultDFModified = resultDF
+    for (col: Int <- 0 until confusionMatrix.numCols;
+         row: Int <- 0 until confusionMatrix.numRows) {
+      resultDFModified = resultDFModified
+        .withColumn(s"predicted_class_as_${labels(col).toString}_actual_is_${labels(row).toString}",
+          lit(confusionMatrix(row, col)))
+    }
+    resultDFModified
+  }
+
+  private def selectAndCastToDF(dataset: Dataset[_],
+                     predictionColumnName: String,
+                     labelColumnName: String): DataFrame = {
+    dataset.select(col(predictionColumnName), col(labelColumnName).cast(DoubleType))
+      .na
+      .drop(Array(predictionColumnName, labelColumnName))
+  }
+
+  private def selectAndCastToRDD(dataset: Dataset[_],
+                    predictionColumnName: String,
+                    labelColumnName: String): RDD[(Double, Double)] = {
+    selectAndCastToDF(dataset, predictionColumnName, labelColumnName)
+      .rdd
+      .map {
+        case Row(prediction: Double, label: Double) => (prediction, label)
+        case default => throw new Exception(s"Error: prediction and label columns invalid or missing")
+      }
+  }
+
+  private def getPredictionAndLabels(dataset: Dataset[_],
+                             labelColumnName: String,
+                             scoredLabelsColumnName: String,
+                             levelsToIndexMap: Map[Any, Double]): RDD[(Double, Double)] = {
+    // Calculate confusion matrix and output it as DataFrame
+    dataset.select(col(scoredLabelsColumnName), col(labelColumnName))
+      .na
+      .drop(Array(scoredLabelsColumnName, labelColumnName))
+      .rdd
+      .map {
+        case Row(prediction: Double, label) => (prediction, levelsToIndexMap(label))
+        case default => throw new Exception(s"Error: prediction and label columns invalid or missing")
+    }
+  }
+
+  private def getScalarScoresAndLabels(dataset: Dataset[_],
+                               labelColumnName: String,
+                               scoresColumnName: String): RDD[(Double, Double)] = {
+    selectAndCastToDF(dataset, scoresColumnName, labelColumnName)
+      .rdd
+      .map {
+        case Row(prediction: Vector, label: Double) => (prediction(1), label)
+        case default => throw new Exception(s"Error: prediction and label columns invalid or missing")
+      }
+  }
+
+  private def getScoresAndLabels(dataset: Dataset[_],
+                         labelColumnName: String,
+                         scoresColumnName: String,
+                         levelsToIndexMap: Map[Any, Double]): RDD[(Double, Double)] = {
+    dataset.select(col(scoresColumnName), col(labelColumnName))
+      .na
+      .drop(Array(scoresColumnName, labelColumnName))
+      .rdd
+      .map {
+        case Row(prediction: Vector, label) => (prediction(1), levelsToIndexMap(label))
+        case default => throw new Exception(s"Error: prediction and label columns invalid or missing")
+      }
+  }
+
+  private def getLevelsToIndexMap(levels: Array[_]): Map[Any, Double] = {
+    levels.zipWithIndex.map(t => t._1 -> t._2.toDouble).toMap
+  }
+
+  private def getMulticlassMetrics(predictionAndLabels: RDD[(Double, Double)],
+                           confusionMatrix: Matrix): (Double, Double, Double, Double, Double, Double) = {
+    // Compute multiclass metrics based on paper "A systematic analysis of performance measure for classification
+    // tasks", Sokolova and Lapalme
+    var tpSum: Double = 0.0
+    for (diag: Int <- 0 until confusionMatrix.numCols) {
+      tpSum += confusionMatrix(diag, diag)
+    }
+    val totalSum = predictionAndLabels.count()
+
+    val microAvgAccuracy = tpSum / totalSum
+    val microAvgPrecision = microAvgAccuracy
+    val microAvgRecall = microAvgAccuracy
+
+    // Compute class counts - these are the row and column sums of the matrix, used to calculate the
+    // average accuracy, macro averaged precision and macro averaged recall
+    val actualClassCounts = new Array[Double](confusionMatrix.numCols)
+    val predictedClassCounts = new Array[Double](confusionMatrix.numRows)
+    val truePositives = new Array[Double](confusionMatrix.numRows)
+    for (rowIndex: Int <- 0 until confusionMatrix.numRows) {
+      for (colIndex: Int <- 0 until confusionMatrix.numCols) {
+        actualClassCounts(rowIndex) += confusionMatrix(rowIndex, colIndex)
+        predictedClassCounts(colIndex) += confusionMatrix(rowIndex, colIndex)
+
+        if (rowIndex == colIndex) {
+          truePositives(rowIndex) += confusionMatrix(rowIndex, colIndex)
+        }
+      }
+    }
+
+    var totalAccuracy = 0.0
+    var totalPrecision = 0.0
+    var totalRecall = 0.0
+    for (classIndex: Int <- 0 until confusionMatrix.numCols) {
+      // compute the class accuracy as:
+      // (true positive + true negative) / total =>
+      // (true positive + (total - (actual + predicted - true positive))) / total =>
+      // 2 * true positive + (total - (actual + predicted)) / total
+      totalAccuracy += (2 * truePositives(classIndex) +
+        (totalSum - (actualClassCounts(classIndex) + predictedClassCounts(classIndex)))) / totalSum
+
+      // compute the class precision as:
+      // true positive / predicted as positive (=> tp + fp)
+      totalPrecision += truePositives(classIndex) / predictedClassCounts(classIndex)
+
+      // compute the class recall as:
+      // true positive / actual positive (=> tp + fn)
+      totalRecall += truePositives(classIndex) / actualClassCounts(classIndex)
+    }
+
+    val averageAccuracy = totalAccuracy / confusionMatrix.numCols
+    val macroAveragedPrecision = totalPrecision / confusionMatrix.numCols
+    val macroAveragedRecall = totalRecall / confusionMatrix.numCols
+    (microAvgAccuracy, microAvgPrecision, microAvgRecall, averageAccuracy, macroAveragedPrecision, macroAveragedRecall)
+  }
+
+  private def getAUC(modelName: String,
+             dataset: Dataset[_],
+             labelColumnName: String,
+             scoresAndLabels: RDD[(Double, Double)]): Double = {
+    val binaryMetrics = new BinaryClassificationMetrics(scoresAndLabels,
+      ComputeModelStatistics.BinningThreshold)
+
+    val spark = dataset.sparkSession
+    import spark.implicits._
+
+    rocCurve = binaryMetrics.roc()
+      .toDF(ComputeModelStatistics.FpRateROCColumnName, ComputeModelStatistics.TpRateROCColumnName)
+    logROC(rocCurve)
+    val auc = binaryMetrics.areaUnderROC()
+    logAUC(auc)
+    auc
+  }
+
+  private def getBinaryAccuracyPrecisionRecall(confusionMatrix: Matrix): (Double, Double, Double) = {
+    val TP: Double = confusionMatrix(1, 1)
+    val FP: Double = confusionMatrix(0, 1)
+    val TN: Double = confusionMatrix(0, 0)
+    val FN: Double = confusionMatrix(1, 0)
+
+    val accuracy: Double = (TP + TN) / (TP + TN + FP + FN)
+    val precision: Double = TP / (TP + FP)
+    val recall: Double = TP / (TP + FN)
+    (accuracy, precision, recall)
+  }
+
+  private def createConfusionMatrix(predictionAndLabels: RDD[(Double, Double)]): (Array[Double], Matrix) = {
+    val metrics = new MulticlassMetrics(predictionAndLabels)
+    var labels = metrics.labels
+    var confusionMatrix = metrics.confusionMatrix
+
+    val numCols = confusionMatrix.numCols
+    val numRows = confusionMatrix.numRows
+
+    // Reformat the confusion matrix if less than binary size
+    if (numCols < 2 && numRows < 2) {
+      val values = Array.ofDim[Double](2 * 2)
+      for (col: Int <- 0 until confusionMatrix.numCols;
+           row: Int <- 0 until confusionMatrix.numRows) {
+        // We need to interpret the actual label value
+        val colLabel = if (labels(col) > 0) 1 else 0
+        val rowLabel = if (labels(row) > 0) 1 else 0
+        values(colLabel + rowLabel * 2) =
+          confusionMatrix(row, col)
+      }
+      confusionMatrix = Matrices.dense(2, 2, values)
+      labels = Array(0, 1)
+    }
+    (labels, confusionMatrix)
+  }
+
+  private def logClassificationMetrics(accuracy: Double, precision: Double, recall: Double): Unit = {
+    val metrics = MetricData.create(Map(ComputeModelStatistics.AccuracyColumnName -> accuracy,
+      ComputeModelStatistics.PrecisionColumnName -> precision,
+      ComputeModelStatistics.RecallColumnName -> recall), "Classification Metrics", uid)
+    logger.info(metrics)
+  }
+
+  private def logRegressionMetrics(mse: Double, rmse: Double, r2: Double, mae: Double): Unit = {
+    val metrics = MetricData.create(Map(ComputeModelStatistics.MseColumnName -> mse,
+      ComputeModelStatistics.RmseColumnName -> rmse,
+      ComputeModelStatistics.R2ColumnName -> r2,
+      ComputeModelStatistics.MaeColumnName -> mae), "Regression Metrics", uid)
+    logger.info(metrics)
+  }
+
+  private def logAUC(auc: Double): Unit = {
+    val metrics = MetricData.create(Map(ComputeModelStatistics.AucColumnName -> auc), "AUC Metric", uid)
+    logger.info(metrics)
+  }
+
+  private def logROC(roc: DataFrame): Unit = {
+    val metrics = MetricData.createTable(
+      Map(
+        ComputeModelStatistics.TpRateROCLog ->
+          roc.select(ComputeModelStatistics.TpRateROCColumnName)
+            .collect()
+            .map(row => row(0).asInstanceOf[Double]).toSeq,
+        ComputeModelStatistics.FpRateROCLog ->
+          roc.select(ComputeModelStatistics.FpRateROCColumnName)
+            .collect()
+            .map(row => row(0).asInstanceOf[Double]).toSeq
+      ),
+      "ROC Metric",
+      uid)
+    logger.info(metrics)
+  }
+
+  private def getFirstModelName(colMetadata: Metadata): Option[String] = {
+    if (!colMetadata.contains(MMLTag)) null
+    else {
+      val mlTagMetadata = colMetadata.getMetadata(MMLTag)
+      val metadataKeys = MetadataUtilities.getMetadataKeys(mlTagMetadata)
+      metadataKeys.find(key => key.startsWith(SchemaConstants.ScoreModelPrefix))
+    }
+  }
+
+  override def copy(extra: ParamMap): Transformer = new ComputeModelStatistics()
+
+  override def transformSchema(schema: StructType): StructType = {
+    val (_, _, scoreValueKind) = getSchemaInfo(schema)
+    val columns =
+      if (scoreValueKind == SchemaConstants.ClassificationKind) ComputeModelStatistics.classificationColumns
+      else if (scoreValueKind == SchemaConstants.RegressionKind) ComputeModelStatistics.regressionColumns
+      else throwOnInvalidScoringKind(scoreValueKind)
+    getTransformedSchema(columns, scoreValueKind)
+
+  }
+
+  private def throwOnInvalidScoringKind(scoreValueKind: String) = {
+    throw new Exception(s"Error: unknown scoring kind $scoreValueKind")
+  }
+
+  private def getTransformedSchema(columns: List[String], metricType: String) = {
+    getEvaluationMetric match {
+      case ComputeModelStatistics.AllSparkMetrics =>
+        StructType(columns.map(StructField(_, DoubleType)))
+      case metric: String if (ComputeModelStatistics.metricToColumnName.contains(metric)) &&
+        columns.contains(ComputeModelStatistics.metricToColumnName(metric)) =>
+        StructType(Array(StructField(ComputeModelStatistics.metricToColumnName(metric), DoubleType)))
+      case default =>
+        throw new Exception(s"Error: $default is not a $metricType metric")
+    }
+  }
+}
diff --git a/src/compute-model-statistics/src/test/scala/VerifyComputeModelStatistics.scala b/src/compute-model-statistics/src/test/scala/VerifyComputeModelStatistics.scala
new file mode 100644
index 0000000000..5aa538b549
--- /dev/null
+++ b/src/compute-model-statistics/src/test/scala/VerifyComputeModelStatistics.scala
@@ -0,0 +1,245 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.TrainRegressorTestUtilities._
+import com.microsoft.ml.spark.TrainClassifierTestUtilities._
+import com.microsoft.ml.spark.schema.{CategoricalUtilities, SchemaConstants, SparkSchema}
+import org.apache.spark.ml.classification.LogisticRegression
+import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
+
+/**
+  * Tests to validate the functionality of Evaluate Model module.
+  */
+class VerifyComputeModelStatistics extends TestBase {
+
+  test("Smoke test for evaluating a dataset") {
+
+    val labelColumn = "label"
+    val predictionColumn = SchemaConstants.SparkPredictionColumn
+    val dataset = session.createDataFrame(Seq(
+      (0.0, 2, 0.50, 0.60, 0.0),
+      (1.0, 3, 0.40, 0.50, 1.0),
+      (2.0, 4, 0.78, 0.99, 2.0),
+      (3.0, 5, 0.12, 0.34, 3.0),
+      (0.0, 1, 0.50, 0.60, 0.0),
+      (1.0, 3, 0.40, 0.50, 1.0),
+      (2.0, 3, 0.78, 0.99, 2.0),
+      (3.0, 4, 0.12, 0.34, 3.0),
+      (0.0, 0, 0.50, 0.60, 0.0),
+      (1.0, 2, 0.40, 0.50, 1.0),
+      (2.0, 3, 0.78, 0.99, 2.0),
+      (3.0, 4, 0.12, 0.34, 3.0)))
+      .toDF(labelColumn, "col1", "col2", "col3", predictionColumn)
+
+    val scoreModelName = SchemaConstants.ScoreModelPrefix + "_test model"
+
+    val datasetWithLabel =
+      SparkSchema.setLabelColumnName(dataset, scoreModelName, labelColumn, SchemaConstants.RegressionKind)
+    val datasetWithScores =
+      SparkSchema.setScoresColumnName(datasetWithLabel, scoreModelName, predictionColumn,
+                                      SchemaConstants.RegressionKind)
+
+    val evaluatedSchema = new ComputeModelStatistics().transformSchema(datasetWithScores.schema)
+
+    val evaluatedData = new ComputeModelStatistics().transform(datasetWithScores)
+    val firstRow = evaluatedData.first()
+    assert(firstRow.get(0) == 0.0)
+    assert(firstRow.get(1) == 0.0)
+    assert(firstRow.get(2) == 1.0)
+    assert(firstRow.get(3) == 0.0)
+
+    assert(evaluatedSchema == StructType(ComputeModelStatistics.regressionColumns.map(StructField(_, DoubleType))))
+  }
+
+  test("Evaluate a dataset with missing values") {
+
+    val labelColumn = "label"
+    val predictionColumn = SchemaConstants.SparkPredictionColumn
+    val dataset = session.createDataFrame(sc.parallelize(Seq(
+      (0.0, 0.0),
+      (0.0, null),
+      (1.0, 1.0),
+      (2.0, 2.0),
+      (null, null),
+      (0.0, 0.0),
+      (null, 3.0))).map(values => Row(values._1, values._2)),
+      StructType(Array(StructField(labelColumn, DoubleType, true),
+        StructField(predictionColumn, DoubleType, true))))
+      .toDF(labelColumn, predictionColumn)
+
+    val scoreModelName = SchemaConstants.ScoreModelPrefix + "_test model"
+
+    val datasetWithLabel =
+      SparkSchema.setLabelColumnName(dataset, scoreModelName, labelColumn, SchemaConstants.RegressionKind)
+    val datasetWithScores =
+      SparkSchema.setScoresColumnName(datasetWithLabel, scoreModelName, predictionColumn,
+        SchemaConstants.RegressionKind)
+
+    val evaluatedData = new ComputeModelStatistics().transform(datasetWithScores)
+    val firstRow = evaluatedData.first()
+    assert(firstRow.get(0) == 0.0)
+    assert(firstRow.get(1) == 0.0)
+    assert(firstRow.get(2) == 1.0)
+    assert(firstRow.get(3) == 0.0)
+  }
+
+  test("Smoke test to train regressor, score and evaluate on a dataset using all three modules") {
+    val dataset = session.createDataFrame(Seq(
+      (0, 2, 0.50, 0.60, 0),
+      (1, 3, 0.40, 0.50, 1),
+      (2, 4, 0.78, 0.99, 2),
+      (3, 5, 0.12, 0.34, 3),
+      (0, 1, 0.50, 0.60, 0),
+      (1, 3, 0.40, 0.50, 1),
+      (2, 3, 0.78, 0.99, 2),
+      (3, 4, 0.12, 0.34, 3),
+      (0, 0, 0.50, 0.60, 0),
+      (1, 2, 0.40, 0.50, 1),
+      (2, 3, 0.78, 0.99, 2),
+      (3, 4, 0.12, 0.34, 3)
+    )).toDF("labelColumn", "col1", "col2", "col3", "col4")
+
+    val labelColumn = "someOtherColumn"
+
+    val datasetWithAddedColumn = dataset.withColumn(labelColumn, org.apache.spark.sql.functions.lit(0.0))
+
+    val linearRegressor = createLinearRegressor(labelColumn)
+    val scoredDataset =
+      TrainRegressorTestUtilities.trainScoreDataset(labelColumn, datasetWithAddedColumn, linearRegressor)
+
+    val evaluatedData = new ComputeModelStatistics().transform(scoredDataset)
+    val firstRow = evaluatedData.first()
+    assert(firstRow.get(0) == 0.0)
+    assert(firstRow.get(1) == 0.0)
+    assert(firstRow.get(2).asInstanceOf[Double].isNaN)
+    assert(firstRow.get(3) == 0.0)
+  }
+
+  test("Smoke test to train classifier, score and evaluate on a dataset using all three modules") {
+    val labelColumn = "Label"
+    val dataset = session.createDataFrame(Seq(
+      (0, 2, 0.50, 0.60, 0),
+      (1, 3, 0.40, 0.50, 1),
+      (0, 4, 0.78, 0.99, 2),
+      (1, 5, 0.12, 0.34, 3),
+      (0, 1, 0.50, 0.60, 0),
+      (1, 3, 0.40, 0.50, 1),
+      (0, 3, 0.78, 0.99, 2),
+      (1, 4, 0.12, 0.34, 3),
+      (0, 0, 0.50, 0.60, 0),
+      (1, 2, 0.40, 0.50, 1),
+      (0, 3, 0.78, 0.99, 2),
+      (1, 4, 0.12, 0.34, 3)
+    )).toDF(labelColumn, "col1", "col2", "col3", "col4")
+
+    val logisticRegressor = createLogisticRegressor(labelColumn)
+    val scoredDataset = TrainClassifierTestUtilities.trainScoreDataset(labelColumn, dataset, logisticRegressor)
+    val evaluatedData = new ComputeModelStatistics().transform(scoredDataset)
+
+    val evaluatedSchema = new ComputeModelStatistics().transformSchema(scoredDataset.schema)
+    assert(evaluatedSchema == StructType(ComputeModelStatistics.classificationColumns.map(StructField(_, DoubleType))))
+  }
+
+  test("Verify results of multiclass metrics") {
+    val labelColumn = "label"
+    val predictionColumn = SchemaConstants.SparkPredictionColumn
+    val labelsAndPrediction = session.createDataFrame(
+      Seq(
+        (0.0, 0.0),
+        (0.0, 0.0),
+        (0.0, 1.0),
+        (0.0, 2.0),
+        (1.0, 0.0),
+        (1.0, 1.0),
+        (1.0, 1.0),
+        (1.0, 1.0),
+        (2.0, 2.0))).toDF(labelColumn, predictionColumn)
+
+    val scoreModelName = SchemaConstants.ScoreModelPrefix + "_test model"
+
+    val datasetWithLabel =
+      SparkSchema.setLabelColumnName(labelsAndPrediction, scoreModelName, labelColumn,
+        SchemaConstants.ClassificationKind)
+    val datasetWithScoredLabels =
+      SparkSchema.setScoredLabelsColumnName(datasetWithLabel, scoreModelName, predictionColumn,
+        SchemaConstants.ClassificationKind)
+
+    val evaluatedData = new ComputeModelStatistics().transform(datasetWithScoredLabels)
+
+    val tp0 = 2.0
+    val tp1 = 3.0
+    val tp2 = 1.0
+    val tn0 = 4.0
+    val tn1 = 4.0
+    val tn2 = 7.0
+    val numLabels = 3.0
+    val total = labelsAndPrediction.count()
+
+    val precision0 = 2.0 / (2 + 1)
+    val precision1 = 3.0 / (3 + 1)
+    val precision2 = 1.0 / (1 + 1)
+    val recall0 = 2.0 / (2 + 2)
+    val recall1 = 3.0 / (3 + 1)
+    val recall2 = 1.0 / (1 + 0)
+
+    val overallAccuracy = (tp0 + tp1 + tp2) / total
+    val evalRow = evaluatedData.first()
+    assert(evalRow.getAs[Double](ComputeModelStatistics.AccuracyColumnName) == overallAccuracy)
+    assert(evalRow.getAs[Double](ComputeModelStatistics.PrecisionColumnName) == overallAccuracy)
+    assert(evalRow.getAs[Double](ComputeModelStatistics.RecallColumnName) == overallAccuracy)
+    val avgAccuracy = ((tp0 + tn0) / total + (tp1 + tn1) / total + (tp2 + tn2) / total) / numLabels
+    val macroPrecision = (precision0 + precision1 + precision2) / numLabels
+    val macroRecall = (recall0 + recall1 + recall2) / numLabels
+    assert(evalRow.getAs[Double](ComputeModelStatistics.AverageAccuracy) == avgAccuracy)
+    assert(evalRow.getAs[Double](ComputeModelStatistics.MacroAveragedPrecision) == macroPrecision)
+    assert(evalRow.getAs[Double](ComputeModelStatistics.MacroAveragedRecall) == macroRecall)
+  }
+
+  test("validate AUC from compute model statistic and binary classification evaluator gives the same result") {
+    val fileLocation = ClassifierTestUtils.classificationTrainFile("transfusion.csv").toString
+    val label = "Donated"
+    val dataset: DataFrame =
+      session.read.format("com.databricks.spark.csv")
+        .option("header", "true").option("inferSchema", "true")
+        .option("treatEmptyValuesAsNulls", "false")
+        .option("delimiter", ",")
+        .load(fileLocation)
+
+    val split = dataset.randomSplit(Array(0.75,0.25))
+    val train = split(0)
+    val test = split(1)
+
+    val trainClassifier = new TrainClassifier()
+    val model = trainClassifier.setModel(new LogisticRegression())
+      .set(trainClassifier.labelCol, label)
+      .set(trainClassifier.numFeatures, 1 << 18)
+      .fit(train)
+    val scored = model.transform(test)
+    val eval = new ComputeModelStatistics().transform(scored)
+    val cmsAUC = eval.first().getAs[Double]("AUC")
+
+    val binaryEvaluator = new BinaryClassificationEvaluator()
+      .setMetricName("areaUnderROC")
+      .setLabelCol(label)
+      .setRawPredictionCol(SchemaConstants.ScoresColumn)
+
+    val levels = CategoricalUtilities.getLevels(scored.schema, label)
+    val levelsToIndexMap: Map[Any, Double] = levels.get.zipWithIndex.map(t => t._1 -> t._2.toDouble).toMap
+
+    // Calculate confusion matrix and output it as DataFrame
+    val predictionAndLabels = session
+      .createDataFrame(scored.select(col(SchemaConstants.ScoresColumn), col(label)).rdd.map {
+      case Row(prediction: Vector, label) => (prediction(1), levelsToIndexMap(label))
+    }).toDF(SchemaConstants.ScoresColumn, label)
+
+    val auc = binaryEvaluator.evaluate(predictionAndLabels)
+    assert(auc === cmsAUC)
+  }
+
+}
diff --git a/src/compute-per-instance-statistics/build.sbt b/src/compute-per-instance-statistics/build.sbt
new file mode 100644
index 0000000000..1ddf71d75d
--- /dev/null
+++ b/src/compute-per-instance-statistics/build.sbt
@@ -0,0 +1,3 @@
+//> DependsOn: core
+//> DependsOn: train-regressor
+//> DependsOn: train-classifier
diff --git a/src/compute-per-instance-statistics/src/main/scala/ComputePerInstanceStatistics.scala b/src/compute-per-instance-statistics/src/main/scala/ComputePerInstanceStatistics.scala
new file mode 100644
index 0000000000..9cb83ea1a3
--- /dev/null
+++ b/src/compute-per-instance-statistics/src/main/scala/ComputePerInstanceStatistics.scala
@@ -0,0 +1,110 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.schema.SchemaConstants._
+import com.microsoft.ml.spark.schema.{CategoricalUtilities, SchemaConstants, SparkSchema}
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+
+/**
+  * Contains constants used by Compute Per Instance Statistics.
+  */
+object ComputePerInstanceStatistics extends DefaultParamsReadable[ComputePerInstanceStatistics] {
+  // Regression metrics
+  val L1LossMetric  = "L1_loss"
+  val L2LossMetric  = "L2_loss"
+
+  // Classification metrics
+  val LogLossMetric = "log_loss"
+
+  val epsilon = 1e-15
+}
+
+/**
+  * Evaluates the given scored dataset with per instance metrics.
+  */
+class ComputePerInstanceStatistics(override val uid: String) extends Transformer with MMLParams {
+
+  def this() = this(Identifiable.randomUID("ComputePerInstanceStatistics"))
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    // TODO: evaluate all models; for now, get first model name found
+    val firstModelName = dataset.schema.collectFirst {
+      case StructField(c, t, _, m) if (getFirstModelName(m) != null && !getFirstModelName(m).isEmpty) => {
+        getFirstModelName(m).get
+      }
+    }
+    val modelName = if (!firstModelName.isEmpty) firstModelName.get
+                    else throw new Exception("Please score the model prior to evaluating")
+    val dataframe = dataset.toDF()
+    val labelColumnName = SparkSchema.getLabelColumnName(dataframe, modelName)
+
+    val scoreValueKind = SparkSchema.getScoreValueKind(dataframe, modelName, labelColumnName)
+
+    if (scoreValueKind == SchemaConstants.ClassificationKind) {
+      // Compute the LogLoss for classification case
+      val scoredLabelsColumnName = SparkSchema.getScoredLabelsColumnName(dataframe, modelName)
+
+      // Get levels if categorical
+      val levels = CategoricalUtilities.getLevels(dataframe.schema, labelColumnName)
+      val numLevels =
+        if (!levels.isEmpty && levels.get != null) {
+          if (levels.get.length > 2) levels.get.length else 2
+        } else {
+          // Otherwise compute unique levels
+          dataset.select(col(labelColumnName).cast(DoubleType)).rdd.distinct().count().toInt
+        }
+
+      val logLossFunc = udf((scoredLabel: Double, scores: org.apache.spark.ml.linalg.Vector) =>
+        if (scoredLabel < numLevels) {
+          -Math.log(Math.min(1, Math.max(ComputePerInstanceStatistics.epsilon, scores(scoredLabel.toInt))))
+        } else {
+          // penalize if no label seen in training
+          -Math.log(ComputePerInstanceStatistics.epsilon)
+        })
+      val probabilitiesColumnName = SparkSchema.getScoredProbabilitiesColumnName(dataframe, modelName)
+      dataframe.withColumn(ComputePerInstanceStatistics.LogLossMetric,
+        logLossFunc(dataset(scoredLabelsColumnName), dataset(probabilitiesColumnName)))
+    } else {
+      val scoresColumnName = SparkSchema.getScoresColumnName(dataframe, modelName)
+      // Compute the L1 and L2 loss for regression case
+      val scoresAndLabels =
+        dataset.select(col(scoresColumnName), col(labelColumnName).cast(DoubleType)).rdd.map {
+          case Row(prediction: Double, label: Double) => (prediction, label)
+        }
+      val l1LossFunc = udf((trueLabel:Double, scoredLabel: Double) => math.abs(trueLabel - scoredLabel))
+      val l2LossFunc = udf((trueLabel:Double, scoredLabel: Double) =>
+        {
+          val loss = math.abs(trueLabel - scoredLabel)
+          loss * loss
+        })
+      dataframe.withColumn(ComputePerInstanceStatistics.L1LossMetric,
+        l1LossFunc(dataset(labelColumnName), dataset(scoresColumnName)))
+        .withColumn(ComputePerInstanceStatistics.L2LossMetric,
+          l2LossFunc(dataset(labelColumnName), dataset(scoresColumnName)))
+    }
+  }
+
+  private def getFirstModelName(colMetadata: Metadata): Option[String] = {
+    if (!colMetadata.contains(MMLTag)) null
+    else {
+      val mlTagMetadata = colMetadata.getMetadata(MMLTag)
+      val metadataKeys = MetadataUtilities.getMetadataKeys(mlTagMetadata)
+      metadataKeys.find(key => key.startsWith(SchemaConstants.ScoreModelPrefix))
+    }
+  }
+
+  override def copy(extra: ParamMap): Transformer = new ComputePerInstanceStatistics()
+
+  // TODO: This should be based on the retrieved score value kind
+  override def transformSchema(schema: StructType): StructType =
+    schema.add(new StructField(ComputePerInstanceStatistics.L1LossMetric, DoubleType))
+      .add(new StructField(ComputePerInstanceStatistics.L2LossMetric, DoubleType))
+
+}
diff --git a/src/compute-per-instance-statistics/src/test/scala/VerifyComputePerInstanceStatistics.scala b/src/compute-per-instance-statistics/src/test/scala/VerifyComputePerInstanceStatistics.scala
new file mode 100644
index 0000000000..bfacec803e
--- /dev/null
+++ b/src/compute-per-instance-statistics/src/test/scala/VerifyComputePerInstanceStatistics.scala
@@ -0,0 +1,130 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.TrainRegressorTestUtilities._
+import com.microsoft.ml.spark.TrainClassifierTestUtilities._
+import com.microsoft.ml.spark.schema.{SchemaConstants, SparkSchema}
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.sql._
+
+import scala.tools.nsc.transform.patmat.Lit
+
+/**
+  * Tests to validate the functionality of Compute Per Instance Statistics module.
+  */
+class VerifyComputePerInstanceStatistics extends TestBase {
+
+  test("Smoke test for evaluating a dataset") {
+
+    val labelColumn = "label"
+    val predictionColumn = SchemaConstants.SparkPredictionColumn
+    val dataset = session.createDataFrame(Seq(
+      (0.0, 2, 0.50, 0.60, 0.0),
+      (1.0, 3, 0.40, 0.50, 1.0),
+      (2.0, 4, 0.78, 0.99, 2.0),
+      (3.0, 5, 0.12, 0.34, 3.0),
+      (0.0, 1, 0.50, 0.60, 0.0),
+      (1.0, 3, 0.40, 0.50, 1.0),
+      (2.0, 3, 0.78, 0.99, 2.0),
+      (3.0, 4, 0.12, 0.34, 3.0),
+      (0.0, 0, 0.50, 0.60, 0.0),
+      (1.0, 2, 0.40, 0.50, 1.0),
+      (2.0, 3, 0.78, 0.99, 2.0),
+      (3.0, 4, 0.12, 0.34, 3.0)))
+      .toDF(labelColumn, "col1", "col2", "col3", predictionColumn)
+
+    val scoreModelName = SchemaConstants.ScoreModelPrefix + "_test model"
+
+    val datasetWithLabel =
+      SparkSchema.setLabelColumnName(dataset, scoreModelName, labelColumn, SchemaConstants.RegressionKind)
+    val datasetWithScores =
+      SparkSchema.setScoresColumnName(datasetWithLabel, scoreModelName, predictionColumn,
+                                      SchemaConstants.RegressionKind)
+
+    val evaluatedData = new ComputePerInstanceStatistics().transform(datasetWithScores)
+    validatePerInstanceRegressionStatistics(evaluatedData)
+  }
+
+  test("Smoke test to train regressor, score and evaluate on a dataset using all three modules") {
+    val label = "label"
+    val dataset = session.createDataFrame(Seq(
+      (0, 2, 0.50, 0.60, 0),
+      (1, 3, 0.40, 0.50, 1),
+      (2, 4, 0.78, 0.99, 2),
+      (3, 5, 0.12, 0.34, 3),
+      (0, 1, 0.50, 0.60, 0),
+      (1, 3, 0.40, 0.50, 1),
+      (2, 3, 0.78, 0.99, 2),
+      (3, 4, 0.12, 0.34, 3),
+      (0, 0, 0.50, 0.60, 0),
+      (1, 2, 0.40, 0.50, 1),
+      (2, 3, 0.78, 0.99, 2),
+      (3, 4, 0.12, 0.34, 3)
+    )).toDF(label, "col1", "col2", "col3", "col4")
+
+    val linearRegressor = createLinearRegressor(label)
+    val scoredDataset =
+      TrainRegressorTestUtilities.trainScoreDataset(label, dataset, linearRegressor)
+
+    val evaluatedData = new ComputePerInstanceStatistics().transform(scoredDataset)
+    validatePerInstanceRegressionStatistics(evaluatedData)
+  }
+
+  test("Smoke test to train classifier, score and evaluate on a dataset using all three modules") {
+    val labelColumn = "Label"
+    val dataset = session.createDataFrame(Seq(
+      (0, 2, 0.50, 0.60, 0),
+      (1, 3, 0.40, 0.50, 1),
+      (0, 4, 0.78, 0.99, 2),
+      (1, 5, 0.12, 0.34, 3),
+      (0, 1, 0.50, 0.60, 0),
+      (1, 3, 0.40, 0.50, 1),
+      (0, 3, 0.78, 0.99, 2),
+      (1, 4, 0.12, 0.34, 3),
+      (0, 0, 0.50, 0.60, 0),
+      (1, 2, 0.40, 0.50, 1),
+      (0, 3, 0.78, 0.99, 2),
+      (1, 4, 0.12, 0.34, 3)
+    )).toDF(labelColumn, "col1", "col2", "col3", "col4")
+
+    val logisticRegressor = createLogisticRegressor(labelColumn)
+    val scoredDataset = TrainClassifierTestUtilities.trainScoreDataset(labelColumn, dataset, logisticRegressor)
+    val evaluatedData = new ComputePerInstanceStatistics().transform(scoredDataset)
+    validatePerInstanceClassificationStatistics(evaluatedData)
+  }
+
+  private def validatePerInstanceRegressionStatistics(evaluatedData: DataFrame): Unit = {
+    // Validate the per instance statistics
+    evaluatedData.collect().foreach(row => {
+      val labelUncast = row(0)
+      val label =
+        if (labelUncast.isInstanceOf[Int]) labelUncast.asInstanceOf[Int].toDouble
+        else labelUncast.asInstanceOf[Double]
+      val score = row.getDouble(row.length - 3)
+      val l1Loss = row.getDouble(row.length - 2)
+      val l2Loss = row.getDouble(row.length - 1)
+      val loss = math.abs(label - score)
+      assert(l1Loss === loss)
+      assert(l2Loss === loss * loss)
+    })
+  }
+
+  private def validatePerInstanceClassificationStatistics(evaluatedData: DataFrame): Unit = {
+    // Validate the per instance statistics
+    evaluatedData.collect().foreach(row => {
+      val labelUncast = row(0)
+      val label =
+        if (labelUncast.isInstanceOf[Int]) labelUncast.asInstanceOf[Int].toDouble
+        else labelUncast.asInstanceOf[Double]
+      val probabilities = row.get(row.length - 3).asInstanceOf[org.apache.spark.ml.linalg.Vector]
+      val scoredLabel = row.getDouble(row.length - 2).toInt
+      val logLoss = row.getDouble(row.length - 1)
+      val computedLogLoss = -Math.log(Math.min(1, Math.max(ComputePerInstanceStatistics.epsilon,
+        probabilities(scoredLabel.toInt))))
+      assert(computedLogLoss === logLoss)
+    })
+  }
+
+}
diff --git a/src/core/build.sbt b/src/core/build.sbt
new file mode 100644
index 0000000000..cd0183132b
--- /dev/null
+++ b/src/core/build.sbt
@@ -0,0 +1 @@
+// nothing here
diff --git a/src/core/contracts/build.sbt b/src/core/contracts/build.sbt
new file mode 100644
index 0000000000..cd0183132b
--- /dev/null
+++ b/src/core/contracts/build.sbt
@@ -0,0 +1 @@
+// nothing here
diff --git a/src/core/contracts/src/main/scala/Exceptions.scala b/src/core/contracts/src/main/scala/Exceptions.scala
new file mode 100644
index 0000000000..aa923474d2
--- /dev/null
+++ b/src/core/contracts/src/main/scala/Exceptions.scala
@@ -0,0 +1,35 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.contracts
+
+import java.lang.RuntimeException
+
+import org.apache.spark.ml.util.Identifiable
+
+object MMLException {
+    implicit class MMLID(val i: Identifiable) extends AnyVal {
+        def id: String = i.uid
+    }
+    def throwEx(msg: String, inner: Throwable = null)(implicit i: MMLID): MMLException = {
+        throw new MMLException(i.id, msg, inner)
+    }
+}
+import MMLException._
+
+// The caller must *explicitly* pass null to the source and inner exception
+class MMLException(source: String, msg: String, inner: Throwable)
+    // suppression = true by default
+    // writableStackTrace -> true by design for us
+    extends RuntimeException(msg, inner, true, true) {
+
+    // Fix this to be structured for operationalized scenarios?
+    // Or will they consume the object?
+    override def toString(): String = source + super.toString
+}
+
+class FriendlyException(addedInfo: String, inner: Throwable)(implicit aid: MMLID)
+    extends MMLException(aid.id, addedInfo, inner)
+
+class ParamException(reason: String)(implicit aid: MMLID)
+    extends MMLException(aid.id, reason, null)
diff --git a/src/core/contracts/src/main/scala/Metrics.scala b/src/core/contracts/src/main/scala/Metrics.scala
new file mode 100644
index 0000000000..da1b5aa1f2
--- /dev/null
+++ b/src/core/contracts/src/main/scala/Metrics.scala
@@ -0,0 +1,47 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.contracts
+
+// Case class matching
+sealed abstract class Metric
+
+// Just for clarity in the contract file
+object ConvenienceTypes {
+    type UniqueName = String
+    type MetricTable = Map[UniqueName, Seq[Metric]]
+}
+import ConvenienceTypes._
+
+// One option
+case class TypedMetric[T](name: UniqueName, value: T) extends Metric
+case class MetricGroup(name: UniqueName, metrics: MetricTable) {
+    require ({
+        val len = metrics.values.head.length
+        metrics.values.forall(col => col.length == len)
+    }, s"All metric lists in the table must be the same length")
+}
+
+// Other option (reflection friendly - do we need reflection?)
+sealed abstract class TypenameMetric
+case class DoubleMetric(name: UniqueName, value: Double) extends TypenameMetric
+case class StringMetric(name: UniqueName, value: String) extends TypenameMetric
+case class IntegralMetric(name: UniqueName, value: Long) extends TypenameMetric
+
+case class TypenameMetricGroup(name: UniqueName, values: Map[UniqueName, Seq[TypenameMetric]])
+
+/**
+  * Defines contract for Metric table, which is a metric name to list of values.
+  * @param data
+  */
+case class MetricData(data: Map[String, Seq[Double]], metricType: String, modelName: String)
+
+object MetricData {
+  def create(data: Map[String, Double], metricType: String, modelName: String): MetricData = {
+    return new MetricData(data.map(kvp => (kvp._1, List(kvp._2))), metricType, modelName)
+  }
+
+  def createTable(data: Map[String, Seq[Double]], metricType: String, modelName: String): MetricData = {
+    return new MetricData(data, metricType, modelName)
+  }
+}
diff --git a/src/core/contracts/src/main/scala/Params.scala b/src/core/contracts/src/main/scala/Params.scala
new file mode 100644
index 0000000000..f0ab6e8473
--- /dev/null
+++ b/src/core/contracts/src/main/scala/Params.scala
@@ -0,0 +1,134 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import scala.collection.mutable.Map
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
+
+trait MMLParams extends Wrappable with DefaultParamsWritable
+
+trait Wrappable extends Params {
+
+  // Use this function when instantiating sparkML Identifiable for your
+  // own use - it allows us to locate the origin of any stacks
+  def chainedUid(origin: String): String = Identifiable.randomUID(this.uid)
+
+  private var orderCounter = 0
+  // TODO: Support non-string "enums"?
+  val paramDomains = Map[String, Seq[String]]()
+
+  def BooleanParam(i: Identifiable, name: String, description: String): BooleanParam =
+    BooleanParam(i, name, description, false)
+
+  def BooleanParam(i: Identifiable, name: String, description: String,
+                   default: Boolean): BooleanParam = {
+    val baseParam = new BooleanParam(i, name, description)
+    MMLParam(baseParam, Some(default), None)
+    baseParam
+  }
+
+  def IntParam(i: Identifiable, name: String, description: String): IntParam = {
+    val baseParam = new IntParam(i, name, description)
+    MMLParam(baseParam, None, None)
+    baseParam
+  }
+
+  def IntParam(i: Identifiable, name: String, description: String,
+               default: Int): IntParam = {
+    val baseParam = new IntParam(i, name, description)
+    MMLParam(baseParam, Some(default), None)
+    baseParam
+  }
+
+  def IntParam(i: Identifiable, name: String, description: String, validation: Int => Boolean): IntParam = {
+    val baseParam = new IntParam(i, name, description, validation)
+    MMLParam(baseParam, None, None)
+    baseParam
+  }
+
+  def LongParam(i: Identifiable, name: String, description: String): LongParam = {
+    val baseParam = new LongParam(i, name, description)
+    MMLParam(baseParam, None, None)
+    baseParam
+  }
+
+  def LongParam(i: Identifiable, name: String, description: String,
+                default: Long): LongParam = {
+    val baseParam = new LongParam(i, name, description)
+    MMLParam(baseParam, Some(default), None)
+    baseParam
+  }
+
+  def DoubleParam(i: Identifiable, name: String, description: String): DoubleParam = {
+    val baseParam = new DoubleParam(i, name, description)
+    MMLParam(baseParam, None, None)
+    baseParam
+  }
+
+  def DoubleParam(i: Identifiable, name: String, description: String,
+                  default: Double): DoubleParam = {
+    val baseParam = new DoubleParam(i, name, description)
+    MMLParam(baseParam, Some(default), None)
+    baseParam
+  }
+
+  def StringParam(i: Identifiable, name: String, description: String): Param[String] = {
+    val baseParam = new Param[String](i, name, description)
+    MMLParam(baseParam, None, None)
+    baseParam
+  }
+
+  def StringParam(i: Identifiable, name: String, description: String, validation: String => Boolean): Param[String] = {
+    val baseParam = new Param[String](i, name, description, validation)
+    MMLParam(baseParam, None, None)
+    baseParam
+  }
+
+  def StringParam(i: Identifiable, name: String, description: String,
+                  default: String): Param[String] = {
+    val baseParam = new Param[String](i, name, description)
+    MMLParam(baseParam, Some(default), None)
+    baseParam
+  }
+
+  def StringParam(i: Identifiable, name: String, description: String,
+                  default: String, domain: Seq[String]): Param[String] = {
+    val baseParam = new Param[String](i, name, description)
+    MMLParam(baseParam, Some(default), Some(domain))
+    baseParam
+  }
+
+  private def MMLParam[T](param: Param[T],
+                          default: Option[T], domain: Option[Seq[String]]): Unit = {
+    if (default.isDefined) setDefault(param, default.get)
+    if (domain.isDefined) paramDomains.put(param.name, domain.get)
+    orderCounter += 1
+  }
+
+}
+
+trait HasInputCol extends Wrappable {
+  val inputCol = StringParam(this, "inputCol", "The name of the input column")
+  def setInputCol(value: String): this.type = set(inputCol, value)
+  def getInputCol: String = $(inputCol)
+}
+
+trait HasOutputCol extends Wrappable {
+  val outputCol = StringParam(this, "outputCol", "The name of the output column")
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+  def getOutputCol: String = $(outputCol)
+}
+
+trait HasLabelCol extends Wrappable {
+  val labelCol = StringParam(this, "labelCol", "The name of the label column")
+  def setLabelCol(value: String): this.type = set(labelCol, value)
+  def getLabelCol: String = $(labelCol)
+}
+
+trait HasFeaturesCol extends Wrappable {
+  val featuresCol = StringParam(this, "featuresCol", "The name of the features column")
+  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
+  def getFeaturesCol: String = $(featuresCol)
+}
diff --git a/src/core/env/build.sbt b/src/core/env/build.sbt
new file mode 100644
index 0000000000..cde1c89ce0
--- /dev/null
+++ b/src/core/env/build.sbt
@@ -0,0 +1,7 @@
+libraryDependencies ++= Seq(
+    // "%%" for scala things, "%" for plain java things
+    "com.typesafe" % "config" % "1.3.1",
+    "org.apache.logging.log4j" %  "log4j-api"       % "2.8.1" % "provided",
+    "org.apache.logging.log4j" %  "log4j-core"      % "2.8.1" % "provided",
+    "org.apache.logging.log4j" %% "log4j-api-scala" % "2.8.1" % "provided"
+  )
diff --git a/src/core/env/src/main/scala/CodegenTags.scala b/src/core/env/src/main/scala/CodegenTags.scala
new file mode 100644
index 0000000000..fd8ebe8fac
--- /dev/null
+++ b/src/core/env/src/main/scala/CodegenTags.scala
@@ -0,0 +1,13 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import scala.annotation.StaticAnnotation
+
+/**
+  * Generate the internal wrapper for a given class.
+  * Used for complicated wrappers, where the basic functionality is auto-generated,
+  * and the rest is added in the inherited wrapper.
+  */
+class InternalWrapper extends StaticAnnotation
diff --git a/src/core/env/src/main/scala/Configuration.scala b/src/core/env/src/main/scala/Configuration.scala
new file mode 100644
index 0000000000..d2a66c4e7b
--- /dev/null
+++ b/src/core/env/src/main/scala/Configuration.scala
@@ -0,0 +1,51 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.util.Properties
+import scala.sys.process._
+
+import org.apache.spark._
+import org.apache.spark.sql.SparkSession
+
+// For development convenience - not hard to reimplement the pieces used here
+import com.typesafe.config.{Config, ConfigFactory}
+
+// This is meant to provide a uniform means of configuring the
+// SDK and extension packages in a Spark-compatible form while
+// also allowing for env vars as we may use via the CLI
+abstract class Configuration(config: Config) {
+  private val namespace = "mmlspark"
+
+  protected def subspace: String
+
+  def root: String = combine(namespace, subspace)
+
+  private def combine(names: String*): String = names.mkString(".")
+}
+
+class MMLConfig(config: Config) extends Configuration(config) {
+  override val subspace = "sdk"
+}
+
+object MMLConfig {
+  // Use spark model of one config/JVM
+  private lazy val baseConfig = new MMLConfig(ConfigFactory.load())
+  def get(): MMLConfig = baseConfig
+
+  private def combine(names: String*): String = names.mkString(".")
+}
+
+// Move to CNTK subpackage
+class CNTKConfig(config: Config) extends MMLConfig(config) {
+  override val subspace = "cntk"
+  // Danil brings up a good point - device configuration is confusing
+  // we need to not only say number of devices but also which ones in the
+  // GPU list to use (1080 gaming + Titan DL for example)
+}
+
+// Move to TLC subpackage
+class TLCConfig(config: Config) extends MMLConfig(config) {
+  override val subspace = "tlc"
+}
diff --git a/src/core/env/src/main/scala/EnvironmentUtils.scala b/src/core/env/src/main/scala/EnvironmentUtils.scala
new file mode 100644
index 0000000000..223bb13713
--- /dev/null
+++ b/src/core/env/src/main/scala/EnvironmentUtils.scala
@@ -0,0 +1,52 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.nio.file.Paths
+import scala.sys.process._
+
+import org.apache.spark._
+import org.apache.spark.sql.SparkSession
+
+import ProcessUtils._
+
+object EnvironmentUtils {
+
+  // We should use Apache Commons Lang instead
+  def IsWindows: Boolean = System.getProperty("os.name").toLowerCase().indexOf("win") >= 0
+
+  // Make this overrideable so people have control over the granularity
+  private lazy val nvInfo: Option[String] = {
+    println(s"Computing GPU count on ${if(IsWindows) "Windows" else "Linux"}")
+    val nvsmicmd = if (IsWindows) {
+      // Unlikely nvidia is on the path
+      val nvsmi = Paths.get(
+        System.getenv("ProgramFiles"),
+        "NVIDIA Corporation",
+        "NVSMI",
+        "nvidia-smi.exe").toAbsolutePath.toString
+      "\"" + nvsmi + "\""
+    } else {
+      "nvidia-smi"
+    }
+    // Probably a more Scala-idiomatic way to do this
+    try {
+      Some(ProcessUtils.getProcessOutput(s"$nvsmicmd -L"))
+    } catch {
+      // Use the logging API to do this properly
+      case e: Exception => {
+        println(s"Couldn't query Nvidia SMI for GPU info: $e")
+        None
+      }
+    }
+  }
+
+  lazy val GPUCount: Option[Int] = if (nvInfo.isEmpty) None else {
+    // Commons Lang has isNotBlank
+    val gpucnt = nvInfo.get.split("\n").filter(!_.trim.isEmpty).length
+    println(s"$gpucnt GPUs detected")
+    Some(gpucnt)
+  }
+
+}
diff --git a/src/core/env/src/main/scala/FileUtilities.scala b/src/core/env/src/main/scala/FileUtilities.scala
new file mode 100644
index 0000000000..c473772932
--- /dev/null
+++ b/src/core/env/src/main/scala/FileUtilities.scala
@@ -0,0 +1,139 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.io.ByteArrayOutputStream
+import java.nio.file.{Files, StandardCopyOption}
+import java.util.zip.ZipInputStream
+
+import org.apache.commons.io.IOUtils
+import org.apache.spark.input.PortableDataStream
+
+import scala.io._
+import scala.util.Random
+
+object FileUtilities {
+
+  // Make `File` available to everyone who uses these utilities
+  //   (Future TODO: make it some nice type, something like `file` in SBT)
+  type File = java.io.File
+
+  import scala.util.{Try, Success, Failure}
+  def using[T <: AutoCloseable, U](disposable: Seq[T])(task: Seq[T] => U): Try[U] = {
+    try {
+      Success(task(disposable))
+    } catch {
+      case e: Exception => Failure(e)
+    } finally {
+      disposable.foreach(d => d.close())
+    }
+  }
+
+  def delTree(file: File): Boolean =
+    if (!file.exists) true
+    else { if (file.isDirectory) file.listFiles.forall(delTree)
+           file.delete }
+
+  def allFiles(dir: File, pred: (File => Boolean) = null): Array[File] = {
+    def loop(dir: File): Array[File] = {
+      val (dirs, files) = dir.listFiles.sorted.partition(_.isDirectory)
+      (if (pred == null) files else files.filter(pred)) ++ dirs.flatMap(loop)
+    }
+    loop(dir)
+  }
+
+  // readFile takes a file name or a File, and function to extract a value from
+  // BufferedSource which defaults to _.mkString; performs the read, closes the
+  // source, and returns the result
+  def readFile[T](file: File, read: BufferedSource => T): T = {
+    val i = Source.fromFile(file)
+    try read(i) finally i.close
+  }
+  def readFile(file: File): String = readFile(file, _.mkString)
+
+  def writeFile(file: File, stuff: Any): Unit = {
+    Files.write(file.toPath, stuff.toString.getBytes())
+    ()
+  }
+
+  def copyFile(from: File, toDir: File, overwrite: Boolean = false): Unit = {
+    Files.copy(from.toPath, (new File(toDir, from.getName)).toPath,
+               (if (overwrite) Seq(StandardCopyOption.REPLACE_EXISTING)
+                else Seq()): _*)
+    ()
+  }
+
+  // Perhaps this should move into a more specific place, not a generic file utils thing
+  def zipFolder(dir: File, out: File): Unit = {
+    import java.io.{ BufferedInputStream, FileInputStream, FileOutputStream }
+    import java.util.zip.{ ZipEntry, ZipOutputStream }
+    val bufferSize = 2 * 1024
+    val data = new Array[Byte](bufferSize)
+    val zip = new ZipOutputStream(new FileOutputStream(out))
+    val prefixLen = dir.getParentFile.toString.length + 1
+    allFiles(dir).foreach { file =>
+      zip.putNextEntry(new ZipEntry(file.toString.substring(prefixLen).replace(java.io.File.separator, "/")))
+      val in = new BufferedInputStream(new FileInputStream(file), bufferSize)
+      var b = 0
+      while (b >= 0) { zip.write(data, 0, b); b = in.read(data, 0, bufferSize) }
+      in.close()
+      zip.closeEntry()
+    }
+    zip.close()
+  }
+
+  /**
+    * iterate through the entries of a streamed .zip file, selecting only sampleRatio of them
+    *
+    * @param portableStream  Stream of zip file
+    * @param zipfile         File name is only used to construct the names of the entries
+    * @param sampleRatio     What fraction of files is returned from zip
+    */
+  class ZipIterator(portableStream: PortableDataStream, zipfile: String,  sampleRatio: Double = 1)
+    extends Iterator[(String, Array[Byte])] {
+
+    val stream = portableStream.open
+    private val zipstream = new ZipInputStream(stream)
+
+    val random = {
+      val rd = new Random()
+      rd.setSeed(0)
+      rd
+    }
+
+    private def getNext: Option[(String, Array[Byte])] = {
+      var entry = zipstream.getNextEntry
+      while(entry != null){
+        if(!entry.isDirectory && random.nextDouble < sampleRatio) {
+
+          val filename = zipfile + java.io.File.separator + entry.getName()
+
+          //extracting all bytes of a given entry
+          val byteStream = new ByteArrayOutputStream
+          IOUtils.copy(zipstream, byteStream)
+          val bytes = byteStream.toByteArray
+
+          assert(bytes.length == entry.getSize,
+            "incorrect number of bytes is read from zipstream: " + bytes.length + " instead of " + entry.getSize)
+
+          return Some((filename, bytes))
+        }
+        entry = zipstream.getNextEntry
+      }
+
+      stream.close()
+      None
+    }
+
+    private var nextValue = getNext
+
+    def hasNext: Boolean = !nextValue.isEmpty
+
+    def next: (String, Array[Byte]) = {
+      val result = nextValue.get
+      nextValue = getNext
+      result
+    }
+  }
+}
diff --git a/src/core/env/src/main/scala/Logging.scala b/src/core/env/src/main/scala/Logging.scala
new file mode 100644
index 0000000000..99bec2f0a8
--- /dev/null
+++ b/src/core/env/src/main/scala/Logging.scala
@@ -0,0 +1,23 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.logging.log4j.scala.{Logging => Logging4J}
+import org.apache.logging.log4j._
+
+// Ilya has the logging functions already in a separate branch, so log APIs here removed.
+// Merge those into a single trait "Logging" here and have MMLParams incorporate it.
+
+// Utility to provide log-related canonical construction
+// There should be a separate logger at each package (mml, cntk, tlc)
+object Logging {
+
+  lazy val config = MMLConfig.get
+  lazy val logRoot = config.root
+
+  def getLogger(customSuffix: String): Logger = {
+    LogManager.getLogger(s"$logRoot.$customSuffix")
+  }
+
+}
diff --git a/src/core/env/src/main/scala/NativeLoader.java b/src/core/env/src/main/scala/NativeLoader.java
new file mode 100644
index 0000000000..846d45f302
--- /dev/null
+++ b/src/core/env/src/main/scala/NativeLoader.java
@@ -0,0 +1,194 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark;
+
+import java.io.*;
+import java.nio.file.Files;
+import java.util.ArrayList;
+
+/**
+ * A helper class for loading native libraries from Java
+ *
+ * <p>Some Java interfaces depend on native libraries that need to be loaded at runtime.
+ * This class is a simple utility that can load the native libraries from a jar in one of two ways:</p>
+ *
+ * <ul>
+ *     <li>By name: If a particular native library is needed, it will extract it to a temp folder
+ *     (along with its dependencies) and load it from there.</li>
+ *     <li>All libraries: all libraries will be extracted to a temp folder and the libraries in the
+ *     load manifest are loaded in the order provided, or loaded in the order specified in the
+ *     native manifest if no load manifest is provided. </li>
+ * </ul>
+ *
+ * <p>The jar with the native libraries must contain a file name 'NATIVE_MANIFEST' that lists
+ * all native files (one per line, full name) to be extracted. If the loadAll() method is used,
+ * the libraries will be loaded in the order specified in the manifest. The native libraries should be
+ * in folders describing the OS they run on: linux, windows, mac. </p>
+ * */
+public class NativeLoader {
+
+    private static final String manifestName = "NATIVE_MANIFEST";
+    private static final String loadManifestName = "NATIVE_LOAD_MANIFEST";
+    private String resourcesPath;
+    private String[] nativeList = new String[0];
+    private Boolean extractionDone = false;
+    private File tempDir;
+
+    public NativeLoader(String topLevelResourcesPath) throws IOException{
+        this.resourcesPath = getResourcesPath(topLevelResourcesPath);
+        tempDir = Files.createTempDirectory("tmp").toFile();
+        tempDir.deleteOnExit();
+    }
+
+
+    /**
+     * Loads all native libraries from the jar file, if the jar contains a plain text file
+     * named 'NATIVE_MANIFEST'.
+     *
+     * <p>The NATIVE_MANIFEST contains what libraries to be extracted (one per line, full name)
+     * and the order in which they should be loaded. Alternatively, if only specific top-level
+     * libraries should be loaded, they can be specified in the NATIVE_LOAD_MANIFEST file in order.</p>
+     * */
+    public void loadAll(){
+        try{
+            extractNativeLibraries();
+            try{
+                // First try to find the NATIVE_LOAD_MANIFEST and load the libraries there
+                String[] loadList = getResourceLines(loadManifestName);
+                for (String libName: loadList){
+                    System.load(tempDir.getAbsolutePath() + File.separator + libName);
+                }
+            }
+            catch (IOException ee){
+                // If loading the NATIVE_LOAD_MANIFEST failed, try loading the libraries
+                // in the order provided by the NATIVE_MANIFEST
+                for (String libName: nativeList){
+                    System.load(tempDir.getAbsolutePath() + File.separator + libName);
+                }
+            }
+        }
+        catch (Exception e){
+            // If nothing worked, throw exception
+            throw new UnsatisfiedLinkError(String.format("Could not load all native libraries because " +
+                    "we encountered the following error: %s", e.getMessage()));
+        }
+    }
+
+    /**
+     * Loads a named native library from the jar file
+     *
+     * <p>This method will first try to load the library from java.library.path system property.
+     * Only if that fails, the named native library and its dependencies will be extracted to
+     * a temporary folder and loaded from there.</p>
+     * */
+    public void loadLibraryByName(String libName){
+        try{
+            // First try loading by name
+            // It's possible that the native library is already on a path java can discover
+            System.loadLibrary(libName);
+        }
+        catch (UnsatisfiedLinkError e){
+            try{
+                extractNativeLibraries();
+                // Get the OS specific library name
+                libName = System.mapLibraryName(libName);
+                // Try to load library from extracted native resources
+                System.load(tempDir.getAbsolutePath() + File.separator + libName);
+            }
+            catch (Exception ee){
+                throw new UnsatisfiedLinkError(String.format(
+                        "Could not load the native libraries because " +
+                        "we encountered the following problems: %s and %s",
+                        e.getMessage(), ee.getMessage()));
+            }
+        }
+    }
+
+    private void extractNativeLibraries() throws IOException{
+        if (!extractionDone) {
+            nativeList = getResourceLines(manifestName);
+            // Extract all OS specific native libraries to temporary location
+            for (String libName: nativeList) {
+                extractResourceFromPath(libName, resourcesPath);
+            }
+        }
+        extractionDone = true;
+    }
+
+    private String[] getResourceLines(String resourceName) throws IOException{
+        // Read resource file if it exists
+        InputStream inStream = NativeLoader.class
+                .getResourceAsStream(resourcesPath + resourceName);
+        if (inStream == null) {
+            throw new FileNotFoundException("Could not find native resources in jar. " +
+                    "Make sure the jar containing the native libraries was added to the classpath.");
+        }
+        BufferedReader resourceReader = new BufferedReader(
+                new InputStreamReader(inStream, "UTF-8")
+        );
+        ArrayList<String> lines = new ArrayList<String>();
+        for (String line; (line = resourceReader.readLine()) != null; ) {
+            lines.add(line);
+        }
+        resourceReader.close();
+        inStream.close();
+        return lines.toArray(new String[lines.size()]);
+    }
+
+    private static String getResourcesPath(String topLevelResourcesPath){
+        String sep = "/";
+        String OS = System.getProperty("os.name").toLowerCase();
+        String resourcePrefix = topLevelResourcesPath
+                + sep + "%s"
+                + sep;
+        if (OS.contains("linux")){
+            return String.format(resourcePrefix, "linux");
+        }
+        else if (OS.contains("windows")){
+            return String.format(resourcePrefix, "windows");
+        }
+        else if (OS.contains("mac")|| OS.contains("darwin")){
+            return String.format(resourcePrefix, "mac");
+        }
+        else{
+            throw new UnsatisfiedLinkError(
+                    String.format("This component doesn't currently have native support for OS: %s", OS)
+            );
+        }
+    }
+
+    private void extractResourceFromPath(String libName, String prefix) throws IOException{
+
+        File temp = new File(tempDir.getPath() + File.separator + libName);
+        temp.createNewFile();
+        temp.deleteOnExit();
+
+        if (!temp.exists()) {
+            throw new FileNotFoundException(String.format(
+                    "Temporary file %s could not be created. Make sure you can write to this location.",
+                    temp.getAbsolutePath())
+            );
+        }
+
+        String path = prefix + libName;
+        InputStream inStream = NativeLoader.class.getResourceAsStream(path);
+        if (inStream == null) {
+            throw new FileNotFoundException(String.format("Could not find resource %s in jar.", path));
+        }
+
+        FileOutputStream outStream = new FileOutputStream(temp);
+        byte[] buffer = new byte[1 << 18];
+        int bytesRead;
+
+        try {
+            while ((bytesRead = inStream.read(buffer)) >= 0) {
+                outStream.write(buffer, 0, bytesRead);
+            }
+        } finally {
+            outStream.close();
+            inStream.close();
+        }
+    }
+
+}
diff --git a/src/core/env/src/main/scala/ProcessUtilities.scala b/src/core/env/src/main/scala/ProcessUtilities.scala
new file mode 100644
index 0000000000..820577d8b1
--- /dev/null
+++ b/src/core/env/src/main/scala/ProcessUtilities.scala
@@ -0,0 +1,26 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import scala.sys.process._
+
+object ProcessUtils {
+
+  // These are only here until we create a more robust
+  // stream-redirected utility
+  def getProcessOutput(cmd: String): String = {
+    println(s"Capturing external process $cmd...")
+    val ret = cmd.!!
+    println(s"$ret...done!")
+    ret
+  }
+
+  def runProcess(cmd: String): Int = {
+    println(s"Executing external process $cmd...")
+    val ret = cmd .!
+    println(s"$ret...done!")
+    ret
+  }
+
+}
diff --git a/src/core/hadoop/build.sbt b/src/core/hadoop/build.sbt
new file mode 100644
index 0000000000..cd0183132b
--- /dev/null
+++ b/src/core/hadoop/build.sbt
@@ -0,0 +1 @@
+// nothing here
diff --git a/src/core/hadoop/src/main/scala/HadoopUtils.scala b/src/core/hadoop/src/main/scala/HadoopUtils.scala
new file mode 100644
index 0000000000..b5a4bf5350
--- /dev/null
+++ b/src/core/hadoop/src/main/scala/HadoopUtils.scala
@@ -0,0 +1,176 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.hadoop
+
+import java.nio.file.Paths
+
+import org.apache.commons.io.FilenameUtils
+
+import scala.sys.process._
+import org.apache.hadoop.conf.{Configuration, Configured}
+import org.apache.hadoop.fs.{Path, PathFilter}
+import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
+import org.apache.spark.sql.SparkSession
+import scala.language.existentials
+import scala.util.Random
+
+class HadoopUtils(hadoopConf: Configuration) {
+  // Is there a better way? We need to deduce full Hadoop conf
+  // including current active namenode etc as well as YARN properties
+  // going forward anyway for cluster participation in GPU-YARN-queue mode
+  // fs.defaultFS isn't good on HDI because we rewrite to WASB
+  // Answer, Slightly better:
+  /*
+    $ hdfs getconf -confKey dfs.nameservices
+    mycluster
+    $ hdfs getconf -confKey dfs.ha.namenodes.mycluster
+    nn1,nn2
+    $ hdfs haadmin -getServiceState nn1
+    active
+    $ hdfs haadmin -getServiceState nn2
+    standby
+  */
+  private val NAMESERVICES_KEY = "dfs.nameservices"
+  private val NAMENODE_KEY_ROOT = "dfs.ha.namenodes"
+  private val RPC_KEY_ROOT = "dfs.namenode.rpc-address"
+
+  private def getNameServices: String = {
+    hadoopConf.get(NAMESERVICES_KEY)
+  }
+
+  private def getNameNodes: Seq[String] = {
+    val nameservices = getNameServices
+    println(s"Nameservices for cluster at '$nameservices'")
+    hadoopConf.get(combine(NAMENODE_KEY_ROOT, nameservices)).split(",")
+  }
+
+  private def isActiveNode(namenode: String): Boolean = {
+    shellout(s"hdfs haadmin -getServiceState $namenode").startsWith("active")
+  }
+
+  private def combine(keys: String*): String = keys.mkString(".")
+
+  def getActiveNameNode: String = {
+    val nameservices = getNameServices
+    println(s"Nameservices for cluster at '$nameservices'")
+    val namenodes = getNameNodes
+    println(s"Querying namenodes:\n${namenodes.foreach(println)}")
+    val active = namenodes.par
+      .filter(isActiveNode)
+      .head
+    println(s"Found $active as active namenode")
+    hadoopConf.get(combine(RPC_KEY_ROOT, nameservices, active))
+  }
+
+  // This is only to make sure all uses go away ASAP into Process utils
+  // I realize this means it will be around forever
+  private def shellout(cmd: String): String = {
+    println(s"Executing external process $cmd...")
+    val ret = cmd.!!
+    println(s"$ret...done!")
+    ret
+  }
+
+}
+
+/**
+  * Filter that allows loading a fraction of HDFS files.
+  */
+class SamplePathFilter extends Configured with PathFilter {
+  val random = {
+    val rd = new Random()
+    rd.setSeed(0)
+    rd
+  }
+
+  // Ratio of files to be read from disk
+  var sampleRatio: Double = 1
+
+  // When inspectZip is enabled, zip files are treated as directories, and SamplePathFilter can't filter them out.
+  // Otherwise, zip files are treated as regular files and only sampleRatio of them is read.
+  var inspectZip: Boolean = true
+
+  override def setConf(conf: Configuration): Unit = {
+    if (conf != null) {
+      sampleRatio = conf.getDouble(SamplePathFilter.ratioParam, 1)
+      inspectZip = conf.getBoolean(SamplePathFilter.inspectZipParam, true)
+    }
+  }
+
+  override def accept(path: Path): Boolean = {
+    // Note: checking fileSystem.isDirectory is very slow here, so we use basic rules instead
+    !SamplePathFilter.isFile(path) ||
+      (SamplePathFilter.isZipFile(path) && inspectZip) ||
+      random.nextDouble() < sampleRatio
+  }
+}
+
+object SamplePathFilter {
+  val ratioParam = "sampleRatio"
+  val inspectZipParam = "inspectZip"
+
+  def isFile(path: Path): Boolean = FilenameUtils.getExtension(path.toString) != ""
+
+  def isZipFile(filename: String): Boolean = FilenameUtils.getExtension(filename) == "zip"
+
+  def isZipFile(path: Path): Boolean = isZipFile(path.toString)
+
+  /**
+    * Set/unset  hdfs PathFilter
+    *
+    * @param value       Filter class that is passed to HDFS
+    * @param sampleRatio Fraction of the files that the filter picks
+    * @param inspectZip  Look into zip files, if true
+    * @param spark       Existing Spark session
+    * @return
+    */
+  def setPathFilter(value: Option[Class[_]], sampleRatio: Option[Double] = None,
+                    inspectZip: Option[Boolean] = None, spark: SparkSession)
+  : Option[Class[_]] = {
+    val flagName = FileInputFormat.PATHFILTER_CLASS
+    val hadoopConf = spark.sparkContext.hadoopConfiguration
+    val old = Option(hadoopConf.getClass(flagName, null))
+    if (sampleRatio.isDefined) {
+      hadoopConf.setDouble(SamplePathFilter.ratioParam, sampleRatio.get)
+    } else {
+      hadoopConf.unset(SamplePathFilter.ratioParam)
+      None
+    }
+
+    if (inspectZip.isDefined) {
+      hadoopConf.setBoolean(SamplePathFilter.inspectZipParam, inspectZip.get)
+    } else {
+      hadoopConf.unset(SamplePathFilter.inspectZipParam)
+      None
+    }
+
+    value match {
+      case Some(v) => hadoopConf.setClass(flagName, v, classOf[PathFilter])
+      case None => hadoopConf.unset(flagName)
+    }
+    old
+  }
+}
+
+object RecursiveFlag {
+  /**
+    * Sets a value of spark recursive flag
+    *
+    * @param value value to set
+    * @param spark existing spark session
+    * @return previous value of this flag
+    */
+  def setRecursiveFlag(value: Option[String], spark: SparkSession): Option[String] = {
+    val flagName = FileInputFormat.INPUT_DIR_RECURSIVE
+    val hadoopConf = spark.sparkContext.hadoopConfiguration
+    val old = Option(hadoopConf.get(flagName))
+
+    value match {
+      case Some(v) => hadoopConf.set(flagName, v)
+      case None => hadoopConf.unset(flagName)
+    }
+
+    old
+  }
+}
diff --git a/src/core/ml/build.sbt b/src/core/ml/build.sbt
new file mode 100644
index 0000000000..050b722ee9
--- /dev/null
+++ b/src/core/ml/build.sbt
@@ -0,0 +1,3 @@
+//> DependsOn: core/test
+//> DependsOn: core/spark
+//> DependsOn: core/schema
diff --git a/src/core/ml/src/test/scala/HashingTFSpec.scala b/src/core/ml/src/test/scala/HashingTFSpec.scala
new file mode 100644
index 0000000000..576a35c53f
--- /dev/null
+++ b/src/core/ml/src/test/scala/HashingTFSpec.scala
@@ -0,0 +1,81 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.schema.DatasetExtensions._
+import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
+import org.apache.spark.ml.linalg.SparseVector
+
+class HashingTFSpec extends TestBase {
+
+  test("operation on tokenized strings") {
+    val wordDataFrame = session.createDataFrame(Seq(
+      (0, Array("Hi", "I", "can", "not", "foo", "foo")),
+      (1, Array("I")),
+      (2, Array("Logistic", "regression")),
+      (3, Array("Log", "f", "reg"))
+    )).toDF("label", "words")
+
+    val hashDF = new HashingTF().setInputCol("words").setOutputCol("hashedTF").transform(wordDataFrame)
+    val lines = hashDF.getSVCol("hashedTF")
+
+    val trueLines = List(
+      new SparseVector(262144, Array(36073,51654,113890,139098,242088), Array(1.0,2.0,1.0,1.0,1.0)),
+      new SparseVector(262144, Array(113890), Array(1.0)),
+      new SparseVector(262144, Array(13671,142455), Array(1.0,1.0)),
+      new SparseVector(262144, Array(24152,74466,122984), Array(1.0,1.0,1.0))
+    )
+    assert(lines === trueLines)
+  }
+
+  test("support several values for number of features") {
+    val featureSizes = List(1, 5, 100, 100000)
+    val words = Array("Hi", "I", "can", "not", "foo", "bar", "foo", "afk")
+    val wordDataFrame = session.createDataFrame(Seq((0, words))).toDF("label", "words")
+
+    val fsResults = featureSizes.map { n =>
+          new HashingTF()
+            .setNumFeatures(n)
+            .setInputCol("words")
+            .setOutputCol("hashedTF")
+            .transform(wordDataFrame)
+            .getSVCol("hashedTF")(0)
+      }
+    val trueResults = Array(
+      new SparseVector(     1, Array(0), Array(8.0)),
+      new SparseVector(     5, Array(0,2,3), Array(4.0,2.0,2.0)),
+      new SparseVector(   100, Array(0,10,18,33,62,67,80), Array(1.0,2.0,1.0,1.0,1.0,1.0,1.0)),
+      new SparseVector(100000, Array(5833,9467,16680,29018,68900,85762,97510), Array(1.0,1.0,1.0,1.0,1.0,1.0,2.0))
+    )
+    assert(fsResults === trueResults)
+  }
+
+  test("treat empty strings as another word") {
+    val wordDataFrame = session.createDataFrame(Seq(
+      (0, "hey you no way"),
+      (1, "")))
+      .toDF("label", "sentence")
+
+    val tokenized = new Tokenizer().setInputCol("sentence").setOutputCol("tokens").transform(wordDataFrame)
+    val hashDF = new HashingTF().setInputCol("tokens").setOutputCol("HashedTF").transform(tokenized)
+
+    val lines = hashDF.getSVCol("hashedTF")
+      assert(lines(1) === new SparseVector(262144, Array(249180), Array(1.0)))
+  }
+
+  test("raise an error when applied to a null array") {
+    val tokenDataFrame = session.createDataFrame(Seq(
+      (0, Some(Array("Hi", "I", "can", "not", "foo"))),
+      (1, None))
+    ).toDF("label", "tokens")
+    assertSparkException[org.apache.spark.SparkException](new HashingTF().setInputCol("tokens"), tokenDataFrame)
+  }
+
+  test("raise an error when given strange values of n") {
+    List(0, -1, -10).foreach { n =>
+      intercept[IllegalArgumentException] { new HashingTF().setNumFeatures(n) }
+    }
+  }
+
+}
diff --git a/src/core/ml/src/test/scala/IDFSpec.scala b/src/core/ml/src/test/scala/IDFSpec.scala
new file mode 100644
index 0000000000..80c71a7195
--- /dev/null
+++ b/src/core/ml/src/test/scala/IDFSpec.scala
@@ -0,0 +1,103 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.schema.DatasetExtensions._
+import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector}
+
+class IDFSpec extends TestBase {
+
+  test("operation on hashingTF output") {
+    val sentenceData = session.createDataFrame(Seq((0, "Hi I"),
+                                                   (1, "I wish"),
+                                                   (2, "we Cant")))
+      .toDF("label", "sentence")
+
+    val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
+    val wordsData = tokenizer.transform(sentenceData)
+    val hashingTF = new HashingTF()
+      .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(20)
+    val featurizedData = hashingTF.transform(wordsData)
+
+    val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
+    val idfModel = idf.fit(featurizedData)
+    val rescaledData = idfModel.transform(featurizedData)
+
+    val lines = rescaledData.getSVCol("features")
+    val trueLines = List(
+      new SparseVector(20, Array(0,  9), Array(0.6931471805599453, 0.28768207245178085)),
+      new SparseVector(20, Array(9, 15), Array(0.28768207245178085, 0.6931471805599453)),
+      new SparseVector(20, Array(6, 13), Array(0.6931471805599453, 0.6931471805599453))
+    )
+    assert(lines === trueLines)
+  }
+
+  test("operation on dense or sparse vectors") {
+    val denseVects = Seq((0, new DenseVector(Array(1, 1, 0, 0, 0))),
+                         (1, new DenseVector(Array(0, 1, 1, 0, 0))),
+                         (2, new DenseVector(Array(0, 0, 0, 1, 1))))
+
+    val denseVectDF = session.createDataFrame(denseVects).toDF("label", "features")
+    val sparseVectDF = session.createDataFrame(denseVects.map(p => (p._1, p._2.toSparse))).toDF("label", "features")
+
+    val rescaledDD =
+      new IDF().setInputCol("features").setOutputCol("scaledFeatures").fit(denseVectDF).transform(denseVectDF)
+    val rescaledDS =
+      new IDF().setInputCol("features").setOutputCol("scaledFeatures").fit(denseVectDF).transform(sparseVectDF)
+    val rescaledSD =
+      new IDF().setInputCol("features").setOutputCol("scaledFeatures").fit(sparseVectDF).transform(denseVectDF)
+    val rescaledSS =
+      new IDF().setInputCol("features").setOutputCol("scaledFeatures").fit(sparseVectDF).transform(sparseVectDF)
+
+    val resultsD = List(rescaledDD, rescaledSD).map(_.getDVCol("scaledFeatures"))
+    val resultsS = List(rescaledDS, rescaledSS).map(_.getSVCol("scaledFeatures"))
+
+    assert(resultsD.head === resultsD(1))
+    assert(resultsS.head === resultsS(1))
+    assert(resultsD.head.map(_.toSparse) === resultsS.head)
+  }
+
+  test("raise an error when applied to a null array") {
+    val df = session.createDataFrame(Seq((0, Some(new DenseVector(Array(1, 1, 0, 0, 0)))),
+                                         (1, Some(new DenseVector(Array(0, 1, 1, 0, 0)))),
+                                         (2, None)))
+      .toDF("id", "features")
+    val df2 = new IDF().setInputCol("features")
+    withoutLogging {
+      intercept[org.apache.spark.SparkException] {
+        new IDF().setInputCol("features").fit(df)
+      }
+    }
+  }
+
+  test("support setting minDocFrequency") {
+    val df = session.createDataFrame(Seq((0, new DenseVector(Array(1, 1, 0, 0, 0))),
+                                         (1, new DenseVector(Array(0, 1, 1, 0, 0))),
+                                         (2, new DenseVector(Array(0, 0, 0, 1, 1)))))
+      .toDF("id", "features")
+
+    val df2 = new IDF().setMinDocFreq(2)
+      .setInputCol("features").setOutputCol("rescaledFeatures")
+      .fit(df).transform(df)
+    val lines = df2.getDVCol("rescaledFeatures")
+    val trueLines = List(new DenseVector(Array(0.0, 0.28768207245178085, 0.0, 0.0, 0.0)),
+                         new DenseVector(Array(0.0, 0.28768207245178085, 0.0, 0.0, 0.0)),
+                         new DenseVector(Array(0.0, 0.0, 0.0, 0.0, 0.0)))
+    assert(lines === trueLines)
+  }
+
+  ignore("raise an error when given strange values of minDocumentFrequency") {
+    val df = session.createDataFrame(Seq((0, new DenseVector(Array(1, 1, 0, 0, 0))),
+                                         (1, new DenseVector(Array(0, 1, 1, 0, 0))),
+                                         (2, new DenseVector(Array(0, 0, 0, 1, 1)))))
+      .toDF("id", "features")
+    // new IDF().setMinDocFreq(-1).setInputCol("features").fit(df).transform(df).show()
+    List(-1, -10).foreach { n =>
+      val estimator = new IDF().setMinDocFreq(n).setInputCol("features")
+      assertSparkException[IllegalArgumentException](estimator, df)
+    }
+  }
+
+}
diff --git a/src/core/ml/src/test/scala/NGramSpec.scala b/src/core/ml/src/test/scala/NGramSpec.scala
new file mode 100644
index 0000000000..a9e9cb247b
--- /dev/null
+++ b/src/core/ml/src/test/scala/NGramSpec.scala
@@ -0,0 +1,74 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.ml.feature.{NGram, Tokenizer}
+import org.apache.spark.sql.DataFrame
+
+import scala.collection.mutable
+
+class NGramSpec extends TestBase {
+
+  def ngramDFToScalaList(dataFrame: DataFrame, outputCol: String = "ngrams"): Array[List[Any]] = {
+    dataFrame.select(dataFrame(outputCol)).collect()
+      .map(_.getAs[mutable.WrappedArray[Any]](0).toList)
+  }
+
+  test("operation on tokenized strings") {
+    val wordDataFrame = session.createDataFrame(Seq((0, Array("Hi", "I", "can", "not", "foo")),
+                                                    (1, Array("I")),
+                                                    (2, Array("Logistic", "regression")),
+                                                    (3, Array("Log", "f", "reg"))))
+      .toDF("label", "words")
+
+    val ngramDF = new NGram().setN(3)
+      .setInputCol("words").setOutputCol("ngrams")
+      .transform(wordDataFrame)
+    val ngrams = ngramDFToScalaList(ngramDF)
+    assert(ngrams(0) === Array("Hi I can", "I can not", "can not foo"))
+    assert(ngrams(1) === Array())
+    assert(ngrams(2) === Array())
+    assert(ngrams(3) === Array("Log f reg"))
+  }
+
+  test("supporting several values for n") {
+    val ns = 1 to 6
+    val words = Array("Hi", "I", "can", "not", "foo", "bar", "foo", "afk")
+    val wordDataFrame = session.createDataFrame(Seq((0, words))).toDF("label", "words")
+    val nGramResults = ns.map { n =>
+      ngramDFToScalaList(
+        new NGram().setN(n)
+          .setInputCol("words").setOutputCol("ngrams")
+          .transform(wordDataFrame))
+      }
+    ns.foreach { n =>
+      assert(nGramResults(n-1)(0).head === words.take(n).mkString(" "))
+    }
+  }
+
+  test("handling empty strings gracefully") {
+    val wordDataFrame = session.createDataFrame(Seq((0, "hey you no way"),
+                                                    (1, "")))
+      .toDF("label", "sentence")
+
+    val tokenized = new Tokenizer().setInputCol("sentence").setOutputCol("tokens").transform(wordDataFrame)
+    val ngrams = new NGram().setInputCol("tokens").setOutputCol("ngrams").transform(tokenized)
+    assert(ngramDFToScalaList(ngrams)(1) === Nil)
+  }
+
+  test("raise an error when applied to a null array") {
+    val tokenDataFrame = session.createDataFrame(Seq(
+      (0, Some(Array("Hi", "I", "can", "not", "foo"))),
+      (1, None))
+    ).toDF("label", "tokens")
+    assertSparkException[org.apache.spark.SparkException](new NGram().setInputCol("tokens"), tokenDataFrame)
+  }
+
+  test("raise an error when given strange values of n") {
+    List(0, -1, -10).foreach { n =>
+      intercept[IllegalArgumentException] { new NGram().setN(n) }
+    }
+  }
+
+}
diff --git a/src/core/ml/src/test/scala/OneHotEncoderSpec.scala b/src/core/ml/src/test/scala/OneHotEncoderSpec.scala
new file mode 100644
index 0000000000..18bbe4e00e
--- /dev/null
+++ b/src/core/ml/src/test/scala/OneHotEncoderSpec.scala
@@ -0,0 +1,102 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.schema.DatasetExtensions._
+import org.apache.spark._
+import org.apache.spark.ml.feature.OneHotEncoder
+import org.apache.spark.ml.linalg.SparseVector
+
+class OneHotEncoderSpec extends TestBase {
+
+  test("expand category indicies") {
+    val df = session.createDataFrame(Seq((0, 0.0),
+                                         (1, 1.0),
+                                         (2, 0.0),
+                                         (3, 2.0),
+                                         (4, 1.0),
+                                         (5, 0.0)))
+      .toDF("id", "categoryIndex")
+
+    val encoded =
+      new OneHotEncoder()
+        .setInputCol("categoryIndex").setOutputCol("categoryVec")
+        .transform(df)
+    val oneHotList = encoded.getSVCol("categoryVec")
+    val trueList = List(new SparseVector(2, Array(0), Array(1.0)),
+                        new SparseVector(2, Array(1), Array(1.0)),
+                        new SparseVector(2, Array(0), Array(1.0)),
+                        new SparseVector(2, Array(),  Array()),
+                        new SparseVector(2, Array(1), Array(1.0)),
+                        new SparseVector(2, Array(0), Array(1.0)))
+    assert(oneHotList === trueList)
+  }
+
+  test("support interger indicies") {
+    val df = session.createDataFrame(Seq((0, 0),
+                                         (1, 1),
+                                         (2, 0),
+                                         (3, 2),
+                                         (4, 1),
+                                         (5, 0)
+                                     ))
+      .toDF("id", "categoryIndex")
+
+    val encoded= new OneHotEncoder().setInputCol("categoryIndex").setOutputCol("categoryVec").transform(df)
+    val oneHotList = encoded.getSVCol("categoryVec")
+    val trueList = List(new SparseVector(2, Array(0), Array(1.0)),
+                        new SparseVector(2, Array(1), Array(1.0)),
+                        new SparseVector(2, Array(0), Array(1.0)),
+                        new SparseVector(2, Array(),  Array()),
+                        new SparseVector(2, Array(1), Array(1.0)),
+                        new SparseVector(2, Array(0), Array(1.0)))
+    assert(oneHotList === trueList)
+  }
+
+  test("support not dropping the last feature") {
+    val df = session.createDataFrame(Seq((0, 0.0),
+                                         (1, 1.0),
+                                         (2, 0.0),
+                                         (3, 2.0),
+                                         (4, 1.0),
+                                         (5, 0.0)
+                                     ))
+      .toDF("id", "categoryIndex")
+
+    val encoded= new OneHotEncoder().setDropLast(false)
+      .setInputCol("categoryIndex").setOutputCol("categoryVec")
+      .transform(df)
+    val oneHotList = encoded.getSVCol("categoryVec")
+    val trueList = List(new SparseVector(3, Array(0), Array(1.0)),
+                        new SparseVector(3, Array(1), Array(1.0)),
+                        new SparseVector(3, Array(0), Array(1.0)),
+                        new SparseVector(3, Array(2), Array(1.0)),
+                        new SparseVector(3, Array(1), Array(1.0)),
+                        new SparseVector(3, Array(0), Array(1.0)))
+    assert(oneHotList === trueList)
+  }
+
+  test("raise an error when applied to a null array") {
+    val df = session.createDataFrame(Seq((0, Some(0.0)),
+                                         (1, Some(1.0)),
+                                         (2, None)))
+      .toDF("id", "categoryIndex")
+    assertSparkException[SparkException](new OneHotEncoder().setInputCol("categoryIndex"), df)
+  }
+
+  test("raise an error when it receives a strange float") {
+    val df = session.createDataFrame(Seq((0, 0.0),
+                                         (1, 1.0),
+                                         (2, 0.4)))
+      .toDF("id", "categoryIndex")
+    assertSparkException[SparkException](new OneHotEncoder().setInputCol("categoryIndex"), df)
+
+    val df2 = session.createDataFrame(Seq((0,  0.0),
+                                          (1,  1.0),
+                                          (2, -1.0)))
+      .toDF("id", "categoryIndex")
+    assertSparkException[SparkException](new OneHotEncoder().setInputCol("categoryIndex"), df2)
+  }
+
+}
diff --git a/src/core/ml/src/test/scala/Word2VecSpec.scala b/src/core/ml/src/test/scala/Word2VecSpec.scala
new file mode 100644
index 0000000000..82f7e0ffdd
--- /dev/null
+++ b/src/core/ml/src/test/scala/Word2VecSpec.scala
@@ -0,0 +1,93 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.schema.DatasetExtensions._
+import org.apache.spark.ml.feature.Word2Vec
+import org.apache.spark.ml.linalg.DenseVector
+import org.apache.spark.sql.DataFrame
+
+class Word2VecSpec extends TestBase {
+
+  def genTokenizedText(): DataFrame = {
+    session.createDataFrame(Seq(
+      (0, Array("I", "walked", "the", "dog", "down", "the", "street")),
+      (1, Array("I", "walked", "with", "the", "dog")),
+      (2, Array("I", "walked", "the", "pup"))
+    )).toDF("label", "words")
+  }
+
+  def genW2V(): Word2Vec = new Word2Vec().setSeed(1234).setMinCount(0)
+
+  test("operation on tokenized strings") {
+    val df = genTokenizedText()
+
+    val df2 = genW2V().setVectorSize(2)
+      .setInputCol("words").setOutputCol("features").fit(df).transform(df)
+
+    val lines = df2.getDVCol("features")
+    assert(lines.forall(_.size == 2))
+  }
+
+  test("return vectors") {
+    val df = genTokenizedText()
+    val model = genW2V().setVectorSize(2)
+      .setInputCol("words").setOutputCol("features").fit(df)
+    val vectors = model.getVectors.getDVCol("vector")
+    assert(vectors(0).size == 2)
+  }
+
+  test("return synonyms") {
+    val df = genTokenizedText()
+    val model = genW2V().setVectorSize(2)
+      .setInputCol("words").setOutputCol("features").fit(df)
+    val synonyms = model.findSynonyms("dog", 2).getColAs[String]("word")
+    assert(synonyms.length === 2)
+  }
+
+  test("raise an error when applied to a null array") {
+    val tokenDataFrame = session.createDataFrame(Seq(
+      (0, Some(Array("Hi", "I", "can", "not", "foo"))),
+      (1, None))
+    ).toDF("label", "tokens")
+    assertSparkException[org.apache.spark.SparkException](genW2V().setInputCol("tokens"), tokenDataFrame)
+  }
+
+  test("raise an error when given strange values of parameters") {
+    def base(): Word2Vec = genW2V().setInputCol("words")
+    def assertIllegalArgument[T](f: T => Any, args: T*): Unit =
+      args.foreach { n => interceptWithoutLogging[IllegalArgumentException] { f(n) } }
+    assertIllegalArgument[Int](base.setMinCount,             -1, -10)
+    assertIllegalArgument[Int](base.setMaxIter,              -1, -10)
+    assertIllegalArgument[Int](base.setVectorSize,        0, -1, -10)
+    assertIllegalArgument[Int](base.setWindowSize,        0, -1, -10)
+    assertIllegalArgument[Int](base.setMaxSentenceLength, 0, -1, -10)
+    assertIllegalArgument[Int](base.setNumPartitions,     0, -1, -10)
+    assertIllegalArgument[Double](base.setStepSize, 0.0, -1.0, -10.0)
+  }
+
+  test("return a vector of zeros when it encounters an OOV word") {
+    val df = genTokenizedText()
+    val model = genW2V().setVectorSize(2).setMinCount(1).setInputCol("words").setOutputCol("features").fit(df)
+    val df2 = session.createDataFrame(Seq(
+      (0, Array("ketchup")))).toDF("label", "words")
+    val results = model.transform(df2)
+    val lines = results.getDVCol("features")
+    val trueLines = List(new DenseVector(Array(0.0, 0.0)))
+    assert(lines === trueLines)
+  }
+
+  test("be able to set vector size") {
+    val df = genTokenizedText()
+    val vectorSizes = List(1, 10, 100)
+    vectorSizes.foreach { n =>
+      val results =
+          genW2V().setVectorSize(n)
+            .setInputCol("words").setOutputCol("features").fit(df).transform(df)
+            .getDVCol("features")
+        assert(results(0).size === n)
+    }
+  }
+
+}
diff --git a/src/core/schema/build.sbt b/src/core/schema/build.sbt
new file mode 100644
index 0000000000..d61f197ca6
--- /dev/null
+++ b/src/core/schema/build.sbt
@@ -0,0 +1,4 @@
+// Explicitly prevent core/test code from depending on core sources
+//> DependsOn: core/test
+//> DependsOn: core/spark
+//> DependsOn: core/env
diff --git a/src/core/schema/src/main/python/TypeConversionUtils.py b/src/core/schema/src/main/python/TypeConversionUtils.py
new file mode 100644
index 0000000000..c3dfc50595
--- /dev/null
+++ b/src/core/schema/src/main/python/TypeConversionUtils.py
@@ -0,0 +1,17 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+def generateTypeConverter(name, cache, typeConverter):
+    return lambda value: typeConverter(name, value, cache)
+
+def complexTypeConverter(name, value, cache):
+    cache[name]=value
+    if isinstance(value, list):
+        java_value=[]
+        for v in value:
+            if hasattr(v, "_transfer_params_to_java"):
+                v._transfer_params_to_java()
+            java_value.append(v._java_obj)
+        return java_value
+    value._transfer_params_to_java()
+    return value._java_obj
diff --git a/src/core/schema/src/main/python/Utils.py b/src/core/schema/src/main/python/Utils.py
new file mode 100644
index 0000000000..bebde7030a
--- /dev/null
+++ b/src/core/schema/src/main/python/Utils.py
@@ -0,0 +1,69 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+import sys
+
+if sys.version >= "3":
+    basestring = str
+
+from pyspark.ml.util import JavaMLReadable, JavaMLReader, MLReadable
+from pyspark.ml.wrapper import JavaParams
+from pyspark.ml.common import inherit_doc
+
+def from_java(java_stage, stage_name):
+    """
+    Given a Java object, create and return a Python wrapper of it.
+    Used for ML persistence.
+    Meta-algorithms such as Pipeline should override this method as a classmethod.
+    """
+    def __get_class(clazz):
+        """
+        Loads Python class from its name.
+        """
+        parts = clazz.split(".")
+        module = ".".join(parts[:-1])
+        m = __import__(module)
+        for comp in parts[1:]:
+            m = getattr(m, comp)
+        return m
+    # Generate a default new instance from the stage_name class.
+    py_type = __get_class(stage_name)
+    if issubclass(py_type, JavaParams):
+        # Load information from java_stage to the instance.
+        py_stage = py_type()
+        py_stage._java_obj = java_stage
+        py_stage._resetUid(java_stage.uid())
+        py_stage._transfer_params_from_java()
+    elif hasattr(py_type, "_from_java"):
+        py_stage = py_type._from_java(java_stage)
+    else:
+        raise NotImplementedError("This Java stage cannot be loaded into Python currently: %r"
+                                  % stage_name)
+    return py_stage
+
+@inherit_doc
+class JavaMMLReadable(MLReadable):
+    """
+    (Private) Mixin for instances that provide JavaMLReader.
+    """
+
+    @classmethod
+    def read(cls):
+        """Returns an MLReader instance for this class."""
+        return JavaMMLReader(cls)
+
+@inherit_doc
+class JavaMMLReader(JavaMLReader):
+    """
+    (Private) Specialization of :py:class:`MLReader` for :py:class:`JavaParams` types
+    """
+
+    def __init__(self, clazz):
+        super(JavaMMLReader, self).__init__(clazz)
+
+    @classmethod
+    def _java_loader_class(cls, clazz):
+        """
+        Returns the full class name of the Java ML instance.
+        """
+        return clazz.getJavaPackage()
diff --git a/src/core/schema/src/main/scala/BinaryFileSchema.scala b/src/core/schema/src/main/scala/BinaryFileSchema.scala
new file mode 100644
index 0000000000..2f0d7f2be9
--- /dev/null
+++ b/src/core/schema/src/main/scala/BinaryFileSchema.scala
@@ -0,0 +1,32 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.schema
+
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.types.{StructType, StructField, StringType, BinaryType}
+
+object BinaryFileSchema {
+
+  /*
+   * schema for the binary file column: Row(String, Array[Byte])
+   */
+  val columnSchema = StructType(Seq(
+    StructField("path",   StringType,  true),
+    StructField("bytes",  BinaryType, true)     //raw file bytes
+  ))
+
+  def getPath(row: Row): String = row.getString(0)
+  def getBytes(row: Row): Array[Byte] = row.getAs[Array[Byte]](1)
+
+  /**
+    * Check if the dataframe column contains binary file data (i.e. has BinaryFileSchema)
+    *
+    * @param df
+    * @param column
+    * @return
+    */
+  def isBinaryFile(df: DataFrame, column: String): Boolean =
+    df.schema(column).dataType == columnSchema
+
+}
diff --git a/src/core/schema/src/main/scala/Categoricals.scala b/src/core/schema/src/main/scala/Categoricals.scala
new file mode 100644
index 0000000000..f9367949da
--- /dev/null
+++ b/src/core/schema/src/main/scala/Categoricals.scala
@@ -0,0 +1,317 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.schema
+
+/**
+  * Contains objects and functions to manipulate Categoricals
+  */
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.types._
+import org.apache.spark.ml.attribute._
+import org.apache.spark.sql.functions.udf
+import SchemaConstants._
+
+import scala.reflect.ClassTag
+
+object CategoricalUtilities {
+
+  /**
+    * Sets the given levels on the column.
+    * @return The modified dataset.
+    */
+  def setLevels(dataset: DataFrame, column: String, levels: Array[_]): DataFrame = {
+    if (levels == null) dataset
+    else dataset.withColumn(column,
+      dataset.col(column).as(column,
+        updateLevelsMetadata(dataset.schema(column).metadata,
+          levels,
+          getCategoricalTypeForValue(levels.head))))
+  }
+
+  /**
+    * Update the levels on the existing metadata.
+    * @param existingMetadata The existing metadata to add to.
+    * @param levels The levels to add to the metadata.
+    * @param dataType The datatype of the levels.
+    * @return The new metadata.
+    */
+  def updateLevelsMetadata(existingMetadata: Metadata, levels: Array[_], dataType: DataType): Metadata = {
+    val bldr =
+    if (existingMetadata.contains(MMLTag)) {
+      new MetadataBuilder().withMetadata(existingMetadata.getMetadata(MMLTag))
+    } else {
+      new MetadataBuilder()
+    }
+    bldr.putBoolean(Ordinal, false)
+    dataType match {
+      case DataTypes.StringType  => bldr.putStringArray(ValuesString, levels.asInstanceOf[Array[String]])
+      case DataTypes.DoubleType  => bldr.putDoubleArray(ValuesDouble, levels.asInstanceOf[Array[Double]])
+      // Ints require special treatment, because Spark does not have putIntArray yet:
+      case DataTypes.IntegerType => bldr.putLongArray(ValuesInt, levels.asInstanceOf[Array[Int]].map(_.toLong))
+      case DataTypes.LongType    => bldr.putLongArray(ValuesLong, levels.asInstanceOf[Array[Long]])
+      case DataTypes.BooleanType => bldr.putBooleanArray(ValuesBool, levels.asInstanceOf[Array[Boolean]])
+      case _           => throw new UnsupportedOperationException("Unsupported categorical data type: " + dataType)
+    }
+    val metadata = bldr.build()
+
+    new MetadataBuilder().withMetadata(existingMetadata).putMetadata(MMLTag, metadata).build()
+  }
+
+  /**
+    * Gets the levels from the dataset.
+    * @param schema The schema to get the levels from.
+    * @param column The column to retrieve metadata levels from.
+    * @return The levels.
+    */
+  def getLevels(schema: StructType, column: String): Option[Array[_]] = {
+    val metadata = schema(column).metadata
+
+    if (metadata.contains(MMLTag)) {
+      val dataType: Option[DataType] = getDataType(metadata)
+      if (dataType.isEmpty) None
+      else {
+        dataType.get match {
+          case DataTypes.StringType => Some(getMap[String](metadata).levels)
+          case DataTypes.LongType => Some(getMap[Long](metadata).levels)
+          case DataTypes.IntegerType => Some(getMap[Int](metadata).levels)
+          case DataTypes.DoubleType => Some(getMap[Double](metadata).levels)
+          case DataTypes.BooleanType => Some(getMap[Boolean](metadata).levels)
+          case default => throw new UnsupportedOperationException("Unknown categorical type: " + default.typeName)
+        }
+      }
+    } else {
+      None
+    }
+  }
+
+  /**
+    * Gets the number of levels from the dataset.
+    * @param dataset The dataset to get the levels count from.
+    * @param column The column to retrieve metadata levels count from.
+    * @return The number of levels.
+    */
+  def getLevelCount(dataset: DataFrame, column: String): Option[Int] = {
+    val metadata = dataset.schema(column).metadata
+
+    if (metadata.contains(MMLTag)) {
+      val dataType: Option[DataType] = getDataType(metadata)
+
+      if (dataType.isEmpty) None
+      else {
+        val numLevels =
+          dataType.get match {
+            case DataTypes.StringType => getMap[String](metadata).numLevels
+            case DataTypes.LongType => getMap[Long](metadata).numLevels
+            case DataTypes.IntegerType => getMap[Int](metadata).numLevels
+            case DataTypes.DoubleType => getMap[Double](metadata).numLevels
+            case DataTypes.BooleanType => getMap[Boolean](metadata).numLevels
+            case default => throw new UnsupportedOperationException("Unknown categorical type: " + default.typeName)
+          }
+        Option(numLevels)
+      }
+    } else {
+      None
+    }
+  }
+
+  /**
+    * Get the map of array of T from the metadata.
+    *
+    * @param ct Implicit class tag.
+    * @param metadata The metadata to retrieve from.
+    * @tparam T The type of map to retrieve.
+    * @return The map of array of T.
+    */
+  def getMap[T](metadata: Metadata)(implicit ct: ClassTag[T]): CategoricalMap[T] = {
+    val data =
+      if (metadata.contains(MMLTag)) {
+        metadata.getMetadata(MMLTag)
+      } else if (metadata.contains(MLlibTag)) {
+        metadata.getMetadata(MLlibTag)
+      } else {
+        sys.error("Invalid metadata to retrieve map from")
+      }
+
+    val categoricalMap = implicitly[ClassTag[T]] match  {
+      case ClassTag.Int => new CategoricalMap[Int](data.getLongArray(ValuesInt).map(_.toInt))
+      case ClassTag.Double => new CategoricalMap[Double](data.getDoubleArray(ValuesDouble))
+      case ClassTag.Boolean => new CategoricalMap[Boolean](data.getBooleanArray(ValuesBool))
+      case ClassTag.Long => new CategoricalMap[Long](data.getLongArray(ValuesLong))
+      case _ => new CategoricalMap[String](data.getStringArray(ValuesString))
+    }
+    categoricalMap.asInstanceOf[CategoricalMap[T]]
+  }
+
+  /**
+    * Get a type for the given value.
+    * @param value The value to get the type from.
+    * @tparam T The generic type of the value.
+    * @return The DataType based on the value.
+    */
+  def getCategoricalTypeForValue[T](value: T): DataType = {
+    value match {
+      // Complicated type matching is requred to get around type erasure
+      case _: String  => DataTypes.StringType
+      case _: Double  => DataTypes.DoubleType
+      case _: Int     => DataTypes.IntegerType
+      case _: Long    => DataTypes.LongType
+      case _: Boolean => DataTypes.BooleanType
+      case _          => throw new UnsupportedOperationException("Unsupported categorical data type")
+    }
+  }
+
+  private def getDataType(metadata: Metadata): Option[DataType] = {
+    val columnMetadata = metadata.getMetadata(MMLTag)
+    val dataType =
+      if (columnMetadata.contains(ValuesString)) Some(DataTypes.StringType)
+      else if (columnMetadata.contains(ValuesLong)) Some(DataTypes.LongType)
+      else if (columnMetadata.contains(ValuesInt)) Some(DataTypes.IntegerType)
+      else if (columnMetadata.contains(ValuesLong)) Some(DataTypes.LongType)
+      else if (columnMetadata.contains(ValuesDouble)) Some(DataTypes.DoubleType)
+      else if (columnMetadata.contains(ValuesBool)) Some(DataTypes.BooleanType)
+      else None
+    dataType
+  }
+
+}
+
+/**
+  * A wrapper around level maps: Map[T -> Int] and Map[Int -> T] that converts
+  *   the data to/from Spark Metadata in both MLib and AzreML formats.
+  * @param levels  The level values are assumed to be already sorted as needed
+  * @param isOrdinal  A flag that indicates if the data are ordinal
+  * @tparam T  Input levels could be String, Double, Int, Long, Boolean
+  */
+class CategoricalMap[T](val levels: Array[T], val isOrdinal: Boolean = false) extends Serializable {
+  //TODO: handle NULL values
+
+  require(levels.distinct.size == levels.size, "Categorical levels are not unique.")
+  require(!levels.isEmpty, "Levels should not be empty")
+
+  /** total number of level */
+  val numLevels = levels.length //TODO: add the maximum possible number of levels?
+
+  /** Spark DataType correspondint to type T */
+  val dataType = CategoricalUtilities.getCategoricalTypeForValue(levels.head)
+
+  /** Maps levels to the corresponding integer index */
+  private lazy val levelToIndex: Map[T, Int] = levels.zipWithIndex.toMap
+
+  /** Returns the index of the given level, can throw */
+  def getIndex(level: T): Int = levelToIndex(level)
+
+  /** Returns the index of a given level as Option; does not throw */
+  def getIndexOption(level: T): Option[Int] = levelToIndex.get(level)
+
+  /** Checks if the given level exists */
+  def hasLevel(level: T): Boolean = levelToIndex.contains(level)
+
+  /** Returns the level of the given index; can throw */
+  def getLevel(index: Int): T = levels(index)
+
+  /** Returns the level of the given index as Option; does not throw */
+  def getLevelOption(index: Int): Option[T] =
+    if (index < 0 || index >= numLevels) None else Some(levels(index))
+
+  /** Stores levels in Spark Metadata in either MLlib format */
+  private def toMetadataMllib(existingMetadata: Metadata): Metadata = {
+    require(!isOrdinal, "Cannot save Ordinal data in MLlib Nominal format currently," +
+                        " because it does not have a public constructor that accepts Ordinal")
+
+    // Currently, MLlib converts all non-string categorical values to string;
+    // see org.apache.spark.ml.feature.StringIndexer
+    val strLevels = levels.map(_.toString).asInstanceOf[Array[String]]
+
+    NominalAttribute.defaultAttr.withValues(strLevels).toMetadata(existingMetadata)
+  }
+
+  /** Stores levels in Spark Metadata in MML format */
+  private def toMetadataMML(existingMetadata: Metadata): Metadata = {
+    CategoricalUtilities.updateLevelsMetadata(existingMetadata, levels, dataType)
+  }
+
+  /** Add categorical levels to existing Spark Metadata
+    * @param existingMetadata [tag, categorical metadata] pair is added to existingMetadata,
+    *   where tag is either MLlib or MML
+    * @param mmlStyle MML (true) or MLlib metadata (false)
+    */
+  def toMetadata(existingMetadata: Metadata, mmlStyle: Boolean): Metadata = {
+
+    // assert that metadata does not have data with this tag
+    def assertNoTag(tag: String) =
+      assert(!existingMetadata.contains(tag),
+             //TODO: add tests to ensure
+             s"Metadata already contains the tag $tag; all the data are eraised")
+
+    if (mmlStyle) {
+      assertNoTag(MMLTag)
+      toMetadataMML(existingMetadata)
+    } else {
+      assertNoTag(MLlibTag)
+      toMetadataMllib(existingMetadata)
+    }
+  }
+
+  /** Add categorical levels and in either MML or MLlib style metadata
+    * @param mmlStyle MML (true) or MLlib metadata (false)
+    */
+  def toMetadata(mmlStyle: Boolean): Metadata = toMetadata(Metadata.empty, mmlStyle)
+
+}
+
+/**
+  * Extract categorical info from the DataFrame column
+  * @param df dataframe
+  * @param column column name
+  */
+class CategoricalColumnInfo(df: DataFrame, column: String) {
+
+  private val columnSchema   = df.schema(column)
+  private val metadata = columnSchema.metadata
+
+  /** Get the basic info: whether the column is categorical or not, actual type of the column, etc */
+  val (isCategorical, isMML, isOrdinal, dataType) = {
+
+    val notCategorical = (false, false, false, NullType)
+
+    if (columnSchema.dataType != DataTypes.IntegerType
+      && columnSchema.dataType != DataTypes.DoubleType) notCategorical
+    else if (metadata.contains(MMLTag)) {
+      val columnMetadata = metadata.getMetadata(MMLTag)
+
+      if (!columnMetadata.contains(Ordinal)) notCategorical
+      else {
+        val isOrdinal = columnMetadata.getBoolean(Ordinal)
+
+        val dataType =
+          if      (columnMetadata.contains(ValuesString)) DataTypes.StringType
+          else if (columnMetadata.contains(ValuesLong))   DataTypes.LongType
+          else if (columnMetadata.contains(ValuesInt))    DataTypes.IntegerType
+          else if (columnMetadata.contains(ValuesLong))   DataTypes.LongType
+          else if (columnMetadata.contains(ValuesDouble)) DataTypes.DoubleType
+          else if (columnMetadata.contains(ValuesBool))   DataTypes.BooleanType
+          else throw new Exception("Unrecognized datatype in MML metadata")
+
+        (true, true, isOrdinal, dataType)
+      }
+    }
+    else if (metadata.contains(MLlibTag)) {
+      val columnMetadata = metadata.getMetadata(MLlibTag)
+      // nominal metadata has ["type" -> "nominal"] pair
+      val isCategorical = columnMetadata.contains(MLlibTypeTag) &&
+                          columnMetadata.getString(MLlibTypeTag) == AttributeType.Nominal.name
+
+      if (!isCategorical) notCategorical
+      else {
+        val isOrdinal = if (columnMetadata.contains(Ordinal)) columnMetadata.getBoolean(Ordinal) else false
+        val dataType =
+          if (columnMetadata.contains(ValuesString)) DataTypes.StringType
+          else throw new UnsupportedOperationException("nominal attribute does not contain string levels")
+        (true, false, isOrdinal, dataType)
+      }
+    } else
+      notCategorical
+  }
+
+}
diff --git a/src/core/schema/src/main/scala/DatasetExtensions.scala b/src/core/schema/src/main/scala/DatasetExtensions.scala
new file mode 100644
index 0000000000..c71a814c68
--- /dev/null
+++ b/src/core/schema/src/main/scala/DatasetExtensions.scala
@@ -0,0 +1,68 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.schema
+
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector}
+import org.apache.spark.sql.DataFrame
+import scala.collection.mutable
+
+/**
+  * Contains methods for manipulating spark dataframes and datasets.
+  */
+object DatasetExtensions {
+
+  implicit class MMLDataFrame(val df: DataFrame) extends AnyVal {
+    /**
+      * Finds an unused column name given initial column name in the given schema.
+      * The unused column name will be given prefix with a number appended to it, eg "testColumn_5".
+      * There will be an underscore between the column name and the number appended.
+      *
+      * @return The unused column name.
+      */
+    def withDerivativeCol(prefix: String): String = {
+      val columnNamesSet = mutable.HashSet(df.columns: _*)
+      findUnusedColumnName(prefix)(columnNamesSet)
+    }
+
+    /**
+      * Gets the column values as the given type.
+      * @param colname The column name to retrieve from.
+      * @tparam T The type to retrieve.
+      * @return The sequence of values in the column.
+      */
+    def getColAs[T](colname: String): Seq[T] = {
+      df.select(colname).collect.map(_.getAs[T](0))
+    }
+
+    /**
+      * Gets the spark sparse vector column.
+      * @return The spark sparse vector column.
+      */
+    def getSVCol: String => Seq[SparseVector] = getColAs[SparseVector] _
+
+    /**
+      * Gets the spark dense vector column.
+      * @return The spark dense vector column.
+      */
+    def getDVCol: String => Seq[DenseVector] = getColAs[DenseVector] _
+  }
+
+  /**
+    * Finds an unused column name given initial column name and a list of existing column names.
+    * The unused column name will be given prefix with a number appended to it, eg "testColumn_5".
+    * There will be an underline between the column name and the number appended.
+    *
+    * @return The unused column name.
+    */
+  def findUnusedColumnName(prefix: String)(columnNames: scala.collection.Set[String]): String = {
+    var counter = 2
+    var unusedColumnName = prefix
+    while (columnNames.contains(unusedColumnName)) {
+      unusedColumnName += "_" + counter
+      counter += 1
+    }
+    unusedColumnName
+  }
+
+}
diff --git a/src/core/schema/src/main/scala/ImageSchema.scala b/src/core/schema/src/main/scala/ImageSchema.scala
new file mode 100644
index 0000000000..f5c2502390
--- /dev/null
+++ b/src/core/schema/src/main/scala/ImageSchema.scala
@@ -0,0 +1,46 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.schema
+
+import com.microsoft.ml.spark._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.types._
+
+import scala.reflect.ClassTag
+
+object ImageSchema {
+
+  /**
+    * schema for the image column: Row(String, Int, Int, Int, Array[Byte])
+    */
+  val columnSchema = StructType(
+    StructField("path",   StringType,  true) ::
+    StructField("height", IntegerType, true) ::
+    StructField("width",  IntegerType, true) ::
+    StructField("type", IntegerType, true) ::                 //OpenCV type: CV_8U in most cases
+    StructField("bytes",  BinaryType, true) :: Nil)   //OpenCV bytes: row-wise BGR in most cases
+
+  def getPath(row: Row): String = row.getString(0)
+  def getHeight(row: Row): Int = row.getInt(1)
+  def getWidth(row: Row): Int = row.getInt(2)
+  def getType(row: Row): Int = row.getInt(3)
+  def getBytes(row: Row): Array[Byte] = row.getAs[Array[Byte]](4)
+
+  /**
+    * Check if the dataframe column contains images (i.e. has imageSchema)
+    *
+    * @param df
+    * @param column
+    * @return
+    */
+  def isImage(df: DataFrame, column: String): Boolean =
+    df.schema(column).dataType == columnSchema
+
+  private[spark] def loadLibraryForAllPartitions[T:ClassTag](rdd: RDD[T], lib: String):RDD[T] = {
+    def perPartition(it: Iterator[T]):Iterator[T] = {
+      new NativeLoader("/org/opencv/lib").loadLibraryByName(lib); it }
+    rdd.mapPartitions(perPartition, preservesPartitioning = true)
+  }
+}
diff --git a/src/core/schema/src/main/scala/SchemaConstants.scala b/src/core/schema/src/main/scala/SchemaConstants.scala
new file mode 100644
index 0000000000..b685f8ea73
--- /dev/null
+++ b/src/core/schema/src/main/scala/SchemaConstants.scala
@@ -0,0 +1,44 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.schema
+
+/**
+  * Contains constants used by modules for schema.
+  */
+object SchemaConstants {
+
+  val ScoreColumnKind           = "ScoreColumnKind"
+  val ScoreValueKind            = "ScoreValueKind"
+
+  val TrueLabelsColumn          = "true_labels"
+  val ScoredLabelsColumn        = "scored_labels"
+  val ScoresColumn              = "scores"
+  val ScoredProbabilitiesColumn = "scored_probabilities"
+
+  val ScoreModelPrefix          = "score_model"
+  val MMLTag                    = "mml"      // MML metadata tag
+  val MLlibTag                  = "ml_attr"  // MLlib metadata tag, see org.apache.spark.ml.attribute.AttributeKeys
+
+  /** The following tags are used in Metadata representation of categorical data
+    * do not change them or use them directly
+    * (see org.apache.spark.ml.attribute.AttributeKeys for the first three)
+    */
+  val Ordinal                   = "ord"        // common tag for both MLlib and MML
+  val MLlibTypeTag              = "type"       // MLlib tag for the attribute types
+  val ValuesString              = "vals"       // common tag for both MLlib and MML
+  val ValuesInt                 = "vals_int"
+  val ValuesLong                = "vals_long"
+  val ValuesDouble              = "vals_double"
+  val ValuesBool                = "vals_bool"
+
+  // Score value kinds, or types of ML:
+  val ClassificationKind        = "Classification"
+  val RegressionKind            = "Regression"
+
+  // Spark native column names
+  val SparkPredictionColumn     = "prediction"
+  val SparkRawPredictionColumn  = "rawPrediction"
+  val SparkProbabilityColumn    = "probability"
+
+}
diff --git a/src/core/schema/src/main/scala/SparkSchema.scala b/src/core/schema/src/main/scala/SparkSchema.scala
new file mode 100644
index 0000000000..858409cf37
--- /dev/null
+++ b/src/core/schema/src/main/scala/SparkSchema.scala
@@ -0,0 +1,352 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.schema
+
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.functions._
+import SchemaConstants._
+import scala.reflect.ClassTag
+
+/**
+  * Schema modification and information retrieval methods.
+  */
+object SparkSchema {
+
+  /**
+    * Sets the label column name.
+    *
+    * @param dataset    The dataset to set the label column name on.
+    * @param modelName  The model name.
+    * @param columnName The column name to set as the label.
+    * @param scoreValueKindModel The model type.
+    * @return The modified dataset.
+    */
+  def setLabelColumnName: (DataFrame, String, String, String) => DataFrame =
+    setColumnName(TrueLabelsColumn)
+
+  /**
+    * Sets the scored labels column name.
+    *
+    * @param dataset    The dataset to set the scored labels column name on.
+    * @param modelName  The model name.
+    * @param columnName The column name to set as the scored label.
+    * @param scoreValueKindModel The model type.
+    * @return The modified dataset.
+    */
+  def setScoredLabelsColumnName: (DataFrame, String, String, String) => DataFrame =
+    setColumnName(ScoredLabelsColumn)
+
+  /**
+    * Sets the scored probabilities column name.
+    *
+    * @param dataset    The dataset to set the scored probabilities column name on.
+    * @param modelName  The model name.
+    * @param columnName The column name to set as the scored probability.
+    * @param scoreValueKindModel The model type.
+    * @return The modified dataset.
+    */
+  def setScoredProbabilitiesColumnName: (DataFrame, String, String, String) => DataFrame =
+    setColumnName(ScoredProbabilitiesColumn)
+
+  /**
+    * Sets the scores column name.
+    *
+    * @param dataset    The dataset to set the scores column name on.
+    * @param modelName  The model name.
+    * @param columnName The column name to set as the scores.
+    * @param scoreValueKindModel The model type.
+    * @return The modified dataset.
+    */
+  def setScoresColumnName: (DataFrame, String, String, String) => DataFrame =
+    setColumnName(ScoresColumn)
+
+  /**
+    * Gets the label column name.
+    *
+    * @param dataset   The dataset to get the label column from.
+    * @param modelName The model to retrieve the label column from.
+    * @return The label column name.
+    */
+  def getLabelColumnName(dataset: DataFrame, modelName: String): String =
+    getScoreColumnKindColumn(TrueLabelsColumn)(dataset.schema, modelName)
+
+  /**
+    * Gets the scored labels column name.
+    *
+    * @param dataset   The dataset to get the scored labels column from.
+    * @param modelName The model to retrieve the scored labels column from.
+    * @return The scored labels column name.
+    */
+  def getScoredLabelsColumnName(dataset: DataFrame, modelName: String): String =
+    getScoreColumnKindColumn(ScoredLabelsColumn)(dataset.schema, modelName)
+
+  /**
+    * Gets the scores column name.
+    *
+    * @param dataset   The dataset to get the scores column from.
+    * @param modelName The model to retrieve the scores column from.
+    * @return The scores column name.
+    */
+  def getScoresColumnName(dataset: DataFrame, modelName: String): String =
+    getScoreColumnKindColumn(ScoresColumn)(dataset.schema, modelName)
+
+  /**
+    * Gets the scored probabilities column name.
+    *
+    * @param dataset   The dataset to get the scored probabilities column from.
+    * @param modelName The model to retrieve the scored probabilities column from.
+    * @return The scored probabilities column name.
+    */
+  def getScoredProbabilitiesColumnName(dataset: DataFrame, modelName: String): String =
+    getScoreColumnKindColumn(ScoredProbabilitiesColumn)(dataset.schema, modelName)
+
+  /**
+    * Gets the label column name.
+    *
+    * @param dataset   The dataset to get the label column from.
+    * @param modelName The model to retrieve the label column from.
+    * @return The label column name.
+    */
+  def getLabelColumnName: (StructType, String) => String =
+    getScoreColumnKindColumn(TrueLabelsColumn)
+
+  /**
+    * Gets the scored labels column name.
+    *
+    * @param dataset   The dataset to get the scored labels column from.
+    * @param modelName The model to retrieve the scored labels column from.
+    * @return The scored labels column name.
+    */
+  def getScoredLabelsColumnName: (StructType, String) => String =
+    getScoreColumnKindColumn(ScoredLabelsColumn)
+
+  /**
+    * Gets the scores column name.
+    *
+    * @param dataset   The dataset to get the scores column from.
+    * @param modelName The model to retrieve the scores column from.
+    * @return The scores column name.
+    */
+  def getScoresColumnName: (StructType, String) => String =
+    getScoreColumnKindColumn(ScoresColumn)
+
+  /**
+    * Gets the scored probabilities column name.
+    *
+    * @param dataset   The dataset to get the scored probabilities column from.
+    * @param modelName The model to retrieve the scored probabilities column from.
+    * @return The scored probabilities column name.
+    */
+  def getScoredProbabilitiesColumnName: (StructType, String) => String =
+    getScoreColumnKindColumn(ScoredProbabilitiesColumn)
+
+  /**
+    * Gets the score value kind or null if it does not exist from a dataset.
+    *
+    * @param scoreColumnKindColumn The score column kind to retrieve.
+    * @param dataset   The dataset to get the score column kind column name from.
+    * @param modelName The model to retrieve the score column kind column name from.
+    * @param columnName The column to retrieve the score value kind from.
+    * @return
+    */
+  def getScoreValueKind(dataset: DataFrame, modelName: String, columnName: String): String = {
+    getScoreValueKind(dataset.schema, modelName, columnName)
+  }
+
+  /**
+    * Gets the score value kind or null if it does not exist from the schema.
+    *
+    * @param scoreColumnKindColumn The score column kind to retrieve.
+    * @param schema   The schema to get the score column kind column name from.
+    * @param modelName The model to retrieve the score column kind column name from.
+    * @param columnName The column to retrieve the score value kind from.
+    * @return
+    */
+  def getScoreValueKind(schema: StructType, modelName: String, columnName: String): String = {
+    val metadata = schema(columnName).metadata
+    if (metadata == null) return null
+    getMetadataFromModule(metadata, modelName, ScoreValueKind)
+  }
+
+  /**
+    * Sets the score column kind.
+    *
+    * @param scoreColumnKindColumn The score column kind column.
+    * @param dataset               The dataset to set the score column kind on.
+    * @param modelName             The model name.
+    * @param columnName            The column name to set as the specified score column kind.
+    * @param scoreValueKindModel   The model type.
+    * @return
+    */
+  private def setColumnName(scoreColumnKindColumn: String)
+                           (dataset: DataFrame, modelName: String,
+                            columnName: String, scoreValueKindModel: String): DataFrame = {
+    dataset.withColumn(columnName,
+      dataset.col(columnName).as(columnName,
+        updateMetadata(dataset.schema(columnName).metadata,
+          scoreColumnKindColumn, scoreValueKindModel, modelName)))
+  }
+
+  /**
+    * Gets the score column kind column name or null if it does not exist.
+    *
+    * @param scoreColumnKindColumn The score column kind to retrieve.
+    * @param schema   The schema to get the score column kind column name from.
+    * @param modelName The model to retrieve the score column kind column name from.
+    * @return
+    */
+  private def getScoreColumnKindColumn(scoreColumnKindColumn: String)
+                                      (schema: StructType, modelName: String): String = {
+    val structField = schema.find {
+      case StructField(_, _, _, metadata) =>
+        getMetadataFromModule(metadata, modelName, ScoreColumnKind) == scoreColumnKindColumn
+    }
+    if (structField.isEmpty) null else structField.get.name
+  }
+
+  private def updateMetadata(metadata: Metadata, scoreColumnKindColumn: String,
+                             scoreValueKindModel: String, moduleName: String): Metadata = {
+    val mmltagMetadata =
+      if (metadata.contains(MMLTag)) metadata.getMetadata(MMLTag)
+      else null
+    val moduleNameMetadata =
+      if (mmltagMetadata != null && mmltagMetadata.contains(moduleName))
+        mmltagMetadata.getMetadata(moduleName)
+      else null
+
+    val moduleMetadataBuilder = new MetadataBuilder()
+    if (mmltagMetadata != null && moduleNameMetadata != null) {
+      moduleMetadataBuilder.withMetadata(moduleNameMetadata)
+    }
+    moduleMetadataBuilder.putString(ScoreColumnKind, scoreColumnKindColumn)
+    moduleMetadataBuilder.putString(ScoreValueKind, scoreValueKindModel)
+
+    val moduleBuilder = new MetadataBuilder()
+    if (mmltagMetadata != null) {
+      moduleBuilder.withMetadata(mmltagMetadata)
+    }
+    moduleBuilder.putMetadata(moduleName, moduleMetadataBuilder.build())
+
+    new MetadataBuilder()
+      .withMetadata(metadata)
+      .putMetadata(MMLTag, moduleBuilder.build())
+      .build()
+  }
+
+  private def getMetadataFromModule(colMetadata: Metadata, moduleName: String, tag: String): String = {
+    if (!colMetadata.contains(MMLTag)) return null
+    val mlTagMetadata = colMetadata.getMetadata(MMLTag)
+    if (!mlTagMetadata.contains(moduleName)) return null
+    val modelMetadata = mlTagMetadata.getMetadata(moduleName)
+    if (!modelMetadata.contains(tag)) return null
+    modelMetadata.getString(tag)
+  }
+
+  /**
+    * Convert the regular column to the categorical one
+    * @param df dataframe
+    * @param column column name
+    * @param newColumn new categorical column name
+    * @param mmlStyle MML format (true, default) or MLlib format (false)
+    * @return updated dataframe
+    */
+  def makeCategorical(df: DataFrame,
+                      column: String,
+                      newColumn: String,
+                      mmlStyle: Boolean = true): DataFrame = {
+
+    val dataType = df.schema(column).dataType
+    val collected = df.select(column).distinct().collect()
+
+    dataType match {
+      //TODO: all cases below are the same; can we simplify the code with a single generic function?
+      case _: IntegerType => {
+        val levels = collected.map(row => row(0).asInstanceOf[Int])
+        val map = new CategoricalMap(levels.sorted)
+        val getIndex = udf((level: Int) => map.getIndex(level))
+        val metadata = map.toMetadata(mmlStyle)
+        df.withColumn(newColumn, getIndex(df(column)).as(newColumn, metadata))
+      }
+
+      case _: LongType => {
+        val levels = collected.map(row => row(0).asInstanceOf[Long])
+        val map = new CategoricalMap(levels.sorted)
+        val getIndex = udf((level: Long) => map.getIndex(level))
+        val metadata = map.toMetadata(mmlStyle)
+        df.withColumn(newColumn, getIndex(df(column)).as(newColumn, metadata))
+      }
+
+      case _: DoubleType => {
+        val levels = collected.map(row => row(0).asInstanceOf[Double])
+        val map = new CategoricalMap(levels.sorted)
+        val getIndex = udf((level: Double) => map.getIndex(level))
+        val metadata = map.toMetadata(mmlStyle)
+        df.withColumn(newColumn, getIndex(df(column)).as(newColumn, metadata))
+      }
+
+      case _: StringType => {
+        val levels = collected.map(row => row(0).asInstanceOf[String])
+        val map = new CategoricalMap(levels.sorted)
+        val getIndex = udf((level: String) => map.getIndex(level))
+        val metadata = map.toMetadata(mmlStyle)
+        df.withColumn(newColumn, getIndex(df(column)).as(newColumn, metadata))
+      }
+
+      case _: BooleanType => {
+        val levels = collected.map(row => row(0).asInstanceOf[Boolean])
+        val map = new CategoricalMap(levels.sorted)
+        val getIndex = udf((level: Boolean) => map.getIndex(level))
+        val metadata = map.toMetadata(mmlStyle)
+        df.withColumn(newColumn, getIndex(df(column)).as(newColumn, metadata))
+      }
+      //case _: BooleanType => makeCategorical[Boolean]
+      case _ => throw new Exception("Unsupported Categorical type " + dataType.toString)
+    }
+  }
+
+  /**
+    * Convert the regular column to the categorical one
+    * @param df dataframe
+    * @param column column name
+    * @param newColumn new categorical column name
+    * @param mmlStyle MML format (true, default) or MLlib format (false)
+    * @return updated dataframe
+    */
+  def makeNonCategorical(df: DataFrame,
+                         column: String,
+                         newColumn: String): DataFrame = {
+
+    val info = new CategoricalColumnInfo(df, column)
+    require(info.isCategorical, "column " + column + "is not Categorical")
+    require(info.dataType == StringType, "underlying categorical is not String based")  //TODO: add other types too
+    //(isCategorical, isMML, isOrdinal, dataType)
+
+    val map = CategoricalUtilities.getMap[String](df.schema(column).metadata)
+    val getLevel = udf((index: Int) => map.getLevel(index))            //TODO: can throw?
+    df.withColumn(newColumn, getLevel(df(column)).as(newColumn))      //TODO: keeping metadata: .as(newColumn,metadata)
+  }
+
+  /** find if the given column is a string */
+  def isString(df: DataFrame, column: String): Boolean = {
+    df.schema(column).dataType == DataTypes.StringType
+  }
+
+  /** find if the given column is numeric */
+  def isNumeric(df: DataFrame, column: String): Boolean = {
+    df.schema(column).dataType.isInstanceOf[NumericType]
+  }
+
+  /** find if the given column is boolean */
+  def isBoolean(df: DataFrame, column: String): Boolean = {
+    df.schema(column).dataType.isInstanceOf[BooleanType]
+  }
+
+  /** find if the given column is Categorical; use CategoricalColumnInfo for more details */
+  def isCategorical(df: DataFrame, column: String): Boolean = {
+    val info = new CategoricalColumnInfo(df, column)
+    info.isCategorical
+  }
+
+}
diff --git a/src/core/schema/src/test/scala/TestCategoricals.scala b/src/core/schema/src/test/scala/TestCategoricals.scala
new file mode 100644
index 0000000000..11bd6f41f4
--- /dev/null
+++ b/src/core/schema/src/test/scala/TestCategoricals.scala
@@ -0,0 +1,131 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.sql._
+import org.apache.spark.ml.Transformer
+import org.apache.spark.sql.types._
+import org.apache.spark.ml.param._
+import com.microsoft.ml.spark.schema._
+
+import scala.reflect.{ClassTag, classTag}
+
+class TestCategoricalMap extends TestBase {
+
+  /** basic asserts that should be true for all Categorical Maps
+    *
+    * @param levels       sorted categorical levels
+    * @param wrong_level  a value that is not a level
+    * @param dataType     corresponding Spark datatype
+    * @param isOrdinal    whether levels are Ordinal or not
+    * @param mmlStyle     save to MML (true, default) or MLlib (false) metadata
+    */
+  private def testMapBasic[T: ClassTag](levels: Array[T], wrong_level: T, dataType: DataType,
+                                        isOrdinal: Boolean, mmlStyle: Boolean = true): Unit = {
+
+    val map = new CategoricalMap(levels, isOrdinal)
+    val s   = " " + classTag[T]; // to idenfity which type throws the error
+
+    assert(map.numLevels == levels.length, "numLevels" + s)
+    assert(map.isOrdinal == isOrdinal, "isOrdinal" + s)
+    assert(map.dataType == dataType, "dataType" + s)
+    assert(map.getIndex(levels.head) == 0 & map.getIndex(levels.last) == levels.length - 1, "getIndex" + s)
+    assert(map.getIndexOption(wrong_level) == None & map.getIndexOption(levels(1)) == Some(1), "getIndexOption" + s)
+    assert(map.hasLevel(levels(1)) == true & map.hasLevel(wrong_level) == false, "hasLevel" + s)
+    assert(map.getLevel(1) == levels(1), "getLevel" + s)
+    assert(map.getLevelOption(1) == Some(levels(1)) & map.getLevelOption(-1) == None, "getLevelOption" + s)
+
+    val mml_meta = map.toMetadata(mmlStyle) //TODO: check metadata for correctness
+  }
+
+  /** test CategoricalMap for different undelying types */
+  test("Test: Create basic CategoricalMap") {
+
+    for (mmlStyle <- List(true, false)) {
+
+      val isOrdinal = mmlStyle
+
+      val strArray = Array("as", "", "efe")
+      testMapBasic(strArray, "wrong_level", StringType, isOrdinal, mmlStyle)
+
+      val intArray = Array[Int](34, 54747, -346, 756, 0)
+      testMapBasic(intArray, -45, IntegerType, isOrdinal, mmlStyle)
+
+      val longArray = Array[Long](34, 54747, -346, 756, 0)
+      testMapBasic(longArray, (-45: Long), LongType, isOrdinal, mmlStyle)
+
+      val doubleArray = Array[Double](34.45, 54.747, -3.46, 7.56, 0)
+      testMapBasic(doubleArray, (-45: Double), DoubleType, isOrdinal, mmlStyle)
+    }
+  }
+
+  import session.implicits._
+
+  /** sample dafaframe */
+  private val DF = Seq[(Int, Long, Double, Boolean, String)](
+      (-3, 24L, 0.32534, true, "piano"),
+      (1, 5L, 5.67, false, "piano"),
+      (-3, 5L, 0.32534, false, "guitar"))
+    .toDF("int", "long", "double", "bool", "string")
+
+  /** sample dafaframe with Null values*/
+  private val nullDF = Seq[(String, java.lang.Integer, java.lang.Double)](
+      ("Alice", null, 44.3),
+      (null, 60, null),
+      ("Josh", 25, Double.NaN))
+    .toDF("string", "int", "double")
+
+  /** test CategoricalMap for different undelying types */
+  test("Test: Convert the regular column into categorical") {
+    for (col <- DF.columns; mmlStyle <- List(false, true)) {
+      val newName = col + "_cat"
+      val df      = SparkSchema.makeCategorical(DF, column = col, newColumn = newName, mmlStyle)
+
+      assert(!SparkSchema.isCategorical(df, col), "Check for non-categorical columns")
+      assert(SparkSchema.isCategorical(df, newName), "Check for categorical columns")
+
+      val info = new CategoricalColumnInfo(df, newName)
+
+      assert(info.isCategorical, "the column is supposed to be categorical")
+      assert(info.isMML == mmlStyle, "wrong metadata style in categorical column")
+      assert(!info.isOrdinal, "wrong ordinal style in categorical column")
+      if (mmlStyle)
+        assert(info.dataType == DF.schema(col).dataType, "categorical data type is not correct")
+      else
+        assert(info.dataType == StringType, "categorical data type is not String")
+    }
+  }
+
+  test("Test: String categorical levels") {
+    val col = "string"
+    val true_levels = DF.select("string").collect().map(_(0).toString).distinct.sorted
+
+    for (mmlStyle <- List(false, true)) {
+      val newName = col + "_cat"
+      val df = SparkSchema.makeCategorical(DF, column = col, newColumn = newName, mmlStyle)
+
+      val map = CategoricalUtilities.getMap[String](df.schema(newName).metadata)
+
+      val levels = map.levels.sorted
+
+      (true_levels zip levels).foreach {
+        case (a, b) => assert(a == b, "categorical levels are not the same")
+      }
+    }
+  }
+
+  test("Test: Going to Categorical and Back") {
+    val col = "string"
+    for (mmlStyle <- List(false, true)) {
+      val newName = col + "_cat"
+      val df = SparkSchema.makeCategorical(DF, column = col, newColumn = newName, mmlStyle)
+
+      val testName = col + "_noncat"
+      val df1 = SparkSchema.makeNonCategorical(df, column = newName, newColumn = testName)
+
+      df1.select(col, testName).collect.foreach(row => assert(row(0) == row(1), "two columns should be the same"))
+    }
+  }
+
+}
diff --git a/src/core/schema/src/test/scala/VerifyFastVectorAssembler.scala b/src/core/schema/src/test/scala/VerifyFastVectorAssembler.scala
new file mode 100644
index 0000000000..7cb44dc0c5
--- /dev/null
+++ b/src/core/schema/src/test/scala/VerifyFastVectorAssembler.scala
@@ -0,0 +1,118 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.schema
+
+import com.microsoft.ml.spark.TransformerFuzzingTest
+import org.apache.spark.SparkException
+import org.apache.spark.ml.{Estimator, Transformer}
+import org.apache.spark.ml.feature.{FastVectorAssembler, StringIndexer}
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.types.StructType
+
+/**
+  * Verifies the fast vector assembler, which only keeps categorical metadata and removes all other metadata.
+  * TODO: Move this to core/spark and remove MML dependencies for the verification
+  */
+class VerifyFastVectorAssembler extends TransformerFuzzingTest {
+
+  val invalidExceptionError = "Could not catch correct exception"
+
+  val inputCols = Array("a", "b", "c", "d", "e")
+  val outputCol = "testCol"
+  val mockDataset = session.createDataFrame(Seq(
+    (0, 2, 0.5, 0.6, 0),
+    (1, 3, 0.4, 0.5, 1),
+    (2, 4, 0.78, 0.99, 2),
+    (3, 5, 0.12, 0.34, 3)
+  )).toDF(inputCols: _*)
+
+  test("Verify fast vector assembler does not keep metadata for non-categorical columns") {
+    val fastAssembler = new FastVectorAssembler().setInputCols(inputCols).setOutputCol(outputCol)
+    val transformedDataset = fastAssembler.transform(mockDataset)
+    // Assert metadata is empty
+    assert(transformedDataset.schema(outputCol).metadata.toString() ==
+      "{\"ml_attr\":{\"attrs\":{},\"num_attrs\":0}}")
+  }
+
+  test("Verify fast vector assembler throws when the first column is not categorical") {
+
+    val (inputCols: Array[String], catColumn: String, categoricalData: DataFrame) = createCategoricalData
+
+    val outputCol = "testCol"
+
+    val fastAssembler = new FastVectorAssembler()
+      .setInputCols((inputCols.toList.drop(1) ::: (List(catColumn))).toArray)
+      .setOutputCol(outputCol)
+
+    var caughtException: Boolean = false
+    try {
+      val transformedDataset = fastAssembler.transform(categoricalData)
+    }
+    catch {
+      case exception: SparkException => {
+        caughtException = true
+        exception.getMessage.contains("Categorical columns must precede all others")
+      }
+      case _: Throwable => throw new Exception(invalidExceptionError)
+    }
+
+    if (!caughtException)
+      throw new Exception(invalidExceptionError)
+  }
+
+  test("Verify fast vector assembler works when the first column is a categorical column") {
+
+    val (inputCols: Array[String], catColumn: String, categoricalData: DataFrame) = createCategoricalData
+
+    val outputCol = "testCol"
+
+    val fastAssembler2 = new FastVectorAssembler()
+      .setInputCols((catColumn :: inputCols.toList.drop(1)).toArray)
+      .setOutputCol(outputCol)
+    val transformedDataset2 = fastAssembler2.transform(categoricalData)
+
+    // Assert metadata is not empty
+    val mlattrData = transformedDataset2.schema(outputCol).metadata.getMetadata(SchemaConstants.MLlibTag)
+    // assert the metadata is equal to: "{\"ml_attr\":{\"attrs\":{\"nominal\":[{\"vals\":[\"are\",\"how\",
+    // \"hello\",\"you\"],\"idx\":0,\"name\":\"cat\"}]},\"num_attrs\":1}}"
+    val attrsTag = "attrs"
+    assert(mlattrData.contains(attrsTag))
+    val attrsData = mlattrData.getMetadata(attrsTag)
+    val nominalTag = "nominal"
+    assert(attrsData.contains(nominalTag))
+    val nominalData = attrsData.getMetadataArray(nominalTag)
+    val valsTag = "vals"
+    assert(nominalData(0).contains(valsTag))
+    assert(nominalData(0).getStringArray(valsTag).contains("are"))
+    assert(nominalData(0).getStringArray(valsTag).contains("how"))
+    assert(nominalData(0).getStringArray(valsTag).contains("hello"))
+    assert(nominalData(0).getStringArray(valsTag).contains("you"))
+
+  }
+
+  def createCategoricalData: (Array[String], String, DataFrame) = {
+    val inputCols = Array("a", "b", "c", "d", "e")
+
+    val dataset = session.createDataFrame(Seq(
+      ("hello", 2, 0.5, 0.6, 0),
+      ("how", 3, 0.4, 0.5, 1),
+      ("are", 4, 0.78, 0.99, 2),
+      ("you", 5, 0.12, 0.34, 3)
+    )).toDF(inputCols: _*)
+
+    val catColumn = "cat"
+    val indexer = new StringIndexer().setInputCol("a").setOutputCol(catColumn).fit(dataset)
+    val categoricalData = indexer.transform(dataset).toDF()
+    (inputCols, catColumn, categoricalData)
+  }
+
+  override def setParams(fitDataset: DataFrame, transformer: Transformer): Transformer =
+    transformer.asInstanceOf[FastVectorAssembler].setInputCols(inputCols).setOutputCol(outputCol)
+
+  override def createDataset: DataFrame = mockDataset
+
+  override def schemaForDataset: StructType = ???
+
+  override def getTransformer(): Transformer = new FastVectorAssembler()
+}
diff --git a/src/core/schema/src/test/scala/VerifySparkSchema.scala b/src/core/schema/src/test/scala/VerifySparkSchema.scala
new file mode 100644
index 0000000000..5b240a3906
--- /dev/null
+++ b/src/core/schema/src/test/scala/VerifySparkSchema.scala
@@ -0,0 +1,56 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark.schema
+
+import com.microsoft.ml.spark.TestBase
+
+/**
+  * Verifies the spark schema functions.
+  */
+class VerifySparkSchema extends TestBase {
+
+  val labelColumn = "label"
+  val scoreColumn = "score"
+  val probabilityColumn = "probability"
+  val scoredLabelsColumn = "scored label"
+  test("Spark schema should be able to set and get label, score, probability and scored labels column name") {
+    val dataset = session.createDataFrame(Seq(
+      (0, Array("Hi", "I", "can", "not", "foo"), 0.50, 0.60, 0),
+      (1, Array("I"),                            0.40, 0.50, 1),
+      (2, Array("Logistic", "regression"),       0.78, 0.99, 2),
+      (3, Array("Log","f", "reg"),               0.12, 0.34, 3)
+    )).toDF(labelColumn, "words", scoreColumn, probabilityColumn, scoredLabelsColumn)
+
+    val modelName = "Test model name"
+    val datasetWithLabel =
+      SparkSchema.setLabelColumnName(dataset, modelName, labelColumn, SchemaConstants.RegressionKind)
+    val labelColumnNameRetrieved =
+      SparkSchema.getLabelColumnName(datasetWithLabel, modelName)
+
+    assert(labelColumnNameRetrieved == labelColumn)
+
+    val datasetWithScore =
+      SparkSchema.setScoresColumnName(dataset, modelName, scoreColumn, SchemaConstants.RegressionKind)
+    val scoreColumnNameRetrieved =
+      SparkSchema.getScoresColumnName(datasetWithScore, modelName)
+
+    assert(scoreColumnNameRetrieved == scoreColumn)
+
+    val datasetWithProbability =
+      SparkSchema.setScoredProbabilitiesColumnName(dataset, modelName, probabilityColumn,
+                                                   SchemaConstants.RegressionKind)
+    val probabilityColumnNameRetrieved =
+      SparkSchema.getScoredProbabilitiesColumnName(datasetWithProbability, modelName)
+
+    assert(probabilityColumnNameRetrieved == probabilityColumn)
+
+    val datasetWithScoredLabels =
+      SparkSchema.setScoredLabelsColumnName(dataset, modelName, scoredLabelsColumn, SchemaConstants.RegressionKind)
+    val scoredLabelsColumnNameRetrieved =
+      SparkSchema.getScoredLabelsColumnName(datasetWithScoredLabels, modelName)
+
+    assert(scoredLabelsColumnNameRetrieved == scoredLabelsColumn)
+  }
+
+}
diff --git a/src/core/spark/build.sbt b/src/core/spark/build.sbt
new file mode 100644
index 0000000000..cd0183132b
--- /dev/null
+++ b/src/core/spark/build.sbt
@@ -0,0 +1 @@
+// nothing here
diff --git a/src/core/spark/src/main/scala/ArrayMapParam.scala b/src/core/spark/src/main/scala/ArrayMapParam.scala
new file mode 100644
index 0000000000..2da6645bb8
--- /dev/null
+++ b/src/core/spark/src/main/scala/ArrayMapParam.scala
@@ -0,0 +1,70 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package org.apache.spark.ml.param
+
+import spray.json._
+import org.apache.spark.ml.util.Identifiable
+import scala.collection.immutable.Map
+
+object ArrayMapJsonProtocol extends DefaultJsonProtocol {
+
+  import spray.json._
+  implicit object MapJsonFormat extends JsonFormat[Map[String, Any]] {
+    def write(m: Map[String, Any]): JsValue = {
+      JsObject(m.mapValues {
+        case v: Int => JsNumber(v)
+        case v: Double => JsNumber(v)
+        case v: String => JsString(v)
+        case true => JsTrue
+        case false => JsFalse
+        case v: Map[_, _] => write(v.asInstanceOf[Map[String, Any]])
+        case default => serializationError(s"Unable to serialize $default")
+      })
+    }
+
+    def read(value: JsValue): Map[String, Any] = value.asInstanceOf[JsObject].fields.map(kvp => {
+      val convValue = kvp._2 match {
+        case JsNumber(n) => if (n.isValidInt) n.intValue().asInstanceOf[Any] else n.toDouble.asInstanceOf[Any]
+        case JsString(s) => s
+        case JsTrue => true
+        case JsFalse => false
+        case v: JsValue => read(v)
+        case default => deserializationError(s"Unable to deserialize $default")
+      }
+      (kvp._1, convValue)
+    })
+  }
+
+}
+
+import ArrayMapJsonProtocol._
+
+/**
+  * Param for Array of stage parameter maps.
+  */
+class ArrayMapParam(parent: String, name: String, doc: String, isValid: Array[Map[String, Any]] => Boolean)
+  extends Param[Array[Map[String, Any]]](parent, name, doc, isValid) {
+
+  def this(parent: String, name: String, doc: String) =
+    this(parent, name, doc, ParamValidators.alwaysTrue)
+
+  def this(parent: Identifiable, name: String, doc: String, isValid: Array[Map[String, Any]] => Boolean) =
+    this(parent.uid, name, doc, isValid)
+
+  def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc)
+
+  /** Creates a param pair with the given value (for Java). */
+  override def w(value: Array[Map[String, Any]]): ParamPair[Array[Map[String, Any]]] = super.w(value)
+
+  override def jsonEncode(value: Array[Map[String, Any]]): String = {
+    val json = value.toSeq.asInstanceOf[Seq[Map[String, Int]]].toJson
+    json.prettyPrint
+  }
+
+  override def jsonDecode(json: String): Array[Map[String, Any]] = {
+    val jsonValue = json.parseJson
+    jsonValue.convertTo[Seq[Map[String, Any]]].toArray
+  }
+
+}
diff --git a/src/core/spark/src/main/scala/EstimatorParam.scala b/src/core/spark/src/main/scala/EstimatorParam.scala
new file mode 100644
index 0000000000..bfdc213eee
--- /dev/null
+++ b/src/core/spark/src/main/scala/EstimatorParam.scala
@@ -0,0 +1,36 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package org.apache.spark.ml.param
+
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.util.Identifiable
+
+/**
+  * Param for Estimator.  Needed as spark has explicit params for many different types but not Estimator.
+  */
+class EstimatorParam(parent: String, name: String, doc: String, isValid: Estimator[_ <: Model[_]] => Boolean)
+  extends Param[Estimator[_ <: Model[_]]](parent, name, doc, isValid) {
+
+  def this(parent: String, name: String, doc: String) =
+    this(parent, name, doc, ParamValidators.alwaysTrue)
+
+  def this(parent: Identifiable, name: String, doc: String, isValid: Estimator[_ <: Model[_]] => Boolean) =
+    this(parent.uid, name, doc, isValid)
+
+  def this(parent: Identifiable, name: String, doc: String) =
+    this(parent.uid, name, doc)
+
+  /** Creates a param pair with the given value (for Java). */
+  override def w(value: Estimator[_ <: Model[_]]): ParamPair[Estimator[_ <: Model[_]]] =
+  super.w(value)
+
+  override def jsonEncode(value: Estimator[_ <: Model[_]]): String = {
+    throw new NotImplementedError("The transform cannot be encoded.")
+  }
+
+  override def jsonDecode(json: String): Estimator[_ <: Model[_]] = {
+    throw new NotImplementedError("The transform cannot be decoded.")
+  }
+
+}
diff --git a/src/core/spark/src/main/scala/FastVectorAssembler.scala b/src/core/spark/src/main/scala/FastVectorAssembler.scala
new file mode 100644
index 0000000000..ddd9072d47
--- /dev/null
+++ b/src/core/spark/src/main/scala/FastVectorAssembler.scala
@@ -0,0 +1,154 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package org.apache.spark.ml.feature
+
+import scala.collection.mutable.ArrayBuilder
+import org.apache.spark.SparkException
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.attribute.{Attribute, AttributeGroup}
+import org.apache.spark.ml.linalg.{Vector, VectorUDT, Vectors}
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.param.shared._
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+
+/**
+  * A fast vector assembler.  The columns given must be ordered such that categorical columns come first
+  * (otherwise spark learners will give categorical attributes to the wrong index).
+  * Does not keep spurrious numeric data which can significantly slow down computations when there are
+  * millions of columns.
+  */
+class FastVectorAssembler (override val uid: String)
+  extends Transformer with HasInputCols with HasOutputCol with DefaultParamsWritable {
+
+  def this() = this(Identifiable.randomUID("FastVectorAssembler"))
+
+  /** @group setParam */
+  def setInputCols(value: Array[String]): this.type = set(inputCols, value)
+
+  /** @group setParam */
+  def setOutputCol(value: String): this.type = set(outputCol, value)
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    // Schema transformation.
+    val schema = dataset.schema
+    lazy val first = dataset.toDF.first()
+    var addedNumericField = false
+
+    // Propagate only nominal (categorical) attributes (others only slow down the code)
+    val attrs: Array[Attribute] = $(inputCols).flatMap { c =>
+      val field = schema(c)
+      val index = schema.fieldIndex(c)
+      field.dataType match {
+        case _: NumericType | BooleanType =>
+          val attr = Attribute.fromStructField(field)
+          if (attr.isNominal) {
+            if (addedNumericField) {
+              throw new SparkException("Categorical columns must precede all others, column out of order: " + c)
+            }
+            Some(attr.withName(c))
+          } else {
+            addedNumericField = true
+            None
+          }
+        case _: VectorUDT =>
+          val group = AttributeGroup.fromStructField(field)
+          if (group.attributes.isDefined) {
+            // If attributes are defined, copy them with updated names.
+            group.attributes.get.zipWithIndex.map { case (attr, i) =>
+              if (attr.isNominal && attr.name.isDefined) {
+                if (addedNumericField) {
+                  throw new SparkException("Categorical columns must precede all others, column out of order: " + c)
+                }
+                attr.withName(c + "_" + attr.name.get)
+              } else if (attr.isNominal) {
+                if (addedNumericField) {
+                  throw new SparkException("Categorical columns must precede all others, column out of order: " + c)
+                }
+                attr.withName(c + "_" + i)
+              } else {
+                addedNumericField = true
+                null
+              }
+            }.filter(attr => attr != null)
+          } else {
+            addedNumericField = true
+            None
+          }
+        case otherType =>
+          throw new SparkException(s"FastVectorAssembler does not support the $otherType type")
+      }
+    }
+    val metadata = new AttributeGroup($(outputCol), attrs).toMetadata()
+
+    // Data transformation.
+    val assembleFunc = udf { r: Row =>
+      FastVectorAssembler.assemble(r.toSeq: _*)
+    }
+    val args = $(inputCols).map { c =>
+      schema(c).dataType match {
+        case DoubleType => dataset(c)
+        case _: VectorUDT => dataset(c)
+        case _: NumericType | BooleanType => dataset(c).cast(DoubleType).as(s"${c}_double_$uid")
+      }
+    }
+
+    dataset.select(col("*"), assembleFunc(struct(args: _*)).as($(outputCol), metadata))
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+    val inputColNames = $(inputCols)
+    val outputColName = $(outputCol)
+    val inputDataTypes = inputColNames.map(name => schema(name).dataType)
+    inputDataTypes.foreach {
+      case _: NumericType | BooleanType =>
+      case t if t.isInstanceOf[VectorUDT] =>
+      case other =>
+        throw new IllegalArgumentException(s"Data type $other is not supported.")
+    }
+    if (schema.fieldNames.contains(outputColName)) {
+      throw new IllegalArgumentException(s"Output column $outputColName already exists.")
+    }
+    StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, true))
+  }
+
+  override def copy(extra: ParamMap): FastVectorAssembler = defaultCopy(extra)
+
+}
+
+object FastVectorAssembler extends DefaultParamsReadable[FastVectorAssembler] {
+
+  override def load(path: String): FastVectorAssembler = super.load(path)
+
+  private[feature] def assemble(vv: Any*): Vector = {
+    val indices = ArrayBuilder.make[Int]
+    val values = ArrayBuilder.make[Double]
+    var cur = 0
+    vv.foreach {
+      case v: Double =>
+        if (v != 0.0) {
+          indices += cur
+          values += v
+        }
+        cur += 1
+      case vec: Vector =>
+        vec.foreachActive { case (i, v) =>
+          if (v != 0.0) {
+            indices += cur + i
+            values += v
+            ()
+          }
+        }
+        cur += vec.size
+      case null =>
+        throw new SparkException("Values to assemble cannot be null.")
+      case o =>
+        throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.")
+    }
+    Vectors.sparse(cur, indices.result(), values.result()).compressed
+  }
+
+}
diff --git a/src/core/spark/src/main/scala/MapArrayParam.scala b/src/core/spark/src/main/scala/MapArrayParam.scala
new file mode 100644
index 0000000000..bad158fca7
--- /dev/null
+++ b/src/core/spark/src/main/scala/MapArrayParam.scala
@@ -0,0 +1,74 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package org.apache.spark.ml.param
+
+import org.apache.spark.ml.util.Identifiable
+
+import scala.collection.JavaConverters._
+import scala.collection.immutable.Map
+import scala.collection.mutable
+import spray.json._
+
+object MapArrayJsonProtocol extends DefaultJsonProtocol {
+
+  import spray.json._
+  implicit object MapJsonFormat extends JsonFormat[Map[String, Seq[String]]] {
+    def write(m: Map[String, Seq[String]]): JsValue = {
+      JsObject(m.mapValues {
+        case v: Seq[String] => seqFormat[String].write(v)
+        case default => serializationError(s"Unable to serialize $default")
+      })
+    }
+
+    def read(value: JsValue): Map[String, Seq[String]] = value.asInstanceOf[JsObject].fields.map(kvp => {
+      val convValue = kvp._2 match {
+        case v: JsValue => seqFormat[String].read(v)
+        case default => deserializationError(s"Unable to deserialize $default")
+      }
+      (kvp._1, convValue)
+    })
+  }
+
+}
+
+import MapArrayJsonProtocol._
+
+/**
+  * Param for Map of String to Seq of String.
+  */
+class MapArrayParam(parent: String, name: String, doc: String, isValid: Map[String, Seq[String]] => Boolean)
+  extends Param[Map[String, Seq[String]]](parent, name, doc, isValid) {
+
+  def this(parent: String, name: String, doc: String) =
+
+    this(parent, name, doc, ParamValidators.alwaysTrue)
+
+  def this(parent: Identifiable, name: String, doc: String, isValid: Map[String, Seq[String]] => Boolean) =
+
+    this(parent.uid, name, doc, isValid)
+
+  def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc)
+
+  /** Creates a param pair with the given value (for Java). */
+  def w(value: java.util.HashMap[String, java.util.List[String]]): ParamPair[Map[String, Seq[String]]] = {
+    val mutMap = mutable.Map[String, Seq[String]]()
+    for (key <- value.keySet().asScala) {
+      val list = value.get(key).asScala
+      mutMap(key) = list
+    }
+    w(mutMap.toMap)
+  }
+
+  override def jsonEncode(value: Map[String, Seq[String]]): String = {
+    val convertedMap = value.map(kvp => (kvp._1, kvp._2.toArray))
+    val json = convertedMap.toJson
+    json.prettyPrint
+  }
+
+  override def jsonDecode(json: String): Map[String, Seq[String]] = {
+    val jsonValue = json.parseJson
+    jsonValue.convertTo[Map[String, Seq[String]]]
+  }
+
+}
diff --git a/src/core/spark/src/main/scala/MetadataUtilities.scala b/src/core/spark/src/main/scala/MetadataUtilities.scala
new file mode 100644
index 0000000000..43f36c1272
--- /dev/null
+++ b/src/core/spark/src/main/scala/MetadataUtilities.scala
@@ -0,0 +1,10 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package org.apache.spark.sql.types
+
+object MetadataUtilities {
+
+  def getMetadataKeys(metadata: Metadata): Iterable[String] = metadata.map.keys
+
+}
diff --git a/src/core/spark/src/main/scala/TransformParam.scala b/src/core/spark/src/main/scala/TransformParam.scala
new file mode 100644
index 0000000000..fb1233a3cc
--- /dev/null
+++ b/src/core/spark/src/main/scala/TransformParam.scala
@@ -0,0 +1,58 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package org.apache.spark.ml.param
+
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.util.Identifiable
+import scala.collection.JavaConverters._
+
+/**
+  * Param for Transformer.  Needed as spark has explicit params for many different types but not Transformer.
+  */
+class TransformerParam(parent: String, name: String, doc: String, isValid: Transformer => Boolean)
+  extends Param[Transformer](parent, name, doc, isValid) {
+
+  def this(parent: String, name: String, doc: String) =
+    this(parent, name, doc, ParamValidators.alwaysTrue)
+
+  def this(parent: Identifiable, name: String, doc: String, isValid: Transformer => Boolean) =
+    this(parent.uid, name, doc, isValid)
+
+  def this(parent: Identifiable, name: String, doc: String) =
+    this(parent.uid, name, doc)
+
+  /** Creates a param pair with the given value (for Java). */
+  override def w(value: Transformer): ParamPair[Transformer] =
+    super.w(value)
+
+  override def jsonEncode(value: Transformer): String = {
+    throw new NotImplementedError("The transform cannot be encoded.")
+  }
+
+  override def jsonDecode(json: String): Transformer = {
+    throw new NotImplementedError("The transform cannot be decoded.")
+  }
+
+}
+
+/**
+  * Param for Array of Models.
+  */
+class TransformerArrayParam(parent: String, name: String, doc: String, isValid: Array[Transformer] => Boolean)
+  extends Param[Array[Transformer]](parent, name, doc, isValid) {
+
+  def this(parent: String, name: String, doc: String) =
+
+    this(parent, name, doc, ParamValidators.alwaysTrue)
+
+  def this(parent: Identifiable, name: String, doc: String, isValid: Array[Transformer] => Boolean) =
+
+    this(parent.uid, name, doc, isValid)
+
+  def this(parent: Identifiable, name: String, doc: String) = this(parent.uid, name, doc)
+
+  /** Creates a param pair with the given value (for Java). */
+  def w(value: java.util.List[Transformer]): ParamPair[Array[Transformer]] = w(value.asScala.toArray)
+
+}
diff --git a/src/core/test/base/build.sbt b/src/core/test/base/build.sbt
new file mode 100644
index 0000000000..cd0183132b
--- /dev/null
+++ b/src/core/test/base/build.sbt
@@ -0,0 +1 @@
+// nothing here
diff --git a/src/core/test/base/src/main/scala/SparkSessionFactory.scala b/src/core/test/base/src/main/scala/SparkSessionFactory.scala
new file mode 100644
index 0000000000..4b328406df
--- /dev/null
+++ b/src/core/test/base/src/main/scala/SparkSessionFactory.scala
@@ -0,0 +1,53 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.io.File
+
+import org.apache.log4j.{Level, Logger}
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.SparkSession
+
+// Convert configuration to JSON/ENV vars moving forward:
+// 1. Logging Level
+// 2. Warehouse directory
+// 3. DiskBlockManager - currently defaults to USER TEMP it seems
+// 3a. Does this derive from spark.local.dir? Should be configured as well?
+// 4. Actual Session host instead of local
+object SparkSessionFactory {
+
+  // Default spark warehouse = ./spark-warehouse
+  private val defaultWarehouseDirName = "spark-warehouse"
+  private val testDir = System.currentTimeMillis.toString
+
+  private lazy val localWarehousePath =
+    "file:" +
+    customNormalize(new File(currentDir, defaultWarehouseDirName)
+                        .getAbsolutePath())
+  val workingDir =
+    "file:" +
+    customNormalize(new File(currentDir, testDir)
+                        .getAbsolutePath())
+  // On NTFS-like systems, normalize path
+  //   (solves the problem of sending a path from spark to hdfs on Windows)
+  def customNormalize(path: String): String = {
+    if (File.separator != "\\") path
+    else path.replaceFirst("[A-Z]:", "").replace("\\", "/")
+  }
+  def currentDir(): String = System.getProperty("user.dir")
+
+  def getSession(name: String, logLevel: String = "WARN"): SparkSession = {
+    val conf = new SparkConf()
+        .setAppName(name)
+        .setMaster("local[*]")
+        .set("spark.logConf", "true")
+        .set("spark.sql.warehouse.dir", SparkSessionFactory.localWarehousePath)
+    val sess = SparkSession.builder()
+      .config(conf)
+      .getOrCreate()
+    sess.sparkContext.setLogLevel(logLevel)
+    sess
+  }
+
+}
diff --git a/src/core/test/base/src/main/scala/TestBase.scala b/src/core/test/base/src/main/scala/TestBase.scala
new file mode 100644
index 0000000000..f5debe7ddd
--- /dev/null
+++ b/src/core/test/base/src/main/scala/TestBase.scala
@@ -0,0 +1,155 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import scala.reflect.ClassTag
+import org.apache.spark._
+import org.apache.spark.ml._
+import org.apache.spark.sql._
+import org.scalatest._
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions._
+import org.scalactic.source.Position
+
+// Common test tags
+object TestBase {
+  object Extended extends Tag("com.microsoft.ml.spark.test.tags.extended")
+  object LinuxOnly extends Tag("com.microsoft.ml.spark.test.tags.linuxonly")
+}
+
+trait LinuxOnly extends TestBase {
+  override def test(testName: String, testTags: Tag*)(testFun: => Any)(implicit pos: Position): Unit =
+    super.test(testName, testTags.toList.::(TestBase.LinuxOnly): _*)(testFun)
+}
+
+abstract class TestBase extends FunSuite with BeforeAndAfterEachTestData with BeforeAndAfterAll {
+
+  println(s"\n>>>-------------------- $this --------------------<<<")
+
+  // "This Is A Bad Thing" according to my research. However, this is
+  // just for tests so maybe ok. A better design would be to break the
+  // session stuff into TestSparkSession as a trait and have test suites
+  // that need it "with TestSparkSession" instead, but that's a lot of
+  // changes right now and maybe not desired.
+  private var sessionInitialized = false
+  protected lazy val session: SparkSession = {
+    info(s"Creating a spark session for suite $this")
+    sessionInitialized = true
+    SparkSessionFactory
+      .getSession(s"$this", logLevel = "WARN")
+  }
+
+  protected lazy val sc: SparkContext = session.sparkContext
+  protected lazy val dir = SparkSessionFactory.workingDir
+  protected def normalizePath(path: String) = SparkSessionFactory.customNormalize(path)
+
+  // Timing info
+  var suiteElapsed: Long = 0
+  var testStart: Long = 0
+  var testElapsed: Long = 0
+
+  // Test Fixture Overrides
+  protected override def beforeEach(td: TestData): Unit = {
+    testStart = System.currentTimeMillis
+    testElapsed = 0
+    super.beforeEach(td)
+  }
+
+  protected override def afterEach(td: TestData): Unit = {
+    try {
+      super.afterEach(td)
+    }
+    finally {
+      testElapsed = System.currentTimeMillis - testStart
+      logTime(s"Test ${td.name}", testElapsed, 3000)
+      suiteElapsed += testElapsed
+    }
+  }
+
+  protected override def beforeAll(): Unit = {
+    if (sessionInitialized) {
+      info(s"Parallelism: ${session.sparkContext.defaultParallelism.toString}")
+    }
+    suiteElapsed = 0
+  }
+
+  protected override def afterAll(): Unit = {
+    logTime(s"Suite $this", suiteElapsed, 10000)
+    if (sessionInitialized) {
+      info("Shutting down spark session")
+      session.stop()
+    }
+  }
+
+  // Utilities
+
+  def withoutLogging[T](e: => T): T = {
+    // This should really keep the old level, but there is no sc.getLogLevel, so
+    // take the cheap way out for now: just use "WARN", and do something proper
+    // when/if needed
+    sc.setLogLevel("OFF")
+    try e finally sc.setLogLevel("WARN")
+  }
+
+  def interceptWithoutLogging[E <: Exception: ClassTag](e: => Any): Unit = {
+    withoutLogging { intercept[E] { e }; () }
+  }
+
+  def assertSparkException[E <: Exception: ClassTag](stage: PipelineStage, data: DataFrame): Unit = {
+    withoutLogging {
+      intercept[E] {
+        val transformer = stage match {
+            case e: Estimator[_] => e.fit(data)
+            case t: Transformer  => t
+            case _ => sys.error(s"Unknown PipelineStage value: $stage")
+          }
+        // use .length to force the pipeline (.count might work, but maybe it's sometimes optimized)
+        transformer.transform(data).foreach { r => r.length; () }
+      }
+      ()
+    }
+  }
+
+  import session.implicits._
+
+  def makeBasicDF(): DataFrame = {
+    val df = Seq(
+      (0, "guitars", "drums"),
+      (1, "piano", "trumpet"),
+      (2, "bass", "cymbals")).toDF("numbers","words", "more")
+    df
+  }
+
+  def makeBasicNullableDF(): DataFrame = {
+    val df = Seq(
+      (0, 2.5, "guitars", "drums"),
+      (1, Double.NaN, "piano", "trumpet"),
+      (2, 8.9, "bass", null)).toDF("indices", "numbers","words", "more")
+    df
+  }
+
+  def verifyResult(expected: DataFrame, result: DataFrame): Boolean = {
+    assert(expected.count == result.count)
+    assert(expected.schema.length == result.schema.length)
+    (expected.columns zip result.columns).forall{ case (x,y) => x == y }
+  }
+
+  def time[R](block: => R): R = {
+    val t0     = System.nanoTime()
+    val result = block
+    val t1     = System.nanoTime()
+    println(s"Elapsed time: ${(t1 - t0) / 1e9} sec")
+    result
+  }
+
+  private def logTime(name: String, time: Long, threshold: Long) = {
+    val msg = s"$name took ${time / 1000.0}s"
+    if (time > threshold) {
+      alert(msg)
+    } else {
+      info(msg)
+    }
+  }
+
+}
diff --git a/src/core/test/build.sbt b/src/core/test/build.sbt
new file mode 100644
index 0000000000..e3bafe48f4
--- /dev/null
+++ b/src/core/test/build.sbt
@@ -0,0 +1 @@
+Extras.noJar
diff --git a/src/core/test/datagen/build.sbt b/src/core/test/datagen/build.sbt
new file mode 100644
index 0000000000..6c29f8db94
--- /dev/null
+++ b/src/core/test/datagen/build.sbt
@@ -0,0 +1 @@
+//> DependsOn: core/test/base
diff --git a/src/core/test/datagen/src/main/scala/DatasetConstraints.scala b/src/core/test/datagen/src/main/scala/DatasetConstraints.scala
new file mode 100644
index 0000000000..d67c82fca9
--- /dev/null
+++ b/src/core/test/datagen/src/main/scala/DatasetConstraints.scala
@@ -0,0 +1,68 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import breeze.stats.distributions.{Rand, RandBasis, Uniform}
+import org.apache.commons.math3.random.{MersenneTwister, RandomGenerator}
+
+import scala.util.Random
+
+/**
+  * Specifies the trait for constraints on generating a dataset.
+  */
+trait HasDatasetGenerationConstraints {
+  var numRows: Int
+  var numCols: Int
+  var numSlotsPerCol: Array[Int]
+  var randomizeColumnNames: Boolean
+}
+
+/**
+  * Basic constraints for generating a dataset.
+  */
+class BasicDatasetGenerationConstraints(numberOfRows: Int, numberOfColumns: Int, numberOfSlotsPerColumn: Array[Int])
+  extends HasDatasetGenerationConstraints {
+  override var numRows: Int = numberOfRows
+  override var numCols: Int = numberOfColumns
+  override var numSlotsPerCol: Array[Int] = numberOfSlotsPerColumn
+  override var randomizeColumnNames: Boolean = true
+}
+
+/**
+  * Contraints on generating a dataset where all parameters are randomly generated.
+  * @param minRows The min number of rows.
+  * @param maxRows The max number of rows.
+  * @param minCols The min number of columns.
+  * @param maxCols The max number of columns.
+  * @param minSlots The min number of slots.
+  * @param maxSlots The max number of slots.
+  */
+class RandomDatasetGenerationConstraints(minRows: Int,
+                                         maxRows: Int,
+                                         minCols: Int,
+                                         maxCols: Int,
+                                         minSlots: Int,
+                                         maxSlots: Int)
+  extends HasDatasetGenerationConstraints {
+
+  override var numRows: Int = _
+  override var numCols: Int = _
+  override var numSlotsPerCol: Array[Int] = _
+  override var randomizeColumnNames: Boolean = _
+
+  /**
+    * Generates values for rows, columns and slots based on the given constraints using a random number generator.
+    * @param random The random number generator.
+    */
+  def generateConstraints(random: Random): Unit = {
+    val rand = new RandBasis(new MersenneTwister(random.nextInt()))
+    val distributionRows = new Uniform(minRows.toDouble, maxRows.toDouble)(rand)
+    val distributionCols = new Uniform(minCols.toDouble, maxCols.toDouble)(rand)
+    val distributionSlots = new Uniform(minCols.toDouble, maxCols.toDouble)(rand)
+    numRows = distributionRows.draw().toInt
+    numCols = distributionCols.draw().toInt
+    numSlotsPerCol = (1 to numCols).map(col => distributionSlots.draw().toInt).toArray
+  }
+
+}
diff --git a/src/core/test/datagen/src/main/scala/DatasetOptions.scala b/src/core/test/datagen/src/main/scala/DatasetOptions.scala
new file mode 100644
index 0000000000..5e75992f58
--- /dev/null
+++ b/src/core/test/datagen/src/main/scala/DatasetOptions.scala
@@ -0,0 +1,57 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.ColumnOptions.ColumnOptions
+import com.microsoft.ml.spark.DataOptions.DataOptions
+
+/**
+  * Specifies the column types supported in spark dataframes and modules.
+  */
+object ColumnOptions extends Enumeration {
+  type ColumnOptions = Value
+  // TODO: add Categorical, DenseVector, SparseVector
+  val Scalar = Value
+}
+
+/**
+  * Specifies the data types supported in spark dataframes and modules.
+  */
+object DataOptions extends Enumeration {
+  type DataOptions = Value
+  val String, Int, Double, Boolean, Date, Timestamp, Byte, Short = Value
+}
+
+/**
+  * Options used to specify how a dataset will be generated.
+  * This contains information on what the data and column types
+  * (specified as flags) for generating a dataset will be limited to.
+  * It also contain options for all possible missing values generation
+  * and options for how values will be generated.
+  */
+case class DatasetOptions(columnTypes: ColumnOptions.ValueSet,
+                          dataTypes: DataOptions.ValueSet,
+                          missingValuesOptions: DatasetMissingValuesGenerationOptions)
+
+object DatasetOptions {
+  def apply(columnOptions: ColumnOptions.ValueSet, dataOptions: DataOptions.ValueSet): DatasetOptions = {
+    val missingValueOptions = DatasetMissingValuesGenerationOptions(0.0, columnOptions, dataOptions)
+    new DatasetOptions(columnOptions, dataOptions, missingValueOptions)
+  }
+
+  def apply(columnOption: ColumnOptions, dataOption: DataOptions): DatasetOptions = {
+    val colOptions = ColumnOptions.ValueSet(columnOption)
+    val dataOptions = DataOptions.ValueSet(dataOption)
+    val missingValueOptions = DatasetMissingValuesGenerationOptions(0.0, colOptions, dataOptions)
+    new DatasetOptions(colOptions, dataOptions, missingValueOptions)
+  }
+}
+
+case class DatasetMissingValuesGenerationOptions(percentMissing: Double,
+                                                 columnTypesWithMissings: ColumnOptions.ValueSet,
+                                                 dataTypesWithMissings: DataOptions.ValueSet) {
+  def hashMissing(): Boolean = {
+    !columnTypesWithMissings.isEmpty && !dataTypesWithMissings.isEmpty
+  }
+}
diff --git a/src/core/test/datagen/src/main/scala/GenerateDataType.scala b/src/core/test/datagen/src/main/scala/GenerateDataType.scala
new file mode 100644
index 0000000000..5b72dc4c84
--- /dev/null
+++ b/src/core/test/datagen/src/main/scala/GenerateDataType.scala
@@ -0,0 +1,37 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.sql.Timestamp
+import java.sql.Date
+import org.apache.commons.lang.RandomStringUtils
+import scala.util.Random
+
+/**
+  * Generates the specified random data type.
+  */
+class GenerateDataType(random: Random) extends Serializable {
+
+  def nextTimestamp: Timestamp = new Timestamp(random.nextLong())
+
+  def nextBoolean: Boolean = random.nextBoolean()
+
+  def nextByte: Byte = {
+    val byteArray = new Array[Byte](1)
+    random.nextBytes(byteArray)
+    byteArray(0)
+  }
+
+  def nextDouble: Double = random.nextDouble()
+
+  def nextInt: Int = random.nextInt()
+
+  def nextShort: Short = random.nextInt(Short.MaxValue).toShort
+
+  def nextString: String = RandomStringUtils.random(random.nextInt(100), 0, 0, true, true, null,
+    new java.util.Random(random.nextLong()))
+
+  def nextDate: Date = new Date(random.nextLong())
+
+}
diff --git a/src/core/test/datagen/src/main/scala/GenerateDataset.scala b/src/core/test/datagen/src/main/scala/GenerateDataset.scala
new file mode 100644
index 0000000000..ed1432b75f
--- /dev/null
+++ b/src/core/test/datagen/src/main/scala/GenerateDataset.scala
@@ -0,0 +1,114 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.ColumnOptions.ColumnOptions
+import com.microsoft.ml.spark.DataOptions.DataOptions
+import org.apache.spark.mllib.random.RandomRDDs
+import org.apache.spark.sql._
+import org.apache.spark.sql.types._
+
+import scala.util.Random
+
+/**
+  * Defines methods to generate a random spark DataFrame dataset based on given options.
+  */
+object GenerateDataset {
+
+  /**
+    * Generates a random Spark DataFrame given a set of dataset generation constraints.
+    * @param sparkSession The spark session.
+    * @param datasetGenerationConstraints The dataset generation constraints to use.
+    * @param seed The random seed.
+    * @return A randomly generated dataset.
+    */
+  def generateDataset(sparkSession: SparkSession,
+                      datasetGenerationConstraints: HasDatasetGenerationConstraints,
+                      seed: Long): DataFrame = {
+    generateDatasetFromOptions(sparkSession, Map[Int, DatasetOptions](), datasetGenerationConstraints, seed)
+  }
+
+  /**
+    * Generates a random Spark DataFrame given a map of index to DataGenerationOptions.
+    * @param sparkSession The spark session.
+    * @param indexToOptions The map of indexes to DataGenerationOptions.
+    * @param datasetGenerationConstraints The constraints for generating the dataset.
+    * @param seed The random seed.
+    * @return The randomly generated dataset.
+    */
+  def generateDatasetFromOptions(sparkSession: SparkSession,
+                                 indexToOptions: Map[Int, DatasetOptions],
+                                 datasetGenerationConstraints: HasDatasetGenerationConstraints,
+                                 seed: Long): DataFrame = {
+
+    val random = new Random(seed)
+    val numCols: Int = datasetGenerationConstraints.numCols
+    val datasetGenerationOptions = (1 to numCols).
+      map(index =>
+        if (indexToOptions.contains(index)) indexToOptions(index)
+        else new DatasetOptions(ColumnOptions.values,
+                                          DataOptions.values,
+                                          new DatasetMissingValuesGenerationOptions(0.5,
+                                                                                    ColumnOptions.values,
+                                                                                    DataOptions.values)))
+    // Get random options chosen from given valid space-dimension complex
+    val chosenOptions:Array[(ColumnOptions, DataOptions)] =
+      datasetGenerationOptions.toArray.map(option => chooseOptions(option, random))
+
+    val rdd = RandomRDDs.randomRDD[Row](sparkSession.sparkContext,
+      new RandomRowGeneratorCombiner(chosenOptions.map(option => new RandomRowGenerator(option._1, option._2))),
+      datasetGenerationConstraints.numRows.toLong, 1, random.nextLong())
+    sparkSession.createDataFrame(rdd, getSchemaFromOptions(chosenOptions, random))
+  }
+
+  def getOptionsFromSchema(schema: StructType): Map[Int, DatasetOptions] = {
+    val datasetOptions = schema.map(sf => DatasetOptions(ColumnOptions.Scalar, getOptionsFromDataType(sf.dataType)))
+    datasetOptions.zipWithIndex.map(kvp => (kvp._2 + 1, kvp._1)).toMap
+  }
+
+  private def chooseOptions(options: DatasetOptions, random: Random) = {
+    val (optionsColumnArray, optionsDataArray) = (options.columnTypes.toArray, options.dataTypes.toArray)
+    (optionsColumnArray(random.nextInt(optionsColumnArray.length)),
+      optionsDataArray(random.nextInt(optionsDataArray.length)))
+  }
+
+  private def getSchemaFromOptions(chosenOptions: Array[(ColumnOptions, DataOptions)],
+                                   random: Random): StructType = {
+    val generateDataType = new GenerateDataType(random)
+    new StructType(
+      chosenOptions
+        .map(option => getDataTypeFromOptions(option._2))
+        .map(dataType => StructField(generateDataType.nextString, dataType)))
+  }
+
+  lazy val dataTypeToOptions: Map[DataOptions, DataType] = Map(
+    DataOptions.String -> StringType,
+    DataOptions.Timestamp -> TimestampType,
+    DataOptions.Short -> ShortType,
+    DataOptions.Int -> IntegerType,
+    DataOptions.Boolean -> BooleanType,
+    DataOptions.Byte -> ByteType,
+    DataOptions.Date -> DateType,
+    DataOptions.Double -> DoubleType
+  )
+
+  lazy val optionsToDataType: Map[DataType, DataOptions] = dataTypeToOptions.map(kvp => (kvp._2, kvp._1))
+
+  private def getDataTypeFromOptions(data: DataOptions): DataType = {
+    if (dataTypeToOptions.contains(data)) {
+      dataTypeToOptions(data)
+    } else {
+      throw new Exception("The type does not exist in spark: " + data)
+    }
+  }
+
+  private def getOptionsFromDataType(data: DataType): DataOptions = {
+    if (optionsToDataType.contains(data)) {
+      optionsToDataType(data)
+    } else {
+      throw new Exception("The corresponding option does not exist for spark data type: " + data)
+    }
+  }
+
+}
diff --git a/src/core/test/datagen/src/main/scala/GenerateRow.scala b/src/core/test/datagen/src/main/scala/GenerateRow.scala
new file mode 100644
index 0000000000..a258bc1f46
--- /dev/null
+++ b/src/core/test/datagen/src/main/scala/GenerateRow.scala
@@ -0,0 +1,70 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.ColumnOptions.ColumnOptions
+import com.microsoft.ml.spark.DataOptions.DataOptions
+import org.apache.spark.mllib.random.RandomDataGenerator
+import org.apache.spark.sql.Row
+
+import scala.util.Random
+
+/**
+  * Combines an array of row generators into a single row generator.
+  * @param generators
+  */
+class RandomRowGeneratorCombiner(generators: Array[RandomMMLGenerator[Row]]) extends RandomMMLGenerator[Row] {
+
+  override def nextValue(): Row = Row.merge(generators.map(generator => generator.nextValue()): _*)
+
+  override def copy(): RandomRowGeneratorCombiner = new RandomRowGeneratorCombiner(generators)
+
+}
+
+/**
+  * Randomly generates a row given the set space of data, column options.
+  * @param col The column generation options specifying the column type to generate.
+  * @param data The data generation options specifying the data to generate.
+  */
+class RandomRowGenerator(col: ColumnOptions, data: DataOptions) extends RandomMMLGenerator[Row] {
+
+  override def nextValue(): Row = {
+    if (data == DataOptions.Boolean)
+      Row(random.nextBoolean)
+    else if (data == DataOptions.Byte)
+      Row(random.nextByte)
+    else if (data == DataOptions.Double)
+      Row(random.nextDouble)
+    else if (data == DataOptions.Int)
+      Row(random.nextInt)
+    else if (data == DataOptions.Short)
+      Row(random.nextShort)
+    else if (data == DataOptions.String)
+      Row(random.nextString)
+    else if (data == DataOptions.Date)
+      Row(random.nextDate)
+    else if (data == DataOptions.Timestamp)
+      Row(random.nextTimestamp)
+    else throw new Exception("Selected type not supported: " + data)
+  }
+
+  override def copy(): RandomRowGenerator = new RandomRowGenerator(col, data)
+
+}
+
+/**
+  * Base abstract class for random generation of data.
+  * @tparam T The data to generate.
+  */
+abstract class RandomMMLGenerator[T] extends RandomDataGenerator[T] {
+
+  var seed: Long = 0
+  var random: GenerateDataType = new GenerateDataType(new Random(seed))
+
+  override def setSeed(seed: Long): Unit = {
+    random = new GenerateDataType(new Random(seed))
+    this.seed = seed
+  }
+
+}
diff --git a/src/core/test/datagen/src/main/scala/ModuleFuzzingTest.scala b/src/core/test/datagen/src/main/scala/ModuleFuzzingTest.scala
new file mode 100644
index 0000000000..48dd3e0afa
--- /dev/null
+++ b/src/core/test/datagen/src/main/scala/ModuleFuzzingTest.scala
@@ -0,0 +1,52 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.ml.{Estimator, Transformer}
+import org.apache.spark.sql.types.StructType
+import org.apache.spark.sql.DataFrame
+
+/**
+  * Used to provide overrides on datasets to be constructed for testing fit/transform and default values
+  */
+abstract class EstimatorFuzzingTest extends TestBase {
+  def setParams(fitDataset: DataFrame, estimator: Estimator[_]): Estimator[_] = estimator
+
+  def createFitDataset: DataFrame = {
+    val schema = schemaForDataset
+    GenerateDataset.generateDatasetFromOptions(session,
+      GenerateDataset.getOptionsFromSchema(schema),
+      new BasicDatasetGenerationConstraints(5, schema.size, Array()),
+      0).toDF(schemaForDataset.map(_.name): _*)
+  }
+
+  def createTransformDataset: DataFrame = createFitDataset
+
+  def schemaForDataset: StructType
+
+  def getEstimator(): Estimator[_]
+
+  def getClassName: String = getEstimator().getClass.getName
+}
+
+/**
+  * Used to provide overrides on datasets to be constructed for testing transform and default values
+  */
+abstract class TransformerFuzzingTest extends TestBase {
+  def setParams(fitDataset: DataFrame, transformer: Transformer): Transformer = transformer
+
+  def createDataset: DataFrame = {
+    val schema = schemaForDataset
+    GenerateDataset.generateDatasetFromOptions(session,
+      GenerateDataset.getOptionsFromSchema(schema),
+      new BasicDatasetGenerationConstraints(5, schema.size, Array()),
+      0)
+  }
+
+  def schemaForDataset: StructType
+
+  def getTransformer(): Transformer
+
+  def getClassName: String = getTransformer().getClass.getName
+}
diff --git a/src/core/test/datagen/src/test/scala/VerifyGenerateDataset.scala b/src/core/test/datagen/src/test/scala/VerifyGenerateDataset.scala
new file mode 100644
index 0000000000..94a7d5cd52
--- /dev/null
+++ b/src/core/test/datagen/src/test/scala/VerifyGenerateDataset.scala
@@ -0,0 +1,46 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+/**
+  * Verifies generating a dataset using the api.
+  */
+class VerifyGenerateDataset extends TestBase {
+
+  test("Smoke test to verify that generating a dataset works") {
+    val numRows = 10
+    val numCols = 20
+    val numSlotsPerVectorCol = Array(15, 15)
+    val seed = 1337
+    val df = GenerateDataset
+      .generateDataset(session, new BasicDatasetGenerationConstraints(numRows, numCols, numSlotsPerVectorCol),
+        seed.toLong)
+    assert(df.columns.length == numCols)
+    assert(df.count == numRows)
+  }
+
+  test("Verify that the generated dataset is always the same") {
+    val numRows = 10
+    val numCols = 20
+    val numSlotsPerVectorCol = Array(15, 15)
+    val seed = 1337
+
+    val datasets = (0 to 10).map(i => GenerateDataset
+      .generateDataset(session, new BasicDatasetGenerationConstraints(numRows, numCols, numSlotsPerVectorCol),
+        seed.toLong))
+
+    assert(datasets.forall(df => verifyResult(df, datasets(0))), "Datasets must be equal")
+  }
+
+  test("Verify that for different seed, you will get different datasets") {
+    val numRows = 25
+    val numCols = 10
+
+    val datasets = (0 to 10).map(i => GenerateDataset
+      .generateDataset(session, new BasicDatasetGenerationConstraints(numRows, numCols, Array()), i.toLong))
+
+    assert(!datasets.forall(df => verifyResult(df, datasets(0))), "Datasets must not be equal for different seeds")
+  }
+
+}
diff --git a/src/data-conversion/build.sbt b/src/data-conversion/build.sbt
new file mode 100644
index 0000000000..6d55f118b6
--- /dev/null
+++ b/src/data-conversion/build.sbt
@@ -0,0 +1 @@
+//> DependsOn: core
diff --git a/src/data-conversion/src/main/scala/DataConversion.scala b/src/data-conversion/src/main/scala/DataConversion.scala
new file mode 100644
index 0000000000..073ca6b714
--- /dev/null
+++ b/src/data-conversion/src/main/scala/DataConversion.scala
@@ -0,0 +1,161 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.param._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+import java.sql.Timestamp
+
+import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
+import schema._
+
+object DataConversion extends DefaultParamsReadable[DataConversion]
+
+/*
+This class takes a DataFrame, a comma separated list of column names, and a conversion action and returns
+a new DataFrame with the contents of the selected columns coverted to the requested type.
+ */
+class DataConversion(override val uid: String) extends Transformer with MMLParams {
+  def this() = this(Identifiable.randomUID("DataConversion"))
+
+  val col: Param[String] = StringParam(this, "col",
+    "comma separated list of columns whose type will be converted", "")
+
+  /** @group getParam **/
+  final def getCol: String = $(col)
+
+  /** @group setParam **/
+  def setCol(value: String): this.type = set(col, value)
+
+  val convertTo: Param[String] = StringParam(this, "convertTo", "the result type", "")
+
+  /** @group getParam **/
+  final def getConvertTo: String = $(convertTo)
+
+  /** @group setParam **/
+  def setConvertTo(value: String): this.type = set(convertTo, value)
+
+  val dateTimeFormat: Param[String] = StringParam(this, "dateTimeFormat",
+    "format for DateTime when making DateTime:String conversions", "yyyy-MM-dd HH:mm:ss")
+
+  /** @group getParam **/
+  final def getDateTimeFormat: String = $(dateTimeFormat)
+
+  /** @group setParam **/
+  def setDateTimeFormat(value: String): this.type = set(dateTimeFormat, value)
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    require($(col) != null, "No column name specified")
+    require(dataset != null, "No dataset supplied")
+    require(dataset.columns.length != 0, "Dataset with no columns cannot be converted")
+    val colsList = $(col).split(",").map(_.trim)
+    val errorList = verifyCols(dataset.toDF(), colsList)
+    if (!errorList.isEmpty) {
+      throw new NoSuchElementException
+    }
+    var df = dataset.toDF
+
+    val res: DataFrame =  {
+      for (convCol <- colsList) {
+        df = $(convertTo) match {
+          case "boolean" => numericTransform(df, BooleanType, convCol)
+          case "byte" => numericTransform(df, ByteType, convCol)
+          case "short" => numericTransform(df, ShortType, convCol)
+          case "integer" => numericTransform(df, IntegerType, convCol)
+          case "long" => numericTransform(df, LongType, convCol)
+          case "float" => numericTransform(df, FloatType, convCol)
+          case "double" => numericTransform(df, DoubleType, convCol)
+          case "string" => numericTransform(df, StringType, convCol)
+          case "toCategorical" => SparkSchema.makeCategorical(df, convCol, convCol, true)
+          case "clearCategorical" => SparkSchema.makeNonCategorical(df, convCol, convCol)
+          case "date" => toDateConversion(df, convCol)
+        }
+      }
+      df
+    }
+    res
+  }
+
+  /**
+    * @param dataset - The input dataset, to be transformed
+    * @param paramMap - ParamMap which contains parameter value to override the default value
+    * @return - the DataFrame that results from data conversion
+    */
+  override def transform(dataset: Dataset[_], paramMap: ParamMap): DataFrame = {
+    setCol(paramMap.getOrElse(new Param("col", "col","name of column whose type will be converted"), ""))
+    setConvertTo(paramMap.getOrElse(new Param("convertTo", "convertTo","result type"), ""))
+    setDateTimeFormat(paramMap.getOrElse(new Param("dateTimeFormat", "dateTimeFormat", "time string format"), ""))
+    transform(dataset)
+  }
+
+  def transformSchema(schema: StructType): StructType = {
+    System.err.println("transformSchema not implemented yet")
+    schema
+  }
+
+  def copy(extra: ParamMap): DataConversion = defaultCopy(extra)
+
+  /*
+  Convert to a numeric type or a string. If the input type was a TimestampType, tnen do a different conversion?
+   */
+  private def numericTransform(df: DataFrame, outType: DataType, columnName: String): DataFrame = {
+    val inType = df.schema(columnName).dataType
+    if (inType == StringType && outType == BooleanType) throw new Exception("String to Boolean not supported")
+    val res = inType match {
+      case TimestampType => fromDateConversion(df, outType, columnName)
+      case _ => df.withColumn(columnName, df(columnName).cast(outType).as(columnName))
+    }
+    res
+  }
+
+  /*
+  Convert a TimestampType to a StringType or a LongType, else error
+   */
+  private def fromDateConversion(df: DataFrame, outType: DataType, columnName: String): DataFrame = {
+    require(outType == StringType || outType == LongType, "Date only converts to string or long")
+    val res = outType match {
+      case LongType => {
+        val getTime = udf((t:java.sql.Timestamp)=>t.getTime())
+        df.withColumn(columnName, getTime(df(columnName)))
+      }
+      case StringType => {
+        val parseTimeString = udf((t:java.sql.Timestamp)=>{
+          val f:java.text.SimpleDateFormat = new java.text.SimpleDateFormat($(dateTimeFormat));f.format(t)})
+        df.withColumn(columnName, parseTimeString(df(columnName)))
+      }
+    }
+    res
+  }
+
+  private def toDateConversion(df: DataFrame, columnName: String): DataFrame = {
+    val inType = df.schema(columnName).dataType
+    require(inType == StringType || inType == LongType, "Can only convert string or long to Date")
+    val res = inType match {
+      case StringType => {
+        val f = new java.text.SimpleDateFormat($(dateTimeFormat))
+        val parseTimeFromString = udf((t:String)=>{new Timestamp(f.parse(t).getTime)})
+        df.withColumn(columnName, parseTimeFromString(df(columnName)).cast("timestamp")).as(columnName)
+      }
+      case LongType => {
+        val longToTimestamp = udf((t:Long)=>{new java.sql.Timestamp(t)})
+        df.withColumn(columnName, longToTimestamp(df(columnName)))
+      }
+    }
+    res
+  }
+
+  private def verifyCols(df: DataFrame, req: Array[String]): List[String] = {
+    req.foldLeft(List[String]()) { (l, r) =>
+      if (df.columns.contains(r)) l
+      else {
+        System.err.println(s"DataFrame does not contain specified column: $r")
+        r :: l
+      }
+    }
+  }
+
+}
diff --git a/src/data-conversion/src/test/scala/VerifyDataConversion.scala b/src/data-conversion/src/test/scala/VerifyDataConversion.scala
new file mode 100644
index 0000000000..49b42eebf0
--- /dev/null
+++ b/src/data-conversion/src/test/scala/VerifyDataConversion.scala
@@ -0,0 +1,232 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.sql.Timestamp
+
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.sql.types._
+import schema._
+
+class VerifyDataConversions extends TestBase {
+
+  import session.implicits._
+
+  val testVal: Long = (Int.MaxValue).toLong + 100
+  val testShort: Integer = Short.MaxValue + 100
+  /*
+  DataFrame for the numerical and string <--> numerical conversions
+   */
+  val masterInDF = Seq((true: Boolean, 1: Byte, 2: Short, 3: Integer, 4: Long, 5.0F, 6.0, "7", "8.0"),
+    (false, 9: Byte, 10: Short, 11: Integer, 12: Long, 14.5F, 15.5, "16", "17.456"),
+    (true, -127: Byte, 345: Short, testShort, testVal, 18.91F, 20.21, "100", "200.12345"))
+    .toDF("bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring")
+
+  /*
+  Dataframe of Timestamp data
+   */
+  val tsDF= Seq("1986-07-27 12:48:00.123", "1988-11-01 11:08:48.456", "1993-08-06 15:32:00.789").toDF("Col0")
+    .select($"Col0".cast("timestamp"))
+
+  /*
+  Timestamps as longs dataframe. These longs were generated on the commandline feeding the above timestamp
+  values to Timestamp.getTime()
+   */
+  val f = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS")
+  val parseTimeFromString = udf((t:String)=>{new Timestamp(f.parse(t).getTime)})
+  val lDF = Seq(f.parse("1986-07-27 12:48:00.123").getTime(),
+    f.parse("1988-11-01 11:08:48.456").getTime(),
+    f.parse("1993-08-06 15:32:00.789").getTime()).toDF("Col0")
+
+  /*
+  Timestaps as strings dataframe
+   */
+  val sDF = Seq("1986-07-27 12:48:00.123", "1988-11-01 11:08:48.456", "1993-08-06 15:32:00.789").toDF("Col0")
+
+  /*
+  Test conversion of all numeric types to Boolean
+  Strings are cast to null, which causes the comparison test to fail, so for now I
+  am skipping the string tests.
+  Types tested are boolean, Byte, Short, Int, Long, Float, Double, and string
+  */
+  test("Test convert all types to Boolean") {
+    val r1 = new DataConversion().setCol("byte").setConvertTo("boolean").transform(masterInDF)
+    val r2 = new DataConversion().setCol("short").setConvertTo("boolean").transform(r1)
+    val r3 = new DataConversion().setCol("int").setConvertTo("boolean").transform(r2)
+    val r4 = new DataConversion().setCol("long").setConvertTo("boolean").transform(r3)
+    val r5 = new DataConversion().setCol("float").setConvertTo("boolean").transform(r4)
+    val r6 = new DataConversion().setCol("double").setConvertTo("boolean").transform(r5)
+    val expectedRes = Seq(( true, true, true, true, true, true, true, "7", "8.0"),
+      (false, true, true, true, true, true, true, "16", "17.456"),
+      (true, true, true, true, true, true, true, "100", "200.12345"))
+      .toDF("bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring")
+    assert(r6.schema("byte").dataType == BooleanType)
+    assert(r6.schema("short").dataType == BooleanType)
+    assert(r6.schema("int").dataType == BooleanType)
+    assert(r6.schema("long").dataType == BooleanType)
+    assert(r6.schema("float").dataType == BooleanType)
+    assert(r6.schema("double").dataType == BooleanType)
+  }
+
+  /*
+  Verify sting to boolean throws an error
+  */
+  test("Test convert string to boolean throws an exception") {
+    assertThrows[Exception] {
+      new DataConversion().setCol("intstring").setConvertTo("boolean").transform(masterInDF)
+    }
+  }
+
+  /*
+  Test conversion of all numeric types to Byte, as well as string representations
+  of integers and doubles
+  Types tested are boolean, Byte, Short, Int, Long, Float, Double, and string
+  For floats and doubles, the conversion value is the truncated integer portion of
+  the number. For values that exceed the min/max value for integers, the value will be truncated
+  at the least 32 bits, so a very large number will end up being a very large negative number
+  */
+  test("Test convert to Byte") {
+    val expectedDF = Seq(( 1: Byte, 1: Byte, 2: Byte, 3: Byte, 4: Byte, 5: Byte, 6: Byte, 7: Byte, 8: Byte),
+      (0: Byte, 9: Byte, 10: Byte, 11: Byte, 12: Byte, 14: Byte, 127: Byte, 16: Byte, 17: Byte),
+      (1: Byte, -127: Byte, 89: Byte, 99: Byte, 99: Byte, 18: Byte, 20: Byte, 100: Byte, -56: Byte))
+      .toDF("bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring")
+    val res =  generateRes("byte", masterInDF)
+    assert(res.schema("bool").dataType == ByteType)
+    assert(res.schema("short").dataType == ByteType)
+    assert(res.schema("int").dataType == ByteType)
+    assert(res.schema("long").dataType == ByteType)
+    assert(res.schema("float").dataType == ByteType)
+    assert(res.schema("double").dataType == ByteType)
+    assert(res.schema("intstring").dataType == ByteType)
+    assert(res.schema("doublestring").dataType == ByteType)
+  }
+
+  /*
+  Test conversion of all numeric types to Short, as well as string representations
+  of integers and doubles
+  Types tested are boolean, Byte, Short, Int, Long, Float, Double, and string
+  For floats and doubles, the conversion value is the truncated integer portion of
+  the number. For values that exceed the min/max value for integers, the value will be truncated
+  at the least 32 bits, so a very large number will end up being a very large negative number
+  */
+  test("Test convert to Short") {
+    val expectedDF = Seq(( 1: Short, 1: Short, 2: Short, 3: Short, 4: Short, 5: Short, 6: Short, 7: Short, 8: Short),
+      (0: Short, 9: Short, 10: Short, 11: Short, 12: Short, 14: Short, 15: Short, 16: Short, 17: Short),
+      (1: Short, -127: Short, 345: Short, -32669: Short, 99: Short, 18: Short, 20: Short, 100: Short, 200: Short))
+      .toDF("bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring")
+    assert(expectedDF.except(generateRes("short", masterInDF)).count == 0)
+  }
+
+  /*
+  Test conversion of all numeric types to Integer, as well as string representations
+  of integers and doubles
+  Types tested are boolean, Byte, Short, Int, Long, Float, Double, and string
+  For floats and doubles, the conversion value is the truncated integer portion of
+  the number. For values that exceed the min/max value for integers, the value will be truncated
+  at the least 32 bits, so a very large number will end up being a very large negative number
+  */
+  test("Test convert to Integer") {
+    val expectedDF = Seq((1, 1, 2, 3, 4, 5, 6, 7, 8),
+      (0, 9, 10, 11, 12, 14, 15, 16, 17),
+      (1, -127, 345, 32867, -2147483549, 18, 20, 100, 200))
+      .toDF("bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring")
+    assert(expectedDF.except(generateRes("integer", masterInDF)).count == 0)
+  }
+
+  /*
+  Test conversion of all numeric types to Long, as well as string representations
+  of integers and doubles
+  Types tested are boolean, Byte, Short, Int, Long, Float, Double, and string
+  For floats and doubles, the conversion value is the truncated integer portion of
+  the number. For values that exceed the min/max value for integers, the value will be truncated
+  at the least 32 bits, so a very large number will end up being a very large negative number
+  */
+  test("Test convert to Long") {
+    val expectedDF = Seq((1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L),
+      (0L, 9L, 10L, 11L, 12L, 14L, 15L, 16L, 17L),
+      (1L, -127L, 345L, 32867L, 2147483747L, 18L, 20L, 100L, 200L))
+      .toDF("bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring")
+    assert(expectedDF.except(generateRes("long", masterInDF)).count == 0)
+  }
+
+  /*
+  Test conversion of all numeric types to Double, as well as string representations
+  of integers and doubles
+  Types tested are boolean, Byte, Short, Int, Long, Float, Double, and string
+  */
+  test("Test convert to Double") {
+    val fToD = 18.91F.toDouble
+    val expectedDF = Seq((1.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0),
+      (0.0, 9.0, 10.0, 11.0, 12.0, 14.5, 15.5, 16.0, 17.456),
+      (1.0, -127.0, 345.0, 32867.0, 2147483747.0, fToD, 20.21, 100.0, 200.12345))
+      .toDF("bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring")
+    assert(expectedDF.except(generateRes("double", masterInDF)).count == 0)
+  }
+
+  // Test the conversions to string
+  test("Test convert all types to String") {
+    val expectedDF = Seq(("true", "1", "2", "3", "4", "5.0", "6.0", "7", "8.0"),
+      ("false", "9", "10", "11", "12", "14.5", "15.5", "16", "17.456"),
+      ("true", "-127", "345", "32867", "2147483747", "18.91", "20.21", "100", "200.12345"))
+      .toDF("bool", "byte", "short", "int", "long", "float", "double", "intstring", "doublestring")
+    assert(expectedDF.except(generateRes("string", masterInDF)).count == 0)
+  }
+
+  // Test convert to categorical:
+  test("Test convert to categorical") {
+    val inDF = Seq(("piano", 1, 2), ("drum", 3, 4), ("guitar", 5, 6)).toDF("instruments", "c1", "c2")
+    val res = new DataConversion().setCol("instruments").setConvertTo("toCategorical").transform(inDF)
+    assert(SparkSchema.isCategorical(res, "instruments"))
+  }
+
+  // Test clearing categorical
+  test("Test that categorical features will be cleared") {
+    val inDF = Seq(("piano", 1, 2), ("drum", 3, 4), ("guitar", 5, 6)).toDF("instruments", "c1", "c2")
+    val res = new DataConversion().setCol("instruments").setConvertTo("toCategorical").transform(inDF)
+    assert(SparkSchema.isCategorical(res, "instruments"))
+    val res2 = new DataConversion().setCol("instruments").setConvertTo("clearCategorical").transform(res)
+    assert(!SparkSchema.isCategorical(res2, "instruments"))
+    assert(inDF.except(res2).count == 0)
+  }
+
+  // Verify that a TimestampType is converted to a LongType
+  test("Test timestamp to long conversion") {
+    val res = new DataConversion().setCol("Col0").setConvertTo("long")
+      .setDateTimeFormat("yyyy-MM-dd HH:mm:ss.SSS").transform(tsDF)
+    assert(res.schema("Col0").dataType == LongType)
+    assert(lDF.except(res).count == 0)
+  }
+
+  // Test the reverse - long to timestamp
+  test("Test long to timestamp conversion") {
+    val res = new DataConversion().setCol("Col0").setConvertTo("date")
+      .setDateTimeFormat("yyyy-MM-dd HH:mm:ss.SSS").transform(lDF)
+    assert(res.schema("Col0").dataType == TimestampType)
+    assert(tsDF.except(res).count == 0)
+  }
+
+  test("Test timestamp to string conversion") {
+    val res = new DataConversion().setCol("Col0").setConvertTo("string")
+      .setDateTimeFormat("yyyy-MM-dd HH:mm:ss.SSS").transform(tsDF)
+    assert(res.schema("Col0").dataType == StringType)
+    assert(sDF.except(res).count == 0)
+  }
+
+  test("Test date string to timestamp conversion") {
+    val res = new DataConversion().setCol("Col0").setConvertTo("date")
+      .setDateTimeFormat("yyyy-MM-dd HH:mm:ss.SSS").transform(sDF)
+    val res2 = new DataConversion().setCol("Col0").setConvertTo("long")
+      .setDateTimeFormat("yyyy-MM-dd HH:mm:ss.SSS").transform(res)
+    assert(res.schema("Col0").dataType == TimestampType)
+    assert(tsDF.except(res).count == 0)
+  }
+
+  def generateRes(convTo: String, inDF: DataFrame): DataFrame = {
+    val result = new DataConversion().setCol("bool, byte, short, int, long, float, double, intstring, doublestring")
+      .setConvertTo(convTo).transform(masterInDF)
+    result
+  }
+
+}
diff --git a/src/downloader/build.sbt b/src/downloader/build.sbt
new file mode 100644
index 0000000000..6d55f118b6
--- /dev/null
+++ b/src/downloader/build.sbt
@@ -0,0 +1 @@
+//> DependsOn: core
diff --git a/src/downloader/src/main/python/ModelDownloader.py b/src/downloader/src/main/python/ModelDownloader.py
new file mode 100644
index 0000000000..b774f63184
--- /dev/null
+++ b/src/downloader/src/main/python/ModelDownloader.py
@@ -0,0 +1,101 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+import sys
+
+if sys.version >= '3':
+    basestring = str
+
+from pyspark.ml.param.shared import *
+from mmlspark.Utils import *
+
+DEFAULT_URL = "https://mmlspark.azureedge.net/datasets/CNTKModels/"
+
+
+class ModelSchema:
+    def __init__(self, name, dataset, modelType, uri, hash, size, inputNode, numLayers, layerNames):
+        """
+        An object that represents a model.
+
+        :param name: Name of the model
+        :param dataset: Dataset it was trained on
+        :param modelType: Domain that the model operates on
+        :param uri: The location of the model's bytes
+        :param hash: The sha256 hash of the models bytes
+        :param size: the size of the model in bytes
+        :param inputNode: the node which represents the input
+        :param numLayers: the number of layers of the model
+        :param layerNames: the names of nodes that represent layers in the network
+        """
+        self.name = name
+        self.dataset = dataset
+        self.modelType = modelType
+        self.uri = uri
+        self.hash = hash
+        self.size = size
+        self.inputNode = inputNode
+        self.numLayers = numLayers
+        self.layerNames = layerNames
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        return "ModelSchema<name: {}, dataset: {}, loc: {}>".format(self.name, self.dataset, self.uri)
+
+    def toJava(self, sparkSession):
+        ctx = sparkSession.sparkContext
+        uri = ctx._jvm.java.net.URI(self.uri)
+        return ctx._jvm.com.microsoft.ml.spark.ModelSchema(
+            self.name, self.dataset, self.modelType,
+            uri, self.hash, self.size, self.inputNode,
+            self.numLayers, self.layerNames)
+
+    @staticmethod
+    def fromJava(jobj):
+        return ModelSchema(jobj.name(), jobj.dataset(),
+                           jobj.modelType(), jobj.uri().toString(),
+                           jobj.hash(), jobj.size(), jobj.inputNode(),
+                           jobj.numLayers(), list(jobj.layerNames()))
+
+
+class ModelDownloader:
+    def __init__(self, sparkSession, localPath, serverURL=DEFAULT_URL):
+        """
+        A class for downloading CNTK pretrained models in python. To download all models use the downloadModels
+        function. To browse models from the microsoft server please use remoteModels.
+
+        :param sparkSession: A spark session for interfacing between python and java
+        :param localPath: The folder to save models to
+        :param serverURL: The location of the model Server, beware this default can change!
+        """
+        self.localPath = localPath
+        self.serverURL = serverURL
+
+        self._sparkSession = sparkSession
+        self._ctx = sparkSession.sparkContext
+        self._model_downloader = self._ctx._jvm.com.microsoft.ml.spark.ModelDownloader(
+            sparkSession._jsparkSession, localPath, serverURL)
+
+    def _wrap(self, iter):
+        return (ModelSchema.fromJava(s) for s in iter)
+
+    def localModels(self):
+        return self._wrap(self._model_downloader.localModels())
+
+    def remoteModels(self):
+        return self._wrap(self._model_downloader.remoteModels())
+
+    def downloadModel(self, model):
+        model = model.toJava(self._sparkSession)
+        return ModelSchema.fromJava(self._model_downloader.downloadModel(model))
+
+    def downloadByName(self, name):
+        return ModelSchema.fromJava(self._model_downloader.downloadByName(name))
+
+    def downloadModels(self, models=None):
+        if models is None:
+            models = self.remoteModels()
+        models = (m.toJava(self._sparkSession) for m in models)
+
+        return list(self._wrap(self._model_downloader.downloadModels(models)))
diff --git a/src/downloader/src/main/scala/ModelDownloader.scala b/src/downloader/src/main/scala/ModelDownloader.scala
new file mode 100644
index 0000000000..55b42a3bb0
--- /dev/null
+++ b/src/downloader/src/main/scala/ModelDownloader.scala
@@ -0,0 +1,260 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.io._
+import java.net.{URI, URL}
+import java.util
+import org.apache.commons.io.IOUtils
+import org.apache.hadoop.conf.{Configuration => HadoopConf}
+import org.apache.hadoop.fs.{FileSystem, LocatedFileStatus, Path}
+import org.apache.hadoop.io.{IOUtils => HUtils}
+import org.apache.log4j.LogManager
+import org.apache.spark.sql.SparkSession
+import spray.json._
+import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
+
+/**
+  * Abstract representation of a repository for future expansion
+  *
+  * @tparam S an instantiation of the
+  */
+private[spark] abstract class Repository[S <: Schema] {
+
+  def listSchemas(): Iterable[S]
+
+  def getBytes(schema: S): InputStream
+
+  def addBytes(schema: S, location: URI, bytes: InputStream): S
+
+}
+
+/**
+  * Exception returned if a repo cannot find the file
+  *
+  * @param uri : location of the file
+  */
+class ModelNotFoundException(uri: URI) extends FileNotFoundException(s"model located at $uri could not be found")
+
+private[spark] class HDFSRepo[S <: Schema](val uri: URI, val hconf: HadoopConf)
+                                            (implicit val jsonFormat: JsonFormat[S])
+  extends Repository[S] {
+
+  private val rootPath = new Path(uri)
+
+  private val fs = FileSystem.get(uri, hconf)
+
+  if (!fs.exists(rootPath)) {
+    fs.mkdirs(rootPath)
+  }
+
+  override def listSchemas(): Iterable[S] = {
+    val fileIteratorHadoop = fs.listFiles(rootPath, false)
+    val fileIterator = new Iterator[LocatedFileStatus] {
+      def hasNext: Boolean = fileIteratorHadoop.hasNext
+
+      def next(): LocatedFileStatus = fileIteratorHadoop.next()
+    }
+
+    val schemaStrings = fileIterator
+      .filter(status =>
+        status.isFile & status.getPath.toString.endsWith(".meta"))
+      .map(status =>
+        IOUtils.toString(fs.open(status.getPath).getWrappedStream))
+
+    schemaStrings.map(s => s.parseJson.convertTo[S]).toList
+  }
+
+  override def getBytes(schema: S): InputStream = {
+    try {
+      fs.open(new Path(schema.uri))
+    } catch {
+      case _: IOException => throw new ModelNotFoundException(schema.uri)
+    }
+  }
+
+  override def addBytes(schema: S, location: URI, bytes: InputStream): S = {
+    val path = new Path(location)
+    val os = fs.create(path)
+    try {
+      HUtils.copyBytes(bytes, os, hconf)
+    } finally {
+      os.close()
+    }
+    val downloadedIs = fs.open(path)
+    try{
+      schema.assertMatchingHash(downloadedIs)
+    }finally{
+      downloadedIs.close()
+    }
+
+    val newSchema = schema.updateURI(location)
+    val schemaPath = new Path(location.getPath + ".meta")
+    val osSchema = fs.create(schemaPath)
+    val SchemaIs = IOUtils.toInputStream(newSchema.toJson.prettyPrint)
+    try {
+      HUtils.copyBytes(SchemaIs, osSchema, hconf)
+    } finally {
+      osSchema.close()
+      SchemaIs.close()
+    }
+    newSchema
+  }
+
+}
+
+/**
+  * Class to represent repository of models that will eventually be hosted outside the repo
+  */
+private[spark] class DefaultModelRepo(val baseURL: URL) extends Repository[ModelSchema] {
+  var connectTimeout = 15000
+  var readTimeout = 5000
+
+  import SchemaJsonProtocol._
+
+  private def toStream(url: URL) = {
+    val urlCon = url.openConnection()
+    urlCon.setConnectTimeout(connectTimeout)
+    urlCon.setReadTimeout(readTimeout)
+    new BufferedInputStream(urlCon.getInputStream)
+  }
+
+  private def join(root: URL, file: String) = {
+    new Path(new Path(root.toURI), file).toUri.toURL
+  }
+
+  override def listSchemas(): Iterable[ModelSchema] = {
+    val url = join(baseURL, "MANIFEST")
+    val manifestStream = toStream(url)
+    try {
+      val modelStreams = IOUtils.readLines(manifestStream).map(fn => toStream(join(baseURL, fn)))
+      try {
+        modelStreams.map(s => IOUtils.toString(s).parseJson.convertTo[ModelSchema])
+      } finally {
+        modelStreams.foreach(_.close())
+      }
+    } finally {
+      manifestStream.close()
+    }
+  }
+
+  override def getBytes(schema: ModelSchema): InputStream = {
+    try {
+      val url = schema.uri.toURL
+      val urlCon = url.openConnection()
+      urlCon.setConnectTimeout(connectTimeout)
+      urlCon.setReadTimeout(readTimeout)
+      new BufferedInputStream(urlCon.getInputStream)
+    } catch {
+      case _: IOException => throw new ModelNotFoundException(schema.uri)
+    }
+  }
+
+  override def addBytes(schema: ModelSchema, location: URI, bytes: InputStream): ModelSchema =
+    throw new IllegalAccessError("Do not have the credentials to write a file to the remote repository")
+}
+
+private[spark] abstract class Client {
+  var quiet = false
+
+  private def log(s: String): Unit = {
+    LogManager.getRootLogger.info(s)
+  }
+
+  def repoTransfer[T <: Schema](schema: T, targetLocation: URI,
+                                source: Repository[T], target: Repository[T],
+                                overwrite: Boolean = false, closeStream: Boolean = true): T = {
+    if (target.listSchemas().exists(s =>
+      (s.uri == targetLocation) && (s.hash == schema.hash))) {
+      log(s"Using model at $targetLocation, skipping download")
+      target.listSchemas().find(_.hash == schema.hash).get
+    } else {
+      log(s"No model found in local repo, writing bytes to $targetLocation")
+      val sourceStream = source.getBytes(schema)
+      try {
+        target.addBytes(schema, targetLocation, sourceStream)
+      } finally {
+        if (closeStream) sourceStream.close()
+      }
+    }
+  }
+
+}
+
+private[spark] object ModelDownloader {
+  private[spark] val defaultURL = new URL("https://mmlspark.azureedge.net/datasets/CNTKModels/")
+}
+
+/**
+  * Class for downloading models from a server to Local or HDFS
+  *
+  * @param spark Spark session so that the downloader can save to HDFS
+  * @param localPath path to a directory that will store the models (local or HDFS)
+  * @param serverURL URL of the server which supplies models ( The default URL is subject to change)
+  */
+class ModelDownloader(val spark: SparkSession,
+                      val localPath: URI,
+                      val serverURL: URL = ModelDownloader.defaultURL) extends Client {
+
+  import SchemaJsonProtocol._
+
+  def this(spark: SparkSession, localPath: String, serverURL: String) = {
+    this(spark, new URI(localPath), new URL(serverURL))
+  }
+
+  private val localModelRepo = new HDFSRepo[ModelSchema](localPath, spark.sparkContext.hadoopConfiguration)
+
+  private val remoteModelRepo = new DefaultModelRepo(serverURL)
+
+  /**
+    * Function for querying the local repository for its registered models
+    *
+    * @return the model schemas found in the downloader's local path
+    */
+  def localModels: util.Iterator[ModelSchema] = localModelRepo.listSchemas().iterator.asJava
+
+  /**
+    * Function for querying the remote server for its registered models
+    *
+    * @return the model schemas found in remote reposiory accessed through the serverURL
+    */
+  def remoteModels: util.Iterator[ModelSchema] = remoteModelRepo.listSchemas().iterator.asJava
+
+  /**
+    * Method to download a single model
+    * @param model the remote model schema
+    * @return the new local model schema with a URI that points to the model's location (on HDFS or local)
+    */
+  def downloadModel(model: ModelSchema): ModelSchema = {
+    repoTransfer(model,
+      new Path(new Path(localPath), NamingConventions.canonicalModelFilename(model)).toUri,
+      remoteModelRepo, localModelRepo)
+  }
+
+  def downloadByName(name: String): ModelSchema = {
+    val models = remoteModels.filter(_.name == name).toList
+    if (models.length != 1) {
+      throw new IllegalArgumentException(s"there are ${models.length} models with the same name")
+    }
+    downloadModel(models.head)
+  }
+
+  /**
+    * @param models An iterable of remote model schemas
+    * @return An list of local model schema whose URI's points to the model's location (on HDFS or local)
+    */
+  def downloadModels(models: Iterable[ModelSchema] = remoteModels.toIterable): List[ModelSchema] =
+  // Call toList so that all models are downloaded when downloadModels are called
+    models.map(downloadModel).toList
+
+  /**
+    * @param models A java iterator of remote model schemas for in the java api (for python wrapper)
+    * @return A java List of local model schema whose URI's points to the model's location (on HDFS or local)
+    */
+  def downloadModels(models: util.ArrayList[ModelSchema]): util.List[ModelSchema] =
+  // Call toList so that all models are downloaded when downloadModels are called
+    models.map(downloadModel).toList.asJava
+
+}
diff --git a/src/downloader/src/main/scala/Schema.scala b/src/downloader/src/main/scala/Schema.scala
new file mode 100644
index 0000000000..f30d02a83e
--- /dev/null
+++ b/src/downloader/src/main/scala/Schema.scala
@@ -0,0 +1,92 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.io.InputStream
+import java.net.URI
+import org.apache.commons.codec.digest.DigestUtils
+import spray.json._
+
+import scala.collection.JavaConversions._
+import scala.collection.JavaConverters._
+
+private[spark] object NamingConventions {
+
+  def canonicalModelFilename(name: String, dataset: String): String =
+    s"${name}_$dataset.model"
+
+  def canonicalModelFilename(model: ModelSchema): String =
+    s"${model.name}_${model.dataset}.model"
+
+}
+
+/**
+  * Abstract representation of a schema for an item that can be held in a repository
+  *
+  * @param uri  location of the underlying file (local, HDFS, or HTTP)
+  * @param hash sha256 hash of the underlying file
+  * @param size size in bytes of the underlying file
+  */
+abstract class Schema(val uri: URI, val hash: String, val size: Long) {
+
+  def updateURI(newURI: URI): this.type
+
+  def assertMatchingHash(bytes: InputStream): Unit = {
+    val newHash = DigestUtils.sha256Hex(bytes)
+    if (newHash != hash) {
+      throw new IllegalArgumentException(s"downloaded hash: $newHash does not match given hash: $hash")
+    }
+  }
+
+}
+
+/**
+  * Class representing the schema of a CNTK model
+  * @param name      name of the model architecture
+  * @param dataset   dataset the model was trained on
+  * @param modelType type of problem the model is suited for eg: (image, text, sound, sentiment etc)
+  * @param uri       location of the underlying file (local, HDFS, or HTTP)
+  * @param hash      sha256 hash of the underlying file
+  * @param size      size in bytes of the underlying file
+  * @param inputNode the node which represents the input
+  * @param numLayers the number of layers of the model
+  * @param layerNames the names nodes that represent layers in the network
+  */
+case class ModelSchema(name: String,
+                       dataset: String,
+                       modelType: String,
+                       override val uri: URI,
+                       override val hash: String,
+                       override val size: Long,
+                       inputNode: Int,
+                       numLayers: Int,
+                       layerNames: Array[String])
+  extends Schema(uri, hash, size) {
+
+  def this(name: String, dataset: String, modelType: String,
+           uri: URI, hash: String, size: Long, inputNode: Int, numLayers: Int,
+           layerNames: java.util.ArrayList[String]) = {
+    this(name, dataset, modelType, uri, hash, size,
+      inputNode, numLayers, layerNames.toList.toArray)
+  }
+
+  override def updateURI(newURI: URI): this.type = this.copy(uri = newURI).asInstanceOf[this.type]
+
+}
+
+private[spark] object SchemaJsonProtocol extends DefaultJsonProtocol {
+
+  implicit object URIJsonFormat extends JsonFormat[URI] {
+    def write(u: URI): JsValue = {
+      JsString(u.toString)
+    }
+
+    def read(value: JsValue): URI = new URI(value.asInstanceOf[JsString].value)
+  }
+
+  implicit val modelSchemaFormat: RootJsonFormat[ModelSchema] =
+    jsonFormat(ModelSchema.apply,
+      "name", "dataset", "modelType", "uri", "hash", "size", "inputNode", "numLayers", "layerNames")
+
+}
diff --git a/src/downloader/src/test/scala/DownloaderSuite.scala b/src/downloader/src/test/scala/DownloaderSuite.scala
new file mode 100644
index 0000000000..835c39e716
--- /dev/null
+++ b/src/downloader/src/test/scala/DownloaderSuite.scala
@@ -0,0 +1,49 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.nio.file.Files
+import com.microsoft.ml.spark.FileUtilities.File
+import scala.collection.JavaConversions._
+
+class DownloaderSuite extends TestBase {
+
+  val saveDir = Files.createTempDirectory("Models-").toFile
+  val d = new ModelDownloader(session, saveDir.toURI)
+
+  test("A downloader should be able to download a model", TestBase.Extended) {
+    val m = d.remoteModels.filter(_.name == "CNN").next()
+    val schema = d.downloadModel(m)
+    println(schema)
+    assert(m.size == new File(schema.uri).length())
+    assert(d.localModels.toList.length == 1)
+  }
+
+  test("A downloader should be able to get all Models " +
+    "and maybeDownload should be fast if models are downloaded", TestBase.Extended) {
+
+    d.downloadModels()
+    val modTimes = d.localModels.map(s =>
+      new File(s.uri).lastModified())
+
+    d.downloadModels()
+    val modTimes2 = d.localModels.map(s =>
+      new File(s.uri).lastModified())
+
+    // No modification on second call because models are cached
+    assert(modTimes.toList === modTimes2.toList)
+
+    // the downloader's local models will reflect the change
+    assert(d.localModels.toList.length == d.remoteModels.toList.length)
+
+    // there will be a metadata file for every model
+    assert(saveDir.list().count(_.endsWith(".meta")) == d.localModels.toList.length)
+  }
+
+  override def afterAll(): Unit = {
+    FileUtilities.delTree(saveDir)
+    super.afterAll()
+  }
+
+}
diff --git a/src/featurize/build.sbt b/src/featurize/build.sbt
new file mode 100644
index 0000000000..c013e90fa1
--- /dev/null
+++ b/src/featurize/build.sbt
@@ -0,0 +1,3 @@
+//> DependsOn: core
+//> DependsOn: utils
+//> DependsOn: multi-column-adapter
diff --git a/src/featurize/src/main/scala/AssembleFeatures.scala b/src/featurize/src/main/scala/AssembleFeatures.scala
new file mode 100644
index 0000000000..27b583d00e
--- /dev/null
+++ b/src/featurize/src/main/scala/AssembleFeatures.scala
@@ -0,0 +1,499 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.io._
+
+import com.microsoft.ml.spark.schema.{CategoricalColumnInfo, DatasetExtensions}
+import com.microsoft.ml.spark.schema.DatasetExtensions._
+import org.apache.hadoop.fs.Path
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.ml.feature._
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util._
+import org.apache.spark.ml.{Estimator, Model}
+import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
+import org.apache.spark.ml.linalg.SparseVector
+import org.apache.spark.mllib.linalg.VectorUDT
+import org.apache.spark.sql._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.{StringType, _}
+
+import scala.collection.mutable
+import scala.collection.mutable.ListBuffer
+import scala.collection.immutable.{BitSet, HashSet}
+
+private object AssembleFeaturesUtilities
+{
+  private val tokenizedColumnName = "tokenizedFeatures"
+  private val hashedFeaturesColumnName = "hashedFeatures"
+  private val selectedFeaturesColumnName = "selectedFeatures"
+
+  def getTokenizedColumnName(dataset:DataFrame): String = {
+    dataset.withDerivativeCol(tokenizedColumnName)
+  }
+
+  def getHashedFeaturesColumnName(dataset:DataFrame): String = {
+    dataset.withDerivativeCol(hashedFeaturesColumnName)
+  }
+
+  def getSelectedFeaturesColumnName(dataset:DataFrame): String = {
+    dataset.withDerivativeCol(selectedFeaturesColumnName)
+  }
+
+  def hashStringColumns(nonMissingDataset: DataFrame, colNamesToHash: ListBuffer[String],
+                        hashingTransform: HashingTF): DataFrame = {
+    val tokenizeFunc = udf((cols: Seq[String]) => cols
+      .filter(str => str != null && !str.isEmpty)
+      .flatMap(str => str.toLowerCase.split("\\s")))
+    val cols = array(colNamesToHash.map(x => col(x)): _*)
+    val combinedData = nonMissingDataset.withColumn(hashingTransform.getInputCol, tokenizeFunc(cols))
+    hashingTransform.transform(combinedData)
+  }
+
+  def isNumeric(dataType: DataType): Boolean = dataType == IntegerType ||
+    dataType == BooleanType ||
+    dataType == LongType ||
+    dataType == ByteType ||
+    dataType == ShortType ||
+    dataType == FloatType
+}
+
+/**
+  * Class containing the list of column names to perform special featurization steps for.
+  * colNamesToHash - List of column names to hash.
+  * colNamesToDuplicateForMissings - List of column names containing doubles to duplicate
+  *                                   so we can remove missing values from them.
+  * colNamesToTypes - Map of column names to their types.
+  * colNamesToCleanMissings - List of column names to clean missing values from (ignore).
+  * colNamesToVectorize - List of column names to vectorize using FastVectorAssembler.
+  * categoricalColumns - List of categorical columns to pass through or turn into indicator array.
+  * conversionColumnNamesMap - Map from old column names to new.
+  */
+@SerialVersionUID(0L)
+class ColumnNamesToFeaturize extends Serializable {
+  val colNamesToHash                  = ListBuffer[String]()
+  val colNamesToDuplicateForMissings  = ListBuffer[String]()
+  val colNamesToTypes                 = mutable.Map[String, DataType]()
+  val vectorColumnsToAdd              = ListBuffer[String]()
+  val colNamesToCleanMissings         = ListBuffer[String]()
+  val colNamesToVectorize             = ListBuffer[String]()
+  val categoricalColumns              = mutable.Map[String, String]()
+  val conversionColumnNamesMap        = mutable.Map[String, String]()
+}
+
+object AssembleFeatures extends DefaultParamsReadable[AssembleFeatures]
+
+/**
+  * Assembles the features in a dataset, converting them to a form appropriate for training.
+  */
+class AssembleFeatures(override val uid: String) extends Estimator[AssembleFeaturesModel]
+  with HasFeaturesCol with MMLParams {
+
+  def this() = this(Identifiable.randomUID("AssembleFeatures"))
+
+  setDefault(featuresCol -> "features")
+
+  val columnsToFeaturize: StringArrayParam =
+    new StringArrayParam(this, "columnsToFeaturize", "columns to featurize", array => true)
+
+  /** @group getParam **/
+  final def getColumnsToFeaturize: Array[String] = $(columnsToFeaturize)
+
+  /** @group setParam **/
+  def setColumnsToFeaturize(value: Array[String]): this.type = set(columnsToFeaturize, value)
+
+  val oneHotEncodeCategoricals: Param[Boolean] = BooleanParam(this,
+    "oneHotEncodeCategoricals",
+    "one hot encode categoricals",
+    true)
+
+  /** @group getParam **/
+  final def getOneHotEncodeCategoricals: Boolean = $(oneHotEncodeCategoricals)
+
+  /** @group setParam **/
+  def setOneHotEncodeCategoricals(value: Boolean): this.type = set(oneHotEncodeCategoricals, value)
+
+  val numberOfFeatures: IntParam =
+    IntParam(this, "numberOfFeatures", "number of features to hash string columns to")
+
+  /** @group getParam **/
+  final def getNumberOfFeatures: Int = $(numberOfFeatures)
+
+  /** @group setParam **/
+  def setNumberOfFeatures(value: Int): this.type = set(numberOfFeatures, value)
+
+  /**
+    * Assembles the features in the dataset.
+    *
+    * @param dataset The input dataset to fit.
+    * @return The model that will return the original dataset with assembled features as a vector.
+    */
+  override def fit(dataset: Dataset[_]): AssembleFeaturesModel = {
+    val columnNamesToFeaturize = new ColumnNamesToFeaturize
+
+    val columnsToFeaturize = HashSet[String](getColumnsToFeaturize: _*)
+
+    val columns = dataset.columns
+
+    val allIntermediateCols = new mutable.HashSet[String]()
+    allIntermediateCols ++= columns
+
+    val datasetAsDf = dataset.toDF()
+
+    // Remap and convert columns prior to training
+    columns.foreach {
+      col => if (columnsToFeaturize.contains(col)) {
+        val unusedColumnName = DatasetExtensions.findUnusedColumnName(col)(allIntermediateCols)
+        allIntermediateCols += unusedColumnName
+
+        // Find out if column is categorical
+        // If using non-tree learner, one-hot encode them
+        // Otherwise, pass attributes directly to train classifier,
+        // but move categoricals to beginning for superior
+        // runtime and to avoid spark bug
+        val categoricalInfo = new CategoricalColumnInfo(datasetAsDf, col)
+        val isCategorical = categoricalInfo.isCategorical
+        if (isCategorical) {
+          val oheColumnName = DatasetExtensions.findUnusedColumnName("TmpOHE_" + unusedColumnName)(allIntermediateCols)
+          columnNamesToFeaturize.categoricalColumns += unusedColumnName -> oheColumnName
+        }
+
+        dataset.schema(col).dataType match {
+          case _ @ (dataType: DataType) if dataType == DoubleType
+            || dataType == FloatType => {
+            columnNamesToFeaturize.colNamesToTypes += unusedColumnName -> dataType
+            // For double and float columns, will always need to remove possibly NaN values
+            columnNamesToFeaturize.colNamesToCleanMissings += unusedColumnName
+            columnNamesToFeaturize.conversionColumnNamesMap += col -> unusedColumnName
+          }
+          case _ @ (dataType: DataType) if (AssembleFeaturesUtilities.isNumeric(dataType)) => {
+            // Convert all numeric columns to same type double to feed them as a vector to the learner
+            if (dataset.schema(col).nullable) {
+              columnNamesToFeaturize.colNamesToCleanMissings += unusedColumnName
+            }
+            columnNamesToFeaturize.colNamesToTypes += unusedColumnName -> dataType
+            columnNamesToFeaturize.conversionColumnNamesMap += col -> unusedColumnName
+          }
+          case _: StringType => {
+            // Hash string columns
+            columnNamesToFeaturize.colNamesToHash += col
+            columnNamesToFeaturize.colNamesToTypes += col -> StringType
+          }
+          case _ @ (dataType: DataType) if dataType.typeName == "vector" || dataType.isInstanceOf[VectorUDT] => {
+            columnNamesToFeaturize.vectorColumnsToAdd += unusedColumnName
+            // For double columns, will always need to remove possibly NaN values
+            columnNamesToFeaturize.colNamesToCleanMissings += unusedColumnName
+            columnNamesToFeaturize.colNamesToTypes += unusedColumnName -> dataType
+            columnNamesToFeaturize.conversionColumnNamesMap += col -> unusedColumnName
+          }
+        }
+      }
+    }
+    val colNamesToVectorizeWithoutHashOneHot: List[String] = getColumnsToVectorize(columnNamesToFeaturize,
+      columnNamesToFeaturize.conversionColumnNamesMap.keys.toSeq)
+
+    // Tokenize the string columns
+    val (transform: Option[HashingTF], colNamesToVectorize: List[String], nonZeroColumns: Option[Array[Int]]) =
+      if (columnNamesToFeaturize.colNamesToHash.isEmpty)
+        (None, colNamesToVectorizeWithoutHashOneHot, None)
+      else {
+        val hashingTransform = new HashingTF()
+          .setInputCol(AssembleFeaturesUtilities.getTokenizedColumnName(datasetAsDf))
+          .setOutputCol(AssembleFeaturesUtilities.getHashedFeaturesColumnName(datasetAsDf))
+          .setNumFeatures(getNumberOfFeatures)
+
+        // Hash data for the vectorizer, to determine which slots are non-zero and should be kept
+        val hashedData = AssembleFeaturesUtilities.hashStringColumns(datasetAsDf,
+          columnNamesToFeaturize.colNamesToHash,
+          hashingTransform)
+        val encoder = Encoders.kryo[BitSet]
+        val bitset = hashedData.select(hashingTransform.getOutputCol)
+          .map(row => toBitSet(row.getAs[SparseVector](0).indices))(encoder)
+          .reduce(_ | _)
+
+        val nonZeroColumns: Array[Int] = bitset.toArray
+
+        val colsToVectorize =
+          colNamesToVectorizeWithoutHashOneHot :+ AssembleFeaturesUtilities.getSelectedFeaturesColumnName(datasetAsDf)
+
+        (Some(hashingTransform),
+          colsToVectorize,
+          Some(nonZeroColumns))
+      }
+
+    columnNamesToFeaturize.colNamesToVectorize ++= colNamesToVectorize
+
+    val vectorAssembler = new FastVectorAssembler()
+      .setInputCols(colNamesToVectorize.toArray)
+      .setOutputCol(getFeaturesCol)
+
+    new AssembleFeaturesModel(uid, columnNamesToFeaturize, transform,
+      nonZeroColumns, vectorAssembler, $(oneHotEncodeCategoricals))
+  }
+
+  private def getColumnsToVectorize(columnNamesToFeaturize: ColumnNamesToFeaturize,
+                                    columnsToFeaturize: Seq[String]): List[String] = {
+    val categoricalColumnNames =
+      if ($(oneHotEncodeCategoricals)) {
+        columnNamesToFeaturize.categoricalColumns.values
+      } else {
+        columnNamesToFeaturize.categoricalColumns.keys
+      }
+
+    val newColumnNames =
+      columnsToFeaturize.map(oldColName => columnNamesToFeaturize.conversionColumnNamesMap(oldColName))
+
+    val colNamesToVectorizeWithoutHash = (categoricalColumnNames.toList
+      ::: newColumnNames.toList)
+      .distinct
+
+    // If one hot encoding, remove the columns we are converting from the list to vectorize
+    val colNamesToVectorizeWithoutHashOneHot =
+      if ($(oneHotEncodeCategoricals)) {
+        colNamesToVectorizeWithoutHash.filter {
+          !columnNamesToFeaturize.categoricalColumns.contains(_)
+        }
+      } else {
+        colNamesToVectorizeWithoutHash
+      }
+    colNamesToVectorizeWithoutHashOneHot
+  }
+
+  def toBitSet(indices: Array[Int]): BitSet = {
+    indices.foldLeft(BitSet())((bitset, index) => bitset + index)
+  }
+
+  override def copy(extra: ParamMap): Estimator[AssembleFeaturesModel] = {
+    new AssembleFeatures()
+  }
+
+  @DeveloperApi
+  override def transformSchema(schema: StructType): StructType =
+    schema.add(new StructField(getFeaturesCol, VectorType))
+
+}
+
+/**
+  * Model produced by [[AssembleFeatures]].
+  */
+class AssembleFeaturesModel(val uid: String,
+                     val columnNamesToFeaturize: ColumnNamesToFeaturize,
+                     val hashingTransform: Option[HashingTF],
+                     val nonZeroColumns: Option[Array[Int]],
+                     val vectorAssembler: FastVectorAssembler,
+                     val oneHotEncodeCategoricals: Boolean)
+  extends Model[AssembleFeaturesModel] with Params with MLWritable {
+
+  /** @group getParam **/
+  final def getFeaturesColumn: String = vectorAssembler.getOutputCol
+
+  override def write: MLWriter = new AssembleFeaturesModel.AssembleFeaturesModelWriter(uid,
+    columnNamesToFeaturize,
+    hashingTransform,
+    nonZeroColumns,
+    vectorAssembler,
+    oneHotEncodeCategoricals)
+
+  override def copy(extra: ParamMap): AssembleFeaturesModel =
+    new AssembleFeaturesModel(uid,
+      columnNamesToFeaturize,
+      hashingTransform,
+      nonZeroColumns,
+      vectorAssembler,
+      oneHotEncodeCategoricals)
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    val transformedDataset = dataset.select(
+      dataset.columns.flatMap {
+        col => {
+          val dataType = dataset.schema(col).dataType
+          if (!dataType.isInstanceOf[StringType]
+            && columnNamesToFeaturize.colNamesToHash.contains(col)) {
+            throw new Exception("Invalid column type specified during score, should be string for column: " + col)
+          }
+
+          if (!columnNamesToFeaturize.conversionColumnNamesMap.contains(col)) {
+            Seq(dataset(col))
+          } else {
+            val colType = columnNamesToFeaturize.colNamesToTypes(columnNamesToFeaturize.conversionColumnNamesMap(col))
+            if (colType != dataType) {
+              throw new Exception(s"Invalid column type specified during score, should be $colType for column: " + col)
+            }
+
+            // Convert all columns to same type double to feed them as a vector to the learner
+            dataType match {
+              case _ @ (dataType: DataType) if (AssembleFeaturesUtilities.isNumeric(dataType)) => {
+                Seq(dataset(col),
+                  dataset(col).cast(DoubleType).as(columnNamesToFeaturize.conversionColumnNamesMap(col),
+                    dataset.schema(col).metadata))
+              }
+              case _: DoubleType => {
+                Seq(dataset(col),
+                  dataset(col).as(columnNamesToFeaturize.conversionColumnNamesMap(col),
+                    dataset.schema(col).metadata))
+              }
+              case _ @ (dataType: DataType) if dataType.typeName == "vector" || dataType.isInstanceOf[VectorUDT] => {
+                Seq(dataset(col),
+                  dataset(col).as(columnNamesToFeaturize.conversionColumnNamesMap(col),
+                    dataset.schema(col).metadata))
+              }
+              case default => Seq(dataset(col))
+            }
+          }
+        }
+      }: _*
+    )
+
+    // Drop all rows with missing values
+    val nonMissingDataset = transformedDataset.na.drop(columnNamesToFeaturize.colNamesToCleanMissings)
+    // Tokenize the string columns
+    val stringFeaturizedData: DataFrame =
+      if (columnNamesToFeaturize.colNamesToHash.isEmpty) nonMissingDataset
+      else {
+        val hashedData = AssembleFeaturesUtilities.hashStringColumns(nonMissingDataset,
+          columnNamesToFeaturize.colNamesToHash,
+          hashingTransform.get)
+
+        val vectorSlicer = new VectorSlicer().setIndices(nonZeroColumns.get)
+          .setInputCol(hashingTransform.get.getOutputCol)
+          .setOutputCol(columnNamesToFeaturize.colNamesToVectorize.last)
+        // Run count based feature selection on the hashed data
+        val countBasedFeatureSelectedColumns = vectorSlicer.transform(hashedData)
+        // Remove the intermediate columns tokenized and hashed
+        countBasedFeatureSelectedColumns
+          .drop(hashingTransform.get.getInputCol)
+          .drop(hashingTransform.get.getOutputCol)
+      }
+    var columnsToDrop = vectorAssembler.getInputCols
+    // One-hot encode categoricals
+    val oheData =
+      if (oneHotEncodeCategoricals && !columnNamesToFeaturize.categoricalColumns.isEmpty) {
+        val ohe = new OneHotEncoder()
+        val inputColsKeys = columnNamesToFeaturize.categoricalColumns.keys
+        val outputColsKeys = columnNamesToFeaturize.categoricalColumns.values
+        val inputCols = inputColsKeys.mkString(",")
+        val outputCols = outputColsKeys.mkString(",")
+        val oheAdapter =
+          new MultiColumnAdapter().setBaseTransformer(ohe).setInputCols(inputCols).setOutputCols(outputCols)
+        columnsToDrop = columnsToDrop.union(columnNamesToFeaturize.categoricalColumns.keys.toSeq)
+        oheAdapter.transform(stringFeaturizedData)
+      } else {
+        stringFeaturizedData
+      }
+
+    val vectorizedData = vectorAssembler.transform(oheData)
+
+    // Drop the vector assembler intermediate columns
+    vectorizedData.drop(columnsToDrop: _*)
+  }
+
+  @DeveloperApi
+  override def transformSchema(schema: StructType): StructType =
+    schema.add(new StructField(getFeaturesColumn, VectorType))
+
+}
+
+object AssembleFeaturesModel extends MLReadable[AssembleFeaturesModel] {
+
+  private val hashingTransformPart = "hashingTransform"
+  private val vectorAssemblerPart = "vectorAssembler"
+  private val columnNamesToFeaturizePart = "columnNamesToFeaturize"
+  private val nonZeroColumnsPart = "nonZeroColumns"
+  private val dataPart = "data"
+
+  override def read: MLReader[AssembleFeaturesModel] = new AssembleFeaturesModelReader
+
+  override def load(path: String): AssembleFeaturesModel = super.load(path)
+
+  /** [[MLWriter]] instance for [[AssembleFeaturesModel]] */
+  private[AssembleFeaturesModel]
+  class AssembleFeaturesModelWriter(val uid: String,
+                             val columnNamesToFeaturize: ColumnNamesToFeaturize,
+                             val hashingTransform: Option[HashingTF],
+                             val nonZeroColumns: Option[Array[Int]],
+                             val vectorAssembler: FastVectorAssembler,
+                             val oneHotEncodeCategoricals: Boolean)
+    extends MLWriter {
+    private case class Data(uid: String, oneHotEncodeCategoricals: Boolean)
+
+    override protected def saveImpl(path: String): Unit = {
+      val overwrite = this.shouldOverwrite
+      val qualPath = PipelineUtilities.makeQualifiedPath(sc, path)
+      // Required in order to allow this to be part of an ML pipeline
+      PipelineUtilities.saveMetadata(uid,
+        AssembleFeaturesModel.getClass.getName.replace("$", ""),
+        new Path(path, "metadata").toString,
+        sc,
+        overwrite)
+
+      val dataPath = new Path(qualPath, dataPart).toString
+
+      // Save data
+      val data = Data(uid, oneHotEncodeCategoricals)
+      // save the hashing transform
+      if (!hashingTransform.isEmpty) {
+        val hashingTransformPath = new Path(qualPath, hashingTransformPart).toString
+        val writer =
+          if (overwrite) hashingTransform.get.write.overwrite()
+          else hashingTransform.get.write
+        writer.save(hashingTransformPath)
+      }
+      // save the vector assembler
+      val vectorAssemblerPath = new Path(qualPath, vectorAssemblerPart).toString
+      val writer =
+        if (overwrite) vectorAssembler.write.overwrite()
+        else vectorAssembler.write
+      writer.save(vectorAssemblerPath)
+
+      // save the column names to featurize
+      ObjectUtilities.writeObject(columnNamesToFeaturize, qualPath, columnNamesToFeaturizePart, sc, overwrite)
+
+      // save the nonzero columns
+      ObjectUtilities.writeObject(nonZeroColumns, qualPath, nonZeroColumnsPart, sc, overwrite)
+
+      val saveMode =
+        if (overwrite) SaveMode.Overwrite
+        else SaveMode.ErrorIfExists
+      sparkSession.createDataFrame(Seq(data)).repartition(1).write.mode(saveMode).parquet(dataPath)
+    }
+  }
+
+  private class AssembleFeaturesModelReader
+    extends MLReader[AssembleFeaturesModel] {
+    override def load(path: String): AssembleFeaturesModel = {
+      val qualPath = PipelineUtilities.makeQualifiedPath(sc, path)
+      // load the uid and one hot encoding param
+      val dataPath = new Path(qualPath, dataPart).toString
+      val data = sparkSession.read.format("parquet").load(dataPath)
+      val Row(uid: String, oneHotEncodeCategoricals: Boolean) =
+        data.select("uid", "oneHotEncodeCategoricals").head()
+
+      // load the hashing transform
+      val hashingPath = new Path(qualPath, hashingTransformPart).toString
+      val hashingTransform =
+        if (new File(hashingPath).exists()) Some(HashingTF.load(hashingPath))
+        else None
+
+      // load the vector assembler
+      val vectorAssemblerPath = new Path(qualPath, vectorAssemblerPart).toString
+      val vectorAssembler = FastVectorAssembler.load(vectorAssemblerPath)
+
+      // load the column names to featurize
+      val columnNamesToFeaturize =
+        ObjectUtilities.loadObject[ColumnNamesToFeaturize](qualPath, columnNamesToFeaturizePart, sc)
+
+      // load the nonzero columns
+      val nonZeroColumns = ObjectUtilities.loadObject[Option[Array[Int]]](qualPath, nonZeroColumnsPart, sc)
+
+      new AssembleFeaturesModel(uid,
+        columnNamesToFeaturize,
+        hashingTransform,
+        nonZeroColumns,
+        vectorAssembler,
+        oneHotEncodeCategoricals)
+    }
+  }
+
+}
diff --git a/src/featurize/src/main/scala/Featurize.scala b/src/featurize/src/main/scala/Featurize.scala
new file mode 100644
index 0000000000..2090891f79
--- /dev/null
+++ b/src/featurize/src/main/scala/Featurize.scala
@@ -0,0 +1,92 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util._
+import org.apache.spark.ml.{Estimator, Pipeline, PipelineModel}
+import org.apache.spark.sql._
+import org.apache.spark.sql.types._
+
+private object FeaturizeUtilities
+{
+  // 2^18 features by default
+  val numFeaturesDefault = 262144
+  // 2^12 features for tree-based or NN-based learners
+  val numFeaturesTreeOrNNBased = 4096
+}
+
+object Featurize extends DefaultParamsReadable[Featurize]
+
+/**
+  * Featurizes a dataset, converting them to a form appropriate for training.
+  */
+class Featurize(override val uid: String) extends Estimator[PipelineModel] with MMLParams {
+
+  def this() = this(Identifiable.randomUID("Featurize"))
+
+  val featureColumns: MapArrayParam = new MapArrayParam(uid, "featureColumns", "feature columns")
+
+  /** @group getParam **/
+  final def getFeatureColumns: Map[String, Seq[String]] = $(featureColumns)
+
+  /** @group setParam **/
+  def setFeatureColumns(value: Map[String, Seq[String]]): this.type = set(featureColumns, value)
+
+  val oneHotEncodeCategoricals: Param[Boolean] = BooleanParam(this,
+    "oneHotEncodeCategoricals",
+    "one hot encode categoricals",
+    true)
+
+  /** @group getParam **/
+  final def getOneHotEncodeCategoricals: Boolean = $(oneHotEncodeCategoricals)
+
+  /** @group setParam **/
+  def setOneHotEncodeCategoricals(value: Boolean): this.type = set(oneHotEncodeCategoricals, value)
+
+  val numberOfFeatures: IntParam =
+    IntParam(this,
+      "numberOfFeatures",
+      "number of features to hash string columns to",
+      FeaturizeUtilities.numFeaturesDefault)
+
+  /** @group getParam **/
+  final def getNumberOfFeatures: Int = $(numberOfFeatures)
+
+  /** @group setParam **/
+  def setNumberOfFeatures(value: Int): this.type = set(numberOfFeatures, value)
+
+  /**
+    * Featurizes the dataset.
+    *
+    * @param dataset The input dataset to train.
+    * @return The featurized model.
+    */
+  override def fit(dataset: Dataset[_]): PipelineModel = {
+    val pipeline = assembleFeaturesEstimators(getFeatureColumns)
+    pipeline.fit(dataset)
+  }
+
+  private def assembleFeaturesEstimators(featureColumns: Map[String, Seq[String]]): Pipeline = {
+    val assembleFeaturesEstimators = featureColumns.map(newColToFeatures => {
+      new AssembleFeatures()
+        .setColumnsToFeaturize(newColToFeatures._2.toArray)
+        .setFeaturesCol(newColToFeatures._1)
+        .setNumberOfFeatures(getNumberOfFeatures)
+        .setOneHotEncodeCategoricals(getOneHotEncodeCategoricals)
+    }).toArray
+
+    new Pipeline().setStages(assembleFeaturesEstimators)
+  }
+
+  override def copy(extra: ParamMap): Estimator[PipelineModel] = {
+    new Featurize()
+  }
+
+  @DeveloperApi
+  override def transformSchema(schema: StructType): StructType =
+    assembleFeaturesEstimators(getFeatureColumns).transformSchema(schema)
+
+}
diff --git a/src/featurize/src/test/scala/VerifyFeaturize.scala b/src/featurize/src/test/scala/VerifyFeaturize.scala
new file mode 100644
index 0000000000..d01294856f
--- /dev/null
+++ b/src/featurize/src/test/scala/VerifyFeaturize.scala
@@ -0,0 +1,330 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.nio.file.Files
+
+import com.microsoft.ml.spark.FileUtilities.File
+import com.microsoft.ml.spark.schema.SparkSchema
+import org.apache.spark.ml.Estimator
+import org.apache.spark.ml.feature.StringIndexer
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vectors}
+import org.apache.spark.sql._
+import org.apache.spark.sql.types.StructType
+
+class VerifyAssembleFeatures extends EstimatorFuzzingTest {
+  override def setParams(fitDataset: DataFrame, estimator: Estimator[_]): Estimator[_] = {
+    val assembleFeatures = estimator.asInstanceOf[AssembleFeatures]
+    assembleFeatures.setColumnsToFeaturize(fitDataset.columns)
+  }
+
+  override def createFitDataset: DataFrame = {
+    // TODO: Fix bug for date and timestamp type not supported
+    val options = DatasetOptions(ColumnOptions.ValueSet(ColumnOptions.Scalar),
+      DataOptions.ValueSet(DataOptions.String,
+        DataOptions.Int,
+        DataOptions.Double,
+        DataOptions.Boolean,
+        DataOptions.Byte,
+        DataOptions.Short))
+    val indexToType = Map[Int, DatasetOptions](1 -> options, 2 -> options, 3 -> options, 4 -> options, 5 -> options)
+    GenerateDataset.generateDatasetFromOptions(session,
+      indexToType,
+      new BasicDatasetGenerationConstraints(5, 5, Array()),
+      0)
+  }
+
+  override def schemaForDataset: StructType = ???
+
+  override def getEstimator(): Estimator[_] = new AssembleFeatures()
+}
+
+class VerifyFeaturize extends EstimatorFuzzingTest {
+
+  val mockLabelColumn = "Label"
+  val featuresColumn = "testColumn"
+  val thisDirectory = new File("src/test/scala")
+  val targetDirectory = new File("target")
+
+  val benchmarkBasicDataTypesFile = "benchmarkBasicDataTypes.json"
+  val historicDataTypesFile  = new File(thisDirectory, benchmarkBasicDataTypesFile)
+  val benchmarkBasicDataTypesTempFile = getTempFile(benchmarkBasicDataTypesFile)
+
+  val benchmarkVectorsFile = "benchmarkVectors.json"
+  val historicVectorsFile  = new File(thisDirectory, benchmarkVectorsFile)
+  val benchmarkVectorsTempFile = getTempFile(benchmarkVectorsFile)
+
+  val benchmarkStringFile = "benchmarkString.json"
+  val historicStringFile  = new File(thisDirectory, benchmarkStringFile)
+  val benchmarkStringTempFile = getTempFile(benchmarkStringFile)
+
+  val benchmarkStringMissingsFile = "benchmarkStringMissing.json"
+  val historicStringMissingsFile  = new File(thisDirectory, benchmarkStringMissingsFile)
+  val benchmarkStringMissingsTempFile = getTempFile(benchmarkStringMissingsFile)
+
+  val benchmarkOneHotFile = "benchmarkOneHot.json"
+  val historicOneHotFile  = new File(thisDirectory, benchmarkOneHotFile)
+  val benchmarkOneHotTempFile = getTempFile(benchmarkOneHotFile)
+
+  val benchmarkNoOneHotFile = "benchmarkNoOneHot.json"
+  val historicNoOneHotFile  = new File(thisDirectory, benchmarkNoOneHotFile)
+  val benchmarkNoOneHotTempFile = getTempFile(benchmarkNoOneHotFile)
+
+  val benchmarkOneHotMissingsFile = "benchmarkOneHotMissings.json"
+  val historicOneHotMissingsFile  = new File(thisDirectory, benchmarkOneHotMissingsFile)
+  val benchmarkOneHotMissingsTempFile = getTempFile(benchmarkOneHotMissingsFile)
+
+  val benchmarkNoOneHotMissingsFile = "benchmarkNoOneHotMissings.json"
+  val historicNoOneHotMissingsFile  = new File(thisDirectory, benchmarkNoOneHotMissingsFile)
+  val benchmarkNoOneHotMissingsTempFile = getTempFile(benchmarkNoOneHotMissingsFile)
+
+  val benchmarkStringIndexOneHotFile = "benchmarkStringIndexOneHot.json"
+  val historicStringIndexOneHotFile  = new File(thisDirectory, benchmarkStringIndexOneHotFile)
+  val benchmarkStringIndexOneHotTempFile = getTempFile(benchmarkStringIndexOneHotFile)
+
+  private def getTempFile(fileName: String): File = {
+    new File(targetDirectory,
+      s"${fileName}_${System.currentTimeMillis}_.json")
+  }
+
+  // int label with features of:
+  // long, double, boolean, int, byte, float
+  val mockDataset = session.createDataFrame(Seq(
+    (0, 2L, 0.50, true,  0, 0.toByte,    12F),
+    (1, 3L, 0.40, false, 1, 100.toByte,  30F),
+    (0, 4L, 0.78, true,  2, 50.toByte,   12F),
+    (1, 5L, 0.12, false, 3, 0.toByte,    12F),
+    (0, 1L, 0.50, true,  0, 0.toByte,    30F),
+    (1, 3L, 0.40, false, 1, 10.toByte,   12F),
+    (0, 3L, 0.78, false, 2, 0.toByte,    12F),
+    (1, 4L, 0.12, false, 3, 0.toByte,    12F),
+    (0, 0L, 0.50, true,  0, 0.toByte,    12F),
+    (1, 2L, 0.40, false, 1, 127.toByte,  30F),
+    (0, 3L, 0.78, true,  2, -128.toByte, 12F),
+    (1, 4L, 0.12, false, 3, 0.toByte,    12F)))
+    .toDF(mockLabelColumn, "col1", "col2", "col3", "col4", "col5", "col6")
+
+  test("Featurizing on some basic data types") {
+    val result: DataFrame = featurizeAndVerifyResult(mockDataset,
+      benchmarkBasicDataTypesTempFile.toString,
+      historicDataTypesFile)
+    // Verify that features column has the correct number of slots
+    assert(result.first().getAs[DenseVector](featuresColumn).values.length == 6)
+  }
+
+  test("Featurizing with vector columns, sparse and dense") {
+    val dataset: DataFrame = session.createDataFrame(Seq(
+      (0, Vectors.sparse(3, Seq((0, 1.0), (2, 2.0))), 0.50, 0.60, 0, Vectors.dense(1.0, 0.1, -1.5)),
+      (1, Vectors.dense(1.5, 0.2, -1.2), 0.40, 0.50, 1, Vectors.dense(1.5, 0.2, -1.2)),
+      (1, Vectors.sparse(3, Seq((0, 1.0), (2, 2.0))), 0.12, 0.34, 3, Vectors.sparse(3, Seq((0, 1.0), (2, 2.0)))),
+      (0, Vectors.dense(1.1, 0.5, -1.024), 0.50, 0.60, 0, Vectors.dense(1.0, 0.4, -1.23)),
+      (1, Vectors.dense(1.1, 0.5, -1.056), 0.40, 0.50, 1, Vectors.dense(1.1, 0.5, -1.024)),
+      (0, Vectors.dense(Double.NaN, 0.2, -1.23), 0.78, 0.99, 2, Vectors.dense(1.0, 0.1, -1.22)),
+      (1, Vectors.dense(1.0, 0.4, -1.23), 0.12, 0.34, 3, Vectors.dense(Double.NaN, 0.2, -1.23))))
+      .toDF(mockLabelColumn, "col1", "col2", "col3", "col4", "col5")
+
+    val result: DataFrame = featurizeAndVerifyResult(dataset,
+      benchmarkVectorsTempFile.toString,
+      historicVectorsFile)
+    // Verify that features column has the correct number of slots
+    assert(result.first().getAs[DenseVector](featuresColumn).values.length == 9)
+  }
+
+  test("Featurizing with text columns - using hashing with count based feature selection") {
+    val dataset: DataFrame = session.createDataFrame(Seq(
+      (0, 2, 0.50, 0.60, "pokemon are everywhere"),
+      (1, 3, 0.40, 0.50, "they are in the woods"),
+      (0, 4, 0.78, 0.99, "they are in the water"),
+      (1, 5, 0.12, 0.34, "they are in the fields"),
+      (0, 3, 0.78, 0.99, "pokemon - gotta catch em all")))
+      .toDF(mockLabelColumn, "col1", "col2", "col3", "col4")
+
+    val result: DataFrame = featurizeAndVerifyResult(dataset,
+      benchmarkStringTempFile.toString,
+      historicStringFile)
+    // Verify that features column has the correct number of slots
+    assert(result.first().getAs[SparseVector](featuresColumn).size == 11)
+  }
+
+  test("Verify featurizing text data produces proper tokenized output") {
+    val wordCountCol = "wordCount"
+    val wordLengthCol = "wordLength"
+    val textCol = "textCol"
+    val mockAmazonData = session.createDataFrame(Seq(
+      (1, 221, 4.42, "Ok~ but I think the Keirsey Temperment Test is more accurate - and cheaper.  This book has its " +
+        "good points. If anything, it helps you put into words what you want  from a supervisor, but it is not very " +
+        "accurate. The online test does not account for a difference between when 2 of their options are both " +
+        "exactly like you, or if they don't describe you at all. This messes up the results, and it did not " +
+        "describe me very well. I am not just in denial. I have taken a lot of personality type tests, like " +
+        "the Keirsey Temperment sorter and have a pretty good idea of my strengths. So, although this book is " +
+        "pretty good in making you understand the importance of incouraging your strengths, it still " +
+        "leaves you wondering about how you fit in to their terminology.  As for using this book as a manager " +
+        "to better serve your employees, I'v seen it done and it does not necessarily work because the strengths " +
+        "spit out for people were not wholly accurate. The company I work for has done this, and most of the " +
+        "people who were shifted around to better serve their strengths (according to this book) are very " +
+        "unhappy in their new positions.  Your money can be spent better elsewhere. I say its only worth about $10"),
+      (0, 138, 4.49, "I had a bad feeling about this!  And I was right!  I was intrigued by the title, which " +
+        "supposedly links Jedi wisdom to Christianity.  Well, after 60 pages or so, I have got the feeling that the " +
+        "Staub is trying to wrap Jedi in Christian cloth and failing at that. The author speaks of the difficulty in " +
+        "leading a Christian life.  But, I say that any religious life (be it Christian, Islam or otherwise) is hard " +
+        "because it turns the back on the norm or the conventional.   I am convinced that Yoda is a Zen master; " +
+        "the Force is derived from Tao, not God as interpreted by the orthodox religion(I am purposefully leaving " +
+        "out Christian Mysticism, which is another beast altogether.). A better book on the subject of theology " +
+        "in Star wars is \"The Dharma of Star Wars.\""),
+      (0, 43, 4.98, "Poorly written  I tried reading this book but found it so turgid and poorly written that I " +
+        "put it down in frustration.  It reads like a translation from another language by an academic bureacrat. " +
+        "The theme is interesting, the execution poor.  Cannot recommend")))
+      .toDF(mockLabelColumn, wordCountCol, wordLengthCol, textCol)
+
+    val featModel = new Featurize()
+      .setFeatureColumns(Map { featuresColumn -> Array(wordCountCol, wordLengthCol, textCol) })
+      .setNumberOfFeatures(100000).fit(mockAmazonData)
+    val nonzeroValuesThreshold = 30
+    featModel.transform(mockAmazonData).collect().foreach(
+      row => assert(row.getAs[SparseVector](featuresColumn).indices.length > nonzeroValuesThreshold,
+        "Strings improperly tokenized")
+    )
+  }
+
+  test("Featurizing with text columns that have missing values - using hashing with count based feature selection") {
+    val dataset: DataFrame = session.createDataFrame(Seq(
+      (0, 2, 0.50, "pokemon are everywhere"),
+      (1, 3, 0.40, null),
+      (0, 4, 0.78, "they are in the water"),
+      (1, 5, 0.12, "they are in the fields"),
+      (0, 3, 0.78, null)))
+      .toDF(mockLabelColumn, "col1", "col2", "col3")
+
+    val result: DataFrame = featurizeAndVerifyResult(dataset,
+      benchmarkStringMissingsTempFile.toString,
+      historicStringMissingsFile)
+    // Verify that features column has the correct number of slots
+    assert(result.first().getAs[DenseVector](featuresColumn).size == 8)
+  }
+
+  test("Featurizing with categorical columns - using one hot encoding") {
+    val cat = "Cat"
+    val dog = "Dog"
+    val bird = "Bird"
+    val dataset: DataFrame = session.createDataFrame(Seq(
+      (0, 2, 0.50, 0.60, dog, cat),
+      (1, 3, 0.40, 0.50, cat, dog),
+      (0, 4, 0.78, 0.99, dog, bird),
+      (1, 5, 0.12, 0.34, cat, dog),
+      (0, 3, 0.78, 0.99, dog, bird),
+      (1, 4, 0.12, 0.34, bird, dog)))
+      .toDF(mockLabelColumn, "col1", "col2", "col3", "col4", "col5")
+
+    val catDataset = SparkSchema.makeCategorical(
+      SparkSchema.makeCategorical(dataset, "col4", "col4", false),
+      "col5",
+      "col5",
+      false)
+
+    val result: DataFrame = featurizeAndVerifyResult(catDataset,
+      benchmarkOneHotTempFile.toString,
+      historicOneHotFile,
+      oneHotEncode = true)
+    // Verify that features column has the correct number of slots
+    assert(result.first().getAs[DenseVector](featuresColumn).size == 7)
+
+    // Verify without one-hot encoding we get expected data
+    val resultNoOneHot: DataFrame = featurizeAndVerifyResult(catDataset,
+      benchmarkNoOneHotTempFile.toString,
+      historicNoOneHotFile)
+    // Verify that features column has the correct number of slots
+    assert(resultNoOneHot.first().getAs[DenseVector](featuresColumn).size == 5)
+
+    // Verify get equivalent results if we use string indexer for making categoricals
+    val tmp4col = "col4tmp"
+    val tmp5col = "col5tmp"
+    val strind1 = new StringIndexer().setInputCol("col4").setOutputCol(tmp4col)
+    val strind2 = new StringIndexer().setInputCol("col5").setOutputCol(tmp5col)
+    val fit1 = strind1.fit(dataset)
+    val catResult1 = fit1.transform(dataset)
+    val fit2 = strind2.fit(catResult1)
+    val catResult2 = fit2.transform(catResult1)
+      .drop("col4", "col5")
+      .withColumnRenamed(tmp4col, "col4")
+      .withColumnRenamed(tmp5col, "col5")
+
+    val resultStringIndexer: DataFrame = featurizeAndVerifyResult(catResult2,
+      benchmarkStringIndexOneHotTempFile.toString,
+      historicStringIndexOneHotFile,
+      oneHotEncode = true)
+    // Verify that features column has the correct number of slots
+    assert(resultStringIndexer.first().getAs[DenseVector](featuresColumn).size == 7)
+  }
+
+  // This test currently fails on makeCategorical, where we should handle missing values (unlike spark,
+  // which fails with a null reference exception)
+  ignore("Featurizing with categorical columns that have missings - using one hot encoding") {
+    val cat = "Cat"
+    val dog = "Dog"
+    val bird = "Bird"
+    val dataset: DataFrame = session.createDataFrame(Seq(
+      (0, cat),
+      (1, null),
+      (0, bird),
+      (1, null),
+      (0, bird),
+      (1, dog)))
+      .toDF(mockLabelColumn, "col1")
+
+    val catDataset = SparkSchema.makeCategorical(dataset, "col1", "col1", false)
+
+    val result: DataFrame = featurizeAndVerifyResult(catDataset,
+      benchmarkOneHotMissingsTempFile.toString,
+      historicOneHotMissingsFile,
+      oneHotEncode = true)
+    // Verify that features column has the correct number of slots
+    assert(result.first().getAs[DenseVector](featuresColumn).size == 4)
+
+    // Verify without one-hot encoding we get expected data
+    val resultNoOneHot: DataFrame = featurizeAndVerifyResult(catDataset,
+      benchmarkNoOneHotMissingsTempFile.toString,
+      historicNoOneHotMissingsFile)
+    // Verify that features column has the correct number of slots
+    assert(resultNoOneHot.first().getAs[DenseVector](featuresColumn).size == 4)
+  }
+
+  def featurizeAndVerifyResult(dataset: DataFrame,
+                               tempFile: String,
+                               historicFile: File,
+                               oneHotEncode: Boolean = false): DataFrame = {
+    val featureColumns = dataset.columns.filter(_ != mockLabelColumn)
+    val feat = new Featurize()
+      .setNumberOfFeatures(10)
+      .setFeatureColumns(Map(featuresColumn -> featureColumns))
+      .setOneHotEncodeCategoricals(oneHotEncode)
+    val featModel = feat.fit(dataset)
+    val result = featModel.transform(dataset)
+    // Write out file so it is easy to compare the results
+    result.repartition(1).write.json(tempFile)
+    if (!Files.exists(historicFile.toPath)) {
+      // Store result in file for future
+      val directory = historicFile.toString.replace(".json", "")
+      result.repartition(1).write.json(directory)
+      val directoryFile = new File(directory)
+      val jsonFile = directoryFile.listFiles().filter(file => file.toString.endsWith(".json"))(0)
+      jsonFile.renameTo(historicFile)
+      FileUtilities.delTree(directoryFile)
+    }
+    val expResult = session.read.json(historicFile.toString)
+    // Verify the results are the same
+    verifyResult(expResult, result)
+    result
+  }
+
+  override def setParams(fitDataset: DataFrame, estimator: Estimator[_]): Estimator[_] = {
+    val featureColumns = fitDataset.columns.filter(_ != mockLabelColumn)
+    estimator.asInstanceOf[Featurize].setFeatureColumns(Map(featuresColumn -> featureColumns))
+  }
+
+  override def createFitDataset: DataFrame = mockDataset
+
+  override def schemaForDataset: StructType = ???
+
+  override def getEstimator(): Estimator[_] = new Featurize()
+}
diff --git a/src/featurize/src/test/scala/benchmarkBasicDataTypes.json b/src/featurize/src/test/scala/benchmarkBasicDataTypes.json
new file mode 100644
index 0000000000..cec71f6716
--- /dev/null
+++ b/src/featurize/src/test/scala/benchmarkBasicDataTypes.json
@@ -0,0 +1,12 @@
+{"Label":0,"col1":2,"col2":0.5,"col3":true,"col4":0,"col5":0,"col6":12.0,"testColumn":{"type":1,"values":[0.0,2.0,1.0,12.0,0.5,0.0]}}
+{"Label":1,"col1":3,"col2":0.4,"col3":false,"col4":1,"col5":100,"col6":30.0,"testColumn":{"type":1,"values":[1.0,3.0,0.0,30.0,0.4,100.0]}}
+{"Label":0,"col1":4,"col2":0.78,"col3":true,"col4":2,"col5":50,"col6":12.0,"testColumn":{"type":1,"values":[2.0,4.0,1.0,12.0,0.78,50.0]}}
+{"Label":1,"col1":5,"col2":0.12,"col3":false,"col4":3,"col5":0,"col6":12.0,"testColumn":{"type":1,"values":[3.0,5.0,0.0,12.0,0.12,0.0]}}
+{"Label":0,"col1":1,"col2":0.5,"col3":true,"col4":0,"col5":0,"col6":30.0,"testColumn":{"type":1,"values":[0.0,1.0,1.0,30.0,0.5,0.0]}}
+{"Label":1,"col1":3,"col2":0.4,"col3":false,"col4":1,"col5":10,"col6":12.0,"testColumn":{"type":1,"values":[1.0,3.0,0.0,12.0,0.4,10.0]}}
+{"Label":0,"col1":3,"col2":0.78,"col3":false,"col4":2,"col5":0,"col6":12.0,"testColumn":{"type":1,"values":[2.0,3.0,0.0,12.0,0.78,0.0]}}
+{"Label":1,"col1":4,"col2":0.12,"col3":false,"col4":3,"col5":0,"col6":12.0,"testColumn":{"type":1,"values":[3.0,4.0,0.0,12.0,0.12,0.0]}}
+{"Label":0,"col1":0,"col2":0.5,"col3":true,"col4":0,"col5":0,"col6":12.0,"testColumn":{"type":1,"values":[0.0,0.0,1.0,12.0,0.5,0.0]}}
+{"Label":1,"col1":2,"col2":0.4,"col3":false,"col4":1,"col5":127,"col6":30.0,"testColumn":{"type":1,"values":[1.0,2.0,0.0,30.0,0.4,127.0]}}
+{"Label":0,"col1":3,"col2":0.78,"col3":true,"col4":2,"col5":-128,"col6":12.0,"testColumn":{"type":1,"values":[2.0,3.0,1.0,12.0,0.78,-128.0]}}
+{"Label":1,"col1":4,"col2":0.12,"col3":false,"col4":3,"col5":0,"col6":12.0,"testColumn":{"type":1,"values":[3.0,4.0,0.0,12.0,0.12,0.0]}}
diff --git a/src/featurize/src/test/scala/benchmarkNoOneHot.json b/src/featurize/src/test/scala/benchmarkNoOneHot.json
new file mode 100644
index 0000000000..bf00792485
--- /dev/null
+++ b/src/featurize/src/test/scala/benchmarkNoOneHot.json
@@ -0,0 +1,6 @@
+{"Label":0,"col1":2,"col2":0.5,"col3":0.6,"col4":2,"col5":1,"testColumn":{"type":1,"values":[1.0,2.0,2.0,0.6,0.5]}}
+{"Label":1,"col1":3,"col2":0.4,"col3":0.5,"col4":1,"col5":2,"testColumn":{"type":1,"values":[2.0,1.0,3.0,0.5,0.4]}}
+{"Label":0,"col1":4,"col2":0.78,"col3":0.99,"col4":2,"col5":0,"testColumn":{"type":1,"values":[0.0,2.0,4.0,0.99,0.78]}}
+{"Label":1,"col1":5,"col2":0.12,"col3":0.34,"col4":1,"col5":2,"testColumn":{"type":1,"values":[2.0,1.0,5.0,0.34,0.12]}}
+{"Label":0,"col1":3,"col2":0.78,"col3":0.99,"col4":2,"col5":0,"testColumn":{"type":1,"values":[0.0,2.0,3.0,0.99,0.78]}}
+{"Label":1,"col1":4,"col2":0.12,"col3":0.34,"col4":0,"col5":2,"testColumn":{"type":1,"values":[2.0,0.0,4.0,0.34,0.12]}}
diff --git a/src/featurize/src/test/scala/benchmarkOneHot.json b/src/featurize/src/test/scala/benchmarkOneHot.json
new file mode 100644
index 0000000000..7b193d5113
--- /dev/null
+++ b/src/featurize/src/test/scala/benchmarkOneHot.json
@@ -0,0 +1,6 @@
+{"Label":0,"col1":2,"col2":0.5,"col3":0.6,"col4":2,"col5":1,"testColumn":{"type":1,"values":[0.0,1.0,0.0,0.0,2.0,0.6,0.5]}}
+{"Label":1,"col1":3,"col2":0.4,"col3":0.5,"col4":1,"col5":2,"testColumn":{"type":1,"values":[0.0,0.0,0.0,1.0,3.0,0.5,0.4]}}
+{"Label":0,"col1":4,"col2":0.78,"col3":0.99,"col4":2,"col5":0,"testColumn":{"type":1,"values":[1.0,0.0,0.0,0.0,4.0,0.99,0.78]}}
+{"Label":1,"col1":5,"col2":0.12,"col3":0.34,"col4":1,"col5":2,"testColumn":{"type":1,"values":[0.0,0.0,0.0,1.0,5.0,0.34,0.12]}}
+{"Label":0,"col1":3,"col2":0.78,"col3":0.99,"col4":2,"col5":0,"testColumn":{"type":1,"values":[1.0,0.0,0.0,0.0,3.0,0.99,0.78]}}
+{"Label":1,"col1":4,"col2":0.12,"col3":0.34,"col4":0,"col5":2,"testColumn":{"type":1,"values":[0.0,0.0,1.0,0.0,4.0,0.34,0.12]}}
diff --git a/src/featurize/src/test/scala/benchmarkString.json b/src/featurize/src/test/scala/benchmarkString.json
new file mode 100644
index 0000000000..f6a333ae77
--- /dev/null
+++ b/src/featurize/src/test/scala/benchmarkString.json
@@ -0,0 +1,5 @@
+{"Label":0,"col1":2,"col2":0.5,"col3":0.6,"col4":"pokemon are everywhere","testColumn":{"type":0,"size":11,"indices":[0,1,2,7,9,10],"values":[2.0,0.6,0.5,1.0,1.0,1.0]}}
+{"Label":1,"col1":3,"col2":0.4,"col3":0.5,"col4":"they are in the woods","testColumn":{"type":1,"values":[3.0,0.5,0.4,1.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0]}}
+{"Label":0,"col1":4,"col2":0.78,"col3":0.99,"col4":"they are in the water","testColumn":{"type":1,"values":[4.0,0.99,0.78,1.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0]}}
+{"Label":1,"col1":5,"col2":0.12,"col3":0.34,"col4":"they are in the fields","testColumn":{"type":1,"values":[5.0,0.34,0.12,1.0,0.0,0.0,1.0,0.0,1.0,2.0,0.0]}}
+{"Label":0,"col1":3,"col2":0.78,"col3":0.99,"col4":"pokemon - gotta catch em all","testColumn":{"type":1,"values":[3.0,0.99,0.78,0.0,1.0,0.0,0.0,1.0,1.0,1.0,2.0]}}
diff --git a/src/featurize/src/test/scala/benchmarkStringIndexOneHot.json b/src/featurize/src/test/scala/benchmarkStringIndexOneHot.json
new file mode 100644
index 0000000000..c542136b16
--- /dev/null
+++ b/src/featurize/src/test/scala/benchmarkStringIndexOneHot.json
@@ -0,0 +1,6 @@
+{"Label":0,"col1":2,"col2":0.5,"col3":0.6,"col4":0.0,"col5":2.0,"testColumn":{"type":1,"values":[0.0,0.0,1.0,0.0,2.0,0.6,0.5]}}
+{"Label":1,"col1":3,"col2":0.4,"col3":0.5,"col4":1.0,"col5":0.0,"testColumn":{"type":1,"values":[1.0,0.0,0.0,1.0,3.0,0.5,0.4]}}
+{"Label":0,"col1":4,"col2":0.78,"col3":0.99,"col4":0.0,"col5":1.0,"testColumn":{"type":1,"values":[0.0,1.0,1.0,0.0,4.0,0.99,0.78]}}
+{"Label":1,"col1":5,"col2":0.12,"col3":0.34,"col4":1.0,"col5":0.0,"testColumn":{"type":1,"values":[1.0,0.0,0.0,1.0,5.0,0.34,0.12]}}
+{"Label":0,"col1":3,"col2":0.78,"col3":0.99,"col4":0.0,"col5":1.0,"testColumn":{"type":1,"values":[0.0,1.0,1.0,0.0,3.0,0.99,0.78]}}
+{"Label":1,"col1":4,"col2":0.12,"col3":0.34,"col4":2.0,"col5":0.0,"testColumn":{"type":1,"values":[1.0,0.0,0.0,0.0,4.0,0.34,0.12]}}
diff --git a/src/featurize/src/test/scala/benchmarkStringMissing.json b/src/featurize/src/test/scala/benchmarkStringMissing.json
new file mode 100644
index 0000000000..5bcc3f4166
--- /dev/null
+++ b/src/featurize/src/test/scala/benchmarkStringMissing.json
@@ -0,0 +1,5 @@
+{"Label":0,"col1":2,"col2":0.5,"col3":"pokemon are everywhere","testColumn":{"type":1,"values":[2.0,0.5,0.0,0.0,1.0,0.0,1.0,1.0]}}
+{"Label":1,"col1":3,"col2":0.4,"testColumn":{"type":0,"size":8,"indices":[0,1],"values":[3.0,0.4]}}
+{"Label":0,"col1":4,"col2":0.78,"col3":"they are in the water","testColumn":{"type":1,"values":[4.0,0.78,1.0,1.0,0.0,0.0,2.0,1.0]}}
+{"Label":1,"col1":5,"col2":0.12,"col3":"they are in the fields","testColumn":{"type":1,"values":[5.0,0.12,1.0,1.0,0.0,1.0,2.0,0.0]}}
+{"Label":0,"col1":3,"col2":0.78,"testColumn":{"type":0,"size":8,"indices":[0,1],"values":[3.0,0.78]}}
diff --git a/src/featurize/src/test/scala/benchmarkVectors.json b/src/featurize/src/test/scala/benchmarkVectors.json
new file mode 100644
index 0000000000..4bc319c1a7
--- /dev/null
+++ b/src/featurize/src/test/scala/benchmarkVectors.json
@@ -0,0 +1,7 @@
+{"Label":0,"col1":{"type":0,"size":3,"indices":[0,2],"values":[1.0,2.0]},"col2":0.5,"col3":0.6,"col4":0,"col5":{"type":1,"values":[1.0,0.1,-1.5]},"testColumn":{"type":1,"values":[0.0,1.0,0.0,2.0,0.6,0.5,1.0,0.1,-1.5]}}
+{"Label":1,"col1":{"type":1,"values":[1.5,0.2,-1.2]},"col2":0.4,"col3":0.5,"col4":1,"col5":{"type":1,"values":[1.5,0.2,-1.2]},"testColumn":{"type":1,"values":[1.0,1.5,0.2,-1.2,0.5,0.4,1.5,0.2,-1.2]}}
+{"Label":1,"col1":{"type":0,"size":3,"indices":[0,2],"values":[1.0,2.0]},"col2":0.12,"col3":0.34,"col4":3,"col5":{"type":0,"size":3,"indices":[0,2],"values":[1.0,2.0]},"testColumn":{"type":1,"values":[3.0,1.0,0.0,2.0,0.34,0.12,1.0,0.0,2.0]}}
+{"Label":0,"col1":{"type":1,"values":[1.1,0.5,-1.024]},"col2":0.5,"col3":0.6,"col4":0,"col5":{"type":1,"values":[1.0,0.4,-1.23]},"testColumn":{"type":1,"values":[0.0,1.1,0.5,-1.024,0.6,0.5,1.0,0.4,-1.23]}}
+{"Label":1,"col1":{"type":1,"values":[1.1,0.5,-1.056]},"col2":0.4,"col3":0.5,"col4":1,"col5":{"type":1,"values":[1.1,0.5,-1.024]},"testColumn":{"type":1,"values":[1.0,1.1,0.5,-1.056,0.5,0.4,1.1,0.5,-1.024]}}
+{"Label":0,"col1":{"type":1,"values":["NaN",0.2,-1.23]},"col2":0.78,"col3":0.99,"col4":2,"col5":{"type":1,"values":[1.0,0.1,-1.22]},"testColumn":{"type":1,"values":[2.0,"NaN",0.2,-1.23,0.99,0.78,1.0,0.1,-1.22]}}
+{"Label":1,"col1":{"type":1,"values":[1.0,0.4,-1.23]},"col2":0.12,"col3":0.34,"col4":3,"col5":{"type":1,"values":["NaN",0.2,-1.23]},"testColumn":{"type":1,"values":[3.0,1.0,0.4,-1.23,0.34,0.12,"NaN",0.2,-1.23]}}
diff --git a/src/find-best-model/build.sbt b/src/find-best-model/build.sbt
new file mode 100644
index 0000000000..9aa1bc13cf
--- /dev/null
+++ b/src/find-best-model/build.sbt
@@ -0,0 +1,3 @@
+//> DependsOn: core
+//> DependsOn: compute-model-statistics
+//> DependsOn: train-classifier
diff --git a/src/find-best-model/src/main/scala/FindBestModel.scala b/src/find-best-model/src/main/scala/FindBestModel.scala
new file mode 100644
index 0000000000..060283b614
--- /dev/null
+++ b/src/find-best-model/src/main/scala/FindBestModel.scala
@@ -0,0 +1,331 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.schema.SchemaConstants
+import org.apache.hadoop.fs.Path
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SaveMode}
+import org.apache.spark.ml._
+import org.apache.spark.ml.param.{Param, ParamMap, TransformerArrayParam}
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.types._
+
+import scala.collection.mutable.ListBuffer
+
+object FindBestModel extends DefaultParamsReadable[FindBestModel] {
+  val modelNameCol = "model_name"
+  val metricsCol = "metric"
+  val paramsCol = "parameters"
+}
+
+/**
+  * Evaluates and chooses the best model from a list of models.
+  */
+class FindBestModel(override val uid: String) extends Estimator[BestModel] with MMLParams {
+
+  def this() = this(Identifiable.randomUID("FindBestModel"))
+  val models: TransformerArrayParam = new TransformerArrayParam(this, "models", "List of models to be evaluated")
+
+  def getModels: Array[Transformer] = $(models)
+
+  /** @group setParam **/
+  def setModels(value: Array[Transformer]): this.type = set(models, value)
+
+  /** @group setParam **/
+  val evaluationMetric: Param[String] = StringParam(this, "evaluationMetric", "Metric to evaluate models with",
+    (s: String) => Seq(ComputeModelStatistics.MseSparkMetric,
+    ComputeModelStatistics.RmseSparkMetric,
+    ComputeModelStatistics.R2SparkMetric,
+    ComputeModelStatistics.MaeSparkMetric,
+    ComputeModelStatistics.AccuracySparkMetric,
+    ComputeModelStatistics.PrecisionSparkMetric,
+    ComputeModelStatistics.RecallSparkMetric,
+    ComputeModelStatistics.AucSparkMetric) contains s)
+
+  // Set default evaluation metric to accuracy
+  setDefault(evaluationMetric -> ComputeModelStatistics.AccuracySparkMetric)
+
+  def getEvaluationMetric: String = $(evaluationMetric)
+
+  /** @group setParam **/
+  def setEvaluationMetric(value: String): this.type = set(evaluationMetric, value)
+
+  var selectedModel: Transformer = null
+
+  var selectedScoredDataset: Dataset[_] = null
+
+  var selectedROCCurve: DataFrame = null
+
+  var selectedBestModelMetrics: Dataset[_] = null
+
+  /**
+    *
+    * @param dataset - The input dataset, to be fitted
+    * @return The Model that results from the fitting
+    */
+  override def fit(dataset: Dataset[_]): BestModel = {
+    // Staging
+    val trainedModels = getModels
+    if (trainedModels.isEmpty) {
+      throw new Exception("No trained models to evaluate.")
+    }
+    // Find type of trained models
+    def modelTypeDiscriminant(model: Transformer):String = {
+      model match {
+        case reg: TrainedRegressorModel => SchemaConstants.RegressionKind
+        case cls: TrainedClassifierModel => SchemaConstants.ClassificationKind
+        case evm: BestModel => modelTypeDiscriminant(evm.getBestModel)
+        case _ => throw new Exception("Model type not supported for evaluation")
+      }
+    }
+    val modelType = modelTypeDiscriminant(trainedModels(0))
+    val evaluator = new ComputeModelStatistics()
+    evaluator.set(evaluator.evaluationMetric, getEvaluationMetric)
+
+    var bestMetric: Double = Double.NaN
+    // Setup to store metrics and model name data for model metrics table
+    val modelMetrics = ListBuffer[Double]()
+    val models = ListBuffer[String]()
+    val parameters = ListBuffer[String]()
+
+    // TODO: Add the other metrics
+    // TODO: Check metrics per model
+    val chooseHighest = (current: Double, best: Double) => { current > best }
+    val chooseLowest = (current: Double, best: Double) => { current < best }
+    val (evaluationMetricColumnName, operator): (String, (Double, Double) => Boolean) = modelType match {
+      case SchemaConstants.RegressionKind => getEvaluationMetric match {
+        case ComputeModelStatistics.MseSparkMetric  => (ComputeModelStatistics.MseColumnName,  chooseLowest)
+        case ComputeModelStatistics.RmseSparkMetric => (ComputeModelStatistics.RmseColumnName, chooseLowest)
+        case ComputeModelStatistics.R2SparkMetric   => (ComputeModelStatistics.R2ColumnName,   chooseHighest)
+        case ComputeModelStatistics.MaeSparkMetric  => (ComputeModelStatistics.MaeColumnName,  chooseLowest)
+        case _ => throw new Exception("Metric is not supported for regressors")
+      }
+      case SchemaConstants.ClassificationKind => getEvaluationMetric match {
+        case ComputeModelStatistics.AucSparkMetric       => (ComputeModelStatistics.AucColumnName, chooseHighest)
+        case ComputeModelStatistics.PrecisionSparkMetric => (ComputeModelStatistics.PrecisionColumnName, chooseHighest)
+        case ComputeModelStatistics.RecallSparkMetric    => (ComputeModelStatistics.RecallColumnName, chooseHighest)
+        case ComputeModelStatistics.AccuracySparkMetric  => (ComputeModelStatistics.AccuracyColumnName, chooseHighest)
+        case _ => throw new Exception("Metric is not supported for classifiers")
+      }
+      case _ => throw new Exception("Model type not supported for evaluation")
+    }
+
+    val compareModels = (model: Transformer, metrics: DataFrame, scoredDataset: Dataset[_]) => {
+      val currentMetric = metrics.select(evaluationMetricColumnName).first()(0).toString.toDouble
+      modelMetrics += currentMetric
+      models += model.uid
+      def getModelParams(model: Transformer): ParamMap = {
+        model match {
+          case reg: TrainedRegressorModel => reg.getParamMap
+          case cls: TrainedClassifierModel => cls.getParamMap
+          case evm: BestModel => getModelParams(evm.getBestModel)
+          case _ => throw new Exception("Model type not supported for evaluation")
+        }
+      }
+      parameters += getModelParams(model).toSeq.map { case pv => s"${pv.param.name}: ${pv.value}" }.mkString(", ")
+      if (bestMetric.isNaN || operator(currentMetric, bestMetric)) {
+        bestMetric = currentMetric
+        selectedModel = model
+        selectedScoredDataset = scoredDataset
+      }
+    }
+
+    for (trainedModel <- trainedModels) {
+      // Check that models are consistent
+      if (modelTypeDiscriminant(trainedModel) != modelType) {
+        throw new Exception("Models are inconsistent. Please evaluate only regressors or classifiers.")
+      }
+      val df = trainedModel.transform(dataset)
+      val metrics = evaluator.transform(df)
+      compareModels(trainedModel, metrics, df)
+    }
+
+    // compute ROC curve
+    evaluator.set(evaluator.evaluationMetric, ComputeModelStatistics.AllSparkMetrics)
+    selectedBestModelMetrics = evaluator.transform(selectedScoredDataset)
+    selectedROCCurve = evaluator.rocCurve
+
+    val spark = dataset.sparkSession
+    val allModelMetricsSchema = StructType(Seq(StructField(FindBestModel.modelNameCol, StringType, true),
+      StructField(FindBestModel.metricsCol, DoubleType, true),
+      StructField(FindBestModel.paramsCol, StringType, true)))
+    var allModelMetrics = spark.createDataFrame(spark.sparkContext.parallelize(models.zip(modelMetrics).zip(parameters)
+        .map(mmp => Row(mmp._1._1, mmp._1._2, mmp._2))), allModelMetricsSchema)
+    new BestModel(uid,
+      selectedModel,
+      selectedScoredDataset,
+      selectedROCCurve,
+      selectedBestModelMetrics,
+      allModelMetrics)
+  }
+
+  // Choose a random model as we don't know which one will be chosen yet - all will transform schema in same way
+  def transformSchema(schema: StructType): StructType = getModels(0).transformSchema(schema)
+
+  def copy(extra: ParamMap): FindBestModel = defaultCopy(extra)
+
+}
+
+/**
+  * Model produced by [[FindBestModel]].
+  */
+class BestModel(val uid: String,
+                val model: Transformer,
+                val scoredDataset: Dataset[_],
+                val rocCurve: DataFrame,
+                val bestModelMetrics: Dataset[_],
+                val allModelMetrics: Dataset[_])
+    extends Model[BestModel] with MLWritable {
+
+  override def write: MLWriter = new BestModel.EvaluateModelWriter(uid,
+    new Pipeline().setStages(Array(model)).fit(scoredDataset),
+    scoredDataset,
+    rocCurve,
+    bestModelMetrics,
+    allModelMetrics)
+
+  override def copy(extra: ParamMap): BestModel =
+    new BestModel(uid, model.copy(extra), scoredDataset, rocCurve, bestModelMetrics, allModelMetrics)
+
+  override def transform(dataset: Dataset[_]): DataFrame = model.transform(dataset)
+
+  /**
+    * The best model found during evaluation.
+    * @return The best model.
+    */
+  def getBestModel: Transformer = model
+
+  /**
+    * Gets the scored dataset.
+    * @return The scored dataset for the best model.
+    */
+  def getScoredDataset: Dataset[_] = scoredDataset
+
+  /**
+    * Gets the ROC curve with TPR, FPR.
+    * @return The evaluation results.
+    */
+  def getEvaluationResults: Dataset[_] = rocCurve
+
+  /**
+    * Gets all of the best model metrics results from the evaluator.
+    * @return All of the best model metrics results.
+    */
+  def getBestModelMetrics: Dataset[_] = bestModelMetrics
+
+  /**
+    * Gets a table of metrics from all models compared from the evaluation comparison.
+    * @return The model metrics results from all models.
+    */
+  def getAllModelMetrics: Dataset[_] = allModelMetrics
+
+  @DeveloperApi
+  override def transformSchema(schema: StructType): StructType = model.transformSchema(schema)
+
+}
+
+object BestModel extends MLReadable[BestModel] {
+
+  private val modelPart = "model"
+  private val scoredDatasetPart = "scoredDataset"
+  private val rocCurvePart = "rocCurve"
+  private val bestModelMetricsPart = "bestModelMetrics"
+  private val allModelMetricsPart = "allModelMetrics"
+  private val dataPart = "data"
+
+  override def read: MLReader[BestModel] = new BestModelReader
+
+  override def load(path: String): BestModel = super.load(path)
+
+  /** [[MLWriter]] instance for [[BestModel]] */
+  private[BestModel]
+  class EvaluateModelWriter(val uid: String,
+                            val model: PipelineModel,
+                            val scoredDataset: Dataset[_],
+                            val rocCurve: DataFrame,
+                            val bestModelMetrics: Dataset[_],
+                            val allModelMetrics: Dataset[_])
+    extends MLWriter {
+    private case class Data(uid: String)
+
+    override protected def saveImpl(path: String): Unit = {
+      val overwrite = this.shouldOverwrite
+      val qualPath = PipelineUtilities.makeQualifiedPath(sc, path)
+      // Required in order to allow this to be part of an ML pipeline
+      PipelineUtilities.saveMetadata(uid,
+        BestModel.getClass.getName.replace("$", ""),
+        new Path(path, "metadata").toString,
+        sc,
+        overwrite)
+
+      // save the model
+      val modelPath = new Path(qualPath, modelPart).toString
+      val modelWriter =
+        if (overwrite) model.write.overwrite()
+        else model.write
+      modelWriter.save(modelPath)
+
+      val saveMode =
+        if (overwrite) SaveMode.Overwrite
+        else SaveMode.ErrorIfExists
+
+      // save the scored dataset
+      val scoredDatasetPath = new Path(qualPath, scoredDatasetPart).toString
+      scoredDataset.write.mode(saveMode).parquet(scoredDatasetPath)
+
+      // save the roc curve
+      val rocCurvePath = new Path(qualPath, rocCurvePart).toString
+      rocCurve.write.mode(saveMode).parquet(rocCurvePath)
+
+      // save the best model metrics
+      val bestModelMetricsPath = new Path(qualPath, bestModelMetricsPart).toString
+      bestModelMetrics.write.mode(saveMode).parquet(bestModelMetricsPath)
+
+      // save all model metrics
+      val allModelMetricsPath = new Path(qualPath, allModelMetricsPart).toString
+      allModelMetrics.write.mode(saveMode).parquet(allModelMetricsPath)
+
+      // save model data
+      val data = Data(uid)
+      val dataPath = new Path(qualPath, dataPart).toString
+      sparkSession.createDataFrame(Seq(data)).repartition(1).write.mode(saveMode).parquet(dataPath)
+    }
+  }
+
+  private class BestModelReader
+    extends MLReader[BestModel] {
+
+    override def load(path: String): BestModel = {
+      val qualPath = PipelineUtilities.makeQualifiedPath(sc, path)
+      // load the uid, label column and model name
+      val dataPath = new Path(qualPath, dataPart).toString
+      val data = sparkSession.read.format("parquet").load(dataPath)
+      val Row(uid: String) = data.select("uid").head()
+
+      // retrieve the underlying model
+      val modelPath = new Path(qualPath, modelPart).toString
+      val model = PipelineModel.load(modelPath)
+
+      // retrieve the scored dataset
+      val scoredDatasetPath = new Path(qualPath, scoredDatasetPart).toString
+      val scoredDataset = sparkSession.read.parquet(scoredDatasetPath)
+
+      // retrieve the roc curve
+      val rocCurvePath = new Path(qualPath, rocCurvePart).toString
+      val rocCurve = sparkSession.read.parquet(rocCurvePath)
+
+      // retrieve the best model metrics
+      val bestModelMetricsPath = new Path(qualPath, bestModelMetricsPart).toString
+      val bestModelMetrics = sparkSession.read.parquet(bestModelMetricsPath)
+
+      // retrieve all model metrics
+      val allModelMetricsPath = new Path(qualPath, allModelMetricsPart).toString
+      val allModelMetrics = sparkSession.read.parquet(allModelMetricsPath)
+
+      new BestModel(uid, model.stages(0), scoredDataset, rocCurve, bestModelMetrics, allModelMetrics)
+    }
+  }
+
+}
diff --git a/src/find-best-model/src/test/scala/VerifyFindBestModel.scala b/src/find-best-model/src/test/scala/VerifyFindBestModel.scala
new file mode 100644
index 0000000000..d36998a7a3
--- /dev/null
+++ b/src/find-best-model/src/test/scala/VerifyFindBestModel.scala
@@ -0,0 +1,106 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.io.File
+
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.ml.{Estimator, Transformer}
+import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
+
+class VerifyFindBestModel extends EstimatorFuzzingTest {
+
+  val mockLabelColumn = "Label"
+
+  def createMockDataset: DataFrame = {
+    session.createDataFrame(Seq(
+      (0, 2, 0.50, 0.60, 0),
+      (1, 3, 0.40, 0.50, 1),
+      (0, 4, 0.78, 0.99, 2),
+      (1, 5, 0.12, 0.34, 3),
+      (0, 1, 0.50, 0.60, 0),
+      (1, 3, 0.40, 0.50, 1),
+      (0, 3, 0.78, 0.99, 2),
+      (1, 4, 0.12, 0.34, 3),
+      (0, 0, 0.50, 0.60, 0),
+      (1, 2, 0.40, 0.50, 1),
+      (0, 3, 0.78, 0.99, 2),
+      (1, 4, 0.12, 0.34, 3)))
+      .toDF(mockLabelColumn, "col1", "col2", "col3", "col4")
+  }
+
+  test("Smoke test to verify that evaluate can be run") {
+    val dataset = createMockDataset
+    val randomForestClassifier = TrainClassifierTestUtilities.createRandomForestClassifier(mockLabelColumn)
+    val model = randomForestClassifier.fit(dataset)
+    val findBestModel = new FindBestModel()
+      .setModels(Array(model.asInstanceOf[Transformer], model.asInstanceOf[Transformer]))
+      .setEvaluationMetric(ComputeModelStatistics.AccuracySparkMetric)
+    val bestModel = findBestModel.fit(dataset)
+    bestModel.transform(dataset)
+  }
+
+  test("Verify the best model can be saved") {
+    val dataset: DataFrame = createMockDataset
+    val logisticRegressor = TrainClassifierTestUtilities.createLogisticRegressor(mockLabelColumn)
+    val model = logisticRegressor.fit(dataset)
+
+    val findBestModel = new FindBestModel()
+      .setModels(Array(model.asInstanceOf[Transformer], model.asInstanceOf[Transformer]))
+      .setEvaluationMetric(ComputeModelStatistics.AucSparkMetric)
+    val bestModel = findBestModel.fit(dataset)
+
+    val myModelName = "testEvalModel"
+    bestModel.save(myModelName)
+    val dir = new File(myModelName)
+    // assert directory exists
+    assert(dir.exists())
+    // delete the file to cleanup
+    FileUtilities.delTree(dir)
+  }
+
+  test("Verify the best model metrics can be retrieved and are valid") {
+    val dataset: DataFrame = createMockDataset
+    val logisticRegressor = TrainClassifierTestUtilities.createLogisticRegressor(mockLabelColumn)
+    val decisionTreeClassifier = TrainClassifierTestUtilities.createDecisionTreeClassifier(mockLabelColumn)
+    val GBTClassifier = TrainClassifierTestUtilities.createGradientBoostedTreesClassifier(mockLabelColumn)
+    val naiveBayesClassifier = TrainClassifierTestUtilities.createNaiveBayesClassifier(mockLabelColumn)
+    val randomForestClassifier = TrainClassifierTestUtilities.createRandomForestClassifier(mockLabelColumn)
+    val model1 = logisticRegressor.fit(dataset)
+    val model2 = decisionTreeClassifier.fit(dataset)
+    val model3 = GBTClassifier.fit(dataset)
+    val model4 = naiveBayesClassifier.fit(dataset)
+    val model5 = randomForestClassifier.fit(dataset)
+
+    val findBestModel = new FindBestModel()
+      .setModels(Array(model1.asInstanceOf[Transformer], model2, model3, model4, model5))
+      .setEvaluationMetric(ComputeModelStatistics.AucSparkMetric)
+    val bestModel = findBestModel.fit(dataset)
+    // validate schema is as expected
+    assert(bestModel.getAllModelMetrics.schema ==
+      StructType(Seq(StructField(FindBestModel.modelNameCol, StringType, true),
+        StructField(FindBestModel.metricsCol, DoubleType, true),
+        StructField(FindBestModel.paramsCol, StringType, true))))
+    // validate we got metrics for every model
+    assert(bestModel.getAllModelMetrics.count() == 5)
+    // validate AUC looks valid
+    bestModel.getAllModelMetrics
+      .select(FindBestModel.metricsCol)
+      .collect()
+      .foreach(value => assert(value.getDouble(0) >= 0.5))
+  }
+
+  override def setParams(fitDataset: DataFrame, estimator: Estimator[_]): Estimator[_] = {
+    val assembleFeatures = estimator.asInstanceOf[FindBestModel]
+    val logisticRegressor = TrainClassifierTestUtilities.createLogisticRegressor(mockLabelColumn)
+    val model = logisticRegressor.fit(createMockDataset)
+    assembleFeatures.setModels(Array(model, model))
+  }
+
+  override def createFitDataset: DataFrame = createMockDataset
+
+  override def schemaForDataset: StructType = ???
+
+  override def getEstimator(): Estimator[_] = new FindBestModel()
+}
diff --git a/src/fuzzing/build.sbt b/src/fuzzing/build.sbt
new file mode 100644
index 0000000000..4e7b0eb87c
--- /dev/null
+++ b/src/fuzzing/build.sbt
@@ -0,0 +1,5 @@
+//> DependsOn: core
+//> DependsOn: utils
+//> DependsOn: compute-model-statistics
+//> DependsOn: find-best-model
+//> DependsOn: featurize
diff --git a/src/fuzzing/src/test/scala/Fuzzing.scala b/src/fuzzing/src/test/scala/Fuzzing.scala
new file mode 100644
index 0000000000..b4480d1467
--- /dev/null
+++ b/src/fuzzing/src/test/scala/Fuzzing.scala
@@ -0,0 +1,254 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import FileUtilities.File
+import org.apache.spark.ml._
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util.{MLReadable, MLWritable}
+import org.apache.spark.sql.DataFrame
+
+import scala.language.existentials
+import scala.util.Random
+
+/**
+  * Tests to validate fuzzing of modules
+  */
+class Fuzzing extends TestBase {
+
+  // Needed because the session in MTB is lazy
+  session
+
+  val numRows = 10
+  val numCols = 20
+  val numSlotsPerVectorCol = Array(15, 15)
+  val randomSeed = new Random()
+
+  // Use this for more detailed output from the Jar Loader
+  val debug = false
+
+  // use this to quickly see all the results for all failing modules
+  // Note that this could make the tests pass when they should be failing
+  val disableFailure = false
+
+  test("Verify all PipelineStages can be saved then loaded") {
+
+    val exemptions: Set[String] = Set()
+
+    val applicableStages = pipelineStages.filter(t => !exemptions(t.getClass.getName))
+    applicableStages.foreach(t => if (!readerMap.contains(t.getClass.getName)) {
+      assertOrLog(false, s"need to have a companion reader for class ${t.getClass.getName}")
+    })
+
+    applicableStages.foreach(t => trySave(t, Some(readerMap(t.getClass.getName))))
+  }
+
+  // TODO verify that model UIDs match the class names, perhaps use a Trait
+
+  test("Verify all estimators can be turned into pipelines, saved and loaded") {
+    estimators.foreach(est => {
+      val estimatorName = est.getClass.getName
+      println()
+      println(s"Running estimator: ${est.toString} with name: ${estimatorName}")
+      val (dataset, pipelineStage) =
+        if (estimatorFuzzers.contains(estimatorName)) {
+          println("Generating dataset from estimator fuzzer")
+          val estimatorFuzzer = estimatorFuzzers(estimatorName)
+          val fitDataset = estimatorFuzzer.createFitDataset
+          val estUpdated = estimatorFuzzer.setParams(fitDataset, est.copy(ParamMap()))
+          (fitDataset, estUpdated.asInstanceOf[PipelineStage])
+        } else {
+          println("Generating random dataset")
+          (createDataSet, est.copy(ParamMap()).asInstanceOf[PipelineStage])
+        }
+      tryRun(() => {
+        var pipelineModel = new Pipeline().setStages(Array(pipelineStage)).fit(dataset)
+        pipelineModel = trySave(pipelineModel,
+          Some(PipelineModel.asInstanceOf[MLReadable[Any]])).get.asInstanceOf[PipelineModel]
+        val dfTransform =
+          if (estimatorFuzzers.contains(estimatorName)) {
+            estimatorFuzzers(estimatorName).createTransformDataset
+          } else {
+            createDataSet
+          }
+        pipelineModel.transform(dfTransform)
+        ()
+      })
+    })
+  }
+
+  test("Verify all transformers can be turned into pipelines, saved and loaded") {
+    transformers.foreach(tr => {
+      val transformerName = tr.getClass.getName
+      println()
+      println(s"Running transformer: ${tr.toString} with name: ${transformerName}")
+      val (dataset, pipelineStage) =
+        if (transformerFuzzers.contains(transformerName)) {
+          println("Generating dataset from transformer fuzzer")
+          val transformerFuzzer = transformerFuzzers(transformerName)
+          val fitDataset = transformerFuzzer.createDataset
+          val trUpdated = transformerFuzzer.setParams(fitDataset, tr.copy(ParamMap()))
+          (fitDataset, trUpdated.asInstanceOf[PipelineStage])
+        } else {
+          println("Generating random dataset")
+          (createDataSet, tr.copy(ParamMap()).asInstanceOf[PipelineStage])
+        }
+      tryRun(() => {
+        val pipeline = new Pipeline().setStages(Array(pipelineStage))
+        val pipelineModel = pipeline.fit(dataset)
+        trySave(pipelineModel)
+        ()
+      })
+    })
+  }
+
+  test("Verify all pipeline stages dont have exotic characters") {
+    val badChars = List(",", "\"", "'", ".")
+    pipelineStages.foreach { pipelineStage =>
+      pipelineStage.params.foreach { param =>
+        assertOrLog(!param.name.contains(badChars))
+        assertOrLog(!param.doc.contains("\""))
+      }
+    }
+  }
+
+  test("Verify all pipeline stage values match their param names") {
+    val exemptions: Set[String] = Set()
+    pipelineStages.foreach { pipelineStage =>
+      if (!exemptions(pipelineStage.getClass.getName)) {
+        val paramFields = pipelineStage.getClass.getDeclaredFields
+          .filter(f => classOf[Param[Any]].isAssignableFrom(f.getType))
+
+        val paramNames = paramFields.map { f =>
+          f.setAccessible(true)
+          val p = f.get(pipelineStage)
+          p.asInstanceOf[Param[Any]].name
+        }
+        val paramFieldNames = paramFields.map(_.getName)
+        assertOrLog(paramNames === paramFieldNames, pipelineStage.getClass.getName)
+      }
+    }
+  }
+
+  test("Verify correct use of mixins") {
+    val triggers = Map(
+      "inputCol" -> classOf[HasInputCol],
+      "inputColumn" -> classOf[HasInputCol],
+      "outputCol" -> classOf[HasOutputCol],
+      "outputColumn" -> classOf[HasOutputCol],
+      "labelCol" -> classOf[HasLabelCol],
+      "labelColumn" -> classOf[HasLabelCol],
+      "featuresCol" -> classOf[HasFeaturesCol],
+      "featuresColumn" -> classOf[HasFeaturesCol]
+    )
+
+    val exemptions = Set[String](
+      "org.apache.spark.ml.feature.FastVectorAssembler", // In Spark namespace
+      "com.microsoft.ml.spark.TextFeaturizer" // needs to hide setters from model
+    )
+    pipelineStages.foreach { stage =>
+      if (!exemptions(stage.getClass.getName)) {
+        stage.params.foreach { param =>
+          triggers.get(param.name) match {
+            case Some(clazz) =>
+              assertOrLog(clazz.isAssignableFrom(stage.getClass),
+                stage.getClass.getName + " needs to extend " + clazz.getName)
+            case None =>
+          }
+        }
+      }
+    }
+  }
+
+  private def assertOrLog(condition: Boolean, hint: String = "",
+                          disableFailure: Boolean = disableFailure): Unit = {
+    if (disableFailure && !condition) {
+      println(hint)
+    } else {
+      assert(condition, hint)
+    }
+    ()
+  }
+
+  private def throwOrLog(e: Throwable, message: String = "",
+                         disableFailure: Boolean = disableFailure): Unit = {
+    println(message)
+    if (disableFailure) {
+      println(e.getMessage)
+      e.printStackTrace(System.out)
+    } else {
+      throw e
+    }
+  }
+
+  // set the context loader to pick up on the jars
+  Thread.currentThread().setContextClassLoader(JarLoadingUtils.classLoader)
+
+  private lazy val transformers: List[Transformer] = JarLoadingUtils.loadClass[Transformer](debug = debug)
+
+  private lazy val estimators: List[Estimator[_]] = JarLoadingUtils.loadClass[Estimator[_]](debug = debug)
+
+  private lazy val readers: List[MLReadable[_]] = JarLoadingUtils.loadObject[MLReadable[_]](debug = debug)
+
+  private lazy val pipelineStages: List[PipelineStage] = JarLoadingUtils.loadClass[PipelineStage](debug = debug)
+
+  private lazy val readerMap = readers.map {
+    r => (r.getClass.getName.dropRight(1), r.asInstanceOf[MLReadable[Any]])
+  }.toMap
+
+  private lazy val transformerFuzzers: Map[String, TransformerFuzzingTest] =
+    JarLoadingUtils.loadTestClass[TransformerFuzzingTest](debug = debug)
+      .map(tr => (tr.getClassName, tr)).toMap
+
+  private lazy val estimatorFuzzers: Map[String, EstimatorFuzzingTest] =
+    JarLoadingUtils.loadTestClass[EstimatorFuzzingTest](debug = debug)
+      .map(est => (est.getClassName, est)).toMap
+
+  private def trySave(stage: PipelineStage, reader: Option[MLReadable[Any]] = None,
+                      path: String = "testModels"): Option[PipelineStage] = {
+    stage match {
+      case w: PipelineStage with MLWritable =>
+        try {
+          w.write.overwrite().save(path)
+          reader match {
+            case Some(r) =>
+              val loaded = r.load(path).asInstanceOf[PipelineStage]
+              assertOrLog(loaded.params.sameElements(w.params))
+              println(s"Round trip succeeded for ${w.getClass.getName}")
+              Some(loaded)
+            case None => None
+          }
+        } catch {
+          case e: Throwable =>
+            throwOrLog(e, w.getClass.getName + " encounters an error while saving/loading")
+            None
+        } finally {
+          FileUtilities.delTree(new File(path))
+          ()
+        }
+      case tr =>
+        assertOrLog(false, tr.getClass.getName + " needs to extend MLWritable")
+        None
+    }
+  }
+
+  private def createDataSet: DataFrame = {
+    GenerateDataset
+      .generateDataset(session,
+        new BasicDatasetGenerationConstraints(numRows, numCols, numSlotsPerVectorCol),
+        randomSeed.nextLong())
+  }
+
+  private def tryRun(func: () => Unit): Unit = {
+    try {
+      func()
+    } catch {
+      case ne: java.util.NoSuchElementException =>
+        throwOrLog(ne, s"Could not transform: $ne", disableFailure=true)
+      case th: Throwable =>
+        throwOrLog(th, s"Encountered unknown error: $th", disableFailure=true)
+    }
+  }
+
+}
diff --git a/src/image-featurizer/build.sbt b/src/image-featurizer/build.sbt
new file mode 100644
index 0000000000..bf243751d5
--- /dev/null
+++ b/src/image-featurizer/build.sbt
@@ -0,0 +1,5 @@
+//> DependsOn: core
+//> DependsOn: readers
+//> DependsOn: downloader
+//> DependsOn: cntk-model
+//> DependsOn: image-transformer
diff --git a/src/image-featurizer/src/main/scala/ImageFeaturizer.scala b/src/image-featurizer/src/main/scala/ImageFeaturizer.scala
new file mode 100644
index 0000000000..7d59780eb9
--- /dev/null
+++ b/src/image-featurizer/src/main/scala/ImageFeaturizer.scala
@@ -0,0 +1,128 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.net.URI
+
+import com.microsoft.ml.spark.FileUtilities.File
+import com.microsoft.ml.spark.schema.DatasetExtensions
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
+import org.apache.spark.sql.types.{ArrayType, FloatType, StructType}
+import org.apache.spark.sql.{DataFrame, Dataset}
+
+object ImageFeaturizer extends DefaultParamsReadable[ImageFeaturizer]
+
+/**
+  *
+  * Class for featurizing images with pretrained CNTK models. The <code>ImageFeaturizer</code> allows one to
+  * leverage deep representations learned on large supervised datasets to improve image processing
+  * workflows.
+  *
+  * The ImageFeaturizer relies on a CNTK model to do the featurization, one can set this model using
+  * the <code>modelLocation</code> parameter. To map the nodes of the CNTK model onto the standard "layers" structure
+  * of a feed forward neural net, one needs to supply a list of node names that range from the output node,
+  * back towards the input node of the CNTK Function.
+  * This list does not need to be exhaustive, and is provided to you if you
+  * use a model downloaded from the <code>ModelDownloader</code>, one can find this layer list in the schema of the
+  * downloaded model.
+  *
+  * The <code>ImageFeaturizer</code> takes an input column of images
+  * (the type returned by the <code>ImageReader</code>), and
+  * automatically resizes them to fit the CMTKModel's inputs. It then feeds them through a pre-trained
+  * CNTK model. One can truncate the model using the <code> cutOutputLayers </code> parameter that
+  * determines how many layers to truncate from the output of the network.
+  * For example, layer=0 means tha no layers are removed,
+  * layer=2 means that the image featurizer returns the activations of the layer that is two layers
+  * from the output layer.
+  *
+  * @param uid the uid of the image transformer
+  */
+class ImageFeaturizer(val uid: String) extends Transformer with HasInputCol with HasOutputCol with MMLParams {
+  def this() = this(Identifiable.randomUID("ImageFeaturizer"))
+
+  val inputNode: IntParam = IntParam(this, "inputNode", "which node of the CNTKFunctions inputs" +
+    "to use as the input (default 0)")
+
+  def setInputNode(value: Int): this.type = set(inputNode, value)
+
+  def getInputNode: Int = $(inputNode)
+
+  val cutOutputLayers: IntParam = IntParam(this, "cutOutputLayers", "the number of layers to cut " +
+    "off the end of the network, 0 leaves the network intact," +
+    " 1 removes the output layer, etc", ParamValidators.gtEq(0))
+
+  def setCutOutputLayers(value: Int): this.type = set(cutOutputLayers, value)
+
+  def getCutOutputLayers: Int = $(cutOutputLayers)
+
+  val layerNames: StringArrayParam = new StringArrayParam(this, "layerNames",
+    "Array with valid CNTK nodes to choose from, this first entries of this array should be closer to the " +
+      "output node")
+
+  def setLayerNames(value: Array[String]): this.type = set(layerNames, value)
+
+  def getLayerNames: Array[String] = $(layerNames)
+
+  val modelLocation: Param[String] = StringParam(this, "modelLocation", "the location of the model as a URI/URL",
+    {s: String =>
+      try{
+        new URI(s)
+        true
+      }catch{
+        case e: Exception => false
+      }
+    })
+
+  def setModelLocation(value: String): this.type = set(modelLocation, value)
+
+  def setModelLocation(value: URI): this.type = set(modelLocation, value.toString)
+
+  def getModelLocation: String = $(modelLocation)
+
+  def setModel(modelSchema: ModelSchema): this.type = {
+    setLayerNames(modelSchema.layerNames)
+      .setInputNode(modelSchema.inputNode)
+      .setModelLocation(modelSchema.uri.toString)
+  }
+
+  setDefault(cutOutputLayers -> 1, inputNode -> 0, outputCol -> (uid + "_output"))
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    val spark = dataset.sparkSession
+
+    val resizedCol = DatasetExtensions.findUnusedColumnName("resized")(dataset.columns.toSet)
+
+    val cntkModel = new CNTKModel()
+      .setModel(dataset.sparkSession, getModelLocation)
+      .setInputNode(getInputNode)
+      .setOutputNodeName(getLayerNames.apply(getCutOutputLayers))
+      .setInputCol(resizedCol)
+      .setOutputCol(getOutputCol)
+
+    val requiredSize = CNTKModel.loadModelFromBytes(cntkModel.getModel)
+      .getArguments.get(0).getShape().getDimensions
+
+    val prepare = new ImageTransformer()
+      .setInputCol($(inputCol))
+      .resize(requiredSize(0).toInt, requiredSize(1).toInt)
+
+    val unroll = new UnrollImage()
+      .setInputCol(prepare.getOutputCol)
+      .setOutputCol(resizedCol)
+
+    val resizedDF = prepare.transform(dataset)
+    val unrolledDF = unroll.transform(resizedDF).drop(prepare.getOutputCol)
+    val featurizedDF = cntkModel.transform(unrolledDF).drop(resizedCol)
+    featurizedDF
+  }
+
+  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)
+
+  override def transformSchema(schema: StructType): StructType = {
+    schema.add(getOutputCol, new ArrayType(FloatType, false))
+  }
+
+}
diff --git a/src/image-featurizer/src/test/scala/ImageFeaturizerSuite.scala b/src/image-featurizer/src/test/scala/ImageFeaturizerSuite.scala
new file mode 100644
index 0000000000..bb81bb56fd
--- /dev/null
+++ b/src/image-featurizer/src/test/scala/ImageFeaturizerSuite.scala
@@ -0,0 +1,66 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.net.URI
+
+import org.apache.spark.sql.DataFrame
+import com.microsoft.ml.spark.FileUtilities.File
+import org.apache.spark.ml.linalg.DenseVector
+import com.microsoft.ml.spark.Readers.implicits._
+
+import scala.collection.JavaConversions._
+
+class ImageFeaturizerSuite extends LinuxOnly with CNTKTestUtils {
+  val images: DataFrame = session.readImages(imagePath, true).withColumnRenamed("image", inputCol)
+
+  val modelDir = new File(filesRoot, "CNTKModel")
+  val modelDownloader = new ModelDownloader(session, modelDir.toURI)
+
+  lazy val resNetUri: URI = new File(modelDir, "ResNet50_ImageNet.model").toURI
+  lazy val resNet: ModelSchema = modelDownloader.downloadByName("ResNet50")
+
+  test("Image featurizer should reproduce the CIFAR10 experiment") {
+    val model = new ImageFeaturizer()
+      .setInputCol(inputCol)
+      .setOutputCol(outputCol)
+      .setModelLocation(s"${sys.env("DATASETS_HOME")}/CNTKModel/ConvNet_CIFAR10.model")
+      .setCutOutputLayers(0)
+      .setLayerNames(Array("z"))
+    val result = model.transform(images)
+    compareToTestModel(result)
+  }
+
+  test("the Image feature should work with the modelSchema") {
+    val model = new ImageFeaturizer()
+      .setInputCol(inputCol)
+      .setOutputCol(outputCol)
+      .setModel(resNet)
+      .setCutOutputLayers(0)
+    val result = model.transform(images)
+    compareToTestModel(result)
+  }
+
+  test("Image featurizer should work with ResNet50", TestBase.Extended) {
+    val model = new ImageFeaturizer()
+      .setModel(resNet)
+      .setInputCol(inputCol)
+      .setOutputCol(outputCol)
+    val result = model.transform(images)
+    val resVec = result.select(outputCol).collect()(0).getAs[DenseVector](0)
+    assert(resVec.size == 1000)
+  }
+
+  test("test layers of network", TestBase.Extended) {
+    (0 to 9).foreach({ i =>
+      val model = new ImageFeaturizer()
+        .setModel(resNet)
+        .setInputCol(inputCol)
+        .setOutputCol(outputCol)
+        .setCutOutputLayers(i)
+      val result = model.transform(images)
+    })
+  }
+
+}
diff --git a/src/image-transformer/build.sbt b/src/image-transformer/build.sbt
new file mode 100644
index 0000000000..c354d24346
--- /dev/null
+++ b/src/image-transformer/build.sbt
@@ -0,0 +1,2 @@
+//> DependsOn: core
+//> DependsOn: readers
diff --git a/src/image-transformer/src/main/python/ImageTransform.py b/src/image-transformer/src/main/python/ImageTransform.py
new file mode 100644
index 0000000000..f2eb61112b
--- /dev/null
+++ b/src/image-transformer/src/main/python/ImageTransform.py
@@ -0,0 +1,96 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+import sys
+
+if sys.version >= '3':
+    basestring = str
+
+import pyspark
+from pyspark.ml.common import inherit_doc
+from pyspark.sql.types import *
+from pyspark.sql.types import Row, _create_row
+import numpy as np
+from mmlspark._ImageTransformer import _ImageTransformer
+
+ImageFields = ["path", "height", "width", "type", "bytes"]
+
+ImageSchema = StructType([
+    StructField(ImageFields[0], StringType(),  True),
+    StructField(ImageFields[1], IntegerType(), True),
+    StructField(ImageFields[2], IntegerType(), True),
+    StructField(ImageFields[3], IntegerType(), True),                 # OpenCV type: CV_8U in most cases
+    StructField(ImageFields[4], BinaryType(), True) ])   # OpenCV bytes: row-wise BGR in most cases
+
+def toNDArray(image):
+    return np.asarray(image.bytes, dtype = np.uint8).reshape((image.height, image.width, 3))[:,:,(2,1,0)]
+
+def toImage(array, path = "", ocvType = 16):
+    length = np.prod(array.shape)
+
+    data = bytearray(array.astype(dtype=np.int8)[:,:,(2,1,0)].reshape(length))
+    height = array.shape[0]
+    width = array.shape[1]
+    # Creating new Row with _create_row(), because Row(name = value, ... ) orders fields by name,
+    # which conflicts with expected ImageSchema order when the new DataFrame is created by UDF
+    return  _create_row(ImageFields, [path, height, width, ocvType, data])
+
+from pyspark.ml.common import inherit_doc
+@inherit_doc
+class ImageTransform(_ImageTransformer):
+    """
+    Resizes the image to the given width and height
+    :param int height: The height to resize to (>=0)
+    :param int width: The width to resize to (>=0)
+    """
+    def resize(self, height, width):
+        self._java_obj.resize(height, width)
+        return self
+
+    """
+    Crops the image given the starting x,y coordinates
+    and the width and height
+    :param int x: The initial x coordinate (>=0)
+    :param int y: The initial y coordinate (>=0)
+    :param int height: The height to crop to (>=0)
+    :param int width: The width to crop to (>=0)
+    """
+    def crop(self, x, y, height, width):
+        self._java_obj.crop(x,y,height,width)
+        return self
+
+    """
+    Formats the image to the given image format
+    :param int format: The format to convert to, please see OpenCV cvtColor function documentation for all formats
+    """
+    def colorFormat(self, format):
+        self._java_obj.colorFormat(format)
+        return self
+
+    """
+    Blurs the image using a normalized box filter
+    :param double height: The height of the box filter (>= 0)
+    :param double width: The width of the box filter (>= 0)
+    """
+    def blur(self, height, width):
+        self._java_obj.blur(height, width)
+        return self
+
+    """
+    Thresholds the image, please see OpenCV threshold function documentation for more information
+    :param double threshold: The threshold value
+    :param double maxVal: The maximum value to use
+    :param double thresholdType: The type of threshold, can be binary, binary_inv, trunc, zero, zero_inv
+    """
+    def threshold(self, threshold, maxVal, thresholdType):
+        self._java_obj.threshold(threshold, maxVal, thresholdType)
+        return self
+
+    """
+    Blurs the image by applying a gaussian kernel
+    :param double appertureSize: The aperture size, which should be odd and positive
+    :param double sigma: The standard deviation of the gaussian
+    """
+    def gaussianKernel(self, appertureSize, sigma):
+        self._java_obj.gaussianKernel(appertureSize, sigma)
+        return self
diff --git a/src/image-transformer/src/main/scala/ImageTransformer.scala b/src/image-transformer/src/main/scala/ImageTransformer.scala
new file mode 100644
index 0000000000..ec82e574dc
--- /dev/null
+++ b/src/image-transformer/src/main/scala/ImageTransformer.scala
@@ -0,0 +1,314 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.util.DefaultParamsReadable
+import org.apache.spark.sql.functions.udf
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.ml.param._
+import com.microsoft.ml.spark.schema.ImageSchema
+import scala.collection.mutable.ListBuffer
+import com.microsoft.ml.spark.schema.BinaryFileSchema
+import scala.collection.mutable.{ListBuffer, WrappedArray}
+import org.opencv.core.Core
+import org.opencv.core.Mat
+import org.opencv.core.{Rect, Size}
+import org.opencv.imgproc.Imgproc
+import org.apache.spark.ml.util.Identifiable
+
+abstract class ImageTransformerStage(params: Map[String, Any]) extends Serializable {
+  def apply(image: Mat): Mat
+  val stageName: String
+}
+
+class ResizeImage(params: Map[String, Any]) extends ImageTransformerStage(params) {
+  val height = params(ResizeImage.height).asInstanceOf[Int].toDouble
+  val width = params(ResizeImage.width).asInstanceOf[Int].toDouble
+  override val stageName = ResizeImage.stageName
+
+  override def apply(image: Mat): Mat = {
+    var resized = new Mat()
+    val sz = new Size(width, height)
+    Imgproc.resize(image, resized, sz)
+    resized
+  }
+}
+
+object ResizeImage {
+  val stageName = "resize"
+  val height = "height"
+  val width = "width"
+}
+
+class CropImage(params: Map[String, Any]) extends ImageTransformerStage(params) {
+  val x = params(CropImage.x).asInstanceOf[Int]
+  val y = params(CropImage.y).asInstanceOf[Int]
+  val height = params(CropImage.height).asInstanceOf[Int]
+  val width = params(CropImage.width).asInstanceOf[Int]
+  override val stageName = CropImage.stageName
+
+  override def apply(image: Mat): Mat = {
+    val rect = new Rect(x, y, width, height)
+    new Mat(image, rect)
+  }
+}
+
+object CropImage {
+  val stageName = "crop"
+  val x = "x"
+  val y = "y"
+  val height = "height"
+  val width = "width"
+}
+
+/**
+  * Applies a color format to the image, eg COLOR_BGR2GRAY.
+  */
+class ColorFormat(params: Map[String, Any]) extends ImageTransformerStage(params) {
+  val format = params(ColorFormat.format).asInstanceOf[Int]
+  override val stageName = ColorFormat.stageName
+
+  override def apply(image: Mat): Mat = {
+    val dst = new Mat()
+    Imgproc.cvtColor(image, dst, format)
+    dst
+  }
+}
+
+object ColorFormat {
+  val stageName = "colorformat"
+  val format = "format"
+}
+
+/**
+  * Blurs the image
+  * @param params
+  */
+class Blur(params: Map[String, Any]) extends ImageTransformerStage(params) {
+  val height = params(Blur.height).asInstanceOf[Double]
+  val width = params(Blur.width).asInstanceOf[Double]
+  override val stageName = Blur.stageName
+
+  override def apply(image: Mat): Mat = {
+    val dst = new Mat()
+    Imgproc.blur(image, dst, new Size(height, width))
+    dst
+  }
+}
+
+object Blur {
+  val stageName = "blur"
+  val height = "height"
+  val width = "width"
+}
+
+/**
+  * Applies a threshold to the image
+  * @param params
+  */
+class Threshold(params: Map[String, Any]) extends ImageTransformerStage(params) {
+  val threshold = params(Threshold.threshold).asInstanceOf[Double]
+  val maxVal = params(Threshold.maxVal).asInstanceOf[Double]
+  // EG Imgproc.THRESH_BINARY
+  val thresholdType = params(Threshold.thresholdType).asInstanceOf[Int]
+  override val stageName = Threshold.stageName
+
+  override def apply(image: Mat): Mat = {
+    val dst = new Mat()
+    Imgproc.threshold(image, dst, threshold, maxVal, thresholdType)
+    dst
+  }
+}
+
+object Threshold {
+  val stageName = "threshold"
+  val threshold = "threshold"
+  val maxVal = "maxVal"
+  val thresholdType = "type"
+}
+
+/**
+  * Applies gaussian kernel to the image
+  */
+class GaussianKernel(params: Map[String, Any]) extends ImageTransformerStage(params) {
+  val appertureSize = params(GaussianKernel.appertureSize).asInstanceOf[Int]
+  val sigma = params(GaussianKernel.sigma).asInstanceOf[Double]
+  override val stageName = GaussianKernel.stageName
+
+  override def apply(image: Mat): Mat = {
+    val dst = new Mat()
+    val kernel = Imgproc.getGaussianKernel(appertureSize, sigma)
+    Imgproc.filter2D(image, dst, -1, kernel)
+    dst
+  }
+}
+
+object GaussianKernel {
+  val stageName = "gaussiankernel"
+  val appertureSize = "appertureSize"
+  val sigma = "sigma"
+}
+
+/**
+  * Pipelined image processing
+  */
+object ImageTransformer extends DefaultParamsReadable[ImageTransformer] {
+
+  override def load(path: String): ImageTransformer = super.load(path)
+
+  /**
+    * Convert Spark image representation to OpenCV format
+    */
+  private def row2mat(row: Row): (String, Mat) = {
+    val path    = ImageSchema.getPath(row)
+    val height  = ImageSchema.getHeight(row)
+    val width   = ImageSchema.getWidth(row)
+    val ocvType = ImageSchema.getType(row)
+    val bytes   = ImageSchema.getBytes(row)
+
+    val img = new Mat(height, width, ocvType)
+    img.put(0,0,bytes)
+    (path, img)
+  }
+
+  /**
+    *  Convert from OpenCV format to Dataframe Row; unroll if needed
+    */
+  private def mat2row(img: Mat, path: String = ""): Row = {
+    var ocvBytes = new Array[Byte](img.total.toInt*img.elemSize.toInt)
+    img.get(0,0,ocvBytes)         //extract OpenCV bytes
+    Row(path, img.height, img.width, img.`type`, ocvBytes)
+  }
+
+  /**
+    * Apply all OpenCV transformation stages to a single image; unroll the result if needed
+    * For null inputs or binary files that could not be parsed, return None.
+    * Break on OpenCV errors.
+    */
+  def process(stages: Seq[ImageTransformerStage], decode: Boolean)(row: Row): Option[Row] = {
+
+    if (row == null) return None
+
+    val decoded = if (decode) {
+      val path = BinaryFileSchema.getPath(row)
+      val bytes = BinaryFileSchema.getBytes(row)
+
+      //early return if the image can't be decompressed
+      ImageReader.decode(path, bytes).getOrElse(return None)
+    } else row
+
+    var (path, img) = row2mat(decoded)
+    for (stage <- stages) {
+      img = stage.apply(img)
+    }
+    Some(mat2row(img, path))
+  }
+}
+
+@InternalWrapper
+class ImageTransformer(val uid: String) extends Transformer
+  with HasInputCol with HasOutputCol with MMLParams {
+
+  import com.microsoft.ml.spark.ImageTransformer._
+
+  def this() = this(Identifiable.randomUID("ImageTransformer"))
+
+  val stages: ArrayMapParam = new ArrayMapParam(this, "stages", "image transformation stages")
+  def setStages(value: Array[Map[String, Any]]): this.type = set(stages, value)
+  def getStages: Array[Map[String, Any]] = $(stages)
+  private def addStage(stage: Map[String, Any]): this.type = set(stages, $(stages) :+ stage)
+
+  setDefault(inputCol -> "image",
+    outputCol -> (uid + "_output"),
+    stages -> Array[Map[String, Any]]()
+  )
+
+  // every stage has a name like "resize", "normalize", "unroll"
+  val stageName = "action"
+
+  def resize(height: Int, width: Int): this.type = {
+    require(width >= 0 && height >= 0, "width and height should be nonnegative")
+
+    addStage(Map(stageName -> ResizeImage.stageName,
+      ResizeImage.width -> width,
+      ResizeImage.height -> height))
+  }
+
+  def crop(x: Int, y: Int, height: Int, width: Int): this.type = {
+    require(x >= 0 && y >= 0 && width >= 0 && height >= 0, "crop values should be nonnegative")
+
+    addStage(Map(stageName -> CropImage.stageName,
+      CropImage.width -> width,
+      CropImage.height -> height,
+      CropImage.x -> x,
+      CropImage.y -> y))
+  }
+
+  def colorFormat(format: Int): this.type = {
+    addStage(Map(stageName -> ColorFormat.stageName, ColorFormat.format -> format))
+  }
+
+  def blur(height: Double, width: Double): this.type = {
+    addStage(Map(stageName -> Blur.stageName, Blur.height -> height, Blur.width -> width))
+  }
+
+  def threshold(threshold: Double, maxVal: Double, thresholdType: Int): this.type = {
+    addStage(Map(stageName -> Threshold.stageName,
+      Threshold.maxVal -> maxVal,
+      Threshold.threshold -> threshold,
+      Threshold.thresholdType -> thresholdType))
+  }
+
+  def gaussianKernel(appertureSize: Int, sigma: Double): this.type = {
+    addStage(Map(stageName -> GaussianKernel.stageName,
+      GaussianKernel.appertureSize -> appertureSize,
+      GaussianKernel.sigma -> sigma))
+  }
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+
+    //  load native OpenCV library on each partition
+    // TODO: figure out more elegant way
+    val spark = dataset.sqlContext
+
+    val schema = dataset.toDF.schema
+
+    val loaded = ImageSchema.loadLibraryForAllPartitions(dataset.toDF.rdd, Core.NATIVE_LIBRARY_NAME)
+
+    val df = spark.createDataFrame(loaded, schema)
+
+    val isBinary = BinaryFileSchema.isBinaryFile(df, $(inputCol))
+    assert(ImageSchema.isImage(df, $(inputCol)) || isBinary, "input column should have Image or BinaryFile type")
+
+    var transforms = ListBuffer[ImageTransformerStage]()
+    for(stage <- $(stages)) {
+      stage(stageName) match  {
+        case ResizeImage.stageName => transforms += new ResizeImage(stage)
+        case CropImage.stageName => transforms += new CropImage(stage)
+        case ColorFormat.stageName => transforms += new ColorFormat(stage)
+        case Blur.stageName => transforms += new Blur(stage)
+        case Threshold.stageName => transforms += new Threshold(stage)
+        case GaussianKernel.stageName => transforms += new GaussianKernel(stage)
+        case unsupported: String => throw new IllegalArgumentException(s"unsupported transformation $unsupported")
+      }
+    }
+
+    val func = process(transforms, decode = isBinary)(_)
+    val convert = udf(func, ImageSchema.columnSchema)
+
+    df.withColumn($(outputCol), convert(df($(inputCol))))
+  }
+
+  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)
+
+  override def transformSchema(schema: StructType): StructType = {
+    schema.add($(outputCol), ImageSchema.columnSchema)
+  }
+
+}
+
+
diff --git a/src/image-transformer/src/main/scala/UnrollImage.scala b/src/image-transformer/src/main/scala/UnrollImage.scala
new file mode 100644
index 0000000000..f05332de76
--- /dev/null
+++ b/src/image-transformer/src/main/scala/UnrollImage.scala
@@ -0,0 +1,70 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.schema.ImageSchema._
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.linalg.DenseVector
+import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
+import org.apache.spark.ml.param.{Param, ParamMap}
+import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
+import org.apache.spark.sql.functions.udf
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
+
+object UnrollImage extends DefaultParamsReadable[UnrollImage]{
+
+  private def unroll(row: Row): DenseVector = {
+    val width = getWidth(row)
+    val height = getHeight(row)
+    val bytes = getBytes(row)
+
+    val area = width*height
+    require(area >= 0 && area < 1e8, "image has incorrect dimensions" )
+    require(bytes.length == width*height*3, "image has incorrect nuber of bytes" )
+
+    var rearranged =  Array.fill[Double](area*3)(0.0)
+    var count = 0
+    for (c <- 0 until 3) {
+      for (h <- 0 until height) {
+        val offset = h*width*3
+        for (w <- 0 until width) {
+          val b = bytes(offset + w*3 + c).toDouble
+
+          //TODO: is there a better way to convert to unsigned byte?
+          rearranged(count) =  if(b>0) b else b + 256.0
+          count += 1
+        }
+      }
+    }
+    new DenseVector(rearranged)
+  }
+}
+
+class UnrollImage(val uid: String) extends Transformer with HasInputCol with HasOutputCol with MMLParams{
+  def this() = this(Identifiable.randomUID("UnrollImage"))
+
+  import com.microsoft.ml.spark.UnrollImage._
+
+  setDefault(inputCol -> "image", outputCol -> (uid + "_output"))
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    val df = dataset.toDF
+    assert(isImage(df, $(inputCol)), "input column should have Image type")
+
+    val func = unroll(_)
+    val unrollUDF = udf(func)
+
+    df.withColumn($(outputCol), unrollUDF(df($(inputCol))))
+  }
+
+  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)
+
+  override def transformSchema(schema: StructType): StructType = {
+    schema.add($(outputCol), VectorType)
+  }
+
+}
+
+
diff --git a/src/image-transformer/src/test/scala/ImageTransformerSuite.scala b/src/image-transformer/src/test/scala/ImageTransformerSuite.scala
new file mode 100644
index 0000000000..fc7e3b1a7b
--- /dev/null
+++ b/src/image-transformer/src/test/scala/ImageTransformerSuite.scala
@@ -0,0 +1,293 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.awt.GridLayout
+import java.nio.file.Paths
+import javax.swing._
+
+import org.apache.spark.ml.linalg.DenseVector
+import org.apache.spark.sql.DataFrame
+import org.opencv.core.{Mat, MatOfByte}
+import org.opencv.imgcodecs.Imgcodecs
+import org.opencv.imgproc.Imgproc
+import org.apache.spark.sql.Row
+import com.microsoft.ml.spark.FileUtilities.File
+import com.microsoft.ml.spark.Readers.implicits._
+import org.apache.spark.sql.SaveMode
+
+class ImageTransformerSuite extends LinuxOnly {
+
+  val groceriesDirectory = "/Images/Grocery/"
+  private val fileLocation = s"${sys.env("DATASETS_HOME")}/$groceriesDirectory"
+
+  test("general workflow") {
+
+    val images = session.readImages(fileLocation, recursive = true)
+    assert(images.count() == 30)
+
+    val size = (224,200)
+    val tr = new ImageTransformer()
+      .setOutputCol("out")
+      .resize(height = size._1, width = size._2)
+      .crop(x = 0, y = 0, height = 22, width = 26)
+      .resize(height = 15, width = 10)
+
+    val preprocessed = tr.transform(images)
+
+    val out_sizes = preprocessed.select(preprocessed("out.height"), preprocessed("out.width")).collect
+
+    out_sizes.foreach(
+      (row:Row) => {
+        assert(row.getInt(0) == 15 && row.getInt(1) == 10, "output images have incorrect size")
+      }
+    )
+
+    val unroll = new UnrollImage()
+      .setInputCol(tr.getOutputCol)
+      .setOutputCol("final")
+
+    val result = unroll.transform(preprocessed).select("final")
+    result.collect().foreach(
+      row => assert(row(0).asInstanceOf[DenseVector].toArray.length == 10*15*3, "unrolled image is incorrect"))
+
+  }
+
+  test("to parquet") {
+
+    val filename = "test_images_parquet"
+    try {
+      val images = session.readImages(fileLocation, recursive = true)
+      images.write.mode(SaveMode.Overwrite).parquet(filename)
+
+      val images1 = session.sqlContext.read.parquet(filename)
+      assert(images1.count() == images.count())
+    } finally {
+      FileUtilities.delTree(new File(filename))
+      ()
+    }
+  }
+
+  test("binary file input") {
+
+    val images = session.readBinaryFiles(fileLocation, recursive = true)
+    assert(images.count() == 31)
+
+    val tr = new ImageTransformer()
+      .setInputCol("value")
+      .setOutputCol("out")
+      .resize(height = 15, width = 10)
+
+    val preprocessed = tr.transform(images).na.drop
+    assert(preprocessed.count() == 30)
+
+    val out_sizes = preprocessed.select(preprocessed("out.height"), preprocessed("out.width")).collect
+
+    out_sizes.foreach(
+      (row:Row) => {
+        assert(row.getInt(0) == 15 && row.getInt(1) == 10, "output images have incorrect size")
+      }
+    )
+  }
+
+  test("crop") {
+
+    val images = session.readImages(fileLocation, recursive = true)
+
+    val tr = new ImageTransformer()
+      .setOutputCol("out")
+      .resize(height = 100, width = 200)
+      .crop(x = 0, y = 0, height = 22, width = 26)
+
+    val preprocessed = tr.transform(images)
+
+    val out_sizes = preprocessed.select(preprocessed("out.height"), preprocessed("out.width")).collect
+
+    out_sizes.foreach(
+      (row:Row) => {
+        assert(row.getInt(0) == 22 && row.getInt(1) == 26, "output images have incorrect size")
+      }
+    )
+  }
+
+  test("color format") {
+
+    val images = session.readImages(fileLocation, recursive = true)
+
+    val tr = new ImageTransformer()
+      .setOutputCol("out")
+      .colorFormat(Imgproc.COLOR_BGR2GRAY)
+
+    val preprocessed = tr.transform(images)
+
+    val grayImages = selectImageCols(preprocessed)
+
+    // For visual debugging uncomment:
+    // displayImages(grayImages)
+    val bytes = Array(10, 1, 3, 9, 6, 16, 11, 7, 8, 6, 26, 40, 57, 50)
+    // Validate first image first few bytes have been transformed correctly
+    val firstImageBytes = selectTestImageBytes(grayImages)
+    for (i <- 0 until bytes.length) {
+      assert(firstImageBytes(i) == bytes(i))
+    }
+  }
+
+  test("verify blur") {
+
+    val images = session.readImages(fileLocation, recursive = true)
+
+    val tr = new ImageTransformer()
+      .setOutputCol("out")
+      .blur(100, 100)
+
+    val preprocessed = tr.transform(images)
+
+    val blurImages = selectImageCols(preprocessed)
+
+    // For visual debugging uncomment:
+    // displayImages(grayImages)
+    val bytes = Array(15, 28, 26, 15, 28, 26, 15, 28, 26, 15, 28, 26, 15, 28, 26, 15)
+    // Validate first image first few bytes have been transformed correctly
+    val firstImageBytes = selectTestImageBytes(blurImages)
+    for (i <- 0 until bytes.length) {
+      assert(firstImageBytes(i) == bytes(i))
+    }
+  }
+
+  test("verify thresholding") {
+
+    val images = session.readImages(fileLocation, recursive = true)
+
+    val tr = new ImageTransformer()
+      .setOutputCol("out")
+      .threshold(100, 100, Imgproc.THRESH_BINARY)
+
+    val preprocessed = tr.transform(images)
+
+    val thresholdedImages = selectImageCols(preprocessed)
+
+    // For visual debugging uncomment:
+    // displayImages(thresholdedImages)
+    // Validate first image first few bytes have been transformed correctly
+    thresholdedImages.foreach(
+      (row:Row) => {
+        if (!row.getAs[Array[Byte]](3).forall(b => b == 100 || b == 0)) {
+          throw new Exception("threshold did not result in binary values")
+        }
+      }
+    )
+  }
+
+  test("verify application of gaussian kernel (has blur effect)") {
+
+    val images = session.readImages(fileLocation, recursive = true)
+
+    val tr = new ImageTransformer()
+      .setOutputCol("out")
+      .gaussianKernel(20, 10)
+
+    val preprocessed = tr.transform(images)
+
+    val gaussianImages = selectImageCols(preprocessed)
+
+    // For visual debugging uncomment:
+    // displayImages(gaussianImages)
+    val firstImageBytes = selectTestImageBytes(gaussianImages)
+    // Validate first image first few bytes have been transformed correctly
+    val bytes = Array(8, 14, 14, 4, 8, 7, 4, 5, 5, 4, 5, 6, 5, 9, 8, 3, 8, 7, 7, 13, 12, 8, 12)
+    // Validate first image first few bytes have been transformed correctly
+    for (i <- 0 until bytes.length) {
+      assert(firstImageBytes(i) == bytes(i))
+    }
+  }
+
+  test("unroll") {
+    val filesRoot = s"${sys.env("DATASETS_HOME")}/"
+    val imagePath = s"$filesRoot/Images/CIFAR"
+
+    val images = session.readImages(imagePath, recursive = true)
+    assert(images.count() == 6)
+
+    val unroll = new UnrollImage().setOutputCol("result")
+    val unrolled = unroll.transform(images).select("image.path","result").collect
+
+    unrolled.foreach(row => {
+      val path = Paths.get(row.getString(0))
+      val expected = firstBytes(path.getFileName().toString())
+      val result = row(1).asInstanceOf[DenseVector].toArray
+
+      val length =result.length
+      if(length != 3072) throw new Exception(s"array length should be 3072, not $length ")
+
+      if(!compareArrays(expected, result)) {
+        println(path)
+        println("result:   " + result.slice(0,10).deep.toString)
+        println("expected: " + expected.deep.toString)
+        throw new Exception("incorrect numeric value for flattened image")
+      }
+    })
+  }
+
+  private def selectTestImageBytes(images: DataFrame): Array[Byte] = {
+    images.filter(row => row.getString(4).endsWith("negative/5.jpg"))
+      .head.getAs[Array[Byte]](3)
+  }
+
+  private def selectImageCols(images: DataFrame): DataFrame = {
+    images.select(images("out.height"),
+      images("out.width"),
+      images("out.type"),
+      images("out.bytes"),
+      images("out.path"))
+  }
+
+  private def displayImages(images: DataFrame): Unit = {
+    val (jframe, panel) = createScrollingFrame(images.count())
+    images.collect().foreach(
+      (row:Row) => {
+        val img = new Mat(row.getInt(0), row.getInt(1), row.getInt(2))
+        img.put(0,0,row.getAs[Array[Byte]](3))
+        // Have to do the MatOfByte dance here
+        val matOfByte = new MatOfByte()
+        Imgcodecs.imencode(".jpg", img, matOfByte)
+        val icon = new ImageIcon(matOfByte.toArray)
+        val label: JLabel = new JLabel()
+        label.setIcon(icon)
+        panel.add(label)
+        ()
+      }
+    )
+    jframe.pack()
+    jframe.setVisible(true)
+    Thread.sleep(10000)
+  }
+
+  private def createScrollingFrame(count: Long): (JFrame, JPanel) = {
+    val jframe: JFrame = new JFrame("images")
+    jframe.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE)
+    val panel: JPanel = new JPanel()
+    panel.setLayout(new GridLayout(count.toInt, 1))
+    val scrPane: JScrollPane = new JScrollPane(panel)
+    jframe.getContentPane.add(scrPane)
+    (jframe, panel)
+  }
+
+  private val firstBytes = Map(
+    "00001.png" -> Array(235.0, 231.0, 232.0, 232.0, 232.0, 232.0, 232.0, 232.0, 232.0, 232.0),
+    "00002.png" -> Array(222.0, 218.0, 194.0, 186.0, 222.0, 236.0, 238.0, 241.0, 243.0, 245.0),
+    "00000.png" -> Array(49.0, 47.0, 51.0, 53.0, 46.0, 41.0, 47.0, 45.0, 44.0, 41.0),
+    "00004.png" -> Array(50.0, 64.0, 46.0, 30.0, 22.0, 36.0, 55.0, 57.0, 59.0, 54.0),
+    "00005.png" -> Array(83.0, 61.0, 26.0, 36.0, 65.0, 67.0, 58.0, 54.0, 63.0, 65.0),
+    "00003.png" -> Array(149.0, 187.0, 193.0, 205.0, 202.0, 183.0, 181.0, 180.0, 182.0, 189.0)
+  )
+
+  private def compareArrays(x: Array[Double], y:Array[Double]): Boolean = {
+    val length = Math.min(x.length, y.length)
+    for(i <- 0 to length-1){
+      if(Math.abs(x(i) - y(i)) > 1e-5) return false
+    }
+    true
+  }
+
+}
diff --git a/src/multi-column-adapter/build.sbt b/src/multi-column-adapter/build.sbt
new file mode 100644
index 0000000000..6d55f118b6
--- /dev/null
+++ b/src/multi-column-adapter/build.sbt
@@ -0,0 +1 @@
+//> DependsOn: core
diff --git a/src/multi-column-adapter/src/main/scala/MultiColumnAdapter.scala b/src/multi-column-adapter/src/main/scala/MultiColumnAdapter.scala
new file mode 100644
index 0000000000..521a2b212c
--- /dev/null
+++ b/src/multi-column-adapter/src/main/scala/MultiColumnAdapter.scala
@@ -0,0 +1,121 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.ml.{PipelineStage, Transformer}
+import org.apache.spark.ml.param.{Param, ParamMap, TransformerParam}
+import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
+import org.apache.spark.sql.types._
+
+object MultiColumnAdapter extends DefaultParamsReadable[MultiColumnAdapter]
+
+/**
+  * This transformer takes a unary transformer and a list of input output column pairs
+  * and applies the transformer to each column
+  */
+class MultiColumnAdapter(override val uid: String) extends Transformer with MMLParams {
+
+  def this() = this(Identifiable.randomUID("MultiColumnAdapter"))
+
+  val inputCols: Param[String] =
+    StringParam(
+      this,
+      "inputCols",
+      "comma separated list of column names encoded as a string")
+
+  /** @group getParam **/
+  final def getInputCols: String = $(inputCols)
+
+  /** @group setParam **/
+  def setInputCols(value: String): this.type = set(inputCols, value)
+
+  val outputCols: Param[String] =
+    StringParam(
+      this,
+      "outputCols",
+      "comma separated list of column names encoded as a string")
+
+  /** @group getParam **/
+  final def getOutputCols: String = $(outputCols)
+
+  /** @group setParam **/
+  def setOutputCols(value: String): this.type = set(outputCols, value)
+
+  def getInputOutputPairs: List[(String, String)] =
+    getInputCols.split(",").zip(getOutputCols.split(",")).toList
+
+  val baseTransformer: TransformerParam =
+    new TransformerParam(this,
+                         "baseTransformer",
+                         "base transformer to apply to every column")
+
+  /** @group getParam **/
+  final def getBaseTransformer: Transformer = $(baseTransformer)
+
+  /** @group setParam **/
+  def setBaseTransformer(value: Transformer): this.type = {
+    try {
+      //Test to see whether the class has the appropriate getters and setters
+      value.getParam("inputCol")
+      value.getParam("outputCol")
+      setParamInternal(value, "inputCol", this.uid + "__in")
+      setParamInternal(value, "outputCol", this.uid + "__out")
+    } catch {
+      case e: Exception =>
+        throw new IllegalArgumentException(
+          "Need to pass a transformer with inputCol and outputCol params")
+    }
+    set(baseTransformer, value)
+  }
+
+  private def setParamInternal[M <: PipelineStage, V](model: M,
+                                                      name: String,
+                                                      value: V) = {
+    model.set(model.getParam(name), value)
+  }
+
+  private def getParamInternal[M <: PipelineStage](model: M, name: String) = {
+    model.getOrDefault(model.getParam(name))
+  }
+
+  private def setInOutCols[M <: PipelineStage](
+      model: M,
+      inputOutputPair: (String, String)) = {
+    setParamInternal(setParamInternal(model, "inputCol", inputOutputPair._1),
+                     "outputCol",
+                     inputOutputPair._2)
+  }
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    transformSchema(dataset.schema)
+    val firstOutput = setInOutCols(getBaseTransformer,
+                                   getInputOutputPairs.head).transform(dataset)
+    getInputOutputPairs.tail.foldLeft(firstOutput: DataFrame) { (df, pair) =>
+      setInOutCols(getBaseTransformer, pair).transform(df)
+    }
+  }
+
+  def copy(extra: ParamMap): this.type = defaultCopy(extra)
+
+  private def verifyCols(df: DataFrame,
+                         inputOutputPairs: List[(String, String)]) = {
+    inputOutputPairs.foreach {
+      case (s1, s2) if !df.columns.contains(s1) =>
+        throw new IllegalArgumentException(
+          s"DataFrame does not contain specified column: $s1")
+      case (s1, s2) if df.columns.contains(s2) =>
+        throw new IllegalArgumentException(
+          s"DataFrame already contains specified column: $s2")
+      case _ =>
+    }
+  }
+
+  override def transformSchema(schema: StructType): StructType = {
+    getInputOutputPairs.foldLeft(schema) { (schema, pair) =>
+      setInOutCols(getBaseTransformer, pair).transformSchema(schema)
+    }
+  }
+
+}
diff --git a/src/multi-column-adapter/src/test/scala/MultiColumnAdapterSpec.scala b/src/multi-column-adapter/src/test/scala/MultiColumnAdapterSpec.scala
new file mode 100644
index 0000000000..792228ea56
--- /dev/null
+++ b/src/multi-column-adapter/src/test/scala/MultiColumnAdapterSpec.scala
@@ -0,0 +1,49 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.ml.feature.Tokenizer
+import com.microsoft.ml.spark.schema.DatasetExtensions._
+import org.apache.spark.ml.Transformer
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.types.StructType
+
+class MultiColumnAdapterSpec extends TransformerFuzzingTest {
+
+  val wordDF = session.createDataFrame(Seq(
+    (0, "This is a test", "this is one too"),
+    (1, "could be a test", "maybe not"),
+    (2, "foo", "bar")))
+    .toDF("label", "words1", "words2")
+  val inputCols = "words1,words2"
+  val outputCols = "output1,output2"
+
+  test("parallelize transformers") {
+    val transformer1 = new Tokenizer()
+    val adapter1 =
+      new MultiColumnAdapter().setBaseTransformer(transformer1).setInputCols(inputCols).setOutputCols(outputCols)
+    val tokenizedDF = adapter1.transform(wordDF)
+    val lines = tokenizedDF.getColAs[Array[String]]("output2")
+
+    val trueLines = Array(
+      Array("this", "is", "one", "too"),
+      Array("maybe", "not"),
+      Array("bar")
+    )
+    assert(lines === trueLines)
+  }
+
+  override def setParams(fitDataset: DataFrame, transformer: Transformer): Transformer =
+    transformer.asInstanceOf[MultiColumnAdapter]
+      .setBaseTransformer(new Tokenizer())
+      .setInputCols(inputCols)
+      .setOutputCols(outputCols)
+
+  override def createDataset: DataFrame = wordDF
+
+  override def schemaForDataset: StructType = ???
+
+  override def getTransformer(): Transformer = new MultiColumnAdapter()
+
+}
diff --git a/src/partition-sample/build.sbt b/src/partition-sample/build.sbt
new file mode 100644
index 0000000000..6d55f118b6
--- /dev/null
+++ b/src/partition-sample/build.sbt
@@ -0,0 +1 @@
+//> DependsOn: core
diff --git a/src/partition-sample/src/main/scala/PartitionSample.scala b/src/partition-sample/src/main/scala/PartitionSample.scala
new file mode 100644
index 0000000000..b885f11a1d
--- /dev/null
+++ b/src/partition-sample/src/main/scala/PartitionSample.scala
@@ -0,0 +1,117 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
+import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.sql.types._
+
+object PSConstants {
+    final val ModeRS = "RandomSample"
+    final val ModeHead = "Head"
+    final val ModeATP = "AssignToPartition"
+
+    final val rsAbsolute = "Absolute"
+    final val rsPercent = "Percentage"
+
+    final val newColDefault = "Partition"
+}
+
+trait PartitionSampleParams extends MMLParams {
+
+  /* Mode: {RandomSample|AssignToPartition|Head}
+        - RS:   {Absolute|Percentage, Seed}
+            - Absolute: {Count}
+            - Percentage: {Percent}
+        - ATP:  {Seed, numParts, newColName}
+        - Head: {Count}
+  */
+  // TODO: Convert to Enum
+  final val mode = StringParam(this, "mode", "AssignToPartition, RandomSample, or Head")
+  setDefault(mode, PSConstants.ModeRS)
+  final def getMode: String = $(mode)
+  def setMode(value: String): this.type = set(mode, value)
+
+  // TODO: Convert to Enum
+  // Relevant on Mode = RS
+  final val rsMode = StringParam(this, "rsMode", "Absolute or Percentage", PSConstants.rsPercent)
+  final def getRandomSampleMode: String = $(rsMode)
+  def setRandomSampleMode(value: String): this.type = set(rsMode, value)
+
+  // Relevant on Mode = RS|ATP
+  // TODO: We need to create Option[Int] idiom for params
+  final val seed = LongParam(this, "seed", "seed for random ops", -1L)
+  final def getSeed: Long = $(seed)
+  def setSeed(value: Long): this.type = set(seed, value)
+
+  // Relevant on RSMode = Percentage
+  final val percent = DoubleParam(this, "percent", "percent of rows to return", 0.01)
+  final def getPercent: Double = $(percent)
+  def setPercent(value: Double): this.type = set(percent, value)
+
+  // Relevant on Mode = Head | RSMode = Absolute
+  final val count = LongParam(this, "count", "number of rows to return", 1000L)
+  final def getCount: Long = $(count)
+  def setCount(value: Long): this.type = set(count, value)
+
+  // Relevant on Mode = ATP
+  final val newColName = StringParam(this, "newColName", "name of the partition column", PSConstants.newColDefault)
+  final def getNewColName: String = $(newColName)
+  def setNewColName(value: String): this.type = set(newColName, value)
+
+  // Relevant on Mode = ATP
+  final val numParts = IntParam(this, "numParts", "number of partitions", 10)
+  final def getNumParts: Int = $(numParts)
+  def setNumParts(value: Int): this.type = set(numParts, value)
+
+  protected def validateAndTransformSchema(schema: StructType): StructType = {
+    if (Seq(PSConstants.ModeHead, PSConstants.ModeRS).contains($(mode)))
+      schema
+    else
+      ??? // schema + newCol
+  }
+}
+
+object PartitionSample extends DefaultParamsReadable[PartitionSample]
+
+// UID should be overridden by driver for controlled identification at the DAG level
+sealed class PartitionSample(override val uid: String)
+  extends Transformer
+  with PartitionSampleParams {
+
+  def this() = this(Identifiable.randomUID("PartitionSample"))
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    $(mode) match {
+      case PSConstants.ModeHead => dataset.limit(
+        if ($(count) <= 2000000000) $(count).toInt else throw new Exception("Head limit 2b rows")).toDF
+      case PSConstants.ModeRS => randomSample(dataset, $(rsMode), $(seed)).toDF
+      case PSConstants.ModeATP => dataset.withColumn($(newColName), /* broken */ dataset.col("input"))
+      case _ => ???
+    }
+  }
+
+  def transformSchema(schema: StructType): StructType = {
+    validateAndTransformSchema(schema)
+  }
+
+  def copy(extra: ParamMap): PartitionSample = defaultCopy(extra)
+
+  private def randomSample(
+    ds: Dataset[_],
+    rsMode: String,
+    seed: Long,
+    replace: Boolean = false): Dataset[_] = {
+        val frac = rsMode match {
+            case PSConstants.rsPercent => $(percent)
+            case PSConstants.rsAbsolute => $(count).toDouble / ds.count
+            case _ => ???
+        }
+        println(s"Sampling ${ds.count} rows by ${frac * 100}% to get ~${ds.count * frac} rows")
+        return ds.sample(replace, frac, seed)
+  }
+
+}
diff --git a/src/partition-sample/src/test/scala/VerifyPartitionSample.scala b/src/partition-sample/src/test/scala/VerifyPartitionSample.scala
new file mode 100644
index 0000000000..1ed325f9c9
--- /dev/null
+++ b/src/partition-sample/src/test/scala/VerifyPartitionSample.scala
@@ -0,0 +1,67 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.ml.Transformer
+import org.apache.spark.sql.types._
+import org.apache.spark.ml.param._
+
+class PartitionSampleSmokeTests extends TestBase {
+
+  import session.implicits._
+
+  test("head 3") {
+    val sampler = new PartitionSample().setMode("Head").setCount(3)
+    val out = sampler.transform(makeDF)
+    assert(out.count === 3)
+    assert(makeDF.head === out.head)
+  }
+
+  test("random sample smoke") {
+    val df = extendedDF(10)
+    val sampler = new PartitionSample()
+        .setMode("RandomSample")
+        .setRandomSampleMode("Absolute")
+        .setSeed(1)
+        .setCount(10)
+    val out = sampler.transform(df)
+    assert(out.count < 16)
+    assert(out.count > 5)
+
+    val sampler2 = new PartitionSample()
+        .setMode("RandomSample")
+        .setRandomSampleMode("Percentage")
+        .setSeed(1)
+        .setPercent(0.5)
+    val out2 = sampler2.transform(df)
+    assert(out2.count < 100)
+    assert(out2.count > 60)
+  }
+
+  def extendedDF(n: Int = 10): DataFrame = {
+    (2 to n).map(_ => makeDF).foldLeft(makeDF)((a, b) => a.union(b))
+  }
+
+  lazy val makeDF: DataFrame = {
+    Seq(( 1,  2),
+        ( 3,  4),
+        ( 5,  6),
+        ( 7,  8),
+        ( 9, 10),
+        (11, 12),
+        (13, 14),
+        (15, 16),
+        (17, 18),
+        (19, 20),
+        (21, 22),
+        (23, 24),
+        (25, 26),
+        (27, 28),
+        (29, 30),
+        (31, 32))
+      .toDF("Col1", "Col2")
+  }
+
+}
diff --git a/src/pipeline-stages/build.sbt b/src/pipeline-stages/build.sbt
new file mode 100644
index 0000000000..6d55f118b6
--- /dev/null
+++ b/src/pipeline-stages/build.sbt
@@ -0,0 +1 @@
+//> DependsOn: core
diff --git a/src/pipeline-stages/src/main/scala/Repartition.scala b/src/pipeline-stages/src/main/scala/Repartition.scala
new file mode 100644
index 0000000000..ad277fe1c7
--- /dev/null
+++ b/src/pipeline-stages/src/main/scala/Repartition.scala
@@ -0,0 +1,42 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.types._
+
+object Repartition extends DefaultParamsReadable[Repartition]
+
+class Repartition(val uid: String) extends Transformer with MMLParams {
+  def this() = this(Identifiable.randomUID("Repartition"))
+
+  val n: IntParam = IntParam(this, "n", "Number of partitions",
+    validation = ParamValidators.gt[Int](0))
+
+  final def getN: Int = $(n)
+
+  def setN(value: Int): this.type = set(n,value)
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+
+    if (getN < dataset.rdd.getNumPartitions){
+      dataset.coalesce(getN).toDF()
+    }else{
+      dataset.sqlContext.createDataFrame(
+        dataset.rdd.repartition(getN).asInstanceOf[RDD[Row]],
+        dataset.schema)
+    }
+  }
+
+  def transformSchema(schema: StructType): StructType = {
+    schema
+  }
+
+  def copy(extra: ParamMap): this.type = defaultCopy(extra)
+
+}
diff --git a/src/pipeline-stages/src/main/scala/SelectColumns.scala b/src/pipeline-stages/src/main/scala/SelectColumns.scala
new file mode 100644
index 0000000000..e29b1f6329
--- /dev/null
+++ b/src/pipeline-stages/src/main/scala/SelectColumns.scala
@@ -0,0 +1,63 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.sql.{DataFrame, Dataset}
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+
+object SelectColumns extends DefaultParamsReadable[SelectColumns]
+
+/**
+  * This class takes a dataframe and a list of columns to select and returns
+  * a dataframe comprised of only those columns listed in the input list.
+  *
+  * The columns to be selected is a comma separated list of column names, contained in a single string.
+  */
+
+class SelectColumns(val uid: String) extends Transformer with MMLParams {
+  def this() = this(Identifiable.randomUID("SelectColumns"))
+
+  val cols: StringArrayParam = new StringArrayParam(this, "cols", "comma separated list of selected column names")
+
+  /** @group getParam **/
+  final def getCols: Array[String] = $(cols)
+
+  /** @group setParam **/
+  def setCols(value: Array[String]): this.type = set(cols, value)
+
+  def setCol(value: String): this.type = set(cols, Array(value))
+
+  /**
+    * @param dataset - The input dataset, to be transformed
+    * @return The DataFrame that results from column selection
+    */
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    verifySchema(dataset.schema)
+    dataset.toDF().select(getCols.map(col): _*)
+  }
+
+  def transformSchema(schema: StructType): StructType = {
+    verifySchema(schema)
+    val selectedCols = getCols.toSet
+    StructType(schema.fields.filter(f => selectedCols(f.name)))
+  }
+
+  def copy(extra: ParamMap): SelectColumns = defaultCopy(extra)
+
+  private def verifySchema(schema: StructType): Unit = {
+    val providedCols = schema.fields.map(_.name).toSet
+    val invalidCols = getCols.filter(!providedCols(_))
+
+    if (invalidCols.length > 0) {
+      throw new NoSuchElementException(
+        s"DataFrame does not contain specified columns: ${invalidCols.reduce(_ + "," + _)}")
+    }
+
+  }
+
+}
diff --git a/src/pipeline-stages/src/test/scala/RepartitionSuite.scala b/src/pipeline-stages/src/test/scala/RepartitionSuite.scala
new file mode 100644
index 0000000000..60d3c4f1b2
--- /dev/null
+++ b/src/pipeline-stages/src/test/scala/RepartitionSuite.scala
@@ -0,0 +1,50 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.ml.param.{ParamMap, ParamPair}
+
+class RepartitionSuite extends TestBase {
+
+  import session.implicits._
+
+  val input = Seq(
+    (0, "guitars", "drums"),
+    (1, "piano", "trumpet"),
+    (2, "bass", "cymbals"),
+    (3, "guitars", "drums"),
+    (4, "piano", "trumpet"),
+    (5, "bass", "cymbals"),
+    (6, "guitars", "drums"),
+    (7, "piano", "trumpet"),
+    (8, "bass", "cymbals"),
+    (9, "guitars", "drums"),
+    (10, "piano", "trumpet"),
+    (11, "bass", "cymbals")
+  ).toDF("numbers", "words", "more")
+
+  test("Work for several values of n") {
+
+    def test(n: Int): Unit = {
+      val result = new Repartition()
+        .setN(n)
+        .transform(input)
+      assert(result.rdd.getNumPartitions == n)
+      ()
+    }
+    List(1, 2, 3, 10).foreach(test)
+
+  }
+
+  test("Should allow a user to set the partitions" +
+    " specifically in pipeline transform"){
+    val r = new Repartition().setN(1)
+    val pipe = new Pipeline().setStages(Array(r))
+    val fitPipe = pipe.fit(input)
+    assert(fitPipe.transform(input).rdd.getNumPartitions==1)
+    assert(fitPipe.transform(input, ParamMap(r.n->5)).rdd.getNumPartitions ==5)
+  }
+
+}
diff --git a/src/pipeline-stages/src/test/scala/SelectColumnsSuite.scala b/src/pipeline-stages/src/test/scala/SelectColumnsSuite.scala
new file mode 100644
index 0000000000..830e48d320
--- /dev/null
+++ b/src/pipeline-stages/src/test/scala/SelectColumnsSuite.scala
@@ -0,0 +1,75 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.ml.Transformer
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.types.StructType
+
+class SelectColumnsSuite extends TransformerFuzzingTest {
+
+  import session.implicits._
+
+  test("Select all columns in a data frame") {
+    val input = makeBasicDF()
+    val result = new SelectColumns()
+      .setCols(Array("numbers", "words", "more"))
+      .transform(input)
+    assert(verifyResult(input, result))
+  }
+
+  test("Test: Select two columns in a data frame") {
+    val expected = Seq(
+      ("guitars", "drums"),
+      ("piano", "trumpet"),
+      ("bass", "cymbals")
+    ).toDF("words", "more")
+    val result = new SelectColumns()
+      .setCols(Array("words", "more"))
+      .transform(makeBasicDF())
+    assert(verifyResult(expected, result))
+  }
+
+  test("Test: Select columns with spaces") {
+    val expected = Seq(
+      ("guitars", "drums"),
+      ("piano", "trumpet"),
+      ("bass", "cymbals")
+    ).toDF("words", "Scored Labels")
+    val result = new SelectColumns()
+      .setCols(Array("words", "Scored Labels"))
+      .transform(makeBasicDF().withColumnRenamed("more", "Scored Labels"))
+    assert(verifyResult(expected, result))
+  }
+
+  test("Test: Select one column from the data frame") {
+    val expected = Seq(
+      "guitars",
+      "piano",
+      "bass"
+    ).toDF("words")
+    val result = new SelectColumns()
+      .setCols(Array("words"))
+      .transform(makeBasicDF())
+    assert(verifyResult(expected, result))
+  }
+
+  test("Invalid column specified") {
+    try {
+      new SelectColumns().setCol("four").transform(makeBasicDF())
+      fail()
+    } catch {
+      case _: NoSuchElementException =>
+    }
+  }
+
+  override def setParams(fitDataset: DataFrame, transformer: Transformer): Transformer =
+    transformer.asInstanceOf[SelectColumns].setCols(fitDataset.columns)
+
+  override def createDataset: DataFrame = makeBasicDF()
+
+  override def schemaForDataset: StructType = ???
+
+  override def getTransformer(): Transformer = new SelectColumns()
+}
diff --git a/src/project/build.sbt b/src/project/build.sbt
new file mode 100644
index 0000000000..898a255da9
--- /dev/null
+++ b/src/project/build.sbt
@@ -0,0 +1,16 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+// Same options as in build.scala
+scalacOptions ++= Seq(
+  "-encoding", "UTF-8",
+  // Explain warnings, optimize
+  "-deprecation", "-unchecked", "-feature", "-optimise",
+  "-Xfatal-warnings", "-Xlint", // all warnings
+  // -Y* are Scala options
+  "-Yno-adapted-args", // "-Ywarn-adapted-args",
+  "-Ywarn-dead-code",
+  "-Ywarn-numeric-widen",
+  "-Ywarn-value-discard"
+  // this leads to problems sometimes: "-Yinline-warnings"
+)
diff --git a/src/project/build.scala b/src/project/build.scala
new file mode 100644
index 0000000000..a368c0bde9
--- /dev/null
+++ b/src/project/build.scala
@@ -0,0 +1,201 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+import sbt._
+import Keys._
+
+import sys.process.Process
+
+import sbtassembly._
+import sbtassembly.AssemblyKeys._
+import sbtassembly.AssemblyPlugin.autoImport.assembly
+
+object Extras {
+
+  private def env(varName: String, default: String = "") =
+    sys.env.getOrElse(varName,
+                      if (default != null) default
+                      else sys.error(s"Missing $$$varName environment variable"))
+
+  // get the version from $MML_VERSION, or from `show-version`
+  def mmlVer = sys.env.getOrElse("MML_VERSION",
+                                 Process("../tools/runme/show-version").!!.trim)
+
+  def defaultOrg = "com.microsoft.ml.spark"
+  def scalaVer = env("SCALA_FULL_VERSION", null)
+  def sparkVer = env("SPARK_VERSION", null)
+
+  def commonResolvers = Seq(
+    "MMLSpark Maven Repo" at "https://mmlspark.azureedge.net/maven"
+    )
+  def commonLibs = Seq(
+    "org.apache.spark" %% "spark-core"  % sparkVer % "provided",
+    "org.apache.spark" %% "spark-mllib" % sparkVer % "provided",
+    "org.scalatest"    %% "scalatest"   % "3.0.0"  % "provided",
+    // should include these things in the distributed jar
+    "io.spray"         %% "spray-json"  % "1.3.2",
+    "com.microsoft.CNTK" % "cntk_jni"   % "2.0rc3",
+    "org.opencv"         % "opencv_jni" % "3.2.0"
+    )
+  def overrideLibs = Set(
+    // spark wants 2.2.6, but we don't use its tests anyway
+    "org.scalatest" %% "scalatest" % "3.0.0" % "provided"
+    )
+
+  def artifactsDir = file(env("BUILD_ARTIFACTS", "../BuildArtifacts/packages/m2"))
+  def testsDir     = file(env("TEST_RESULTS", "../TestResults"))
+  def scalacOpts = Seq(
+    "-encoding", "UTF-8",
+    // Explain warnings, optimize
+    "-deprecation", "-unchecked", "-feature", "-optimise",
+    "-Xfatal-warnings", "-Xlint", // all warnings
+    // -Y* are Scala options
+    "-Yno-adapted-args", // "-Ywarn-adapted-args",
+    "-Ywarn-dead-code",
+    "-Ywarn-numeric-widen",
+    "-Ywarn-value-discard"
+    // this leads to problems sometimes: "-Yinline-warnings"
+  )
+
+  // Some convenience commands
+  val sectionPrefix =
+    if (env("BUILDMODE") == "server") "##[section]SBT: " else "===>>> SBT: "
+  def addCommands(st: State, cmds: String*): State =
+    st.copy(remainingCommands = cmds ++ st.remainingCommands)
+  def newCommands = Seq(
+    Command.single("cd") { (st, arg) =>
+      addCommands(st, s"project $arg") },
+    Command.args("echo", "<str>") { (st, args) =>
+      println(args.map(s => if (s == "<br>") "\n" else s).mkString(" ")); st },
+    Command.single("show-section") { (st, arg) =>
+      println("\n" + sectionPrefix + arg); st },
+    Command.single("noisy-command") { (st, cmd) =>
+      addCommands(st, s"show-section $cmd", cmd) },
+    Command.single("on-all-subs") { (st, cmd) =>
+      addCommands(st, SubProjects.all.map("noisy-command " + _ + "/" + cmd): _*) },
+    Command.command("full-build") { st =>
+      val steps = Seq(if (env("PUBLISH") == "all") "update" else null,
+                      "run-scalastyle",
+                      "compile",
+                      if (testSpec == "none") null else "test:compile",
+                      "package",
+                      if (testSpec == "none") null else "on-all-subs test",
+                      "codegen/run",
+                      "publish")
+      addCommands(st, steps.filter(_ != null).map("noisy-command " + _): _*) }
+  ) ++ ScalaStyleExtras.commands
+
+  // Utilities for sub-project sbt files
+  def noJar = Seq(Keys.`package` := file(""))
+
+  // Translate $TESTS to command-line arguments
+  val testSpec = env("TESTS", "-extended")
+  def testOpts =
+    // Generate JUnit-style test result files
+    Seq(testOptions in (ThisBuild, Test) +=
+          Tests.Argument("-u", testsDir.toString())) ++
+    (if (testSpec == "all" || testSpec == "none") Seq()
+     else testSpec.split(",").map { spec =>
+       testOptions in (ThisBuild, Test) +=
+         Tests.Argument(if (spec.substring(0,1) == "+") "-n" else "-l",
+                        "com.microsoft.ml.spark.test.tags." +
+                          spec.substring(1)) })
+
+  def defaultSettings = Seq(
+    // Common stuff: defaults for all subprojects
+    scalaVersion in ThisBuild := scalaVer,
+    organization in ThisBuild := defaultOrg,
+    resolvers in ThisBuild ++= commonResolvers,
+    libraryDependencies in ThisBuild ++= commonLibs,
+    dependencyOverrides in ThisBuild ++= overrideLibs,
+    scalacOptions in ThisBuild ++= scalacOpts,
+    // Don't run tests in parallel, and fork subprocesses for them
+    parallelExecution in (ThisBuild, Test) := false,
+    fork in (ThisBuild, Test) := true,
+    // Assembly options
+    aggregate in assembly := false,
+    aggregate in publish  := false,
+    test in assembly := {},
+    // Documentation settings
+    autoAPIMappings in ThisBuild := true,
+    // Ctrl+C kills a running job, not sbt
+    cancelable in ThisBuild := true,
+    // No verbose logs during update
+    logLevel in (ThisBuild, update) := Level.Warn,
+    // Fewer errors to display (the default is 100)
+    maxErrors in ThisBuild := 20,
+    // Show stack traces up to the first SBT stack frame
+    traceLevel in ThisBuild := 0,
+    // Stamp the jar manifests with the build info
+    packageOptions in (Compile, packageBin) +=
+      Package.ManifestAttributes(
+        "MMLBuildInfo" -> env("MML_BUILD_INFO", "(direct sbt build, no info collected)")),
+    // For convenience, import the main package in a scala console
+    initialCommands in (ThisBuild, console) := "import com.microsoft.ml.spark._",
+    // Use the above commands
+    commands in ThisBuild ++= newCommands
+    ) ++ testOpts
+
+  def rootSettings =
+    defaultSettings ++
+    noJar ++ // no toplevel jar
+    // With this we get:
+    //   mmlspark_2.11-$ver-assembly.jar{,.md5,.sha1}
+    //   mmlspark_2.11-$ver.pom{,.md5,.sha1}
+    //   mmlspark_2.11-$ver{,-javadoc,-sources}.jar{,.md5,.sha1}
+    // the first are the combined jar, and the second are the needed pom files.
+    // The third all look empty and discardable.  Without this, we get the same
+    // structure, except it seems that it tries to write both the empty jar and
+    // the combined jar onto the same mmlspark_2.11-$ver.jar{,.md5,.sha1} files,
+    // spitting up a warning, and sometimes the result is the combined jar and
+    // sometimes it's the empty (probably the above empty jar with the same no
+    // "-assembly" name).  Later in the build we discard the junk files, and
+    // leave only the combined one.
+    Seq(artifact in (Compile, assembly) :=
+          (artifact in (Compile, assembly)).value.copy(`classifier` = Some("assembly"))) ++
+    addArtifact(artifact in (Compile, assembly), assembly) ++
+    Seq(
+      // This creates a maven structure, which we upload to azure storage later
+      publishTo := Some(Resolver.file("file", artifactsDir)),
+      // In case we need to add more stuff to the uber-jar, use this:
+      // unmanagedResourceDirectories in Compile += artifactsDir / "more",
+      publishArtifact in Test := false,
+      publishMavenStyle := true,
+      // Remove the "scala-library" dependency
+      autoScalaLibrary := false,
+      // Don't include things we depend on (we leave the dependency in the POM)
+      assemblyOption in assembly :=
+        (assemblyOption in assembly).value.copy(
+          includeScala = false, includeDependency = false),
+      pomPostProcess := { n: scala.xml.Node =>
+        import scala.xml._, scala.xml.transform._
+        new RuleTransformer(new RewriteRule {
+          override def transform(n: Node) =
+            // Filter out things that shouldn't be a dependency: things that
+            // have "<scope>provided</scope>", or "<optional>true</optional>".
+            // The latter is generated in meta.sbt for toplevel dependencies.
+            if (n.label == "dependency" &&
+                  (n.child.contains(<optional>true</optional>) ||
+                     n.child.contains(<scope>provided</scope>)))
+              Seq.empty
+            else if (n.label == "repositories")
+              // Deduplicate repo entries, since we get one for each subproject
+              <repositories>{ n.child.distinct }</repositories>
+            else
+              Seq(n)
+        }).transform(Seq(pomPostProcess.value.apply(n))).head
+      },
+      // Show the current project in the prompt
+      shellPrompt in ThisBuild := (st => {
+        val ex = Project.extract(st)
+        val proj = ex.currentRef.project
+        val root = ex.rootProject(ex.currentRef.build)
+        s"${if (proj == root) "" else root+"/"}${proj}> "
+      }),
+      // Use the same history path for everything instead of per project files
+      historyPath in ThisBuild := Some((target in LocalRootProject).value / ".history")
+    )
+
+  LibraryCheck() // invoke the library checker
+
+}
diff --git a/src/project/lib-check.scala b/src/project/lib-check.scala
new file mode 100644
index 0000000000..168542fad5
--- /dev/null
+++ b/src/project/lib-check.scala
@@ -0,0 +1,34 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+import sbt._
+import Keys._
+import scala.io._
+
+// Warn user when the library configuration has changed
+object LibraryCheck {
+  def apply() = ()
+  val info = file(sys.env.getOrElse("HOME", System.getProperty("user.home"))) /
+             ".mmlspark_installed_libs"
+  val conf = file("..") / "tools" / "config.sh"
+  val (len, modif) = (conf.length, conf.lastModified)
+  def read[T](file: File, read: BufferedSource => T): T = {
+    val i = Source.fromFile(file); try read(i) finally i.close
+  }
+  lazy val text =
+    "(?s)INSTALLATIONS=\\(.*?\r?\n\\)\r?\n".r.findFirstIn(read(conf, _.mkString)).get
+  lazy val (len_, modif_, text_) =
+    read(info, i => {
+           val meta = i.getLines.take(2).toList.map(_.toLong)
+           (meta(0), meta(1), i.mkString) })
+  def writeInfo() = scala.tools.nsc.io.File(info).writeAll(s"$len\n$modif\n$text")
+  if (!info.exists) writeInfo()
+  else if (len_ != len || modif_ != modif) {
+    if (text_ == text) writeInfo()
+    else {
+      println("\n!!! Warning: Library configuration changed,"
+              + " consider using ./runme to update !!!\n")
+      Thread.sleep(1000)
+    }
+  }
+}
diff --git a/src/project/meta.sbt b/src/project/meta.sbt
new file mode 100644
index 0000000000..6eefb69158
--- /dev/null
+++ b/src/project/meta.sbt
@@ -0,0 +1,108 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+// Auto-generate sub-project definitions
+
+val _ = {
+
+  val topDir      = file(".")
+  val topName     = "MMLSpark"
+  val ignoredDirs = Array(topDir / "project")
+
+  def userError(message: String): Nothing = {
+    System.err.println(message)
+    System.err.println("Aborting...")
+    exit(1)
+  }
+
+  case class Proj(val dir: File, val children: List[Proj]) {
+    val name = if (dir == topDir) topName
+               else Project.normalizeModuleID(dir.toString.substring(2))
+    val dirs = Iterator.iterate(dir)(_.getParentFile).takeWhile(_ != null)
+                 .map(_.getName).toList.reverse.drop(1)
+    val props = {
+      val i = scala.io.Source.fromFile(dir / "build.sbt")
+      val rx = "//> *(\\w+) *: *(.*?) *".r
+      (try i.getLines.toList finally i.close)
+        .map(_ match { case rx(x,y) => (x,y); case _ => null; }).filter(_ != null)
+        .foldLeft(Map[String,List[String]]()) { (m,kv) =>
+        m + (kv._1 -> (m.getOrElse(kv._1,Nil) :+ kv._2)) }
+    }
+    lazy val deps = children ++ props.getOrElse("DependsOn", Nil).map(nameToProj)
+    def flatList(): List[Proj] = this +: children.flatMap(_.flatList)
+    override def toString() = name
+  }
+
+  def findProjects(dir: File): List[Proj] = {
+    val (dirs, files) = dir.listFiles().sorted.toList.partition(_.isDirectory)
+    val nested = dirs.flatMap(findProjects)
+    if (ignoredDirs.contains(dir) || !files.exists(p => p.getName == "build.sbt")) nested
+    else List(Proj(dir, nested))
+  }
+
+  def nameToProj(name: String): Proj =
+    nameProjMap.getOrElse(Project.normalizeModuleID(name),
+                          userError(s"Bad project name: $name..."))
+
+  // Cheap topological sort for projects; note that the input is sorted
+  // alphabetically, and it preserves this order when posible
+  def depSort(projs: List[List[Proj]]): List[Proj] = {
+    if (projs.isEmpty) Nil
+    else projs.find(_.tail.isEmpty) match {
+      case Some(x +: _) => x +: depSort(projs.map(_.filterNot(_==x)).filterNot(_.isEmpty))
+      case _ => userError(s"Dependency cycle! {${projs.map(_.head).mkString(", ")}}")
+    }
+  }
+
+  lazy val topProj     = findProjects(topDir)(0)
+  lazy val nameProjMap = topProj.flatList.map(p => (p.name -> p)).toMap
+  lazy val sortedProjs = depSort(topProj.flatList.map(p => p +: p.deps))
+
+  def projToSbt(proj: Proj): String = {
+    def showList(list: List[Proj], what: String, sfx: String) = {
+      if (list.isEmpty) ""
+      else s"""\n  .${what}(\n    ${list.map(p => s"`$p`$sfx").mkString(",\n    ")})"""
+    }
+    (s"""val `$proj` = (project in ${("topDir" +: proj.dirs.map("\""+_+"\""))
+                                     .mkString(" / ")})"""
+       + "\n  .settings(Extras.defaultSettings: _*)"
+       + showList(proj.children, "aggregate", "")
+       // for the root project, use "optional" -- I don't know what it should
+       // do, but it's visible in the POM file, which allows us to filter our
+       // dependencies out of it (in "build.scala").
+       + showList(proj.deps, "dependsOn",
+                  if (proj != topProj) " % \"compile->compile;test->test\""
+                  else " % \"compile->compile;optional\""))}
+
+  IO.write(topDir / "autogen.sbt",
+    s"""// Automatically generated, DO NOT EDIT\n
+       |val topDir = file(".")\n
+       |${sortedProjs.map(projToSbt).mkString("\n\n")}
+       |""".stripMargin)
+
+  IO.write(topDir / "project" / "autogen.scala",
+    s"""// Automatically generated, DO NOT EDIT\n
+       |import sbt._\nimport Keys._\n
+       |object SubProjects {
+       |  val all = Seq(
+       |    ${sortedProjs.filter(_.children.isEmpty)
+                .map("\"" + _.name + "\"").mkString(",\n    ")})
+       |}
+       |""".stripMargin)
+
+  IO.write(topDir / "project" / "project-roots.txt",
+           sortedProjs
+             .map(p => {
+                    val d = p.dir
+                    (if (d == topDir) d else d.toString.substring(2)) + "\n"})
+             .mkString(""))
+
+  IO.write(topDir / "project" / "dependencies.digraph",
+    s"""// Automatically generated, DO NOT EDIT\n
+       |digraph ${topName} {
+       |  ${sortedProjs.flatMap(p => p.deps.map(d => s""""$p" -> "$d";"""))
+                       .mkString("\n  ")}
+       |}
+       |""".stripMargin)
+
+}
diff --git a/src/project/plugins.sbt b/src/project/plugins.sbt
new file mode 100644
index 0000000000..7400f588e0
--- /dev/null
+++ b/src/project/plugins.sbt
@@ -0,0 +1,5 @@
+logLevel := Level.Warn
+
+addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.4")
+addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0")
+addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.0")
diff --git a/src/project/scalastyle.scala b/src/project/scalastyle.scala
new file mode 100644
index 0000000000..6902ce397a
--- /dev/null
+++ b/src/project/scalastyle.scala
@@ -0,0 +1,136 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+import sbt._
+import Keys._
+
+final object ScalaStyleExtras {
+
+  // First string determines options:
+  //  * [st]?: optional src- or test-only rule (default: both)
+  //  * [FS]:  whether this is a .scalariform or a .file rule
+  //  * [EWI]: Error / Warn / Ignore
+  val rules = List(
+    r("FE", "FileLengthChecker", ("maxFileLength", 800)),
+    r("FE", "FileTabChecker"),
+    r("FE", "FileLineLengthChecker", ("maxLineLength", 120)),
+    r("FE", "NewLineAtEofChecker"),
+    r("FE", "RegexChecker", ("regex", "\n\n\n")),
+    r("SW", "TokenChecker", ("regex", ".{33}")),
+    r("FE", "HeaderMatchesChecker",
+      ("header",
+       Seq("^// Copyright \\(C\\) Microsoft Corporation\\. All rights reserved\\.",
+           "// Licensed under the MIT License\\. See LICENSE in project root for information\\.",
+           "",
+           "package (?:com\\.microsoft\\.ml\\.spark|org\\.apache\\.spark)[.\n]")
+         .mkString("\n")),
+      ("regex", true)),
+    r("FI", "IndentationChecker", ("tabSize", 2), ("methodParamIndentSize", 2)),
+    r("FE", "WhitespaceEndOfLineChecker"),
+    r("SE", "SpacesAfterPlusChecker"),
+    r("SE", "SpacesBeforePlusChecker"),
+    r("SE", "NoWhitespaceBeforeLeftBracketChecker"),
+    r("SE", "NoWhitespaceAfterLeftBracketChecker"),
+    r("SE", "EmptyClassChecker"),
+    r("SW", "EnsureSingleSpaceAfterTokenChecker", ("tokens", "COLON")), // IF, FOR, WHILE, ELSE -- one or more
+    r("SE", "EnsureSingleSpaceBeforeTokenChecker"), // what is this doing?
+    r("SW", "DisallowSpaceAfterTokenChecker"),
+    r("SE", "DisallowSpaceBeforeTokenChecker"),
+    r("SE", "ClassNamesChecker", ("regex", "^[A-Z][A-Za-z0-9]*$")),
+    r("SE", "ObjectNamesChecker", ("regex", "^[A-Za-z][A-Za-z0-9]*$")), // allow function-like names
+    r("SE", "PackageObjectNamesChecker", ("regex", "^[a-z][A-Za-z]*$")),
+    // this matches the first token after a `val`, which might be `(` in case of val (x, y) = ...
+    r("SW", "FieldNamesChecker", ("regex", "^([a-z][A-Za-z0-9]*| *\\( *)$")),
+    r("SW", "MethodNamesChecker", ("regex", "^[a-z][A-Za-z0-9]*(_=)?$")),
+    r("SE", "ClassTypeParameterChecker", ("regex", "^[A-Z_]$")),
+    r("SE", "EqualsHashCodeChecker"),
+    r("SE", "IllegalImportsChecker", ("illegalImports", "sun._")),
+    r("SE", "DeprecatedJavaChecker"),
+    r("SE", "ParameterNumberChecker", ("maxParameters", 9)),
+    r("SW", "MethodLengthChecker", ("maxLength", 50)),
+    r("SE", "NumberOfTypesChecker", ("maxTypes", 30)),
+    r("SE", "NumberOfMethodsInTypeChecker", ("maxMethods", 30)),
+    r("SE", "NumberOfTypesChecker"),
+    r("SW", "CyclomaticComplexityChecker", ("maximum", 10)),
+    r("SE", "PublicMethodsHaveTypeChecker"),
+    r("sSW", "MagicNumberChecker", ("ignore", "-1,0,1,2,3")),
+    r("SE", "UppercaseLChecker"),
+    r("SE", "ProcedureDeclarationChecker"),
+    r("SE", "RedundantIfChecker"),
+    r("SW", "WhileChecker"),
+    r("SW", "ReturnChecker"),
+    r("SW", "NullChecker"),
+    r("SE", "NoCloneChecker"),
+    r("SE", "NoFinalizeChecker"),
+    r("SE", "StructuralTypeChecker"),
+    r("SE", "CovariantEqualsChecker"),
+    r("SE", "NonASCIICharacterChecker"),
+    // looks like this doesn't work
+    r("SE", "ImportOrderChecker", ("groups" , "our,scala,java,other"),
+      ("our", "com.microsoft.ml.spark[.].+"), ("scala", "scala[.].+"), ("java", "java[.].+"), ("other", ".+")),
+    r("SE", "SimplifyBooleanExpressionChecker"),
+    r("SW", "NotImplementedErrorUsage")
+    // r("SE", "ScalaDocChecker") <-- use when we add scaladoc
+    // Rules that are not used:
+    //   VarLocalChecker:         needed in some places
+    //   VarFieldChecker:         -"-
+    //   BlockImportChecker:      we want to be able to name specific imports...
+    //   ImportGroupingChecker:   ... and be able to import in the middle of code
+    //   UnderscoreImportChecker: ... and use _ wildcards
+    //   NoNewLineAtEofChecker:   want a newline there
+    //   ForBraceChecker:         "for {...} yield ..." looks fine, but "for { ... } { ... }" looks messy
+    //   XmlLiteralChecker:       maybe it'll be useful
+    //   LowercasePatternMatchChecker:  Lots of places where it's fine
+    //   MultipleStringLiteralsChecker: applies even in interpolation parts
+    //   PatternMatchAlignChecker:      Looks like it's wrong anyway
+    //   SpaceAfterCommentStartChecker: rejects the popular "//TODO:"
+    //   TodoCommentChecker:            at least for now we need them
+    )
+
+  val conf = file(".") / "scalastyle-config.xml"
+
+  def modes    = Map(' ' -> null, 's' -> "src", 't' -> "test")
+  def prefixes = Map('F' -> "file", 'S' -> "scalariform")
+  def levels   = Map('E' -> "error", 'W' -> "warning", 'I' -> null)
+  def r(flags: String, name: String, params: (String, Any)*) = {
+    val f3 = if (flags.length < 3) " "+flags else flags
+    (modes(f3(0)), s"org.scalastyle.${prefixes(f3(1))}.${name}", levels(f3(2)), params)
+  }
+
+  def mkRule(curmode: String)(rule: (String, String, String, Seq[(String,Any)])): String = {
+    val (mode, name, level, params) = rule
+    if (level == null || (mode != null && curmode != mode)) return null
+    val paramStr =
+      if (params.isEmpty) ""
+      else ("<parameters>"
+            + params.map(p => "\n    <parameter name=\"" + p._1 + "\"><![CDATA["
+                              + p._2.toString + "]]></parameter>")
+              .mkString("")
+            + "</parameters>")
+    s"""<check level="$level" class="$name" enabled="true">$paramStr</check>"""
+  }
+  def mkConfig(mode: String): String =
+    s"""<scalastyle>
+       |  <name>Scalastyle Module Configuration ($mode)</name>
+       |  ${rules.map(mkRule(mode)).filter(_ != null).mkString("\n  ")}
+       |</scalastyle>
+       |""".stripMargin
+
+  def commands = Seq(
+    Command.command("run-scalastyle") { st =>
+      Extras.addCommands(st, "run-scalastyle-on src", "run-scalastyle-on test")
+    },
+    Command.single("scalastyle-make-config") { (st, mode) =>
+      scala.tools.nsc.io.File(conf).writeAll(mkConfig(mode))
+      st
+    },
+    Command.single("run-scalastyle-on") { (st, mode) =>
+      val cmd = (if (mode == "src") "" else mode + ":") + "scalastyle"
+      Extras.addCommands(st, s"scalastyle-make-config $mode",
+                             s"noisy-command on-all-subs $cmd",
+                             "scalastyle-delete-config")
+    },
+    Command.command("scalastyle-delete-config") { st => conf.delete; st }
+  )
+
+}
diff --git a/src/readers/build.sbt b/src/readers/build.sbt
new file mode 100644
index 0000000000..6d55f118b6
--- /dev/null
+++ b/src/readers/build.sbt
@@ -0,0 +1 @@
+//> DependsOn: core
diff --git a/src/readers/src/main/python/BinaryFileReader.py b/src/readers/src/main/python/BinaryFileReader.py
new file mode 100644
index 0000000000..7cbac9205c
--- /dev/null
+++ b/src/readers/src/main/python/BinaryFileReader.py
@@ -0,0 +1,52 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+import sys
+
+if sys.version >= '3':
+    basestring = str
+
+import pyspark
+from pyspark import SparkContext
+from pyspark import sql
+from pyspark.ml.param.shared import *
+from pyspark.sql import DataFrame
+from pyspark.sql.types import *
+
+BinaryFileFields = ["path", "bytes"]
+
+BinaryFileSchema = StructType([
+    StructField(BinaryFileFields[0], StringType(),  True),
+    StructField(BinaryFileFields[1], BinaryType(), True) ])
+
+def readBinaryFiles(self, path, recursive = False, sampleRatio = 1.0, inspectZip = True):
+    """
+    Reads the directory of binary files from the local or remote (WASB) source
+
+    :param str path: Path to the file directory
+    :param bool recursive: Recursive search flag
+    :param double sampleRatio: Fraction of the files loaded into the dataframe
+    :return: DataFrame with a single column "value"; see binaryFileSchema for details
+    :rtype: DataFrame
+    """
+    ctx = SparkContext.getOrCreate()
+    reader = ctx._jvm.com.microsoft.ml.spark.BinaryFileReader
+    sql_ctx = pyspark.SQLContext.getOrCreate(ctx)
+    jsession = sql_ctx.sparkSession._jsparkSession
+    jresult = reader.read(path, recursive, jsession, float(sampleRatio), inspectZip)
+    return DataFrame(jresult, sql_ctx)
+
+setattr(sql.SparkSession, 'readBinaryFiles', classmethod(readBinaryFiles))
+
+def isBinaryFile(df, column):
+    """
+    Returns True if the column contains binary files
+
+    :param DataFrame df: The DataFrame to be processed
+    :param bool column: The name of the column being inspected
+    :return: True if the colum is a binary files column
+    :rtype: bool
+    """
+    ctx = SparkContext.getOrCreate()
+    schema = ctx._jvm.com.microsoft.ml.spark.schema.BinaryFileSchema
+    return schema.isBinaryFile(df._jdf, column)
diff --git a/src/readers/src/main/python/ImageReader.py b/src/readers/src/main/python/ImageReader.py
new file mode 100644
index 0000000000..176326b1f6
--- /dev/null
+++ b/src/readers/src/main/python/ImageReader.py
@@ -0,0 +1,50 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+import sys
+
+if sys.version >= '3':
+    basestring = str
+
+import pyspark
+from pyspark import SparkContext
+from pyspark import sql
+from pyspark.ml.param.shared import *
+from pyspark.sql import DataFrame
+
+def readImages(sparkSession, path, recursive = False, sampleRatio = 1.0, inspectZip = True):
+    """
+    Reads the directory of images from the local or remote (WASB) source.
+    This function is attached to SparkSession class.
+    Example: spark.readImages(path, recursive, ...)
+
+    :param SparkSession sparkSession: Existing sparkSession
+    :param str path: Path to the image directory
+    :param bool recursive: Recursive search flag
+    :param double sampleRatio: Fraction of the images loaded into dataframe
+    :param bool inspectZip: Look for images inside zip files
+    :return: DataFrame with a single column of "images", see imageSchema for details
+    :rtype: DataFrame
+    """
+    ctx = SparkContext.getOrCreate()
+    reader = ctx._jvm.com.microsoft.ml.spark.ImageReader
+    sql_ctx = pyspark.SQLContext.getOrCreate(ctx)
+    jsession = sql_ctx.sparkSession._jsparkSession
+    jresult = reader.read(path, recursive, jsession, float(sampleRatio), inspectZip)
+    return DataFrame(jresult, sql_ctx)
+
+setattr(sql.SparkSession, 'readImages', classmethod(readImages))
+
+def isImage(df, column):
+    """
+    Returns True if the column contains images
+
+    :param DataFrame df: The DataFrame to be processed
+    :param bool column: The name of the column being inspected
+    :return: True if the colum is an image column
+    :rtype: bool
+    """
+
+    jvm = SparkContext.getOrCreate()._jvm
+    schema = jvm.com.microsoft.ml.spark.schema.ImageSchema
+    return schema.isImage(df._jdf, column)
diff --git a/src/readers/src/main/scala/AzureBlobReader.scala b/src/readers/src/main/scala/AzureBlobReader.scala
new file mode 100644
index 0000000000..39a8830e7d
--- /dev/null
+++ b/src/readers/src/main/scala/AzureBlobReader.scala
@@ -0,0 +1,72 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+import com.microsoft.ml.spark.FileFormat.FileFormat
+import java.util.NoSuchElementException
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession}
+import scala.util.parsing.json._
+
+object AzureBlobReader {
+  def read (accountName: String, accountKey: String, containerName: String, filePath: String,
+            fileFormat: String, hasHeader: Boolean): DataFrame = {
+    val spark = SparkSession.builder.getOrCreate()
+    val fileFormatEnum = ReaderUtils.getFileFormat(fileFormat)
+
+    // Register the credential
+    if (!ReaderUtils.isNullOrEmpty(accountKey)) {
+      val config = spark.sparkContext.hadoopConfiguration
+      val azureAccountKeyPrefix = "fs.azure.account.key."
+      val azureAccountKeyPostfix = ".blob.core.windows.net"
+      config.set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
+      config.set(azureAccountKeyPrefix.concat(accountName).concat(azureAccountKeyPostfix), accountKey)
+    }
+
+    // Generate the url
+    var url: String = null
+    if (!ReaderUtils.isNullOrEmpty(containerName) && !ReaderUtils.isNullOrEmpty(accountName)) {
+      val urlPrefix = "wasbs://"
+      val urlPostfix = ".blob.core.windows.net/"
+      url = urlPrefix.concat(containerName).concat("@").concat(accountName).concat(urlPostfix).concat(filePath)
+    } else {
+      val urlPrefix = "wasbs:///"
+      url = urlPrefix.concat(filePath)
+    }
+
+    // Populate the options
+   val options = ReaderUtils.getOptionsForBlobReader(fileFormatEnum, true, hasHeader)
+
+    // Get the file format
+    var format = fileFormatEnum.toString
+    if (format == "tsv") {
+      format = "csv"
+    }
+
+    spark.read.format(format).options(options).load(url)
+  }
+
+  def read2 (jsonStr: String): DataFrame = {
+    val parsedJsonStr = JSON.parseFull(jsonStr)
+    var accountName = ""
+    var accountKey = ""
+    var containerName = ""
+    var filePath = ""
+    var fileFormat = ""
+    var hasHeader = false;
+    try {
+      hasHeader = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("hasHeader").asInstanceOf[Boolean]
+      fileFormat = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("fileFormat").asInstanceOf[String]
+      filePath = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("filePath").asInstanceOf[String]
+      containerName = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("containerName").asInstanceOf[String]
+      accountKey = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("accountKey").asInstanceOf[String]
+      accountName = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("accountName").asInstanceOf[String]
+    } catch {
+      case ex: NoSuchElementException => {
+        throw new IllegalArgumentException("parameter not found or invalid Json format detected in the input.")
+      }
+    }
+
+    read(accountName, accountKey, containerName, filePath, fileFormat, hasHeader)
+  }
+}
diff --git a/src/readers/src/main/scala/AzureSQLReader.scala b/src/readers/src/main/scala/AzureSQLReader.scala
new file mode 100644
index 0000000000..f9dc56c2e1
--- /dev/null
+++ b/src/readers/src/main/scala/AzureSQLReader.scala
@@ -0,0 +1,53 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.util.NoSuchElementException
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession}
+import scala.util.parsing.json.JSON
+
+object AzureSQLReader {
+  def read(serverName: String, databaseName: String, query: String, userName: String, password: String): DataFrame = {
+    // val spark = SQLContext.getOrCreate(null)
+    val spark = SparkSession.builder.getOrCreate()
+
+    // Convert query to subquery
+    val subQueryPrefix = "("
+    val subQueryPostfix = ") AS mmlTempTable123"
+    val subQuery = subQueryPrefix.concat(query).concat(subQueryPostfix)
+    println(subQuery)
+    val driver = "com.microsoft.sqlserver.jdbc.SQLServerDriver"
+    val urlPrefix = "jdbc:sqlserver://"
+    val urlPostfix = ".database.windows.net"
+    val url = urlPrefix.concat(serverName).concat(urlPostfix)
+    val options = Map("url" -> url, "databaseName" -> databaseName, "driver" -> driver, "dbtable" -> subQuery,
+                      "user" -> userName, "password" -> password)
+
+    spark.read.format("jdbc").options(options).load()
+  }
+
+  def read2 (jsonStr: String): DataFrame = {
+    val parsedJsonStr = JSON.parseFull(jsonStr)
+    var serverName= ""
+    var databaseName = ""
+    var query = ""
+    var userName = ""
+    var password = ""
+
+    try {
+      password = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("password").asInstanceOf[String]
+      userName = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("userName").asInstanceOf[String]
+      query = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("query").asInstanceOf[String]
+      databaseName = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("databaseName").asInstanceOf[String]
+      serverName = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("serverName").asInstanceOf[String]
+    } catch {
+      case ex: NoSuchElementException => {
+        throw new IllegalArgumentException("parameter not found or invalid Json format detected in the input.")
+      }
+    }
+
+    read(serverName, databaseName, query, userName, password)
+  }
+}
diff --git a/src/readers/src/main/scala/BinaryFileReader.scala b/src/readers/src/main/scala/BinaryFileReader.scala
new file mode 100644
index 0000000000..69451ac7df
--- /dev/null
+++ b/src/readers/src/main/scala/BinaryFileReader.scala
@@ -0,0 +1,79 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.schema.BinaryFileSchema
+import org.apache.spark.input.PortableDataStream
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.{DataFrame, Row, SparkSession}
+
+import scala.language.existentials
+import com.microsoft.ml.spark.FileUtilities.{ZipIterator}
+import com.microsoft.ml.spark.hadoop.{SamplePathFilter, RecursiveFlag}
+
+object BinaryFileReader {
+
+  //single column of images named "image"
+  private val binaryDFSchema = StructType(StructField("value", BinaryFileSchema.columnSchema, true) :: Nil)
+
+  /**
+    * Read the directory of images from the local or remote source
+    *
+    * @param path      Path to the image directory
+    * @param recursive Recursive search flag
+    * @return Dataframe with a single column of "images", see imageSchema for details
+    */
+  private[spark] def readRDD(path: String, recursive: Boolean, spark: SparkSession,
+                               sampleRatio: Double, inspectZip: Boolean)
+  : RDD[(String, Array[Byte])] = {
+
+    require(sampleRatio <= 1.0 && sampleRatio >= 0, "sampleRatio should be between 0 and 1")
+
+    val oldRecursiveFlag = RecursiveFlag.setRecursiveFlag(Some(recursive.toString), spark)
+    val oldPathFilter: Option[Class[_]] =
+      if (sampleRatio < 1)
+        SamplePathFilter.setPathFilter(Some(classOf[SamplePathFilter]), Some(sampleRatio), Some(inspectZip), spark)
+      else
+        None
+
+    var data: RDD[(String, Array[Byte])] = null
+    try {
+      val streams = spark.sparkContext.binaryFiles(path, spark.sparkContext.defaultParallelism)
+
+      // Create files RDD and load bytes
+      data = if(!inspectZip) {
+        streams.mapValues((stream: PortableDataStream) => stream.toArray)
+      }
+      else{
+        // if inspectZip is enabled, examine/sample the contents of zip files
+        streams.flatMap({ case (filename: String, stream: PortableDataStream) =>
+          if (SamplePathFilter.isZipFile(filename)) {
+            new ZipIterator(stream, filename, sampleRatio)
+          }
+          else {
+            Some((filename, stream.toArray))
+          }
+        })
+      }
+    }
+    finally {
+      // return Hadoop flag to its original value
+      RecursiveFlag.setRecursiveFlag(oldRecursiveFlag, spark = spark)
+      SamplePathFilter.setPathFilter(oldPathFilter, spark = spark)
+      ()
+    }
+
+    data
+  }
+
+  def read(path: String, recursive: Boolean, spark: SparkSession,
+           sampleRatio: Double = 1, inspectZip: Boolean = true): DataFrame = {
+    val rowRDD = readRDD(path, recursive, spark, sampleRatio, inspectZip)
+      .map({row:(String, Array[Byte]) => Row(Row(row._1, row._2))})
+
+    spark.createDataFrame(rowRDD, binaryDFSchema)
+  }
+}
+
diff --git a/src/readers/src/main/scala/FileFormat.scala b/src/readers/src/main/scala/FileFormat.scala
new file mode 100644
index 0000000000..d7ead617e4
--- /dev/null
+++ b/src/readers/src/main/scala/FileFormat.scala
@@ -0,0 +1,12 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+object FileFormat extends Enumeration {
+  type FileFormat = Value
+  val Csv = Value("csv")
+  val Tsv = Value("tsv")
+  val Json = Value("json")
+  val Parquet = Value("parquet")
+}
diff --git a/src/readers/src/main/scala/ImageReader.scala b/src/readers/src/main/scala/ImageReader.scala
new file mode 100644
index 0000000000..3594dbcb7d
--- /dev/null
+++ b/src/readers/src/main/scala/ImageReader.scala
@@ -0,0 +1,63 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.schema.ImageSchema
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.{DataFrame, Row, SparkSession}
+import org.opencv.core.{Core, MatOfByte}
+import org.opencv.imgcodecs.Imgcodecs
+
+object ImageReader {
+
+  //single column of images named "image"
+  private val imageDFSchema = StructType(StructField("image", ImageSchema.columnSchema, true) :: Nil)
+
+  /**
+    * Convert the image from compressd (jpeg, etc.) into OpenCV representation and store it in Row
+    * See ImageSchema for details.
+    *
+    * @param filename arbitrary string
+    * @param bytes image bytes (for example, jpeg)
+    * @return returns None if decompression fails
+    */
+  private[spark] def decode(filename: String, bytes: Array[Byte]): Option[Row] = {
+    val mat = new MatOfByte(bytes: _*)
+    val decoded = Imgcodecs.imdecode(mat, Imgcodecs.CV_LOAD_IMAGE_COLOR)
+
+    if (decoded.empty()) {
+      None
+    } else {
+      val ocvBytes = new Array[Byte](decoded.total.toInt * decoded.elemSize.toInt)
+
+      // extract OpenCV bytes
+      decoded.get(0, 0, ocvBytes)
+
+      // type: CvType.CV_8U
+      Some(Row(filename, decoded.height, decoded.width, decoded.`type`, ocvBytes))
+    }
+  }
+
+  /**
+    * Read the directory of images from the local or remote source
+    *
+    * @param path      Path to the image directory
+    * @param recursive Recursive search flag
+    * @return Dataframe with a single column of "images", see imageSchema for details
+    */
+  def read(path: String, recursive: Boolean, spark: SparkSession,
+           sampleRatio: Double = 1, inspectZip: Boolean = true): DataFrame = {
+
+    val binaryRDD = BinaryFileReader.readRDD(path, recursive, spark, sampleRatio, inspectZip)
+    val binaryRDDlib = ImageSchema.loadLibraryForAllPartitions(binaryRDD, Core.NATIVE_LIBRARY_NAME)
+
+    val validImages = binaryRDDlib.flatMap {
+      case (filename, bytes) => {
+        decode(filename, bytes).map(x => Row(x))
+      }
+    }
+
+    spark.createDataFrame(validImages, imageDFSchema)
+  }
+}
diff --git a/src/readers/src/main/scala/ReaderUtils.scala b/src/readers/src/main/scala/ReaderUtils.scala
new file mode 100644
index 0000000000..fa36d768f7
--- /dev/null
+++ b/src/readers/src/main/scala/ReaderUtils.scala
@@ -0,0 +1,47 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.FileFormat.FileFormat
+
+object ReaderUtils {
+  def isNullOrEmpty(str: String): Boolean = {
+    return str == null || str.trim.isEmpty
+  }
+
+  def getFileFormat(str: String): FileFormat = {
+    if (isNullOrEmpty(str)) {
+      throw new IllegalArgumentException("str is invalid.")
+    }
+
+    if (str.equalsIgnoreCase("csv")) {
+      return FileFormat.Csv
+    } else if (str.equalsIgnoreCase("tsv")) {
+      return FileFormat.Tsv
+    }else if (str.equalsIgnoreCase("json")) {
+      return FileFormat.Json
+    } else if (str.equalsIgnoreCase("parquet")) {
+      return FileFormat.Parquet
+    } else {
+      throw new IllegalArgumentException("str is not valid file format.")
+    }
+  }
+
+  def getOptionsForBlobReader(fileFormat: FileFormat, inferSchema: Boolean, hasHeader: Boolean): Map[String, String] = {
+    var headerOpt = "false"
+    if (hasHeader) {
+      headerOpt = "true"
+    }
+    var schemaOpt = "false"
+    if (inferSchema) {
+      schemaOpt = "true"
+    }
+    var options = Map("inferSchema" -> schemaOpt, "header" -> headerOpt)
+    if (fileFormat == FileFormat.Tsv) {
+      options = options + ("delimiter" -> "\t")
+    }
+
+    return options
+  }
+}
diff --git a/src/readers/src/main/scala/Readers.scala b/src/readers/src/main/scala/Readers.scala
new file mode 100644
index 0000000000..a4ee2359ea
--- /dev/null
+++ b/src/readers/src/main/scala/Readers.scala
@@ -0,0 +1,50 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.sql.{DataFrame, SparkSession}
+import scala.language.implicitConversions
+
+/**
+  * Implicit conversion allows sparkSession.readImages(...) syntax
+  * Example:
+  *     import com.microsoft.ml.spark.Readers.implicits._
+  *     sparkSession.readImages(path, recursive = false)
+  */
+object Readers {
+
+  object implicits {
+
+    class Session(sparkSession: SparkSession) {
+
+      /**
+        *
+        * @param path         Path to the files directory
+        * @param recursive    Recursive path search flag
+        * @param sampleRatio  Fraction of the files loaded
+        * @param inspectZip   Whether zip files are treated as directories
+        * @return Dataframe with a single column "value" of binary files, see BinaryFileSchema for details
+        */
+      def readBinaryFiles(path: String, recursive: Boolean,
+                          sampleRatio: Double = 1, inspectZip: Boolean = true): DataFrame =
+        BinaryFileReader.read(path, recursive, sparkSession, sampleRatio, inspectZip)
+
+      /**
+        * Read the directory of images from the local or remote source
+        *
+        * @param path         Path to the image directory
+        * @param recursive    Recursive path search flag
+        * @param sampleRatio  Fraction of the files loaded
+        * @param inspectZip   Whether zip files are treated as directories
+        * @return Dataframe with a single column "image" of images, see ImageSchema for details
+        */
+      def readImages(path: String, recursive: Boolean,
+                     sampleRatio: Double = 1, inspectZip: Boolean = true): DataFrame =
+        ImageReader.read(path, recursive, sparkSession, sampleRatio, inspectZip)
+    }
+
+    implicit def ImplicitSession(sparkSession: SparkSession):Session = new Session(sparkSession)
+
+  }
+}
diff --git a/src/readers/src/main/scala/WasbReader.scala b/src/readers/src/main/scala/WasbReader.scala
new file mode 100644
index 0000000000..80d38b22bc
--- /dev/null
+++ b/src/readers/src/main/scala/WasbReader.scala
@@ -0,0 +1,47 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.util.NoSuchElementException
+import com.microsoft.ml.spark.FileFormat.FileFormat
+import org.apache.spark.SparkContext
+import org.apache.spark.sql.{DataFrame, SQLContext, SparkSession}
+import scala.util.parsing.json.JSON
+
+object WasbReader {
+  def read (url: String, fileFormat: String, hasHeader: Boolean): DataFrame = {
+    val spark = SparkSession.builder.getOrCreate()
+    val fileFormatEnum = ReaderUtils.getFileFormat(fileFormat)
+
+    // Populate the options
+    val options = ReaderUtils.getOptionsForBlobReader(fileFormatEnum, true, hasHeader)
+
+    // Get the file format
+    var format = fileFormatEnum.toString
+    if (format == "tsv") {
+      format = "csv"
+    }
+
+    spark.read.format(format).options(options).load(url)
+  }
+
+  def read2 (jsonStr: String): DataFrame = {
+    val parsedJsonStr = JSON.parseFull(jsonStr)
+    var url = ""
+    var fileFormat = ""
+    var hasHeader = false
+
+    try {
+      hasHeader = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("hasHeader").asInstanceOf[Boolean]
+      fileFormat  = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("fileFormat").asInstanceOf[String]
+      url = parsedJsonStr.get.asInstanceOf[Map[String, Any]]("url").asInstanceOf[String]
+    } catch {
+      case ex: NoSuchElementException => {
+        throw new IllegalArgumentException("parameter not found or invalid Json format detected in the input.")
+      }
+    }
+
+    read(url, fileFormat, hasHeader)
+  }
+}
diff --git a/src/readers/src/test/scala/BinaryFileReaderSuite.scala b/src/readers/src/test/scala/BinaryFileReaderSuite.scala
new file mode 100644
index 0000000000..b1d00f30e7
--- /dev/null
+++ b/src/readers/src/test/scala/BinaryFileReaderSuite.scala
@@ -0,0 +1,44 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.sql._
+import com.microsoft.ml.spark.schema.BinaryFileSchema.isBinaryFile
+import com.microsoft.ml.spark.Readers.implicits._
+import FileReaderSuiteUtils._
+
+class BinaryFileReaderSuite extends TestBase {
+
+  test("binary dataframe") {
+
+    val data = session.readBinaryFiles(groceriesDirectory, recursive = true)
+
+    println(time { data.count })
+
+    assert(isBinaryFile(data, "value"))
+
+    val paths = data.select("value.path") //make sure that SQL has access to the sub-fields
+    assert(paths.count == 31)             //note that text file is also included
+  }
+
+  test("sample ratio test") {
+
+    val all = session.readBinaryFiles(groceriesDirectory, recursive = true, sampleRatio = 1.0)
+    val sampled = session.readBinaryFiles(groceriesDirectory, recursive = true, sampleRatio = 0.5)
+    val count = sampled.count
+    assert(count > 0 && count < all.count, "incorrect sampling behavior")
+  }
+
+  test("with zip file") {
+    /* remove when datasets/Images is updated */
+    creatZips
+
+    val images = session.readBinaryFiles(imagesDirectory, recursive = true)
+    assert(images.count == 74)
+
+    val images1 = session.readBinaryFiles(imagesDirectory, recursive = true, inspectZip = false)
+    assert(images1.count == 39)
+  }
+
+}
diff --git a/src/readers/src/test/scala/ImageReaderSuite.scala b/src/readers/src/test/scala/ImageReaderSuite.scala
new file mode 100644
index 0000000000..312b376b20
--- /dev/null
+++ b/src/readers/src/test/scala/ImageReaderSuite.scala
@@ -0,0 +1,75 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.sql._
+import com.microsoft.ml.spark.schema.ImageSchema.isImage
+import com.microsoft.ml.spark.schema.BinaryFileSchema.isBinaryFile
+import org.apache.spark.input.PortableDataStream
+import com.microsoft.ml.spark.Readers.implicits._
+import com.microsoft.ml.spark.FileUtilities._
+
+object FileReaderSuiteUtils {
+  val fileLocation = s"${sys.env("DATASETS_HOME")}"
+  val imagesDirectory = fileLocation + "/Images"
+  val groceriesDirectory = imagesDirectory + "/Grocery"
+  val cifarDirectory = imagesDirectory + "/CIFAR"
+
+  def createZip(directory: String): Unit ={
+      val dir = new File(directory)
+      val zipfile = new File(directory + ".zip")
+      if(!zipfile.exists())
+        zipFolder(dir, zipfile)
+  }
+
+  def creatZips(): Unit ={
+    createZip(groceriesDirectory)
+    createZip(cifarDirectory)
+  }
+}
+
+import FileReaderSuiteUtils._
+
+class ImageReaderSuite extends TestBase {
+
+  test("image dataframe") {
+
+    val images = session.readImages(groceriesDirectory, recursive = true)
+
+    println(time { images.count })
+
+    assert(isImage(images, "image")) // make sure the column "images" exists and has the right type
+
+    val paths = images.select("image.path") //make sure that SQL has access to the sub-fields
+    assert(paths.count == 30)
+
+    val areas = images.select(images("image.width") * images("image.height")) //more complicated SQL statement
+
+    println(s"   area of image 1 ${areas.take(1)(0)}")
+  }
+
+  test("with zip file") {
+    /* remove when datasets/Images is updated */
+    creatZips
+
+    val images = session.readImages(imagesDirectory, recursive = true)
+    assert(isImage(images, "image"))
+    assert(images.count == 72)
+
+    val images1 = session.readImages(imagesDirectory, recursive = true, inspectZip = false)
+    assert(images1.count == 36)
+  }
+
+  test("sample ratio test") {
+
+    sc.hadoopConfiguration.set("mapreduce.input.fileinputformat.input.dir.recursive", "true")
+
+    val f = sc.binaryFiles(groceriesDirectory)
+    println(time { f.count })
+
+    val images = session.readImages(groceriesDirectory, recursive = true, sampleRatio = 0.5)
+    println(time { images.count })      //the count changes depending on random number generator
+  }
+
+}
diff --git a/src/summarize-data/build.sbt b/src/summarize-data/build.sbt
new file mode 100644
index 0000000000..6d55f118b6
--- /dev/null
+++ b/src/summarize-data/build.sbt
@@ -0,0 +1 @@
+//> DependsOn: core
diff --git a/src/summarize-data/src/main/scala/SummarizeData.scala b/src/summarize-data/src/main/scala/SummarizeData.scala
new file mode 100644
index 0000000000..d104c7d2d6
--- /dev/null
+++ b/src/summarize-data/src/main/scala/SummarizeData.scala
@@ -0,0 +1,189 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.ml.Transformer
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types._
+import org.apache.spark.sql.{DataFrame, Dataset, Row}
+import org.apache.spark.storage.StorageLevel
+
+import scala.collection.JavaConverters._
+import scala.collection.mutable.ListBuffer
+
+trait SummarizeDataParams extends MMLParams {
+
+  final val counts: BooleanParam = BooleanParam(this, "counts", "compute count statistics", true)
+  final def getCounts: Boolean = $(counts)
+  def setCounts(value: Boolean): this.type = set(counts, value)
+
+  final val basic: BooleanParam = new BooleanParam(this, "basic", "compute basic statistics")
+  setDefault(basic, true)
+  final def getBasic: Boolean = $(basic)
+  def setBasic(value: Boolean): this.type = set(basic, value)
+
+  final val sample: BooleanParam = new BooleanParam(this, "sample", "compute sample statistics")
+  setDefault(sample, true)
+  final def getSample: Boolean = $(sample)
+  def setSample(value: Boolean): this.type = set(sample, value)
+
+  final val percentiles: BooleanParam = new BooleanParam(this, "percentiles", "compute percentiles")
+  setDefault(percentiles, true)
+  final def getPercentiles: Boolean = $(percentiles)
+  def setPercentiles(value: Boolean): this.type = set(percentiles, value)
+
+  final val errorThreshold: DoubleParam =
+    new DoubleParam(this, "errorThreshold", "threshold for quantiles - 0 is exact")
+  setDefault(errorThreshold, 0.0)
+  final def getErrorThreshold: Double = $(errorThreshold)
+  def setErrorThreshold(value: Double): this.type = set(errorThreshold, value)
+
+  protected def validateAndTransformSchema(schema: StructType): StructType = {
+    val columns = ListBuffer(SummarizeData.featureColumn)
+    if ($(counts)) columns ++= SummarizeData.countFields
+    if ($(basic)) columns ++= SummarizeData.basicFields
+    if ($(sample)) columns ++= SummarizeData.sampleFields
+    if ($(percentiles)) columns ++= SummarizeData.percentilesFields
+    StructType(columns)
+  }
+}
+
+// UID should be overridden by driver for controlled identification at the DAG level
+class SummarizeData(override val uid: String)
+  extends Transformer
+    with SummarizeDataParams {
+
+  import SummarizeData.Statistic._
+
+  def this() = this(Identifiable.randomUID("SummarizeData"))
+
+  def setStatistics(stats: List[Statistic]): Unit = ???
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+
+    val df = dataset.toDF()
+    // Some of these statistics are bad to compute
+    df.persist(StorageLevel.MEMORY_ONLY)
+
+    val subFrames = ListBuffer[DataFrame]()
+    if ($(counts)) subFrames += computeCounts(df)
+    if ($(basic)) subFrames += curriedBasic(df)
+    if ($(sample)) subFrames += sampleStats(df)
+    if ($(percentiles)) subFrames += curriedPerc(df)
+
+    df.unpersist(false)
+
+    val base = createJoinBase(df)
+    subFrames.foldLeft(base) { (z, dfi) => z.join(dfi, SummarizeData.featureColumnName) }
+  }
+
+  def transformSchema(schema: StructType): StructType = {
+    validateAndTransformSchema(schema)
+  }
+
+  def copy(extra: ParamMap): SummarizeData = defaultCopy(extra)
+
+  private def computeCounts = computeOnAll(computeCountsImpl, SummarizeData.countFields)
+
+  private def computeCountsImpl(col: String, df: DataFrame): Array[Double] = {
+    val column = df.col(col)
+    val mExpr = isnan(column) || isnull(column)
+    val countMissings = df.where(mExpr).count().toDouble
+    // approxCount returns Long which > Double!
+    val dExpr = approx_count_distinct(column)
+    val distinctCount = df.select(dExpr).first.getLong(0).toDouble
+    Array(df.count() - countMissings, distinctCount, countMissings)
+  }
+
+  private def sampleStats = computeOnNumeric(sampleStatsImpl, SummarizeData.sampleFields)
+
+  private def sampleStatsImpl(col: String, df: DataFrame): Array[Double] = {
+    val column = df.col(col)
+    val k = kurtosis(column)
+    val sk = skewness(column)
+    val v = variance(column)
+    val sd = stddev(column)
+    df.select(v, sd, sk, k).first.toSeq.map(_.asInstanceOf[Double]).toArray
+  }
+
+  private def curriedBasic = {
+    val quants = SummarizeData.basicQuantiles
+    computeOnNumeric(quantStub(quants, $(errorThreshold)), SummarizeData.basicFields)
+  }
+
+  private def curriedPerc = {
+    val quants = SummarizeData.percentilesQuantiles
+    computeOnNumeric(quantStub(quants, $(errorThreshold)), SummarizeData.percentilesFields)
+  }
+
+  private def quantStub(vals: Array[Double], err: Double) =
+    (cn: String, df: DataFrame) => df.stat.approxQuantile(cn, vals, err)
+
+  private def computeOnNumeric = computeColumnStats(sf => sf.dataType.isInstanceOf[NumericType]) _
+
+  private def computeOnAll = computeColumnStats(sf => true) _
+
+  private def allNaNs(l: Int): Array[Double] = Array.fill(l)(Double.NaN)
+
+  private def createJoinBase(df: DataFrame) = computeColumnStats(sf => false)((cn, df) => Array(), List())(df)
+
+  private def computeColumnStats
+  (p: StructField => Boolean)
+  (statFunc: (String, DataFrame) => Array[Double], newColumns: Seq[StructField])
+  (df: DataFrame): DataFrame = {
+    val emptyRow = allNaNs(newColumns.length)
+    val outList = df.schema.map(col => (col.name, if (p(col)) statFunc(col.name, df) else emptyRow))
+    val rows = outList.map { case (n, r) => Row.fromSeq(n +: r) }
+    val schema = SummarizeData.featureColumn +: newColumns
+    df.sparkSession.createDataFrame(rows.asJava, StructType(schema))
+  }
+
+}
+
+object SummarizeData extends DefaultParamsReadable[SummarizeData] {
+
+  object Statistic extends Enumeration {
+    type Statistic = Value
+    val Counts, Basic, Sample, Percentiles = Value
+  }
+
+  final val featureColumnName = "Feature"
+  final val featureColumn = StructField(featureColumnName, StringType, false)
+
+  final val percentilesQuantiles = Array(0.005, 0.01, 0.05, 0.95, 0.99, 0.995)
+  final val percentilesFields = List(
+    StructField("P0.5", DoubleType, true),
+    StructField("P1", DoubleType, true),
+    StructField("P5", DoubleType, true),
+    StructField("P95", DoubleType, true),
+    StructField("P99", DoubleType, true),
+    StructField("P99.5", DoubleType, true))
+
+  final val sampleFields = List(
+    StructField("Sample Variance", DoubleType, true),
+    StructField("Sample Standard Deviation", DoubleType, true),
+    StructField("Sample Skewness", DoubleType, true),
+    StructField("Sample Kurtosis", DoubleType, true))
+
+  final val basicQuantiles = Array(0, 0.25, 0.5, 0.75, 1)
+  final val basicFields = List(
+    StructField("Min", DoubleType, true),
+    StructField("1st Quartile", DoubleType, true),
+    StructField("Median", DoubleType, true),
+    StructField("3rd Quartile", DoubleType, true),
+    StructField("Max", DoubleType, true)
+    //TODO: StructField("Range", DoubleType, true),
+    //TODO: StructField("Mean", DoubleType, true),
+    //TODO: StructField("Mean Deviation", DoubleType, true),
+    // Mode is JSON Array of modes - needs a little special treatment
+    //TODO: StructField("Mode", StringType, true))
+  )
+
+  final val countFields = List(
+    StructField("Count", DoubleType, false),
+    StructField("Unique Value Count", DoubleType, false),
+    StructField("Missing Value Count", DoubleType, false))
+}
diff --git a/src/summarize-data/src/test/scala/SummarizeDataSuite.scala b/src/summarize-data/src/test/scala/SummarizeDataSuite.scala
new file mode 100644
index 0000000000..db7b03fcaa
--- /dev/null
+++ b/src/summarize-data/src/test/scala/SummarizeDataSuite.scala
@@ -0,0 +1,52 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.functions._
+
+class SummarizeDataSuite extends TestBase {
+
+  test("Smoke test for summarizing basic DF - schema transform") {
+
+    val input = makeBasicDF()
+    val summary = new SummarizeData()
+    val result = summary.transformSchema(input.schema)
+    assert(result.length > 10)
+  }
+
+  test("Smoke test for summary params") {
+    val s = new SummarizeData()
+    assert(s.params.length == 5)
+    assert(s.params.map(s.isSet).toSeq == (1 to s.params.length).map(i => false))
+
+    val sNoCounts = s.setCounts(false).setPercentiles(false)
+    assert(sNoCounts.params.map(sNoCounts.isSet).toSeq === Seq(false, true, false, true, false))
+  }
+
+  test("Smoke test for summarizing basic DF") {
+    val input = makeBasicDF()
+    val summary = new SummarizeData()
+    val result = summary.transform(input)
+    assert(result.count === input.columns.length)
+    assert(result.columns.length > 18)
+  }
+
+  test("Smoke test for summarizing missings DF") {
+    val input = makeBasicNullableDF()
+    val summary = new SummarizeData()
+    val result = summary.transform(input)
+    assert(result.count === input.columns.length)
+    assert(result.columns.length > 18)
+  }
+
+  test("Smoke test for subset summarizing missings DF") {
+    val input = makeBasicNullableDF()
+    val summary = new SummarizeData().setPercentiles(false).setCounts(false)
+    val result = summary.transform(input)
+    assert(result.count === input.columns.length)
+    assert(result.columns.length < 11)
+  }
+
+}
diff --git a/src/text-featurizer/build.sbt b/src/text-featurizer/build.sbt
new file mode 100644
index 0000000000..47a5d9cbee
--- /dev/null
+++ b/src/text-featurizer/build.sbt
@@ -0,0 +1,2 @@
+//> DependsOn: core
+//> DependsOn: utils
diff --git a/src/text-featurizer/src/main/scala/TextFeaturizer.scala b/src/text-featurizer/src/main/scala/TextFeaturizer.scala
new file mode 100644
index 0000000000..d97fd3b946
--- /dev/null
+++ b/src/text-featurizer/src/main/scala/TextFeaturizer.scala
@@ -0,0 +1,442 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.util.NoSuchElementException
+
+import org.apache.hadoop.fs.Path
+import org.apache.spark.ml.feature._
+import org.apache.spark.ml._
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util._
+import org.apache.spark.sql.{DataFrame, Dataset, Row, SaveMode}
+import org.apache.spark.sql.types._
+import org.apache.spark.ml.Pipeline
+import org.apache.spark.ml.attribute.AttributeGroup
+
+trait TextFeaturizerParams extends MMLParams{
+
+  //Tokenizer Params
+  /** @group param  */
+  val useTokenizer = BooleanParam(this, "useTokenizer", "Whether to tokenize the input")
+
+  /** @group getParam **/
+  final def getUseTokenizer: Boolean = $(useTokenizer)
+
+  /** @group param  */
+  val tokenizerGaps = BooleanParam(
+    this,
+    "tokenizerGaps",
+    "Indicates whether regex splits on gaps (true) or matches tokens (false)."
+  )
+
+  /** @group getParam **/
+  final def getTokenizerGaps: Boolean = $(tokenizerGaps)
+
+  /** @group param  */
+  val minTokenLength = IntParam(this, "minTokenLength", "Minimum token length, >= 0.")
+
+  /** @group getParam **/
+  final def getMinTokenLength: Int = $(minTokenLength)
+
+  /** @group param  */
+  val tokenizerPattern = StringParam(
+    this,
+    "tokenizerPattern",
+    "Regex pattern used to match delimiters if gaps is true or tokens if gaps is false.")
+
+  /** @group getParam **/
+  final def getTokenizerPattern: String = $(tokenizerPattern)
+
+  /** @group param  */
+  val toLowercase = BooleanParam(
+    this,
+    "toLowercase",
+    "Indicates whether to convert all characters to lowercase before tokenizing.")
+
+  /** @group getParam **/
+  final def getToLowercase: Boolean = $(toLowercase)
+
+  //Stop Word Remover Params
+  /** @group param  */
+  val useStopWordsRemover = BooleanParam(this,
+    "useStopWordsRemover",
+    "Whether to remove stop words from tokenized data")
+
+  /** @group getParam **/
+  final def getUseStopWordsRemover: Boolean = $(useStopWordsRemover)
+
+  val caseSensitiveStopWords = BooleanParam(
+    this,
+    "caseSensitiveStopWords",
+    " Whether to do a case sensitive comparison over the stop words")
+
+  final def getCaseSensitiveStopWords: Boolean = $(caseSensitiveStopWords)
+
+  val defaultStopWordLanguage = StringParam(this,
+    "defaultStopWordLanguage",
+    "Which language to use for the stop word remover," +
+      " set this to custom to use the stopWords input")
+
+  final def getDefaultStopWordLanguage: String = $(defaultStopWordLanguage)
+
+  val stopWords = StringParam(this, "stopWords", "The words to be filtered out.")
+
+  final def getStopWords: String = $(stopWords)
+
+  //Ngram Params
+  /** @group param  */
+  val useNGram = BooleanParam(this, "useNGram", "Whether to enumerate N grams")
+
+  /** @group getParam **/
+  final def getUseNGram: Boolean = $(useNGram)
+
+  /** @group param  */
+  val nGramLength = IntParam(this, "nGramLength", "The size of the Ngrams")
+
+  /** @group getParam **/
+  final def getNGramLength: Int = $(nGramLength)
+
+  //HashingTF Params
+  /** @group param  */
+  val binary = BooleanParam(
+    this,
+    "binary",
+    "If true, all nonegative word counts are set to 1")
+
+  /** @group getParam **/
+  final def getBinary: Boolean = $(binary)
+
+  /** @group param  */
+  val numFeatures = IntParam(
+    this,
+    "numFeatures",
+    "Set the number of features to hash each document to")
+
+  /** @group getParam **/
+  final def getNumFeatures: Int = $(numFeatures)
+
+  //IDF Params
+  /** @group param  */
+  val useIDF = BooleanParam(
+    this,
+    "useIDF",
+    "Whether to scale the Term Frequencies by IDF")
+
+  /** @group getParam **/
+  final def getUseIDF: Boolean = $(useIDF)
+
+  /** @group param  */
+  val minDocFreq = IntParam(
+    this,
+    "minDocFreq",
+    "The minimum number of documents in which a term should appear.")
+
+  /** @group getParam **/
+  final def getMinDocFreq: Int = $(minDocFreq)
+
+}
+
+object  TextFeaturizer extends DefaultParamsReadable[TextFeaturizer]
+
+class TextFeaturizer(override val uid: String)
+  extends Estimator[TextFeaturizerModel]
+    with TextFeaturizerParams with HasInputCol with HasOutputCol {
+  def this() = this(Identifiable.randomUID("TextFeaturizer"))
+
+  setDefault(outputCol, uid + "__output")
+
+  def setUseTokenizer(value: Boolean): this.type = set(useTokenizer, value)
+
+  setDefault(useTokenizer -> true)
+
+  /** @group setParam **/
+  def setTokenizerGaps(value: Boolean): this.type = set(tokenizerGaps, value)
+
+  setDefault(tokenizerGaps -> true)
+
+  /** @group setParam **/
+  def setMinTokenLength(value: Int): this.type = set(minTokenLength, value)
+
+  setDefault(minTokenLength -> 0)
+
+  /** @group setParam **/
+  def setTokenizerPattern(value: String): this.type =
+  set(tokenizerPattern, value)
+
+  setDefault(tokenizerPattern -> "\\s+")
+
+  /** @group setParam **/
+  def setToLowercase(value: Boolean): this.type = set(toLowercase, value)
+
+  setDefault(toLowercase -> true)
+
+  /** @group setParam **/
+  def setUseStopWordsRemover(value: Boolean): this.type =
+  set(useStopWordsRemover, value)
+
+  setDefault(useStopWordsRemover -> false)
+
+  /** @group setParam **/
+  def setCaseSensitiveStopWords(value: Boolean): this.type =
+  set(caseSensitiveStopWords, value)
+
+  setDefault(caseSensitiveStopWords -> false)
+
+  /** @group setParam **/
+  def setDefaultStopWordLanguage(value: String): this.type =
+  set(defaultStopWordLanguage, value)
+
+  setDefault(defaultStopWordLanguage -> "english")
+
+  /** @group setParam **/
+  def setStopWords(value: String): this.type = set(stopWords, value)
+
+  /** @group setParam **/
+  def setUseNGram(value: Boolean): this.type = set(useNGram, value)
+
+  /** @group setParam **/
+  def setNGramLength(value: Int): this.type = set(nGramLength, value)
+
+  setDefault(useNGram -> false, nGramLength -> 2)
+
+  /** @group setParam **/
+  def setBinary(value: Boolean): this.type = set(binary, value)
+
+  /** @group setParam **/
+  def setNumFeatures(value: Int): this.type = set(numFeatures, value)
+
+  setDefault(numFeatures -> (1 << 18), binary -> false)
+
+  /** @group setParam **/
+  def setUseIDF(value: Boolean): this.type = set(useIDF, value)
+
+  /** @group setParam **/
+  def setMinDocFreq(value: Int): this.type = set(minDocFreq, value)
+
+  setDefault(useIDF -> true, minDocFreq -> 1)
+
+  private def setParamInternal[M <: PipelineStage, T](model: M,
+                                                      name: String,
+                                                      value: T) = {
+    model.set(model.getParam(name), value)
+  }
+
+  private def getParamInternal[M <: PipelineStage, T](model: M, name: String) = {
+    model.getOrDefault(model.getParam(name))
+  }
+
+  override def fit(dataset: Dataset[_]): TextFeaturizerModel = {
+    try {
+      getUseTokenizer
+    } catch {
+      case e: NoSuchElementException => setUseTokenizer(needsTokenizer(dataset.schema))
+    }
+
+    transformSchema(dataset.schema)
+    var models: List[PipelineStage] = Nil
+    if (getUseTokenizer)
+      models ::= new RegexTokenizer()
+        .setGaps(getTokenizerGaps)
+        .setPattern(getTokenizerPattern)
+        .setMinTokenLength(getMinTokenLength)
+        .setToLowercase(getToLowercase)
+    if (getUseStopWordsRemover) {
+      val swr =
+        new StopWordsRemover().setCaseSensitive(getCaseSensitiveStopWords)
+      if (getDefaultStopWordLanguage == "custom") {
+        models ::= swr.setStopWords(getStopWords.split(","))
+      } else {
+        models ::= swr.setStopWords(
+          StopWordsRemover.loadDefaultStopWords(getDefaultStopWordLanguage))
+      }
+    }
+    if (getUseNGram)
+      models ::= new NGram().setN(getNGramLength)
+    models ::= new HashingTF()
+      .setBinary(getBinary)
+      .setNumFeatures(getNumFeatures)
+    if (getUseIDF)
+      models ::= new IDF().setMinDocFreq(getMinDocFreq)
+    models = models.reverse
+
+    val chainedModels = models
+      .zip(0 to models.length)
+      .map(
+        { pair: (PipelineStage, Int) =>
+          val model = pair._1
+          val i = pair._2
+          if (i == 0) {
+            setParamInternal(model, "inputCol", getInputCol)
+          } else if (i < models.length - 1) {
+            setParamInternal(model,
+              "inputCol",
+              getParamInternal(models(i - 1), "outputCol"))
+          } else {
+            val m1 =
+              setParamInternal(model,
+                "inputCol",
+                getParamInternal(models(i - 1), "outputCol"))
+            setParamInternal(m1, "outputCol", getOutputCol)
+          }
+        }
+      )
+    val colsToDrop = chainedModels.reverse.tail
+      .map(getParamInternal(_, "outputCol").asInstanceOf[String])
+    val fitPipeline =
+      new Pipeline().setStages(chainedModels.toArray).fit(dataset)
+    new TextFeaturizerModel(uid, fitPipeline, colsToDrop).setParent(this)
+  }
+
+  override def copy(extra: ParamMap): Estimator[TextFeaturizerModel] =
+    defaultCopy(extra)
+
+  def transformSchema(schema: StructType): StructType = {
+    val inputType = schema($(inputCol)).dataType
+    validateInputType(inputType)
+    if (schema.fieldNames.contains($(outputCol))) {
+      throw new IllegalArgumentException(
+        s"Output column ${$(outputCol)} already exists.")
+    }
+    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
+    appendColumn(schema, attrGroup.toStructField())
+  }
+
+  private def needsTokenizer(schema: StructType) = {
+    val inputType = schema($(inputCol)).dataType
+    inputType == StringType
+  }
+
+  private def validateInputType(inputType: DataType) = {
+    if (getUseTokenizer) {
+      if (inputType == ArrayType(StringType)) {
+        require(
+          inputType == StringType,
+          s"Input type must be string type but got $inputType. " +
+            s"It looks like your data is already tokenized, Try with useTokenizer=False")
+      }
+      require(inputType == StringType,
+        s"Input type must be string type but got $inputType.")
+    } else if (getUseNGram) {
+      if (inputType == StringType) {
+        require(
+          inputType == ArrayType(StringType),
+          s"Input type must be Array[string] type but got $inputType. " +
+            s"It looks like your data not tokenized, Try with useTokenizer=True")
+      }
+      require(
+        inputType == ArrayType(StringType),
+        s"Input type must be Array(String) type type but got $inputType.")
+    } else {
+      if (inputType == StringType) {
+        require(
+          inputType.isInstanceOf[ArrayType],
+          s"Input type must be Array[_] type but got $inputType. " +
+            s"It looks like your data not tokenized, Try with useTokenizer=True")
+      }
+      require(inputType.isInstanceOf[ArrayType],
+        s"Input type must be Array(_) type type but got $inputType.")
+    }
+  }
+
+  private def appendColumn(schema: StructType, col: StructField): StructType = {
+    require(!schema.fieldNames.contains(col.name),
+      s"Column ${col.name} already exists.")
+    StructType(schema.fields :+ col)
+  }
+}
+
+class TextFeaturizerModel(val uid: String,
+                          fitPipeline: PipelineModel,
+                          colsToDrop: List[String])
+  extends Model[TextFeaturizerModel] with MLWritable {
+
+  override def write: MLWriter = new TextFeaturizerModel.TextFeaturizerModelWriter(uid, fitPipeline, colsToDrop)
+
+  override def copy(extra: ParamMap): TextFeaturizerModel = defaultCopy(extra)
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    colsToDrop.foldRight(fitPipeline.transform(dataset))((col, df) =>
+      df.drop(col))
+  }
+
+  override def transformSchema(schema: StructType): StructType =
+    colsToDrop.foldRight(fitPipeline.transformSchema(schema))((col, schema) =>
+      StructType(schema.drop(schema.fieldIndex(col))))
+}
+
+object TextFeaturizerModel extends MLReadable[TextFeaturizerModel] {
+
+  private val fitPipelinePart = "fitPipeline"
+  private val colsToDropPart = "colsToDrop"
+  private val dataPart = "data"
+
+  override def read: MLReader[TextFeaturizerModel] = new TextFeaturizerModelReader
+
+  override def load(path: String): TextFeaturizerModel = super.load(path)
+
+  /** [[MLWriter]] instance for [[TextFeaturizerModel]] */
+  private[TextFeaturizerModel]
+  class TextFeaturizerModelWriter(val uid: String,
+                                  val fitPipeline: PipelineModel,
+                                  val colsToDrop: List[String])
+    extends MLWriter {
+
+    private case class Data(uid: String)
+
+    override protected def saveImpl(path: String): Unit = {
+      val overwrite = this.shouldOverwrite
+      val qualPath = PipelineUtilities.makeQualifiedPath(sc, path)
+      // Required in order to allow this to be part of an ML pipeline
+      PipelineUtilities.saveMetadata(uid,
+        TextFeaturizerModel.getClass.getName.replace("$", ""),
+        new Path(path, "metadata").toString,
+        sc,
+        overwrite)
+
+      val dataPath = new Path(qualPath, dataPart).toString
+
+      // Save data
+      val data = Data(uid)
+
+      // save the columns to drop
+      ObjectUtilities.writeObject(colsToDrop, qualPath, colsToDropPart, sc, overwrite)
+
+      // save the pipeline
+      val fitPipelinePath = new Path(qualPath, fitPipelinePart).toString
+      val fitPipelineWriter =
+        if (overwrite) fitPipeline.write.overwrite()
+        else fitPipeline.write
+      fitPipelineWriter.save(fitPipelinePath)
+
+      val saveMode =
+        if (overwrite) SaveMode.Overwrite
+        else SaveMode.ErrorIfExists
+
+      sparkSession.createDataFrame(Seq(data)).repartition(1).write.mode(saveMode).parquet(dataPath)
+    }
+  }
+
+  private class TextFeaturizerModelReader
+    extends MLReader[TextFeaturizerModel] {
+    override def load(path: String): TextFeaturizerModel = {
+      val qualPath = PipelineUtilities.makeQualifiedPath(sc, path)
+      // load the uid and one hot encoding param
+      val dataPath = new Path(qualPath, dataPart).toString
+      val data = sparkSession.read.format("parquet").load(dataPath)
+      val Row(uid: String) = data.select("uid").head()
+
+      // load the fit pipeline
+      val fitPipelinePath = new Path(qualPath, fitPipelinePart).toString
+      val fitPipeline = PipelineModel.load(fitPipelinePath)
+
+      // load the columns to drop
+      val colsToDrop = ObjectUtilities.loadObject[List[String]](qualPath, colsToDropPart, sc)
+
+      new TextFeaturizerModel(uid, fitPipeline, colsToDrop)
+    }
+  }
+
+}
+
diff --git a/src/text-featurizer/src/test/scala/TextFeaturizerSpec.scala b/src/text-featurizer/src/test/scala/TextFeaturizerSpec.scala
new file mode 100644
index 0000000000..c1018de873
--- /dev/null
+++ b/src/text-featurizer/src/test/scala/TextFeaturizerSpec.scala
@@ -0,0 +1,86 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import org.apache.spark.ml.feature.{NGram, Tokenizer}
+import com.microsoft.ml.spark.schema.DatasetExtensions._
+import org.apache.spark.ml.Estimator
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.types.{StringType, StructField, StructType}
+
+class TextFeaturizerSpec extends EstimatorFuzzingTest {
+  val dfRaw = session
+    .createDataFrame(Seq((0, "Hi I"),
+                         (1, "I wish for snow today"),
+                         (2, "we Cant go to the park, because of the snow!"),
+                         (3, "")))
+    .toDF("label", "sentence")
+  val dfTok = new Tokenizer()
+    .setInputCol("sentence")
+    .setOutputCol("tokens")
+    .transform(dfRaw)
+  val dfNgram =
+    new NGram().setInputCol("tokens").setOutputCol("ngrams").transform(dfTok)
+
+  test("operate on sentences,tokens,or ngrams") {
+    val tfRaw = new TextFeaturizer()
+      .setInputCol("sentence")
+      .setOutputCol("features")
+      .setNumFeatures(20)
+    val tfTok = new TextFeaturizer()
+      .setUseTokenizer(false)
+      .setInputCol("tokens")
+      .setOutputCol("features")
+      .setNumFeatures(20)
+    val tfNgram = new TextFeaturizer()
+      .setUseTokenizer(false)
+      .setUseNGram(false)
+      .setInputCol("ngrams")
+      .setOutputCol("features")
+      .setNumFeatures(20)
+
+    val dfRaw2 = tfRaw.fit(dfRaw).transform(dfRaw)
+    val dfTok2 = tfTok.fit(dfTok).transform(dfTok)
+    val dfNgram2 = tfNgram.fit(dfNgram).transform(dfNgram)
+
+    val linesRaw = dfRaw2.getSVCol("features")
+    val linesTok = dfTok2.getSVCol("features")
+    val linesNgram = dfNgram2.getSVCol("features")
+
+    assert(linesRaw.length == 4)
+    assert(linesTok.length == 4)
+    assert(linesNgram.length == 4)
+    assert(linesRaw(0)(0) == 0.9162907318741551)
+    assert(linesTok(1)(9) == 0.5108256237659907)
+    assert(linesNgram(2)(7) == 1.8325814637483102)
+    assert(linesNgram(3)(1) == 0.0)
+  }
+
+  test("throw errors if the schema is incorrect") {
+    val tfRaw = new TextFeaturizer()
+      .setUseTokenizer(true)
+      .setInputCol("sentence")
+      .setOutputCol("features")
+      .setNumFeatures(20)
+    val tfTok = new TextFeaturizer()
+      .setUseTokenizer(false)
+      .setInputCol("tokens")
+      .setOutputCol("features")
+      .setNumFeatures(20)
+    assertSparkException[IllegalArgumentException](tfRaw.setInputCol("tokens"),           dfTok)
+    assertSparkException[IllegalArgumentException](tfRaw.setInputCol("ngrams"),           dfNgram)
+    assertSparkException[IllegalArgumentException](tfTok.setInputCol("sentence"),         dfRaw)
+    assertSparkException[IllegalArgumentException](tfRaw.setInputCol("tokens_incorrect"), dfTok)
+    assertSparkException[IllegalArgumentException](tfRaw.setOutputCol("tokens"),          dfTok)
+  }
+
+  val inputCol = "text"
+
+  override def setParams(fitDataset: DataFrame, estimator: Estimator[_]): Estimator[_] =
+    estimator.asInstanceOf[TextFeaturizer].setInputCol(inputCol)
+
+  override def getEstimator(): Estimator[_] = new TextFeaturizer()
+
+  override def schemaForDataset: StructType = new StructType(Array(StructField(inputCol, StringType, false)))
+}
diff --git a/src/train-classifier/build.sbt b/src/train-classifier/build.sbt
new file mode 100644
index 0000000000..f0037af8bd
--- /dev/null
+++ b/src/train-classifier/build.sbt
@@ -0,0 +1,3 @@
+//> DependsOn: core
+//> DependsOn: utils
+//> DependsOn: featurize
diff --git a/src/train-classifier/src/main/scala/TrainClassifier.scala b/src/train-classifier/src/main/scala/TrainClassifier.scala
new file mode 100644
index 0000000000..509cb1875c
--- /dev/null
+++ b/src/train-classifier/src/main/scala/TrainClassifier.scala
@@ -0,0 +1,367 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.util.UUID
+
+import com.microsoft.ml.spark.schema._
+import org.apache.hadoop.fs.Path
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.ml.classification._
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.param.EstimatorParam
+import org.apache.spark.ml.util._
+import org.apache.spark.ml._
+import org.apache.spark.sql._
+import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
+
+/**
+  * Trains a classification model.
+  */
+class TrainClassifier(override val uid: String) extends Estimator[TrainedClassifierModel]
+  with HasLabelCol with MMLParams {
+
+  def this() = this(Identifiable.randomUID("TrainClassifier"))
+
+  val model = new EstimatorParam(this, "model", "Classifier to run")
+
+  def getModel: Estimator[_ <: Model[_]] = $(model)
+  /** @group setParam **/
+  def setModel(value: Estimator[_ <: Model[_]]): this.type = set(model, value)
+
+  val featuresColumn = this.uid + "_features"
+
+  val numFeatures = IntParam(this, "numFeatures", "number of features to hash to", 0)
+  def getNumFeatures: Int = $(numFeatures)
+  def setNumFeatures(value: Int): this.type = set(numFeatures, value)
+
+  val indexLabel = BooleanParam(this, "indexLabel", "index the label column", true)
+  def getIndexLabel: Boolean = $(indexLabel)
+  def setIndexLabel(value: Boolean): this.type = set(indexLabel, value)
+
+  /**
+    * Fits the classification model.
+    *
+    * @param dataset The input dataset to train.
+    * @return The trained classification model.
+    */
+  override def fit(dataset: Dataset[_]): TrainedClassifierModel = {
+    val labelColumn = getLabelCol
+    val indexLabelFeaturize = getIndexLabel
+    var levels: Option[Array[_]] = None
+    var oneHotEncodeCategoricals = true
+    var modifyInputLayer = false
+
+    // Convert label column to categorical on train, remove rows with missing labels
+    val convertedLabelDataset = if (indexLabelFeaturize) {
+      val dataframe = dataset.toDF().na.drop(Seq(labelColumn))
+      if (!SparkSchema.isCategorical(dataframe, labelColumn)) {
+        val categoricalLabelDataset = SparkSchema.makeCategorical(dataframe, labelColumn, labelColumn, true)
+        levels = CategoricalUtilities.getLevels(categoricalLabelDataset.schema, labelColumn)
+        categoricalLabelDataset.withColumn(labelColumn,
+          categoricalLabelDataset(labelColumn).cast(DoubleType).as(labelColumn,
+            categoricalLabelDataset.schema(labelColumn).metadata))
+      } else {
+        levels = CategoricalUtilities.getLevels(dataframe.schema, labelColumn)
+        dataframe
+      }
+    } else {
+      dataset.na.drop(Seq(labelColumn))
+    }
+
+    // Create trainer based on the pipeline stage and set the parameters
+    val numFeatures: Int = getModel match {
+      case _: DecisionTreeClassifier | _: GBTClassifier | _: RandomForestClassifier =>
+        oneHotEncodeCategoricals = false
+        FeaturizeUtilities.numFeaturesTreeOrNNBased
+      case _: MultilayerPerceptronClassifier =>
+        modifyInputLayer = true
+        FeaturizeUtilities.numFeaturesTreeOrNNBased
+      case _ =>
+        FeaturizeUtilities.numFeaturesDefault
+    }
+
+    var classifier: Estimator[_ <: PipelineStage] = getModel match {
+      case logisticRegressionClassifier: LogisticRegression => {
+        if (indexLabelFeaturize && levels.isDefined && levels.get.length > 2) {
+          new OneVsRest()
+            .setClassifier(
+              logisticRegressionClassifier
+                .setLabelCol(labelColumn)
+                .setFeaturesCol(featuresColumn))
+            .setLabelCol(labelColumn)
+            .setFeaturesCol(featuresColumn)
+        } else {
+          logisticRegressionClassifier
+        }
+      }
+      case gradientBoostedTreesClassifier: GBTClassifier => {
+        if (indexLabelFeaturize && levels.isDefined && levels.get.length > 2) {
+          throw new Exception("Multiclass Gradient Boosted Tree Classifier not supported yet")
+        } else {
+          gradientBoostedTreesClassifier
+        }
+      }
+      case default @ defaultType if defaultType.isInstanceOf[Estimator[_ <: PipelineStage]] => {
+        default
+      }
+      case _ => throw new Exception("Unsupported learner type " + getModel.getClass.toString)
+    }
+
+    classifier = classifier match {
+      case predictor: Predictor[_, _, _] => {
+        predictor
+          .setLabelCol(labelColumn)
+          .setFeaturesCol(featuresColumn).asInstanceOf[Estimator[_ <: PipelineStage]]
+      }
+      case default @ defaultType if defaultType.isInstanceOf[Estimator[_ <: PipelineStage]] => {
+        // assume label col and features col already set
+        default
+      }
+    }
+
+    val featuresToHashTo =
+      if (getNumFeatures != 0) {
+        getNumFeatures
+      } else {
+        numFeatures
+      }
+
+    val featureColumns = convertedLabelDataset.columns.filter(col => col != labelColumn).toSeq
+
+    val featurizer = new Featurize()
+      .setFeatureColumns(Map(featuresColumn -> featureColumns))
+      .setOneHotEncodeCategoricals(oneHotEncodeCategoricals)
+      .setNumberOfFeatures(featuresToHashTo)
+    val featurizedModel = featurizer.fit(convertedLabelDataset)
+    val processedData = featurizedModel.transform(convertedLabelDataset)
+
+    processedData.cache()
+
+    // For neural network, need to modify input layer so it will automatically work during train
+    if (modifyInputLayer) {
+      val multilayerPerceptronClassifier = classifier.asInstanceOf[MultilayerPerceptronClassifier]
+      val row = processedData.take(1)(0)
+      val featuresVector = row.get(row.fieldIndex(featuresColumn))
+      val vectorSize = featuresVector.asInstanceOf[org.apache.spark.ml.linalg.Vector].size
+      multilayerPerceptronClassifier.getLayers(0) = vectorSize
+      multilayerPerceptronClassifier.setLayers(multilayerPerceptronClassifier.getLayers)
+    }
+
+    // Train the learner
+    val fitModel = classifier.fit(processedData)
+
+    processedData.unpersist()
+
+    // Note: The fit shouldn't do anything here
+    val pipelineModel = new Pipeline().setStages(Array(featurizedModel, fitModel)).fit(convertedLabelDataset)
+    new TrainedClassifierModel(uid, labelColumn, pipelineModel, levels, featuresColumn)
+  }
+
+  override def copy(extra: ParamMap): Estimator[TrainedClassifierModel] = defaultCopy(extra)
+
+  @DeveloperApi
+  override def transformSchema(schema: StructType): StructType = {
+    val hasScoreCols =
+      $(model) match {
+        case _: GBTClassifier => false
+        case _: MultilayerPerceptronClassifier => false
+        case _ => true
+      }
+    TrainClassifier.validateTransformSchema(hasScoreCols, schema)
+  }
+}
+
+object TrainClassifier extends DefaultParamsReadable[TrainClassifier] {
+  def validateTransformSchema(hasScoreCols: Boolean, schema: StructType): StructType = {
+    val scoresSchema =
+      if (hasScoreCols) {
+        StructType(schema.fields :+ StructField(SchemaConstants.ScoresColumn, DoubleType))
+      } else schema
+    val probSchema =
+      if (hasScoreCols) {
+        StructType(scoresSchema.fields :+ StructField(SchemaConstants.ScoredProbabilitiesColumn, DoubleType))
+      } else scoresSchema
+    StructType(probSchema.fields :+ StructField(SchemaConstants.ScoredLabelsColumn, DoubleType))
+  }
+}
+
+/**
+  * Model produced by [[TrainClassifier]].
+  */
+class TrainedClassifierModel(val uid: String,
+                             val labelColumn: String,
+                             val model: PipelineModel,
+                             val levels: Option[Array[_]],
+                             val featuresColumn: String)
+    extends Model[TrainedClassifierModel] with MLWritable {
+
+  override def write: MLWriter = new TrainedClassifierModel.TrainClassifierModelWriter(uid,
+    labelColumn,
+    model,
+    levels,
+    featuresColumn)
+
+  override def copy(extra: ParamMap): TrainedClassifierModel =
+    new TrainedClassifierModel(uid,
+      labelColumn,
+      model.copy(extra),
+      levels,
+      featuresColumn)
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    val hasScoreCols = hasScoreColumns(model.stages.last)
+
+    // re-featurize and score the data
+    val scoredData = model.transform(dataset)
+
+    // Drop the vectorized features column
+    val cleanedScoredData = scoredData.drop(featuresColumn)
+
+    // Update the schema - TODO: create method that would generate GUID and add it to the scored model
+    val moduleName = SchemaConstants.ScoreModelPrefix + UUID.randomUUID().toString
+    val labelColumnExists = cleanedScoredData.columns.contains(labelColumn)
+    val schematizedScoredDataWithLabel =
+      if (labelColumnExists) {
+        SparkSchema.setLabelColumnName(cleanedScoredData, moduleName, labelColumn, SchemaConstants.ClassificationKind)
+      } else {
+        cleanedScoredData
+      }
+
+    // Note: The GBT model does not have scores, only scored labels.  Same for OneVsRest with any binary model.
+    val schematizedScoredDataWithScores =
+      if (hasScoreCols) {
+        setMetadataForColumnName(SparkSchema.setScoredProbabilitiesColumnName,
+          SchemaConstants.SparkProbabilityColumn,
+          SchemaConstants.ScoredProbabilitiesColumn,
+          moduleName,
+          setMetadataForColumnName(SparkSchema.setScoresColumnName,
+            SchemaConstants.SparkRawPredictionColumn,
+            SchemaConstants.ScoresColumn,
+            moduleName,
+            schematizedScoredDataWithLabel))
+      } else schematizedScoredDataWithLabel
+
+    val scoredDataWithUpdatedScoredLabels =
+      setMetadataForColumnName(SparkSchema.setScoredLabelsColumnName,
+        SchemaConstants.SparkPredictionColumn,
+        SchemaConstants.ScoredLabelsColumn,
+        moduleName,
+        schematizedScoredDataWithScores)
+
+    val scoredDataWithUpdatedScoredLevels =
+      if (levels.isEmpty) scoredDataWithUpdatedScoredLabels
+      else CategoricalUtilities.setLevels(scoredDataWithUpdatedScoredLabels,
+        SchemaConstants.ScoredLabelsColumn,
+        levels.get)
+
+    // add metadata to the scored labels and true labels for the levels in label column
+    if (levels.isEmpty || !labelColumnExists) scoredDataWithUpdatedScoredLevels
+    else CategoricalUtilities.setLevels(scoredDataWithUpdatedScoredLevels,
+      labelColumn,
+      levels.get)
+  }
+
+  private def setMetadataForColumnName(setter: (DataFrame, String, String, String) => DataFrame,
+                                       sparkColumnName: String,
+                                       mmlColumnName: String,
+                                       moduleName: String,
+                                       dataset: DataFrame): DataFrame = {
+    if (dataset.columns.contains(sparkColumnName)) {
+      setter(dataset.withColumnRenamed(sparkColumnName, mmlColumnName),
+        moduleName,
+        mmlColumnName,
+        SchemaConstants.ClassificationKind)
+    } else {
+      dataset
+    }
+  }
+
+  @DeveloperApi
+  override def transformSchema(schema: StructType): StructType =
+    TrainClassifier.validateTransformSchema(hasScoreColumns(model.stages.last), schema)
+
+  def hasScoreColumns(model: Transformer): Boolean = {
+    model match {
+      case _: GBTClassificationModel => false
+      case _: MultilayerPerceptronClassificationModel => false
+      case _ => true
+    }
+  }
+
+  def getParamMap: ParamMap = model.stages.last.extractParamMap()
+}
+
+object TrainedClassifierModel extends MLReadable[TrainedClassifierModel] {
+
+  private val featurizeModelPart = "featurizeModel"
+  private val modelPart = "model"
+  private val levelsPart = "levels"
+  private val dataPart = "data"
+
+  override def read: MLReader[TrainedClassifierModel] = new TrainedClassifierModelReader
+
+  override def load(path: String): TrainedClassifierModel = super.load(path)
+
+  /** [[MLWriter]] instance for [[TrainedClassifierModel]] */
+  private[TrainedClassifierModel]
+  class TrainClassifierModelWriter(val uid: String,
+                                   val labelColumn: String,
+                                   val model: PipelineModel,
+                                   val levels: Option[Array[_]],
+                                   val featuresColumn: String)
+    extends MLWriter {
+    private case class Data(uid: String, labelColumn: String, featuresColumn: String)
+
+    override protected def saveImpl(path: String): Unit = {
+      val overwrite = this.shouldOverwrite
+      val qualPath = PipelineUtilities.makeQualifiedPath(sc, path)
+      // Required in order to allow this to be part of an ML pipeline
+      PipelineUtilities.saveMetadata(uid,
+        TrainedClassifierModel.getClass.getName.replace("$", ""),
+        new Path(path, "metadata").toString,
+        sc,
+        overwrite)
+
+      // save the model
+      val modelWriter =
+        if (overwrite) model.write.overwrite()
+        else model.write
+      modelWriter.save(new Path(qualPath, modelPart).toString)
+
+      // save the levels
+      ObjectUtilities.writeObject(levels, qualPath, levelsPart, sc, overwrite)
+
+      // save model data
+      val data = Data(uid, labelColumn, featuresColumn)
+      val dataPath = new Path(qualPath, dataPart).toString
+      val saveMode =
+        if (overwrite) SaveMode.Overwrite
+        else SaveMode.ErrorIfExists
+      sparkSession.createDataFrame(Seq(data)).repartition(1).write.mode(saveMode).parquet(dataPath)
+    }
+  }
+
+  private class TrainedClassifierModelReader
+    extends MLReader[TrainedClassifierModel] {
+
+    override def load(path: String): TrainedClassifierModel = {
+      val qualPath = PipelineUtilities.makeQualifiedPath(sc, path)
+      // load the uid, label column and model name
+      val dataPath = new Path(qualPath, dataPart).toString
+      val data = sparkSession.read.format("parquet").load(dataPath)
+      val Row(uid: String, labelColumn: String, featuresColumn: String) =
+        data.select("uid", "labelColumn", "featuresColumn").head()
+
+      // retrieve the underlying model
+      val model = PipelineModel.load(new Path(qualPath, modelPart).toString)
+
+      // get the levels
+      val levels = ObjectUtilities.loadObject[Option[Array[_]]](qualPath, levelsPart, sc)
+
+      new TrainedClassifierModel(uid, labelColumn, model, levels, featuresColumn)
+    }
+  }
+
+}
diff --git a/src/train-classifier/src/test/scala/VerifyTrainClassifier.scala b/src/train-classifier/src/test/scala/VerifyTrainClassifier.scala
new file mode 100644
index 0000000000..5a8674b3b4
--- /dev/null
+++ b/src/train-classifier/src/test/scala/VerifyTrainClassifier.scala
@@ -0,0 +1,560 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import FileUtilities._
+import com.microsoft.ml.spark.schema.{CategoricalUtilities, SchemaConstants, SparkSchema}
+
+import scala.collection.mutable.ArrayBuffer
+import org.apache.spark.ml.Estimator
+import org.apache.spark.ml.classification._
+import org.apache.spark.ml.linalg.{Vector, Vectors}
+import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics}
+import org.apache.spark.sql.{DataFrame, Row}
+import org.apache.spark.sql.functions._
+import org.apache.spark.sql.types.StructType
+
+object ClassifierTestUtils {
+
+  def classificationTrainFile(name: String): File =
+    new File(s"${sys.env("DATASETS_HOME")}/Binary/Train", name)
+
+  def multiclassClassificationTrainFile(name: String): File =
+    new File(s"${sys.env("DATASETS_HOME")}/Multiclass/Train", name)
+
+}
+
+/**
+  * Tests to validate the functionality of Train Classifier module.
+  */
+class VerifyTrainClassifier extends EstimatorFuzzingTest {
+
+  val thisDirectory = new File("src/test/scala")
+  val targetDirectory = new File("target")
+  assert(thisDirectory.isDirectory, "-- the test should run in the sub-project root level")
+  val historicMetricsFile  = new File(thisDirectory, "benchmarkMetrics.csv")
+  val benchmarkMetricsFile = new File(targetDirectory, s"newMetrics_${System.currentTimeMillis}_.csv")
+
+  val LogisticRegressionClassifierName = "LogisticRegression"
+  val DecisionTreeClassifierName = "DecisionTreeClassification"
+  val RandomForestClassifierName = "RandomForestClassification"
+  val GradientBoostedTreesClassifierName = "GradientBoostedTreesClassification"
+  val NaiveBayesClassifierName = "NaiveBayesClassifier"
+  val MultilayerPerceptronClassifierName = "MultilayerPerceptronClassifier"
+
+  val accuracyResults = ArrayBuffer.empty[String]
+  def addAccuracyResult(items: Any*): Unit = {
+    val line = items.map(_.toString).mkString(",")
+    println(s"... $line")
+    accuracyResults += line
+    ()
+  }
+
+  val mockLabelColumn = "Label"
+
+  def createMockDataset: DataFrame = {
+    session.createDataFrame(Seq(
+      (0, 2, 0.50, 0.60, 0),
+      (1, 3, 0.40, 0.50, 1),
+      (0, 4, 0.78, 0.99, 2),
+      (1, 5, 0.12, 0.34, 3),
+      (0, 1, 0.50, 0.60, 0),
+      (1, 3, 0.40, 0.50, 1),
+      (0, 3, 0.78, 0.99, 2),
+      (1, 4, 0.12, 0.34, 3),
+      (0, 0, 0.50, 0.60, 0),
+      (1, 2, 0.40, 0.50, 1),
+      (0, 3, 0.78, 0.99, 2),
+      (1, 4, 0.12, 0.34, 3)))
+      .toDF(mockLabelColumn, "col1", "col2", "col3", "col4")
+  }
+
+  test("Smoke test for training on a classifier") {
+    val dataset: DataFrame = createMockDataset
+
+    val logisticRegressor = TrainClassifierTestUtilities.createLogisticRegressor(mockLabelColumn)
+
+    TrainClassifierTestUtilities.trainScoreDataset(mockLabelColumn, dataset, logisticRegressor)
+  }
+
+  test("Verify you can score on a dataset without a label column") {
+    val dataset: DataFrame = createMockDataset
+
+    val logisticRegressor = TrainClassifierTestUtilities.createLogisticRegressor(mockLabelColumn)
+
+    val data = dataset.randomSplit(Seq(0.6, 0.4).toArray, 42)
+    val trainData = data(0)
+    val testData = data(1)
+
+    val model = logisticRegressor.fit(trainData)
+
+    model.transform(testData.drop(mockLabelColumn))
+  }
+
+  test("Verify train classifier works on a dataset with categorical columns") {
+    val cat = "Cat"
+    val dog = "Dog"
+    val bird = "Bird"
+    val dataset: DataFrame = session.createDataFrame(Seq(
+      (0, 2, 0.50, 0.60, dog, cat),
+      (1, 3, 0.40, 0.50, cat, dog),
+      (0, 4, 0.78, 0.99, dog, bird),
+      (1, 5, 0.12, 0.34, cat, dog),
+      (0, 1, 0.50, 0.60, dog, bird),
+      (1, 3, 0.40, 0.50, bird, dog),
+      (0, 3, 0.78, 0.99, dog, cat),
+      (1, 4, 0.12, 0.34, cat, dog),
+      (0, 0, 0.50, 0.60, dog, cat),
+      (1, 2, 0.40, 0.50, bird, dog),
+      (0, 3, 0.78, 0.99, dog, bird),
+      (1, 4, 0.12, 0.34, cat, dog)))
+      .toDF(mockLabelColumn, "col1", "col2", "col3", "col4", "col5")
+
+    val catDataset = SparkSchema.makeCategorical(
+      SparkSchema.makeCategorical(dataset, "col4", "col4", false),
+      "col5",
+      "col5",
+      false)
+
+    val logisticRegressor = TrainClassifierTestUtilities.createLogisticRegressor(mockLabelColumn)
+    TrainClassifierTestUtilities.trainScoreDataset(mockLabelColumn, catDataset, logisticRegressor)
+
+    val randomForestClassifier = TrainClassifierTestUtilities.createRandomForestClassifier(mockLabelColumn)
+    TrainClassifierTestUtilities.trainScoreDataset(mockLabelColumn, catDataset, randomForestClassifier)
+  }
+
+  test("Verify a trained classifier model can be saved and loaded") {
+    val dataset: DataFrame = createMockDataset
+
+    val logisticRegressor = TrainClassifierTestUtilities.createLogisticRegressor(mockLabelColumn)
+
+    val model = logisticRegressor.fit(dataset)
+
+    val myModelName = "testModel"
+    lazy val dir = new File(myModelName)
+    try {
+      model.write.overwrite().save(myModelName)
+      // write a second time with overwrite flag, verify still works
+      model.write.overwrite().save(myModelName)
+      // assert directory exists
+      assert(dir.exists())
+
+      // load the model
+      val loadedModel = TrainedClassifierModel.load(myModelName)
+
+      // verify model data loaded
+      assert(loadedModel.labelColumn == model.labelColumn)
+      assert(loadedModel.uid == model.uid)
+      val transformedDataset = loadedModel.transform(dataset)
+      val benchmarkDataset = model.transform(dataset)
+      assert(verifyResult(transformedDataset, benchmarkDataset))
+    } finally {
+      // delete the file to cleanup
+      FileUtilities.delTree(dir)
+      ()
+    }
+  }
+
+  test("Verify you can train on a dataset that contains a vector column") {
+    val dataset: DataFrame = session.createDataFrame(Seq(
+      (0, 2, 0.50, 0.60, 0, Vectors.dense(1.0, 0.1, -1.5)),
+      (1, 3, 0.40, 0.50, 1, Vectors.dense(1.5, 0.2, -1.2)),
+      (0, 4, 0.78, 0.99, 2, Vectors.dense(1.3, 0.3, -1.1)),
+      (1, 5, 0.12, 0.34, 3, Vectors.sparse(3, Seq((0, 1.0), (2, 2.0)))),
+      (0, 1, 0.50, 0.60, 0, Vectors.dense(1.0, 0.4, -1.23)),
+      (1, 3, 0.40, 0.50, 1, Vectors.dense(1.1, 0.5, -1.024)),
+      (0, 3, 0.78, 0.99, 2, Vectors.dense(1.0, 0.1, -1.22)),
+      (1, 4, 0.12, 0.34, 3, Vectors.dense(Double.NaN, 0.2, -1.23)),
+      (0, 0, 0.50, 0.60, 0, Vectors.dense(0.5, 0.3, 1.0)),
+      (1, 2, 0.40, 0.50, 1, Vectors.dense(1.0, 0.4, -1.2)),
+      (0, 3, 0.78, 0.99, 2, Vectors.dense(0.7, 0.5, -1.1)),
+      (1, 4, 0.12, 0.34, 3, Vectors.dense(1.8, 0.1, 2.02))))
+      .toDF(mockLabelColumn, "col1", "col2", "col3", "col4", "col5")
+
+    val logisticRegressor = TrainClassifierTestUtilities.createLogisticRegressor(mockLabelColumn)
+    TrainClassifierTestUtilities.trainScoreDataset(mockLabelColumn, dataset, logisticRegressor)
+  }
+
+  verifyLearnerOnMulticlassCsvFile("abalone.csv",                  "Rings", 2, true)
+  // Has multiple columns with the same name.  Spark doesn't seem to be able to handle that yet.
+  // verifyLearnerOnMulticlassCsvFile("arrhythmia.csv",               "Arrhythmia")
+  verifyLearnerOnMulticlassCsvFile("BreastTissue.csv",             "Class", 2, true)
+  verifyLearnerOnMulticlassCsvFile("CarEvaluation.csv",            "Col7", 2, true)
+  // Getting "code generation" exceeded max size limit error
+  // verifyLearnerOnMulticlassCsvFile("mnist.train.csv",              "Label")
+  // This works with 2.0.0, but on 2.1.0 it looks like it loops infinitely while leaking memory
+  // verifyLearnerOnMulticlassCsvFile("au3_25000.csv",                "class", 2, true)
+  // This takes way too long for a gated build.  Need to make it something like a p3 test case.
+  // verifyLearnerOnMulticlassCsvFile("Seattle911.train.csv",         "Event Clearance Group")
+
+  verifyLearnerOnBinaryCsvFile("PimaIndian.csv",                   "Diabetes mellitus", 2, true)
+  verifyLearnerOnBinaryCsvFile("data_banknote_authentication.csv", "class", 2, false)
+  verifyLearnerOnBinaryCsvFile("task.train.csv",                   "TaskFailed10", 2, true)
+  verifyLearnerOnBinaryCsvFile("breast-cancer.train.csv",          "Label", 2, true)
+  verifyLearnerOnBinaryCsvFile("random.forest.train.csv",          "#Malignant", 2, true)
+  verifyLearnerOnBinaryCsvFile("transfusion.csv",                  "Donated", 2, true)
+  // verifyLearnerOnBinaryCsvFile("au2_10000.csv",                    "class", 1)
+  verifyLearnerOnBinaryCsvFile("breast-cancer-wisconsin.csv",      "Class", 2, true)
+  verifyLearnerOnBinaryCsvFile("fertility_Diagnosis.train.csv",    "Diagnosis", 2, false)
+  verifyLearnerOnBinaryCsvFile("bank.train.csv",                   "y", 2, false)
+  verifyLearnerOnBinaryCsvFile("TelescopeData.csv",                " Class", 2, false)
+
+  test("Compare benchmark results file to generated file", TestBase.Extended){
+    try writeFile(benchmarkMetricsFile, accuracyResults.mkString("\n") + "\n")
+    catch {
+      case e: java.io.IOException => throw new Exception("Not able to process benchmarks file")
+    }
+    val historicMetrics = readFile(historicMetricsFile, _.getLines.toList)
+    if (historicMetrics.length != accuracyResults.length)
+      throw new Exception(s"Mis-matching number of lines in new benchmarks file: $benchmarkMetricsFile")
+    for (((hist,acc),i) <- (historicMetrics zip accuracyResults).zipWithIndex) {
+      assert(hist == acc,
+        s"""Lines do not match on file comparison:
+           |  $historicMetricsFile:$i:
+           |    $hist
+           |  $benchmarkMetricsFile:$i:
+           |    $acc
+           |.""".stripMargin)
+    }
+  }
+
+  def verifyLearnerOnBinaryCsvFile(fileName: String,
+                                   labelColumnName: String,
+                                   decimals: Int,
+                                   includeNaiveBayes: Boolean): Unit = {
+    test("Verify classifier can be trained and scored on " + fileName, TestBase.Extended) {
+      val fileLocation = ClassifierTestUtils.classificationTrainFile(fileName).toString
+      val (trainScoreResultLogisticRegression: DataFrame,
+      trainScoreResultDecisionTree: DataFrame,
+      trainScoreResultGradientBoostedTrees: Option[DataFrame],
+      trainScoreResultRandomForest: DataFrame,
+      trainScoreResultMultilayerPerceptron: Option[DataFrame],
+      trainScoreResultNaiveBayes: Option[DataFrame]) =
+        readAndScoreDataset(fileName, labelColumnName, fileLocation, true, includeNaiveBayes)
+
+      // Evaluate and get auc, round to 2 decimals
+      val (aucLogisticRegression, prLogisticRegression) =
+        evalAUC(trainScoreResultLogisticRegression, labelColumnName, SchemaConstants.ScoresColumn, decimals)
+
+      val (aucDecisionTree, prDecisionTree) =
+        evalAUC(trainScoreResultDecisionTree, labelColumnName, SchemaConstants.ScoresColumn, decimals)
+
+      val (aucGradientBoostedTrees, prGradientBoostedTrees) =
+        evalAUC(trainScoreResultGradientBoostedTrees.get,
+          labelColumnName,
+          SchemaConstants.ScoredLabelsColumn,
+          decimals)
+
+      val (aucRandomForest, prRandomForest) =
+        evalAUC(trainScoreResultRandomForest, labelColumnName, SchemaConstants.ScoresColumn, decimals)
+
+      val (aucMultilayerPerceptron, prMultilayerPerceptron) =
+        evalAUC(trainScoreResultMultilayerPerceptron.get,
+          labelColumnName,
+          SchemaConstants.ScoredLabelsColumn,
+          decimals)
+
+      addAccuracyResult(fileName, LogisticRegressionClassifierName,
+                        aucLogisticRegression, prLogisticRegression)
+      addAccuracyResult(fileName, DecisionTreeClassifierName,
+                        aucDecisionTree, prDecisionTree)
+      addAccuracyResult(fileName, GradientBoostedTreesClassifierName,
+                        aucGradientBoostedTrees, prGradientBoostedTrees)
+      addAccuracyResult(fileName, RandomForestClassifierName,
+                        aucRandomForest, prRandomForest)
+      addAccuracyResult(fileName, MultilayerPerceptronClassifierName,
+                        aucMultilayerPerceptron, prMultilayerPerceptron)
+      if (includeNaiveBayes) {
+        val (aucNaiveBayes, prNaiveBayes) =
+          evalAUC(trainScoreResultNaiveBayes.get,
+            labelColumnName,
+            SchemaConstants.ScoredLabelsColumn,
+            decimals)
+        addAccuracyResult(fileName, NaiveBayesClassifierName,
+          aucNaiveBayes, prNaiveBayes)
+      }
+    }
+  }
+
+  def verifyLearnerOnMulticlassCsvFile(fileName: String,
+                                       labelColumnName: String,
+                                       decimals: Int,
+                                       includeNaiveBayes: Boolean): Unit = {
+    test("Verify classifier can be trained and scored on multiclass " + fileName, TestBase.Extended) {
+      val fileLocation = ClassifierTestUtils.multiclassClassificationTrainFile(fileName).toString
+      val (trainScoreResultLogisticRegression: DataFrame,
+      trainScoreResultDecisionTree: DataFrame,
+      trainScoreResultGradientBoostedTrees: Option[DataFrame],
+      trainScoreResultRandomForest: DataFrame,
+      trainScoreResultMultilayerPerceptron: Option[DataFrame],
+      trainScoreResultNaiveBayes: Option[DataFrame]) =
+        readAndScoreDataset(fileName, labelColumnName, fileLocation, false, includeNaiveBayes)
+
+      // Evaluate and get accuracy, F1-Measure
+      val (accuracyLogisticRegression, f1LogisticRegression) =
+        evalMulticlass(trainScoreResultLogisticRegression,
+          labelColumnName,
+          SchemaConstants.ScoredLabelsColumn,
+          decimals)
+
+      val (accuracyDecisionTree, f1DecisionTree) =
+        evalMulticlass(trainScoreResultDecisionTree, labelColumnName, SchemaConstants.ScoredLabelsColumn, decimals)
+
+      val (accuracyRandomForest, f1RandomForest) =
+        evalMulticlass(trainScoreResultRandomForest, labelColumnName, SchemaConstants.ScoredLabelsColumn, decimals)
+
+      addAccuracyResult(fileName, LogisticRegressionClassifierName,
+                        accuracyLogisticRegression, f1LogisticRegression)
+
+      addAccuracyResult(fileName, DecisionTreeClassifierName,
+                        accuracyDecisionTree, f1DecisionTree)
+
+      addAccuracyResult(fileName, RandomForestClassifierName,
+                        accuracyRandomForest, f1RandomForest)
+
+      if (includeNaiveBayes) {
+        val (accuracyNaiveBayes, f1NaiveBayes) =
+          evalMulticlass(trainScoreResultNaiveBayes.get, labelColumnName, SchemaConstants.ScoredLabelsColumn, decimals)
+
+        addAccuracyResult(fileName, NaiveBayesClassifierName,
+          accuracyNaiveBayes, f1NaiveBayes)
+      }
+    }
+  }
+
+  def readAndScoreDataset(fileName: String,
+                          labelColumnName: String,
+                          fileLocation: String,
+                          includeNonProb: Boolean,
+                          includeNaiveBayes: Boolean)
+      : (DataFrame, DataFrame, Option[DataFrame], DataFrame, Option[DataFrame], Option[DataFrame]) = {
+    // TODO: Add other file types for testing
+    val dataset: DataFrame =
+    session.read.format("com.databricks.spark.csv")
+      .option("header", "true").option("inferSchema", "true")
+      .option("treatEmptyValuesAsNulls", "false")
+      .option("delimiter", if (fileName.endsWith(".csv")) "," else "\t")
+      .load(fileLocation)
+    val logisticRegressor =
+      TrainClassifierTestUtilities.createLogisticRegressor(labelColumnName)
+
+    val decisionTreeClassifier =
+      TrainClassifierTestUtilities.createDecisionTreeClassifier(labelColumnName)
+
+    val gradientBoostedTreesClassifier =
+      TrainClassifierTestUtilities.createGradientBoostedTreesClassifier(labelColumnName)
+
+    val randomForestClassifier =
+      TrainClassifierTestUtilities.createRandomForestClassifier(labelColumnName)
+
+    val multilayerPerceptronClassifier =
+      TrainClassifierTestUtilities.createMultilayerPerceptronClassifier(labelColumnName)
+
+    val naiveBayesClassifier =
+      TrainClassifierTestUtilities.createNaiveBayesClassifier(labelColumnName)
+
+    val trainScoreResultLogisticRegression =
+      TrainClassifierTestUtilities.trainScoreDataset(labelColumnName, dataset, logisticRegressor)
+
+    val trainScoreResultDecisionTree =
+      TrainClassifierTestUtilities.trainScoreDataset(labelColumnName, dataset, decisionTreeClassifier)
+
+    val trainScoreResultGradientBoostedTrees =
+      if (includeNonProb) {
+        Some(TrainClassifierTestUtilities.trainScoreDataset(labelColumnName, dataset, gradientBoostedTreesClassifier))
+      }
+      else None
+
+    val trainScoreResultMultilayerPerceptron =
+      if (includeNonProb) {
+        Some(TrainClassifierTestUtilities.trainScoreDataset(labelColumnName, dataset, multilayerPerceptronClassifier))
+      }
+      else None
+
+    val trainScoreResultNaiveBayes =
+      if (includeNaiveBayes) {
+        Some(TrainClassifierTestUtilities.trainScoreDataset(labelColumnName, dataset, naiveBayesClassifier))
+      }
+      else None
+
+    val trainScoreResultRandomForest =
+      TrainClassifierTestUtilities.trainScoreDataset(labelColumnName, dataset, randomForestClassifier)
+    (trainScoreResultLogisticRegression, trainScoreResultDecisionTree,
+     trainScoreResultGradientBoostedTrees, trainScoreResultRandomForest,
+      trainScoreResultMultilayerPerceptron, trainScoreResultNaiveBayes)
+  }
+
+  /**
+    * Get the auc and area over PR for the scored dataset.
+    *
+    * @param scoredDataset The scored dataset to evaluate.
+    * @param labelColumn The label column.
+    * @param predictionColumn The prediction column.
+    * @return The AUC for the scored dataset.
+    */
+  def evalAUC(scoredDataset: DataFrame,
+              labelColumn: String,
+              predictionColumn: String,
+              decimals: Int): (Double, Double) = {
+    // Get levels if categorical
+    val levels = CategoricalUtilities.getLevels(scoredDataset.schema, labelColumn)
+    if (levels.isEmpty) throw new Exception("Test unexpectedly received empty levels")
+    val levelsToIndexMap: Map[Any, Double] = levels.get.zipWithIndex.map(t => t._1 -> t._2.toDouble).toMap
+
+    val scoreAndLabels =
+      scoredDataset.select(col(predictionColumn), col(labelColumn)).na.drop().rdd.map {
+        case Row(prediction: Vector, label) => (prediction(1), levelsToIndexMap(label))
+        case Row(prediction: Double, label) => (prediction, levelsToIndexMap(label))
+      }
+    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
+    val result = (round(metrics.areaUnderROC(), decimals),
+      round(metrics.areaUnderPR(), decimals))
+    metrics.unpersist()
+    result
+  }
+
+  /**
+    * Get the accuracy and f1-score from multiclass data.
+    *
+    * @param scoredDataset The scored dataset to evaluate.
+    * @param labelColumn The label column.
+    * @param predictionColumn The prediction column.
+    * @return The AUC for the scored dataset.
+    */
+  def evalMulticlass(scoredDataset: DataFrame,
+                     labelColumn: String,
+                     predictionColumn: String,
+                     decimals: Int): (Double, Double) = {
+
+    // Get levels if categorical
+    val levels = CategoricalUtilities.getLevels(scoredDataset.schema, labelColumn)
+    if (levels.isEmpty) throw new Exception("Test unexpectedly received empty levels")
+    val levelsToIndexMap: Map[Any, Double] = levels.get.zipWithIndex.map(t => t._1 -> t._2.toDouble).toMap
+
+    val scoreAndLabels =
+      scoredDataset.select(col(predictionColumn), col(labelColumn)).na.drop().rdd.map {
+        case Row(prediction: Vector, label) => (prediction(1), levelsToIndexMap(label))
+        case Row(prediction: Double, label) => (prediction, levelsToIndexMap(label))
+      }
+    val metrics = new MulticlassMetrics(scoreAndLabels)
+    val result = (round(metrics.accuracy, decimals),
+      round(metrics.weightedFMeasure, decimals))
+    result
+  }
+
+  /**
+    * Rounds the given metric to 2 decimals.
+    * @param metric The metric to round.
+    * @return The rounded metric.
+    */
+  def round(metric: Double, decimals: Int): Double = {
+    BigDecimal(metric)
+      .setScale(decimals, BigDecimal.RoundingMode.HALF_UP).toDouble
+  }
+
+  override def setParams(fitDataset: DataFrame, estimator: Estimator[_]): Estimator[_] =
+    estimator.asInstanceOf[TrainClassifier].setModel(new LogisticRegression()).setLabelCol(mockLabelColumn)
+
+  override def createFitDataset: DataFrame = createMockDataset
+
+  override def schemaForDataset: StructType = ???
+
+  override def getEstimator(): Estimator[_] = new TrainClassifier()
+}
+
+/**
+  * Test helper methods for Train Classifier module.
+  */
+object TrainClassifierTestUtilities {
+
+  def createLogisticRegressor(labelColumn: String): Estimator[TrainedClassifierModel] = {
+    val logisticRegression = new LogisticRegression()
+      .setRegParam(0.3)
+      .setElasticNetParam(0.8)
+      .setMaxIter(10)
+    val trainClassifier = new TrainClassifier()
+    trainClassifier
+      .setModel(logisticRegression)
+      .set(trainClassifier.labelCol, labelColumn)
+  }
+
+  def createDecisionTreeClassifier(labelColumn: String): Estimator[TrainedClassifierModel] = {
+    val decisionTreeClassifier = new DecisionTreeClassifier()
+      .setMaxBins(32)
+      .setMaxDepth(5)
+      .setMinInfoGain(0.0)
+      .setMinInstancesPerNode(1)
+      .setSeed(0L)
+    val trainClassifier = new TrainClassifier()
+    trainClassifier
+      .setModel(decisionTreeClassifier)
+      .set(trainClassifier.labelCol, labelColumn)
+  }
+
+  def createGradientBoostedTreesClassifier(labelColumn: String): Estimator[TrainedClassifierModel] = {
+    val decisionTreeClassifier = new GBTClassifier()
+      .setMaxBins(32)
+      .setMaxDepth(5)
+      .setMaxIter(20)
+      .setMinInfoGain(0.0)
+      .setMinInstancesPerNode(1)
+      .setStepSize(0.1)
+      .setSubsamplingRate(1.0)
+      .setSeed(0L)
+    val trainClassifier = new TrainClassifier()
+    trainClassifier
+      .setModel(decisionTreeClassifier)
+      .set(trainClassifier.labelCol, labelColumn)
+  }
+
+  def createRandomForestClassifier(labelColumn: String): Estimator[TrainedClassifierModel] = {
+    val decisionTreeClassifier = new RandomForestClassifier()
+      .setMaxBins(32)
+      .setMaxDepth(5)
+      .setMinInfoGain(0.0)
+      .setMinInstancesPerNode(1)
+      .setNumTrees(20)
+      .setSubsamplingRate(1.0)
+      .setSeed(0L)
+    val trainClassifier = new TrainClassifier()
+    trainClassifier
+      .setModel(decisionTreeClassifier)
+      .set(trainClassifier.labelCol, labelColumn)
+  }
+
+  def createMultilayerPerceptronClassifier(labelColumn: String): Estimator[TrainedClassifierModel] = {
+    val layers = Array[Int](2, 5, 2)
+    val multilayerPerceptronClassifier = new MultilayerPerceptronClassifier()
+      .setLayers(layers)
+      .setBlockSize(1)
+      .setMaxIter(1)
+      .setTol(1e-6)
+      .setSeed(0L)
+    val trainClassifier = new TrainClassifier()
+    trainClassifier
+      .setModel(multilayerPerceptronClassifier)
+      .set(trainClassifier.labelCol, labelColumn)
+  }
+
+  def createNaiveBayesClassifier(labelColumn: String): Estimator[TrainedClassifierModel] = {
+    val naiveBayesClassifier = new NaiveBayes()
+    val trainClassifier = new TrainClassifier()
+    trainClassifier
+      .setModel(naiveBayesClassifier)
+      .set(trainClassifier.labelCol, labelColumn)
+  }
+
+  def trainScoreDataset(labelColumn: String, dataset: DataFrame,
+                        trainClassifier: Estimator[TrainedClassifierModel]): DataFrame = {
+    val data = dataset.randomSplit(Seq(0.6, 0.4).toArray, 42)
+    val trainData = data(0)
+    val testData = data(1)
+
+    val model = trainClassifier.fit(trainData)
+
+    val scoredData = model.transform(testData)
+    scoredData
+  }
+
+}
diff --git a/src/train-classifier/src/test/scala/benchmarkMetrics.csv b/src/train-classifier/src/test/scala/benchmarkMetrics.csv
new file mode 100644
index 0000000000..0144c21824
--- /dev/null
+++ b/src/train-classifier/src/test/scala/benchmarkMetrics.csv
@@ -0,0 +1,68 @@
+abalone.csv,LogisticRegression,0.15,0.04
+abalone.csv,DecisionTreeClassification,0.25,0.22
+abalone.csv,RandomForestClassification,0.26,0.22
+abalone.csv,NaiveBayesClassifier,0.21,0.15
+BreastTissue.csv,LogisticRegression,0.43,0.29
+BreastTissue.csv,DecisionTreeClassification,0.59,0.58
+BreastTissue.csv,RandomForestClassification,0.57,0.52
+BreastTissue.csv,NaiveBayesClassifier,0.54,0.5
+CarEvaluation.csv,LogisticRegression,0.7,0.58
+CarEvaluation.csv,DecisionTreeClassification,0.76,0.74
+CarEvaluation.csv,RandomForestClassification,0.76,0.7
+CarEvaluation.csv,NaiveBayesClassifier,0.74,0.69
+PimaIndian.csv,LogisticRegression,0.5,0.68
+PimaIndian.csv,DecisionTreeClassification,0.62,0.56
+PimaIndian.csv,GradientBoostedTreesClassification,0.68,0.68
+PimaIndian.csv,RandomForestClassification,0.83,0.72
+PimaIndian.csv,MultilayerPerceptronClassifier,0.5,0.68
+PimaIndian.csv,NaiveBayesClassifier,0.51,0.5
+data_banknote_authentication.csv,LogisticRegression,0.92,0.89
+data_banknote_authentication.csv,DecisionTreeClassification,0.98,0.97
+data_banknote_authentication.csv,GradientBoostedTreesClassification,0.98,0.98
+data_banknote_authentication.csv,RandomForestClassification,1.0,1.0
+data_banknote_authentication.csv,MultilayerPerceptronClassifier,0.7,0.74
+task.train.csv,LogisticRegression,0.5,0.57
+task.train.csv,DecisionTreeClassification,0.74,0.71
+task.train.csv,GradientBoostedTreesClassification,0.83,0.85
+task.train.csv,RandomForestClassification,0.9,0.8
+task.train.csv,MultilayerPerceptronClassifier,0.5,0.57
+task.train.csv,NaiveBayesClassifier,0.71,0.56
+breast-cancer.train.csv,LogisticRegression,0.99,0.98
+breast-cancer.train.csv,DecisionTreeClassification,0.96,0.96
+breast-cancer.train.csv,GradientBoostedTreesClassification,0.94,0.94
+breast-cancer.train.csv,RandomForestClassification,1.0,0.99
+breast-cancer.train.csv,MultilayerPerceptronClassifier,0.7,0.71
+breast-cancer.train.csv,NaiveBayesClassifier,0.96,0.96
+random.forest.train.csv,LogisticRegression,1.0,0.99
+random.forest.train.csv,DecisionTreeClassification,0.96,0.96
+random.forest.train.csv,GradientBoostedTreesClassification,0.95,0.95
+random.forest.train.csv,RandomForestClassification,0.99,0.99
+random.forest.train.csv,MultilayerPerceptronClassifier,0.62,0.67
+random.forest.train.csv,NaiveBayesClassifier,0.91,0.91
+transfusion.csv,LogisticRegression,0.5,0.62
+transfusion.csv,DecisionTreeClassification,0.68,0.51
+transfusion.csv,GradientBoostedTreesClassification,0.64,0.52
+transfusion.csv,RandomForestClassification,0.77,0.51
+transfusion.csv,MultilayerPerceptronClassifier,0.5,0.62
+transfusion.csv,NaiveBayesClassifier,0.71,0.61
+breast-cancer-wisconsin.csv,LogisticRegression,1.0,1.0
+breast-cancer-wisconsin.csv,DecisionTreeClassification,0.94,0.95
+breast-cancer-wisconsin.csv,GradientBoostedTreesClassification,0.93,0.95
+breast-cancer-wisconsin.csv,RandomForestClassification,1.0,0.99
+breast-cancer-wisconsin.csv,MultilayerPerceptronClassifier,0.5,0.66
+breast-cancer-wisconsin.csv,NaiveBayesClassifier,0.96,0.95
+fertility_Diagnosis.train.csv,LogisticRegression,0.5,0.56
+fertility_Diagnosis.train.csv,DecisionTreeClassification,0.65,0.18
+fertility_Diagnosis.train.csv,GradientBoostedTreesClassification,0.58,0.29
+fertility_Diagnosis.train.csv,RandomForestClassification,0.68,0.39
+fertility_Diagnosis.train.csv,MultilayerPerceptronClassifier,0.5,0.56
+bank.train.csv,LogisticRegression,0.5,0.56
+bank.train.csv,DecisionTreeClassification,0.53,0.25
+bank.train.csv,GradientBoostedTreesClassification,0.66,0.49
+bank.train.csv,RandomForestClassification,0.88,0.49
+bank.train.csv,MultilayerPerceptronClassifier,0.5,0.06
+TelescopeData.csv,LogisticRegression,0.5,0.68
+TelescopeData.csv,DecisionTreeClassification,0.62,0.58
+TelescopeData.csv,GradientBoostedTreesClassification,0.82,0.83
+TelescopeData.csv,RandomForestClassification,0.89,0.86
+TelescopeData.csv,MultilayerPerceptronClassifier,0.56,0.53
diff --git a/src/train-regressor/build.sbt b/src/train-regressor/build.sbt
new file mode 100644
index 0000000000..5031f836f0
--- /dev/null
+++ b/src/train-regressor/build.sbt
@@ -0,0 +1,2 @@
+//> DependsOn: core
+//> DependsOn: featurize
diff --git a/src/train-regressor/src/main/scala/TrainRegressor.scala b/src/train-regressor/src/main/scala/TrainRegressor.scala
new file mode 100644
index 0000000000..e94d921407
--- /dev/null
+++ b/src/train-regressor/src/main/scala/TrainRegressor.scala
@@ -0,0 +1,246 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.util.UUID
+import com.microsoft.ml.spark.schema.{SchemaConstants, SparkSchema}
+import org.apache.hadoop.fs.Path
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.regression._
+import org.apache.spark.ml.util._
+import org.apache.spark.ml._
+import org.apache.spark.sql._
+import org.apache.spark.sql.types._
+
+/**
+  * Trains a regression model.
+  */
+class TrainRegressor(override val uid: String) extends Estimator[TrainedRegressorModel]
+  with HasLabelCol with MMLParams {
+
+  def this() = this(Identifiable.randomUID("TrainRegressor"))
+
+  val model = new EstimatorParam(this, "model", "Regressor to run")
+
+  def getModel: Estimator[_ <: Model[_]] = $(model)
+  /** @group setParam **/
+  def setModel(value: Estimator[_ <: Model[_]]): this.type = set(model, value)
+
+  val featuresColumn = this.uid + "_features"
+
+  val numFeatures = IntParam(this, "numFeatures", "number of features to hash to", 0)
+  def getNumFeatures: Int = $(numFeatures)
+  def setNumFeatures(value: Int): this.type = set(numFeatures, value)
+
+  /**
+    * Fits the regression model.
+    *
+    * @param dataset The input dataset to train.
+    * @return The trained regression model.
+    */
+  override def fit(dataset: Dataset[_]): TrainedRegressorModel = {
+    val labelColumn = getLabelCol
+    var oneHotEncodeCategoricals = true
+
+    val numFeatures: Int = getModel match {
+      case _: DecisionTreeRegressor | _: GBTRegressor | _: RandomForestRegressor =>
+        oneHotEncodeCategoricals = false
+        FeaturizeUtilities.numFeaturesTreeOrNNBased
+      case _ =>
+        FeaturizeUtilities.numFeaturesDefault
+    }
+
+    val regressor = getModel match {
+      case predictor: Predictor[_, _, _] => {
+        predictor
+          .setLabelCol(labelColumn)
+          .setFeaturesCol(featuresColumn).asInstanceOf[Estimator[_ <: PipelineStage]]
+      }
+      case default @ defaultType if defaultType.isInstanceOf[Estimator[_ <: PipelineStage]] => {
+        // assume label col and features col already set
+        default
+      }
+      case _ => throw new Exception("Unsupported learner type " + getModel.getClass.toString)
+    }
+
+    val featuresToHashTo =
+      if (getNumFeatures != 0) {
+        getNumFeatures
+      } else {
+        numFeatures
+      }
+
+    // TODO: Handle DateType, TimestampType and DecimalType for label
+    // Convert the label column during train to the correct type and drop missings
+    val convertedLabelDataset = dataset.withColumn(labelColumn,
+      dataset.schema(labelColumn).dataType match {
+        case _: IntegerType |
+             _: BooleanType |
+             _: FloatType |
+             _: ByteType |
+             _: LongType |
+             _: ShortType => {
+          dataset(labelColumn).cast(DoubleType)
+        }
+        case _: StringType => {
+          throw new Exception("Invalid type: Regressors are not able to train on a string label column: " + labelColumn)
+        }
+        case _: DoubleType => {
+          dataset(labelColumn)
+        }
+        case default => throw new Exception("Unknown type: " + default.typeName + ", for label column: " + labelColumn)
+      }
+    ).na.drop(Seq(labelColumn))
+
+    val featureColumns = convertedLabelDataset.columns.filter(col => col != labelColumn).toSeq
+
+    val featurizer = new Featurize()
+      .setFeatureColumns(Map(featuresColumn -> featureColumns))
+      .setOneHotEncodeCategoricals(oneHotEncodeCategoricals)
+      .setNumberOfFeatures(featuresToHashTo)
+
+    val featurizedModel = featurizer.fit(convertedLabelDataset)
+    val processedData   = featurizedModel.transform(convertedLabelDataset)
+
+    processedData.cache()
+
+    // Train the learner
+    val fitModel = regressor.fit(processedData)
+
+    processedData.unpersist()
+
+    // Note: The fit shouldn't do anything here
+    val pipelineModel = new Pipeline().setStages(Array(featurizedModel, fitModel)).fit(convertedLabelDataset)
+    new TrainedRegressorModel(uid, labelColumn, pipelineModel, featuresColumn)
+  }
+
+  override def copy(extra: ParamMap): Estimator[TrainedRegressorModel] = defaultCopy(extra)
+
+  @DeveloperApi
+  override def transformSchema(schema: StructType): StructType = TrainRegressor.validateTransformSchema(schema)
+
+}
+
+object TrainRegressor extends DefaultParamsReadable[TrainRegressor] {
+  def validateTransformSchema(schema: StructType): StructType = {
+    StructType(schema.fields :+ StructField(SchemaConstants.ScoresColumn, DoubleType))
+  }
+}
+
+/**
+  * Model produced by [[TrainRegressor]].
+  */
+class TrainedRegressorModel(val uid: String,
+                            val labelColumn: String,
+                            val model: PipelineModel,
+                            val featuresColumn: String)
+    extends Model[TrainedRegressorModel] with MLWritable {
+
+  override def write: MLWriter = new TrainedRegressorModel.TrainedRegressorModelWriter(uid,
+    labelColumn,
+    model,
+    featuresColumn)
+
+  override def copy(extra: ParamMap): TrainedRegressorModel =
+    new TrainedRegressorModel(uid,
+      labelColumn,
+      model.copy(extra),
+      featuresColumn)
+
+  override def transform(dataset: Dataset[_]): DataFrame = {
+    // re-featurize and score the data
+    val scoredData = model.transform(dataset)
+
+    // Drop the vectorized features column
+    val cleanedScoredData = scoredData.drop(featuresColumn)
+
+    // Update the schema - TODO: create method that would generate GUID and add it to the scored model
+    val moduleName = SchemaConstants.ScoreModelPrefix + UUID.randomUUID().toString
+    val labelColumnExists = cleanedScoredData.columns.contains(labelColumn)
+    val schematizedScoredDataWithLabel =
+      if (labelColumnExists) {
+        SparkSchema.setLabelColumnName(cleanedScoredData, moduleName, labelColumn, SchemaConstants.RegressionKind)
+      } else {
+        cleanedScoredData
+      }
+
+    SparkSchema.setScoresColumnName(
+      schematizedScoredDataWithLabel.withColumnRenamed(SchemaConstants.SparkPredictionColumn,
+        SchemaConstants.ScoresColumn),
+      moduleName,
+      SchemaConstants.ScoresColumn,
+      SchemaConstants.RegressionKind)
+  }
+
+  @DeveloperApi
+  override def transformSchema(schema: StructType): StructType = TrainRegressor.validateTransformSchema(schema)
+
+  def getParamMap: ParamMap = model.stages.last.extractParamMap()
+}
+
+object TrainedRegressorModel extends MLReadable[TrainedRegressorModel] {
+
+  private val featurizeModelPart = "featurizeModel"
+  private val modelPart = "model"
+  private val dataPart = "data"
+
+  override def read: MLReader[TrainedRegressorModel] = new TrainedRegressorModelReader
+
+  override def load(path: String): TrainedRegressorModel = super.load(path)
+
+  /** [[MLWriter]] instance for [[TrainedRegressorModel]] */
+  private[TrainedRegressorModel]
+  class TrainedRegressorModelWriter(val uid: String,
+                                    val labelColumn: String,
+                                    val model: PipelineModel,
+                                    val featuresColumn: String)
+    extends MLWriter {
+    private case class Data(uid: String, labelColumn: String, featuresColumn: String)
+
+    override protected def saveImpl(path: String): Unit = {
+      val overwrite = this.shouldOverwrite
+      val qualPath = PipelineUtilities.makeQualifiedPath(sc, path)
+      // Required in order to allow this to be part of an ML pipeline
+      PipelineUtilities.saveMetadata(uid,
+        TrainedRegressorModel.getClass.getName.replace("$", ""),
+        new Path(path, "metadata").toString,
+        sc,
+        overwrite)
+      // save the featurize model and regressor
+      val modelPath = new Path(qualPath, modelPart).toString
+      val modelWriter =
+        if (overwrite) model.write.overwrite()
+        else model.write
+      modelWriter.save(modelPath)
+
+      // save model data
+      val data = Data(uid, labelColumn, featuresColumn)
+      val dataPath = new Path(qualPath, dataPart).toString
+      val saveMode =
+        if (overwrite) SaveMode.Overwrite
+        else SaveMode.ErrorIfExists
+      sparkSession.createDataFrame(Seq(data)).repartition(1).write.mode(saveMode).parquet(dataPath)
+    }
+  }
+
+  private class TrainedRegressorModelReader
+    extends MLReader[TrainedRegressorModel] {
+
+    override def load(path: String): TrainedRegressorModel = {
+      val qualPath = PipelineUtilities.makeQualifiedPath(sc, path)
+      // load the uid, label column and model name
+      val dataPath = new Path(qualPath, dataPart).toString
+      val data = sparkSession.read.format("parquet").load(dataPath)
+      val Row(uid: String, labelColumn: String, featuresColumn: String) =
+        data.select("uid", "labelColumn", "featuresColumn").head()
+
+      // retrieve the underlying model
+      val model = PipelineModel.load(new Path(qualPath, modelPart).toString)
+
+      new TrainedRegressorModel(uid, labelColumn, model, featuresColumn)
+    }
+  }
+
+}
diff --git a/src/train-regressor/src/test/scala/VerifyTrainRegressor.scala b/src/train-regressor/src/test/scala/VerifyTrainRegressor.scala
new file mode 100644
index 0000000000..c50b3a84c1
--- /dev/null
+++ b/src/train-regressor/src/test/scala/VerifyTrainRegressor.scala
@@ -0,0 +1,184 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.io.File
+
+import org.apache.spark.ml.Estimator
+import org.apache.spark.ml.regression.{LinearRegression, RandomForestRegressor}
+import org.apache.spark.sql.DataFrame
+import org.apache.spark.sql.types._
+
+/**
+  * Tests to validate the functionality of Train Regressor module.
+  */
+class VerifyTrainRegressor extends EstimatorFuzzingTest {
+
+  val regressionTrainFilesDirectory = "/Regression/Train/"
+
+  val mockLabelColumn = "Label"
+
+  def createMockDataset: DataFrame = {
+    session.createDataFrame(Seq(
+      (0, 2, 0.50, 0.60, 0),
+      (1, 3, 0.40, 0.50, 1),
+      (2, 4, 0.78, 0.99, 2),
+      (3, 5, 0.12, 0.34, 3),
+      (0, 1, 0.50, 0.60, 0),
+      (1, 3, 0.40, 0.50, 1),
+      (2, 3, 0.78, 0.99, 2),
+      (3, 4, 0.12, 0.34, 3),
+      (0, 0, 0.50, 0.60, 0),
+      (1, 2, 0.40, 0.50, 1),
+      (2, 3, 0.78, 0.99, 2),
+      (3, 4, 0.12, 0.34, 3)))
+      .toDF(mockLabelColumn, "col1", "col2", "col3", "col4")
+  }
+
+  test("Smoke test for training on a regressor") {
+    val dataset = createMockDataset
+
+    val linearRegressor = TrainRegressorTestUtilities.createLinearRegressor(mockLabelColumn)
+
+    TrainRegressorTestUtilities.trainScoreDataset(mockLabelColumn, dataset, linearRegressor)
+  }
+
+  test("Verify you can score on a dataset without a label column") {
+    val dataset: DataFrame = createMockDataset
+
+    val linearRegressor = TrainRegressorTestUtilities.createLinearRegressor(mockLabelColumn)
+
+    val data = dataset.randomSplit(Seq(0.6, 0.4).toArray, 42)
+    val trainData = data(0)
+    val testData = data(1)
+
+    val model = linearRegressor.fit(trainData)
+
+    model.transform(testData.drop(mockLabelColumn))
+  }
+
+  test("Verify train regressor works with different output types") {
+    val dataset = createMockDataset
+    val castLabelCol = "cast_" + mockLabelColumn
+    for (outputType <-
+         Seq(IntegerType, LongType, ByteType, BooleanType, FloatType, DoubleType, ShortType)) {
+      val modifiedDataset = dataset.withColumn(castLabelCol, dataset(mockLabelColumn).cast(outputType))
+      val linearRegressor = TrainRegressorTestUtilities.createLinearRegressor(castLabelCol)
+      TrainRegressorTestUtilities.trainScoreDataset(castLabelCol, modifiedDataset, linearRegressor)
+    }
+  }
+
+  test("Verify a trained regression model can be saved") {
+    val dataset: DataFrame = createMockDataset
+
+    val linearRegressor = TrainRegressorTestUtilities.createLinearRegressor(mockLabelColumn)
+
+    val model = linearRegressor.fit(dataset)
+
+    val myModelName = "testModel"
+    lazy val dir = new File(myModelName)
+    try {
+      model.write.overwrite().save(myModelName)
+      // write a second time with overwrite flag, verify still works
+      model.write.overwrite().save(myModelName)
+      // assert directory exists
+      assert(dir.exists())
+
+      // load the model
+      val loadedModel = TrainedRegressorModel.load(myModelName)
+
+      // verify model data loaded
+      assert(loadedModel.labelColumn == model.labelColumn)
+      assert(loadedModel.uid == model.uid)
+      val transformedDataset = loadedModel.transform(dataset)
+      val benchmarkDataset = model.transform(dataset)
+      assert(verifyResult(transformedDataset, benchmarkDataset))
+    } finally {
+      // delete the file to cleanup
+      FileUtilities.delTree(dir)
+      ()
+    }
+  }
+
+  test("Verify regressor can be trained and scored on airfoil_self_noise-train-csv") {
+    val fileLocation =
+      sys.env("DATASETS_HOME") + regressionTrainFilesDirectory + "airfoil_self_noise.train.csv"
+    val dataset = session.read.format("com.databricks.spark.csv")
+      .option("header", "true").option("inferSchema", "true")
+      .option("delimiter", ",").option("treatEmptyValuesAsNulls", "false")
+      .load(fileLocation)
+
+    val labelColumn = "Scaled sound pressure level"
+
+    val linearRegressor = TrainRegressorTestUtilities.createLinearRegressor(labelColumn)
+
+    TrainRegressorTestUtilities.trainScoreDataset(labelColumn, dataset, linearRegressor)
+  }
+
+  test("Verify regressor can be trained and scored on CASP-train-csv") {
+    val fileLocation =
+      sys.env("DATASETS_HOME") + regressionTrainFilesDirectory + "CASP.train.csv"
+    val dataset = session.read.format("com.databricks.spark.csv")
+      .option("header", "true").option("inferSchema", "true")
+      .option("delimiter", ",").option("treatEmptyValuesAsNulls", "false")
+      .load(fileLocation)
+
+    val labelColumn = "RMSD"
+
+    val parameters = TrainRegressorTestUtilities.createRandomForestRegressor(labelColumn)
+
+    TrainRegressorTestUtilities.trainScoreDataset(labelColumn, dataset, parameters)
+  }
+
+  override def setParams(fitDataset: DataFrame, estimator: Estimator[_]): Estimator[_] =
+    estimator.asInstanceOf[TrainRegressor].setModel(new LinearRegression()).setLabelCol(mockLabelColumn)
+
+  override def createFitDataset: DataFrame = createMockDataset
+
+  override def schemaForDataset: StructType = ???
+
+  override def getEstimator(): Estimator[_] = new TrainRegressor()
+}
+
+/**
+  * Test helper methods for Train Regressor module.
+  */
+object TrainRegressorTestUtilities {
+
+  def createLinearRegressor(labelColumn: String): Estimator[TrainedRegressorModel] = {
+    val linearRegressor = new LinearRegression()
+      .setRegParam(0.3)
+      .setElasticNetParam(0.8)
+    val trainRegressor = new TrainRegressor()
+    trainRegressor
+      .setModel(linearRegressor)
+      .set(trainRegressor.labelCol, labelColumn)
+  }
+
+  def createRandomForestRegressor(labelColumn: String): Estimator[TrainedRegressorModel] = {
+    val linearRegressor = new RandomForestRegressor()
+      .setFeatureSubsetStrategy("auto")
+      .setMaxBins(32)
+      .setMaxDepth(5)
+      .setMinInfoGain(0.0)
+      .setMinInstancesPerNode(1)
+      .setNumTrees(20)
+    val trainRegressor = new TrainRegressor()
+    trainRegressor
+      .setModel(linearRegressor)
+      .set(trainRegressor.labelCol, labelColumn)
+  }
+
+  def trainScoreDataset(labelColumn: String, dataset: DataFrame, trainRegressor: Estimator[TrainedRegressorModel])
+      : DataFrame = {
+    val data = dataset.randomSplit(Seq(0.6, 0.4).toArray, 42)
+    val trainData = data(0)
+    val testData = data(1)
+
+    val model = trainRegressor.fit(trainData)
+    val scoredData = model.transform(testData)
+    scoredData
+  }
+
+}
diff --git a/src/utils/build.sbt b/src/utils/build.sbt
new file mode 100644
index 0000000000..6d55f118b6
--- /dev/null
+++ b/src/utils/build.sbt
@@ -0,0 +1 @@
+//> DependsOn: core
diff --git a/src/utils/src/main/scala/JarLoadingUtils.scala b/src/utils/src/main/scala/JarLoadingUtils.scala
new file mode 100644
index 0000000000..06553105c5
--- /dev/null
+++ b/src/utils/src/main/scala/JarLoadingUtils.scala
@@ -0,0 +1,139 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.net.URLClassLoader
+import java.util.jar.JarFile
+
+import FileUtilities._
+
+import scala.reflect.ClassTag
+import scala.reflect._
+import collection.JavaConverters._
+
+/**
+  * Contains logic for loading classes
+  */
+object JarLoadingUtils {
+
+  private val jarRelPath = "target/scala-" + sys.env("SCALA_VERSION")
+  private val testRelPath = "test-classes"
+  private val projectRoots = "project/project-roots.txt"
+
+  private val outputDirs = {
+    val topDir = List(".", "..").find(root => new File(root, projectRoots).exists)
+    if (topDir.isEmpty) {
+      sys.error(s"Could not find roots file at $projectRoots")
+    }
+    val rootsFile = new File(topDir.get, projectRoots)
+    val roots = readFile(rootsFile, _.getLines.toList)
+    roots.map { root =>
+      new File(new File(topDir.get, root), jarRelPath)
+    }
+  }
+
+  private val testOutputDirs = {
+    outputDirs.flatMap(dir => {
+      val filePath = new File(dir, testRelPath)
+      if (filePath.exists()) {
+        Some(filePath)
+      } else {
+        None
+      }
+    })
+  }
+
+  private val jarFileLocs = outputDirs.flatMap(dir =>
+    FileUtilities.allFiles(dir, file => file.getName.endsWith(".jar")))
+
+  private val testFileLocs = testOutputDirs.flatMap(dir =>
+    FileUtilities.allFiles(dir, file => file.getName.endsWith(".class"))
+      .map(file => file.getCanonicalPath.replace(dir.getCanonicalPath + "/", "")))
+
+  private val jarURLs = jarFileLocs.map(_.toURI.toURL)
+
+  val classLoader = new URLClassLoader(jarURLs.union(testOutputDirs
+    .map(file => new File(file.getCanonicalPath).toURI.toURL)).toArray,
+    this.getClass.getClassLoader)
+
+  private lazy val loadedClasses: List[Class[_]] = {
+    val jarFiles = jarFileLocs.map(jf => new JarFile(jf.getAbsolutePath))
+    try {
+      val classNames = jarFiles.flatMap(_.entries().asScala)
+        .filter(je => je.getName.endsWith(".class"))
+        .map(je => je.getName.replace("/", ".").stripSuffix(".class"))
+      classNames.map(name => classLoader.loadClass(name))
+    } finally {
+      jarFiles.foreach(jf => jf.close())
+    }
+  }
+
+  private lazy val loadedTestClasses: List[Class[_]] = {
+    val classNames = testFileLocs.map(je => je.stripSuffix(".class").replace("/", "."))
+    classNames.map(name => {
+      try {
+        classLoader.loadClass(name)
+      } catch {
+        case e: Throwable => { println(s"Encountered error $e when loading class"); null }
+      }
+    }).filter(_ != null)
+  }
+
+  private def catchInstantiationErrors[T](clazz: Class[_], func: Function[Class[_], T], debug: Boolean): Option[T] = {
+    def log(message: String) = {
+      if (debug) println(message)
+    }
+
+    try {
+      Some(func(clazz))
+    } catch {
+      // Classes without default constructor
+      case ie: InstantiationException =>
+        log(s"Could not generate wrapper without default constructor for " +
+          s"class ${clazz.getSimpleName}: $ie")
+        None
+      // Classes with "private" modifiers on constructors
+      case iae: IllegalAccessException =>
+        log(s"Could not generate wrapper due to private modifiers or constructors for " +
+          s"class ${clazz.getSimpleName}: $iae")
+        None
+      case ncd: NoClassDefFoundError =>
+        log(s"Could not generate wrapper because no class definition found for class " +
+          s"${clazz.getSimpleName}: $ncd")
+        None
+      case ule: UnsatisfiedLinkError =>
+        log(s"Could not generate wrapper due to link error from: " +
+          s"${clazz.getSimpleName}: $ule")
+        None
+      case e: Exception =>
+        log(s"Could not generate wrapper for class ${clazz.getSimpleName}: ${e.printStackTrace()}")
+        None
+    }
+  }
+
+  def load[T: ClassTag](instantiate: Class[_] => Any, debug: Boolean): List[T] = {
+    loadedClasses.filter(lc => classTag[T].runtimeClass.isAssignableFrom(lc)).flatMap { lc =>
+      catchInstantiationErrors(lc, instantiate, debug)
+    }.asInstanceOf[List[T]]
+  }
+
+  def loadClass[T: ClassTag](debug: Boolean): List[T] = load[T](lc => lc.newInstance(), debug)
+
+  def loadTest[T: ClassTag](instantiate: Class[_] => Any, debug: Boolean): List[T] = {
+    loadedTestClasses.filter(lc => classTag[T].runtimeClass.isAssignableFrom(lc)).flatMap { lc =>
+      catchInstantiationErrors(lc, instantiate, debug)
+    }.asInstanceOf[List[T]]
+  }
+
+  def loadTestClass[T: ClassTag](debug: Boolean): List[T] = loadTest[T](lc => lc.newInstance(), debug)
+
+  def loadObject[T: ClassTag](debug: Boolean): List[T] = load[T](
+    lc =>{
+      val cons = lc.getDeclaredConstructors()(0)
+      cons.setAccessible(true)
+      cons.newInstance()}
+    ,
+    debug)
+
+}
diff --git a/src/utils/src/main/scala/ObjectUtilities.scala b/src/utils/src/main/scala/ObjectUtilities.scala
new file mode 100644
index 0000000000..cc17806321
--- /dev/null
+++ b/src/utils/src/main/scala/ObjectUtilities.scala
@@ -0,0 +1,71 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import java.io.{InputStream, ObjectInputStream, ObjectOutputStream, ObjectStreamClass}
+
+import org.apache.hadoop.fs.Path
+import org.apache.spark.SparkContext
+import FileUtilities._
+
+class ObjectInputStreamContextClassLoader(input: InputStream) extends ObjectInputStream(input) {
+  protected override def resolveClass(desc: ObjectStreamClass): Class[_] = {
+    try {
+      Class.forName(desc.getName, false, Thread.currentThread().getContextClassLoader())
+    } catch {
+      case _: ClassNotFoundException => super.resolveClass(desc)
+    }
+  }
+}
+
+/**
+  * Contains logic for reading and writing objects.
+  */
+object ObjectUtilities {
+
+  /**
+    * Loads the object from the given path.
+    * @param corePath The main path for model to load the object from.
+    * @param objectSubPath The path to the object.
+    * @param sc The current spark context.
+    * @tparam ObjectType The type of the object to load.
+    * @return The loaded object.
+    */
+  def loadObject[ObjectType](corePath: Path, objectSubPath: String, sc: SparkContext): ObjectType = {
+    val hadoopConf = sc.hadoopConfiguration
+    val inputPath = new Path(corePath, objectSubPath)
+    using(Seq(inputPath.getFileSystem(hadoopConf))) { fs =>
+      val inputStream = fs(0).open(inputPath)
+      using(Seq(new ObjectInputStreamContextClassLoader(inputStream))) {
+        objectStream =>
+          objectStream(0).readObject().asInstanceOf[ObjectType]
+      }.get
+    }.get
+  }
+
+  /**
+    * Writes the object to the given path.
+    * @param objToWrite The object to write.
+    * @param corePath The main path for model to write the object to.
+    * @param objectSubPath The path to the object.
+    * @param sc The current spark context.
+    * @tparam ObjectType The type of the object to load.
+    */
+  def writeObject[ObjectType](objToWrite: ObjectType,
+                              corePath: Path,
+                              objectSubPath: String,
+                              sc: SparkContext,
+                              overwrite: Boolean): Unit = {
+    val hadoopConf = sc.hadoopConfiguration
+    val outputPath = new Path(corePath, objectSubPath)
+    using(Seq(outputPath.getFileSystem(hadoopConf))) { fs =>
+      val outputStream = fs(0).create(outputPath, overwrite)
+      using(Seq(new ObjectOutputStream(outputStream))) {
+        objectStream =>
+          objectStream(0).writeObject(objToWrite)
+      }.get
+    }.get
+  }
+
+}
diff --git a/src/utils/src/main/scala/PipelineUtilities.scala b/src/utils/src/main/scala/PipelineUtilities.scala
new file mode 100644
index 0000000000..557e59b5bf
--- /dev/null
+++ b/src/utils/src/main/scala/PipelineUtilities.scala
@@ -0,0 +1,55 @@
+// Copyright (C) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License. See LICENSE in project root for information.
+
+package com.microsoft.ml.spark
+
+import com.microsoft.ml.spark.FileUtilities.File
+import org.apache.hadoop.fs.Path
+import org.apache.spark.SparkContext
+import org.json4s.JsonDSL._
+import org.json4s.jackson.JsonMethods._
+
+/**
+  * Exposes utilities used for saving and loading pipelines.
+  */
+object PipelineUtilities {
+  /**
+    * Saves metadata that is required by spark pipeline model in order to read a model.
+    * @param uid The id of the PipelineModel saved.
+    * @param cls The class name.
+    * @param metadataPath The metadata path.
+    * @param sc The spark context.
+    */
+  def saveMetadata(uid: String,
+                   cls: String,
+                   metadataPath: String,
+                   sc: SparkContext,
+                   overwrite: Boolean): Unit = {
+    val metadata = ("class" -> cls) ~
+      ("timestamp" -> System.currentTimeMillis()) ~
+      ("sparkVersion" -> sc.version) ~
+      ("uid" -> uid) ~
+      ("paramMap" -> "{}")
+
+    val metadataJson: String = compact(render(metadata))
+    val metadataFile = new File(metadataPath)
+    val fileExists = metadataFile.exists()
+    if (fileExists) {
+      if (overwrite) {
+        metadataFile.delete()
+      } else {
+        throw new Exception(
+          s"Failed to save pipeline, metadata file $metadataPath already exists, please turn on overwrite option")
+      }
+    }
+    sc.parallelize(Seq(metadataJson), 1).saveAsTextFile(metadataPath)
+  }
+
+  def makeQualifiedPath(sc: SparkContext, path: String): Path = {
+    val modelPath = new Path(path)
+    val hadoopConf = sc.hadoopConfiguration
+    // Note: to get correct working dir, must use root path instead of root + part
+    val fs = modelPath.getFileSystem(hadoopConf)
+    modelPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
+  }
+}
diff --git a/tools/bin/mml-exec b/tools/bin/mml-exec
new file mode 100755
index 0000000000..21925fd447
--- /dev/null
+++ b/tools/bin/mml-exec
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+HERE="$(cd "$(dirname "$(realpath "$0")")"; pwd)"
+
+exe="$1"; shift; args=( "$@" ); set --
+if [[ "$exe" = "" ]]; then
+  echo "Usage: $(basename "$0") <spark-exe> [arguments]"
+  echo "  runs <spark-exe> with --package and --repositories flags that are set"
+  echo "  up to use the most recent MMLSpark build."
+  echo "  If <spark-exe> is \"jupyter-notebook\" then run the notebook server,"
+  echo "  and additional arguments are passed to the \"jupyter notebook\" command."
+  exit 1
+fi
+
+if [[ -x "$HERE/../../runme" ]]; then . "$HERE/../../runme"
+else echo "Could not find \"runme\"" 1>&2; exit 1; fi
+
+# If we let spark guess a driver, it can find "python2.7" in the path (eg, the
+# system's installation) and use that; so do this to force it to use "python" in
+# our path, which is a symlink to the conda python.
+if [[ "x$PYSPARK_PYTHON" = "x" ]]; then export PYSPARK_PYTHON="python"; fi
+
+if [[ "$exe" == "jupyter-notebook" ]]; then
+  if [[ "${#args[@]}" = 0 ]]; then args=""
+  else args="$(printf "%q " "${args[@]}")"; fi
+  export PYSPARK_DRIVER_PYTHON="jupyter"
+  export PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --ip=* $args"
+  exe="pyspark"; args=()
+fi
+
+MML_M2REPOS="file:$BUILD_ARTIFACTS/packages/m2,$MAVEN_URL"
+MML_PACKAGE="com.microsoft.ml.spark:mmlspark_$SCALA_VERSION:$MML_VERSION"
+
+exec "$exe" --repositories "$MML_M2REPOS" --packages "$MML_PACKAGE" \
+            --master "local[*]" "${args[@]}"
diff --git a/tools/build-pr/checkout b/tools/build-pr/checkout
new file mode 100755
index 0000000000..e14b314f5a
--- /dev/null
+++ b/tools/build-pr/checkout
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+. "$(dirname "${BASH_SOURCE[0]}")/../../runme" "$@"
+@ "shared.sh"
+
+PRDIR="$BASEDIR/.build-pr"
+_md "$PRDIR"
+
+# make it possible to use the main version of these files later too
+_ cp -a "$TOOLSDIR/build-pr" "$PRDIR/build-pr"
+
+if [[ "$BUILDPR" = "" ]]; then exit 0; fi
+
+echo "##[section] PR Build for #$BUILDPR"
+
+api "pulls/$BUILDPR" - '.head.sha // error("no such PR")' > /dev/null
+sha1="$(api "pulls/$BUILDPR" - '.head.sha')"
+repo="$(api "pulls/$BUILDPR" - '.head.repo.full_name')"
+ref="$(api "pulls/$BUILDPR" - '.head.ref')"
+repourl="https://github.com/$repo"
+
+printf 'PR BUILD for #%s\n  repo: %s\n  ref: %s\n  sha1: %s\n' \
+       "$BUILDPR" "$repo" "$ref" "$sha1"
+
+git checkout "master" > /dev/null 2>&1
+oldbr="$(git for-each-ref --format="%(refname:short)" "refs/heads/pr-*")"
+if [[ "x$oldbr" != "x" ]]; then git branch -D $oldbr; fi
+
+text="$(jsonq "[A build has started.]($VURL)")"
+api "issues/$BUILDPR/comments" -d '{"body":'"$text"'}' - '.id' > "$PRDIR/comment-id"
+
+_get_T
+git fetch "https://$T@github.com/$repo" "$ref:refs/heads/pr-$BUILDPR"
+git checkout "pr-$BUILDPR"
+git reset --hard "$sha1"
+
+# useful info in build
+{ echo "# This is a build for [github PR #$BUILDPR]($GURL)"
+  echo ""
+  echo "Associated Changes (actual ones)"
+  echo ""; echo "---"; echo ""
+  git log --format="* [%h]($repourl/commit/%H) [%aN](mailto:%aE) %s" \
+      "origin/master..$sha1"
+  } > "$PRDIR/PR-Build.md"
+echo "##vso[task.uploadsummary]$PRDIR/PR-Build.md"
+
+# variable overrides
+prvar() { printf '%s=%q\n' "$1" "$2" >> "$TOOLSDIR/local-config.sh"; }
+prvar BUILD_SOURCEVERSION        "$sha1"
+prvar BUILD_REPOSITORY_NAME      "$repo"
+prvar BUILD_REPOSITORY_ID        "$repourl"
+prvar BUILD_REPOSITORY_URI       "$repourl"
+prvar BUILD_SOURCEBRANCH         "refs/heads/$ref"
+prvar BUILD_SOURCEBRANCHNAME     "$(basename "$ref")"
+prvar BUILD_SOURCEVERSIONAUTHOR  "$(git log -1 --format="%aN <%aE>")"
+prvar BUILD_SOURCEVERSIONMESSAGE "$(git log -1 --format="%s")"
diff --git a/tools/build-pr/report b/tools/build-pr/report
new file mode 100755
index 0000000000..0173e6573a
--- /dev/null
+++ b/tools/build-pr/report
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+. "$(dirname "${BASH_SOURCE[0]}")/../../runme" "$@"
+
+PRDIR="$BASEDIR/.build-pr"
+F="$PRDIR/build-pr/$(basename ${BASH_SOURCE[0]})"
+
+if [[ "${BASH_SOURCE[0]}" != "$F" ]]; then
+  if [[ -x "$F" ]]; then exec "$F"; fi; exit
+fi
+
+@ "shared.sh"
+
+ICONS_URL="https://$MAIN_CONTAINER.blob.core.windows.net/icons"
+icon="$ICONS_URL/Robot"
+case "${AGENT_JOBSTATUS,,}" in
+  ( succeeded ) icon+="2.png"; box="![PASS]($icon) Pass"       ;;
+  ( canceled  ) icon+="1.png"; box="![CANCEL]($icon) Canceled" ;;
+  ( failed    ) icon+="0.png"; box="![FAIL]($icon) Fail"       ;;
+  ( *         ) icon+="1.png"; box="![$AGENT_JOBSTATUS]($icon) Unknown" ;;
+esac
+
+if [[ "$BUILDPR" = "" ]]; then
+  _ az storage blob copy start --account-name "$MAIN_CONTAINER" \
+       --destination-container "icons" --destination-blob "BuildStatus.png" \
+       --source-uri "$icon"
+  exit
+fi
+
+if [[ ! -r "$PRDIR/comment-id" ]]; then exit; fi
+
+cid="$(< "$PRDIR/comment-id")"
+
+api "issues/comments/$cid" -X DELETE
+
+text="$(jsonq "[$box! The build has ${AGENT_JOBSTATUS,,}.]($VURL)")"
+api "issues/$BUILDPR/comments" -d '{"body":'"$text"'}' - '.id' > "$PRDIR/comment-id"
diff --git a/tools/build-pr/shared.sh b/tools/build-pr/shared.sh
new file mode 100644
index 0000000000..09875b3147
--- /dev/null
+++ b/tools/build-pr/shared.sh
@@ -0,0 +1,47 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+set -e
+
+cd "$BASEDIR"
+
+if [[ "$BUILDPR" = "" ]]; then :
+elif [[ "$BUILDPR" = *[^0-9]* ]]; then
+  echo "ERROR: \$BUILDPR should be a number, got: \"$BUILDPR\"" 1>&2
+  exit 1
+fi
+
+T=""
+_get_T() {
+  if [[ "x$T" = "x" ]]; then
+    T="$(__ az keyvault secret show --vault-name mmlspark-keys --name github-auth \
+         | jq -r ".value" | base64 -d)"
+  fi
+}
+
+declare -A api_cache
+api() {
+  local call="$1"; shift
+  local curlargs=() x use_cache=1 json=""
+  while (($# > 0)); do
+    x="$1"; shift
+    if [[ "x$x" = "x-" ]]; then break; else use_cache=0; curlargs+=("$x"); fi;
+  done
+  if ((use_cache)); then json="${api_cache["${call} ${curlargs[*]}"]}"; fi
+  if [[ -z "$json" ]]; then
+    _get_T
+    json="$(curl --silent --show-error -H "AUTHORIZATION: bearer ${T#*:}" \
+                 "https://api.github.com/repos/Azure/mmlspark/$call" \
+                 "${curlargs[@]}")"
+    if ((use_cache)); then api_cache["${call} ${curlargs[*]}"]="$json"; fi
+  fi
+  if (($# == 0)); then echo "$json"; else jq -r "$@" <<<"$json"; fi
+}
+
+jsonq() { # text...; quotes the text as a json string
+  jq --null-input --arg txt "$*" '$txt'
+}
+
+VURL="${SYSTEM_TASKDEFINITIONSURI%/}/$SYSTEM_TEAMPROJECT"
+VURL+="/_build/index?buildId=$BUILD_BUILDID&_a=summary"
+GURL="$(api "pulls/$BUILDPR" - '.html_url')"
diff --git a/tools/config.sh b/tools/config.sh
new file mode 100644
index 0000000000..8a5222e5f8
--- /dev/null
+++ b/tools/config.sh
@@ -0,0 +1,274 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+################################################################################
+# Environment Configuration
+# (See the `defvar` documentation in "utils.sh" too.)
+
+# Make it possible to have a local installation by setting HOME
+defvar -xp HOME; mkdir -p "$HOME"
+
+# Definition of things that need to be installed.  Each one is followed by misc
+# settings, where some of the settings can be computed from others.  The used
+# settings are:
+# * ver: The version of the library.  This version can be used in other settings
+#   by using "<{ver}>", it is also available in the `.setup` and `.init` hooks
+#   as "$ver".
+# * lib: The name of the directory (in ~/lib) to install to, defaults to the
+#   library name in lowercase.
+# * envvar: An environment variable prefix to set to the library's version and
+#   installation directory.  Defaults to the library name in uppercase.  If this
+#   is "FOO", then the two variables set are $FOO_VERSION and $FOO_HOME.
+# * url: The installer URL.
+# * sha256: The expected sha256 of the installer file.
+# * instcmd: The installation command for an `sh` installer (should have `"$1"`
+#   somewhere for the installer file or more likely `bash "$1"`, will be
+#   `eval`uated).  This must be set for sh installers.
+# * exes: Executables to symlink to ~/bin.
+# * vers: Version info in a format of "cmd|pattern" where the `cmd` part is the
+#   command to run to get the version (after the library is installed), and the
+#   output pattern (usually with "<{ver}>", sometimes can also have shell glob
+#   patterns like "*").  The pattern should identify a complete line in the
+#   output of `cmd`.
+# * bindir: The (relative) sub-directory in which executables are found,
+#   defaults to "bin".
+# * prereq: Prerequisite information in a format of "cmd|msg", where cmd is a
+#   shell command to run (its output will not be shown), and a message to show
+#   in case of failure.  The message cannot contain "|"s.
+# * where: A list of contexts where the library is needed; the contexts are:
+#   "devel" for developer installation, "build" for just building (eg, on the
+#   build server), "runtime" for libraries that are needed it a user
+#   environment.
+# In addition, further library-specific setup operations can be put in functions
+# named "<library-name>.setup" and "<library-name>.init".  Both are functions
+# run after the library is already populated and its envvar is set and it run in
+# its directory (can cd elsewhere), but before executable symlinks are made.
+# The .setup function is called to perform setup operation after installation,
+# and the .init function is always called when runme starts, so it's useful to
+# initialize the environment.
+
+# First, the common container definition
+defvar MAIN_CONTAINER "mmlspark"
+# to use the storage directly replace: "azureedge" -> "blob.core.windows"
+_main_url() { echo "https://$MAIN_CONTAINER.azureedge.net/$1"; }
+# The base URL for our installables
+defvar INSTALLER_URL "$(_main_url "installers")"
+# Directory for caching installers; if it is empty then no caching is used
+defvar INSTALLER_CACHE_DIR "$HOME/.mmlspark_cache"
+
+INSTALLATIONS=(
+
+  Java ver: "1.8.0" lib: "jdk"
+  url:    "http://cdn.azul.com/zulu/bin/zulu8.21.0.1-jdk8.0.131-linux_x64.tar.gz"
+  sha256: "17218c6bdd608b5714ffba9d5e28522bb2efc309266ba46232b8b918e6e62133"
+  exes:   "java javac jar javadoc"
+  vers:   "java -version|openjdk version \"<{ver}>_*\""
+  where:  "devel runtime build"
+
+  SBT ver: "0.13.15"
+  url:    "https://github.com/sbt/sbt/releases/download/v<{ver}>/sbt-<{ver}>.tgz"
+  sha256: "b6e073d7c201741dcca92cfdd1dd3cd76c42a47dc9d8c8ead8df7117deed7aef"
+  exes:   "sbt"
+  vers:   "sbt -no-colors sbt-version|?info? <{ver}>"
+  where:  "devel build"
+
+  Spark ver: "2.1.1"
+  url:    "https://archive.apache.org/dist/spark/spark-<{ver}>/spark-<{ver}>-bin-hadoop2.7.tgz"
+  sha256: "372ac4f73221c07696793101007a4f19e31566d1f0d9bd0e5205b6fb5b45bfc2"
+  exes:   "spark-shell spark-sql spark-submit spark-class pyspark sparkR"
+  vers:   "spark-shell --version|* version <{ver}>"
+  where:  "devel runtime build"
+
+  Conda ver: "4.2.12"
+  url:    "https://repo.continuum.io/miniconda/Miniconda3-<{ver}>-Linux-x86_64.sh"
+  sha256: "c59b3dd3cad550ac7596e0d599b91e75d88826db132e4146030ef471bb434e9a"
+  instcmd: 'PYTHONPATH="" bash "$1" -b -f -p "$PWD"'
+  exes:   "python python3 ipython ipython3 jupyter conda pip"
+  vers:   "PYTHONDONTWRITEBYTECODE=true conda --version|conda <{ver}>"
+  where:  "devel runtime build"
+
+  DataSets ver: "2017-05-25"
+  url:    "$INSTALLER_URL/datasets-<{ver}>.tgz"
+  sha256: "9cf58c6d22fa3d3507608c5af23eb791e37bea324d2c98209ae7356becd4ce41"
+  vers:   "cat version|<{ver}>"
+  where:  "devel build"
+
+  # Note: this is different than the version defined in SBT to avoid breaking
+  # work in progress; but when that's done, we need to sync up the two version
+  # via a shared version seetting so they cannot diverge.
+  CNTK ver: "beta12"
+  url:    "$INSTALLER_URL/CNTK-2-0-<{ver}>-0-Linux-64bit-CPU-Only.tar.gz"
+  sha256: "033c5da4b3034f51d0bde6f0d926f7d075a146b16e7c6148a38cecba928efc6c"
+  exes:   "cntk"
+  vers:   "cntk|*Built time: Feb 22 2017 13:29:08"
+  bindir: "cntk/bin"
+  where:  "devel build"
+
+  DockerBuildx ver: "0.0.1"
+  url:  "https://github.com/Microsoft/docker-buildx/archive/v<{ver}>.tar.gz"
+  sha256: "bac3d0036224f4279fc553031849c548296cfae432b3212ea21b2089703b290e"
+  exes: "docker-buildx"
+  vers: "docker-buildx -V|<{ver}>"
+  bindir: "."
+  where: "devel build"
+
+)
+
+# $TESTS holds the specification of tests to run.  The syntax is a list of
+# `tag`, `+tag` or `-tag`, separated by commas and/or spaces; and `tag` is
+# equivalent to `+tag`.  The semantics of the specs mimicks the scala semantics
+# for tags: we run tests that are tagged with `+tag`s, but not `-tag`s, and if
+# there are no `+tag`s then run all tests except for `-tag`s.  `all` and `none`
+# behave as you'd expect, but they can have additional benefits (e.g., `none`
+# will avoid even compiling the tests); avoid using them with other tags.  The
+# default is `+scala,-extended` for local builds, and `all` for server builds.
+# The value is normalized to hold comma-separated `+tag` or `-tag`, except for a
+# single `all`/`none` which don't get a sign prefix.  $PUBLISH similarly holds
+# the specification of things to publish.
+defvar -x TESTS   "default"
+defvar -x PUBLISH "default"
+if [[ "$TESTS" = "default" ]]; then
+  if [[ "$BUILDMODE" = "server" ]]; then TESTS="all"; else TESTS="+scala,-extended"; fi
+fi
+if [[ "$PUBLISH" = "default" ]]; then
+  if [[ "$BUILDMODE" = "server" ]]; then PUBLISH="-demo,-docker"; else PUBLISH="none"; fi
+fi
+# Tag definitions for $TESTS
+deftag scala
+deftag extended
+  deftag python extended
+  deftag e2e extended
+deftag linuxonly
+# Tag definitions for $PUBLISH
+map deftag storage maven pip demo docker
+
+defvar -p SRCDIR          "$BASEDIR/src"
+defvar -p BUILD_ARTIFACTS "$BASEDIR/BuildArtifacts"
+defvar -p TEST_RESULTS    "$BASEDIR/TestResults"
+
+# Specific installation functions
+
+SBT.setup() {
+  local f="$SRCDIR/project/build.properties" txt="sbt.version = $SBT_VERSION"
+  if [[ ! -e "$f" ]]; then echo "$txt" > "$f"; return; fi
+  if [[ "x$(< "$f")" != "x$txt" ]]; then failwith "$f exists"; fi
+}
+defvar SCALA_VERSION "2.11"
+defvar SCALA_FULL_VERSION "$SCALA_VERSION.8"
+SBT.init() {
+  setenv SCALA_VERSION "$SCALA_VERSION"
+  setenv SCALA_FULL_VERSION "$SCALA_FULL_VERSION"
+}
+
+Spark.setup() {
+  if [[ -e "conf/hive-site.xml" ]]; then failwith "conf/hive-site.xml exists"; fi
+  { echo "<configuration>"
+    echo "  <property>"
+    echo "    <name>javax.jdo.option.ConnectionURL</name>"
+    echo "    <value>jdbc:derby:memory:databaseName=metastore_db;create=true</value>"
+    echo "    <description>the URL of the Derby Server database</description>"
+    echo "  </property>"
+    echo "  <property>"
+    echo "    <name>javax.jdo.option.ConnectionDriverName</name>"
+    echo "    <value>org.apache.derby.jdbc.EmbeddedDriver</value>"
+    echo "  </property>"
+    echo "</configuration>"
+  } > "conf/hive-site.xml"
+}
+Spark.init() {
+  local f; for f in "python/lib/"*.zip; do
+    envinit_eval \
+      '[[ ":$PYTHONPATH:" != *":$SPARK_HOME/'"$f"':"* ]]' \
+      '&& export PYTHONPATH="$PYTHONPATH:$SPARK_HOME/'"$f"'"'
+  done
+}
+
+Conda.setup() {
+  show section "Installing Conda & Packages"
+  _ cp "$TOOLSDIR/mmlspark-packages.spec" .
+  # Use `--no-update-deps` to avoid updating everything (including conda &
+  # python) to latest versions; and `--no-deps` is to avoid dependencies that we
+  # know are not needed, such as QT.
+  _ ./bin/conda install --name "root" --no-update-deps --no-deps --yes \
+      --quiet --file "mmlspark-packages.spec"
+  if [[ "$BUILDMODE" != "runtime" ]]; then
+    ./bin/pip install "xmlrunner" "wheel"
+  else
+    show section "Minimizing conda directory"
+    collect_log=2 _ ./bin/conda uninstall -y tk
+    collect_log=2 _ ./bin/conda clean -y --all
+    _rm "pkgs"
+    show command "rm lib/libmkl_....so"
+    rm -f lib/libmkl_{,vml_}{def,rt,sequential,cmpt,mc{,2,3},avx512{,_mic}}.so
+    show command "rm **/*.pyc"
+    rm -rf **/__pycache__/
+    rm -f **/*.pyc
+    show command "strip **/*.so"
+    # note: running this without output and ignore its exit status, so it can
+    # fail silently (its stderr is verbose with files it can't strip, and it
+    # does return an error)
+    strip **/*.so > /dev/null 2>&1
+  fi
+}
+
+_add_to_ld_library_path() {
+  envinit_eval \
+    '[[ ":$LD_LIBRARY_PATH:" != *":'"$1"':"* ]]' \
+    '&& export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:'"$1"'"'
+}
+_req_library_so() { # file.so libname
+  { /sbin/ldconfig -p | grep -q "$1"; } ||
+    failwith "$1 missing, try apt-get install $2"
+}
+CNTK.init() {
+  _req_library_so "libmpi_cxx.so" "libopenmpi1.10"
+  _req_library_so "libgomp.so" "libgomp1"
+  _add_to_ld_library_path '$CNTK_HOME/cntk/lib'
+  _add_to_ld_library_path '$CNTK_HOME/cntk/dependencies/lib'
+}
+
+# Storage for build artifacts
+defvar STORAGE_CONTAINER "buildartifacts"
+defvar STORAGE_URL "$(_main_url "$STORAGE_CONTAINER")"
+
+# Container for maven/pip packages
+defvar MAVEN_CONTAINER  "maven"
+defvar -x MAVEN_URL     "$(_main_url "$MAVEN_CONTAINER")"
+defvar -d MAVEN_PACKAGE "com.microsoft.ml.spark:mmlspark_$SCALA_VERSION:<{MML_VERSION}>"
+defvar PIP_CONTAINER    "pip"
+defvar -x PIP_URL       "$(_main_url "$PIP_CONTAINER")"
+defvar -d PIP_PACKAGE   "mmlspark-<{MML_VERSION}>-py2.py3-none-any.whl"
+
+# E2E test cluster information
+defvar E2E_CLUSTER_NAME   "mmlsparktest"
+defvar E2E_RESOURCE_GROUP "mmlsparktest"
+defvar E2E_CLUSTER_SSH    "spark@${E2E_CLUSTER_NAME}-ssh.azurehdinsight.net"
+defvar E2E_PARALLEL_RUNS  "2"
+defvar CLUSTER_SDK_DIR    "/mml-sdk" # this is for all clusters
+
+# Demo cluster information
+defvar DEMO_CLUSTER_NAME   "mmlsparkdemo"
+defvar DEMO_RESOURCE_GROUP "mmlsparkdemo"
+
+# Public contact email
+defvar -x SUPPORT_EMAIL "mmlspark-support@microsoft.com"
+
+# The following should generally not change
+
+PROFILE_FILE="$HOME/.mmlspark_profile"
+CONF_TRACK_FILE="$HOME/.mmlspark_installed_libs"
+ENV_INIT_FILES=(".profile" # first: write here if none of these files exist
+                ".bash_profile" ".bash_login" ".bashrc" ".zprofile" ".zshrc")
+LIB_VERSION_FILE="MMLSPARK_INSTALLED-README.txt"
+
+CURL_FLAGS="-f --location --retry 20 --retry-max-time 60 --connect-timeout 120"
+CURL_FLAGS="$CURL_FLAGS --speed-limit 10 --speed-time 120"
+if [[ "$BUILDMODE" = "server" ]]; then CURL_FLAGS="$CURL_FLAGS --silent --show-error"
+else CURL_FLAGS="$CURL_FLAGS --progress-bar"; fi
+
+envinit_eval '[[ ":$PATH:" != *":$HOME/bin:"* ]] && export PATH="$HOME/bin:$PATH"'
+envinit_commands+=(
+  'ldpaths="$(ldconfig -v 2> /dev/null | while read -r line; do
+    if [[ "$line" = *: ]]; then echo -n "$line"; fi; done)"'
+  '[[ ":$LD_LIBRARY_PATH:" != *":$ldpaths"* ]] && export LD_LIBRARY_PATH="$ldpaths$LD_LIBRARY_PATH"'
+)
diff --git a/tools/docker/Dockerfile b/tools/docker/Dockerfile
new file mode 100644
index 0000000000..537678f7ab
--- /dev/null
+++ b/tools/docker/Dockerfile
@@ -0,0 +1,54 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+FROM ubuntu:16.04
+
+#META# echo LABEL maintainer="$SUPPORT_EMAIL"
+
+#META# H="/home/mmlspark"; echo ENV \
+  USER="mmlspark" HOME="$H" PATH="$H/bin:\$PATH"
+
+RUN : '==== create a user (and also hide the random hostname in the prompt)' \
+ && echo 'PS1='\''\u:\w\$ '\' >> "/etc/skel/.bashrc" \
+ && useradd -c "Microsoft ML for Apache Spark" -U -G root -d "$HOME" -m "$USER" \
+ && : '==== install needed packages' \
+ && apt-get update --fix-missing \
+ && apt-get install -y curl unzip bzip2 libopenmpi1.10 libgomp1 libunwind8 libtiff5 \
+ && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+WORKDIR $HOME
+USER $USER:$USER
+
+# add the premade runtime environment
+ADD mmlspark.tgz ..
+
+#BUILD# -t mmlspark:keep-for-cache
+#META# rm -f "mmlspark.tgz" # avoid re-hashing a big file
+
+#META# H="/home/mmlspark"; echo ENV \
+  MML_VERSION=$(printf "%q" "$MML_VERSION") \
+  MML_BUILD_INFO=$(printf "%q" "$MML_BUILD_INFO") \
+  MMLSPARK_JUPYTER_PORT="8888" \
+  MML_M2REPOS="file:$H/mml-m2,$MAVEN_URL" \
+  MML_PACKAGE="com.microsoft.ml.spark:mmlspark_$SCALA_VERSION:$MML_VERSION"
+
+ADD notebooks.tgz  /
+ADD mml-m2.tgz .
+ADD bin.tgz .
+
+RUN : '==== make the notebooks convenient to access and have the right date' \
+ && find "/notebooks" -execdir touch "{}" + \
+ && ln -s /notebooks . \
+ && : '==== disable the jupyter security token' \
+ && mkdir ".jupyter" \
+ && echo "c.NotebookApp.token = ''" > ".jupyter/jupyter_notebook_config.py" \
+ && : '==== pre-populate the ivy cache' \
+ && bash -c ". ./.mmlspark_profile; \
+      spark-shell --master \"local[*]\" \
+        --repositories \"$MML_M2REPOS\" --packages \"$MML_PACKAGE\" < /dev/null"
+
+# #SQUASH#
+
+# use CMD to get both "docker run this" and "docker run this command" work
+EXPOSE $MMLSPARK_JUPYTER_PORT
+CMD ["bin/launcher"]
diff --git a/tools/docker/bin/EULA.txt b/tools/docker/bin/EULA.txt
new file mode 100644
index 0000000000..fbc1b8ccc6
--- /dev/null
+++ b/tools/docker/bin/EULA.txt
@@ -0,0 +1,203 @@
+MICROSOFT SOFTWARE LICENSE TERMS
+================================
+
+MICROSOFT MACHINE LEARNING LIBRARY FOR APACHE SPARK
+---------------------------------------------------
+
+These license terms are an agreement between you and Microsoft
+Corporation (or one of its affiliates).  They apply to the software
+named above and any Microsoft services or software updates (except to
+the extent such services or updates are accompanied by new or additional
+terms, in which case those different terms apply prospectively and do
+not alter your or Microsoft's rights relating to pre-updated software or
+services).  IF YOU COMPLY WITH THESE LICENSE TERMS, YOU HAVE THE RIGHTS
+BELOW.  BY DOWNLOADING OR USING THE SOFTWARE, YOU ACCEPT THESE TERMS.
+
+1. INSTALLATION AND USE RIGHTS.
+
+   a. General.  You may install and use any number of copies of the
+      software to develop and test your applications.
+
+   b. Third Party Software.  The software may include third party
+      applications that Microsoft, not the third party, licenses to you
+      under this agreement.  Any included notices for third party
+      applications are for your information only.
+
+   c. Open Source Components.  The software may contain third party
+      copyrighted software licensed under open source licenses with
+      source code availability obligations.  Copies of those licenses
+      are included in the ThirdPartyNotices file or other accompanying
+      notices file.  You may obtain the complete corresponding source
+      code from Microsoft if and as required under the relevant open
+      source license by sending a money order or check for $5.00 to:
+      Source Code Compliance Team, Microsoft Corporation, 1 Microsoft
+      Way, Redmond, WA 98052, USA.  Please write “source code for
+      Microsoft Machine Learning Library for Apache Spark” in the memo
+      line of your payment.  You may also find a copy of the source at
+      http://aka.ms/getsource .
+
+2. SCOPE OF LICENSE.  The software is licensed, not sold.  Microsoft
+   reserves all other rights.  Unless applicable law gives you more
+   rights despite this limitation, you will not (and have no right to):
+
+   a. work around any technical limitations in the software that only
+      allow you to use it in certain ways;
+
+   b. reverse engineer, decompile, or disassemble the software, or
+      attempt to do so, except and only to the extent permitted by
+      licensing terms governing the use of open-source components that
+      may be included with the software;
+
+   c. remove, minimize, block, or modify any notices of Microsoft or its
+      suppliers in the software;
+
+   d. use the software in any way that is against the law or to create
+      or propagate malware; or
+
+   e. share, publish, distribute, or lend the software (except for any
+      distributable code, subject to the terms above), provide the
+      software as a stand-alone hosted solution for others to use, or
+      transfer the software or this agreement to any third party.
+
+3. EXPORT RESTRICTIONS.  You must comply with all domestic and
+   international export laws and regulations that apply to the software,
+   which include restrictions on destinations, end users, and end use.
+   For further information on export restrictions, visit
+   http://aka.ms/exporting .
+
+4. SUPPORT SERVICES.  Microsoft is not obligated under this agreement to
+   provide any support services for the software.  Any support provided
+   is “as is”, “with all faults”, and without warranty of any kind.
+
+5. UPDATES.  The software may periodically check for updates, and
+   download and install them for you.  You may obtain updates only from
+   Microsoft or authorized sources.  Microsoft may need to update your
+   system to provide you with updates.  You agree to receive these
+   automatic updates without any additional notice.  Updates may not
+   include or support all existing software features, services, or
+   peripheral devices.
+
+6. ENTIRE AGREEMENT.  This agreement, and any other terms Microsoft may
+   provide for supplements, updates, or third-party applications, is the
+   entire agreement for the software.
+
+7. Governing Law and Venue.  This Agreement is governed by and construed
+   in accordance with the laws of the state of Washington, without
+   reference to its choice of law principles to the contrary.  Each
+   party hereby consents to the jurisdiction and venue of the state and
+   federal courts located in King County, Washington, with regard to any
+   suit or claim arising under or by reason of this Agreement.
+
+8. CONSUMER RIGHTS; REGIONAL VARIATIONS.  This agreement describes
+   certain legal rights.  You may have other rights, including consumer
+   rights, under the laws of your state or country.  Separate and apart
+   from your relationship with Microsoft, you may also have rights with
+   respect to the party from which you acquired the software.  This
+   agreement does not change those other rights if the laws of your
+   state or country do not permit it to do so.  For example, if you
+   acquired the software in one of the below regions, or mandatory
+   country law applies, then the following provisions apply to you:
+
+   a. Australia.  You have statutory guarantees under the Australian
+      Consumer Law and nothing in this agreement is intended to affect
+      those rights.
+
+   b. Canada.  If you acquired this software in Canada, you may stop
+      receiving updates by turning off the automatic update feature,
+      disconnecting your device from the Internet (if and when you
+      re-connect to the Internet, however, the software will resume
+      checking for and installing updates), or uninstalling the
+      software.  The product documentation, if any, may also specify how
+      to turn off updates for your specific device or software.
+
+   c. Germany and Austria.
+
+      i.  Warranty.  The properly licensed software will perform
+          substantially as described in any Microsoft materials that
+          accompany the software.  However, Microsoft gives no
+          contractual guarantee in relation to the licensed software.
+
+      ii. Limitation of Liability.  In case of intentional conduct, gross
+          negligence, claims based on the Product Liability Act, as well
+          as, in case of death or personal or physical injury, Microsoft
+          is liable according to the statutory law.
+
+      Subject to the foregoing clause ii., Microsoft will only be liable
+      for slight negligence if Microsoft is in breach of such material
+      contractual obligations, the fulfillment of which facilitate the
+      due performance of this agreement, the breach of which would
+      endanger the purpose of this agreement and the compliance with
+      which a party may constantly trust in (so-called "cardinal
+      obligations").  In other cases of slight negligence, Microsoft
+      will not be liable for slight negligence.
+
+9. DISCLAIMER OF WARRANTY.  THE SOFTWARE IS LICENSED “AS IS.”  YOU BEAR
+   THE RISK OF USING IT.  MICROSOFT GIVES NO EXPRESS WARRANTIES,
+   GUARANTEES, OR CONDITIONS.  TO THE EXTENT PERMITTED UNDER APPLICABLE
+   LAWS, MICROSOFT EXCLUDES ALL IMPLIED WARRANTIES, INCLUDING
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND
+   NON-INFRINGEMENT.
+
+10. LIMITATION ON AND EXCLUSION OF DAMAGES.  IF YOU HAVE ANY BASIS FOR
+    RECOVERING DAMAGES DESPITE THE PRECEDING DISCLAIMER OF WARRANTY, YOU
+    CAN RECOVER FROM MICROSOFT AND ITS SUPPLIERS ONLY DIRECT DAMAGES UP
+    TO U.S. $5.00.  YOU CANNOT RECOVER ANY OTHER DAMAGES, INCLUDING
+    CONSEQUENTIAL, LOST PROFITS, SPECIAL, INDIRECT, OR INCIDENTAL
+    DAMAGES.
+
+    This limitation applies to (a) anything related to the software,
+    services, content (including code) on third party Internet sites, or
+    third party applications; and (b) claims for breach of contract,
+    warranty, guarantee, or condition; strict liability, negligence, or
+    other tort; or any other claim; in each case to the extent permitted
+    by applicable law.
+
+    It also applies even if Microsoft knew or should have known about
+    the possibility of the damages.  The above limitation or exclusion
+    may not apply to you because your state, province, or country may
+    not allow the exclusion or limitation of incidental, consequential,
+    or other damages.
+
+Please note: As this software is distributed in Canada, some of the
+clauses in this agreement are provided below in French.
+
+Remarque: Ce logiciel étant distribué au Canada, certaines des clauses
+dans ce contrat sont fournies ci-dessous en français.
+
+EXONÉRATION DE GARANTIE.  Le logiciel visé par une licence est offert
+«tel quel».  Toute utilisation de ce logiciel est à votre seule risque
+et péril.  Microsoft n’accorde aucune autre garantie expresse.  Vous
+pouvez bénéficier de droits additionnels en vertu du droit local sur la
+protection des consommateurs, que ce contrat ne peut modifier.  La ou
+elles sont permises par le droit locale, les garanties implicites de
+qualité marchande, d’adéquation à un usage particulier et d’absence de
+contrefaçon sont exclues.
+
+LIMITATION DES DOMMAGES-INTÉRÊTS ET EXCLUSION DE RESPONSABILITÉ POUR LES
+DOMMAGES.  Vous pouvez obtenir de Microsoft et de ses fournisseurs une
+indemnisation en cas de dommages directs uniquement à hauteur de 5,00 $
+US.  Vous ne pouvez prétendre à aucune indemnisation pour les autres
+dommages, y compris les dommages spéciaux, indirects ou accessoires et
+pertes de bénéfices.
+
+Cette limitation concerne:
+
+* tout ce qui est relié au logiciel, aux services ou au contenu (y
+  compris le code) figurant sur des sites Internet tiers ou dans des
+  programmes tiers; et
+
+* les réclamations au titre de violation de contrat ou de garantie, ou
+  au titre de responsabilité stricte, de négligence ou d’une autre faute
+  dans la limite autorisée par la loi en vigueur.
+
+Elle s’applique également, même si Microsoft connaissait ou devrait
+connaître l’éventualité d’un tel dommage.  Si votre pays n’autorise pas
+l’exclusion ou la limitation de responsabilité pour les dommages
+indirects, accessoires ou de quelque nature que ce soit, il se peut que
+la limitation ou l’exclusion ci-dessus ne s’appliquera pas à votre
+égard.
+
+EFFET JURIDIQUE.  Le présent contrat décrit certains droits juridiques.
+Vous pourriez avoir d’autres droits prévus par les lois de votre pays.
+Le présent contrat ne modifie pas les droits que vous confèrent les lois
+de votre pays si celles-ci ne le permettent pas.
diff --git a/tools/docker/bin/eula b/tools/docker/bin/eula
new file mode 100755
index 0000000000..9b8cdec066
--- /dev/null
+++ b/tools/docker/bin/eula
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+more "$(dirname "$0")/EULA.txt"
+
+echo ""
+echo ""
+echo "(Note: you can also use \"-e ACCEPT_EULA=Y\" to indicate agreement.)"
+echo ""
+read -ep "Do you agree to the EULA? " R
+if [[ "x${R,,}" != @("xy"|"xyes") ]]; then echo "Bye."
+else ACCEPT_EULA=Y launcher; fi
diff --git a/tools/docker/bin/eula.html b/tools/docker/bin/eula.html
new file mode 100644
index 0000000000..0286e89b05
--- /dev/null
+++ b/tools/docker/bin/eula.html
@@ -0,0 +1,54 @@
+<html>
+<head><title>MMLSpark EULA</title>
+  <style>
+    code { background: #ddd; padding: 0 0.5ex; }
+    pre  { display: inline-block; text-align: left; padding: 1ex; background: #edf; }
+  </style>
+  <script>
+    function done(msg, url) {
+      document.getElementById("main").innerHTML = "<br><br><i>" + msg + "</i>";
+      var req = new XMLHttpRequest();
+      req.open("GET", url);
+      req.send();
+    }
+    var retryTimer = null;
+    function retryJupyter() {
+      if (retryTimer != null) clearTimeout(retryTimer);
+      retryTimer = setTimeout(waitForIt, 1000);
+    }
+    function waitForIt() {
+      var req = new XMLHttpRequest();
+      req.onerror = function(e) { retryJupyter(); }
+      req.onreadystatechange = function() {
+        if (req.readyState != 4) return;
+        if (req.status == 0) { retryJupyter(); return; }
+        window.location.reload(true);
+      };
+      try { req.open("GET", "/qsq"); req.send(); } catch (e) { retryJupyter(); }
+    }
+    function accept() {
+      localStorage.setItem("MMLSPARK_EULA", "agreed");
+      done("Starting Jupyter...", "/exit-accept");
+      retryJupyter();
+    }
+    function reject() {
+      done("Goodbye...", "/exit-reject");
+    }
+    window.onload = function() {
+      if (localStorage.getItem("MMLSPARK_EULA") == "agreed") accept();
+    }
+    </script></head>
+<body>
+  <div id="main" style="text-align: center;">
+    Please read the following EULA for the MMLSpark Docker Image.
+    <br><br>
+    <button onclick="accept()">AGREE</button>
+    &nbsp; &nbsp;
+    <button onclick="reject()">REJECT</button>
+    <br><br>
+    <span style="font-size: smaller;">
+      (Agreement will be remembered, but you can skip this check completely by
+      setting <code>ACCEPT_EULA</code> to <code>Yes</code> when starting the
+      container, e.g., <code>docker run ... -e ACCEPT_EULA=Y ...</code>)</span>
+    <br><hr><pre>{TEXT}</pre>
+</div></body></html>
diff --git a/tools/docker/bin/eula.py b/tools/docker/bin/eula.py
new file mode 100755
index 0000000000..9715467593
--- /dev/null
+++ b/tools/docker/bin/eula.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+
+from os import path, environ
+from http.server import BaseHTTPRequestHandler, HTTPServer
+import sys, threading, codecs
+
+def read_file(f):
+    with codecs.open(path.join(path.dirname(__file__), f), "r",
+                     encoding = "utf-8") as inp:
+        return inp.read()
+html = read_file("eula.html").replace("{TEXT}", read_file("EULA.txt"))
+
+class eulaRequestHandler(BaseHTTPRequestHandler):
+    def do_GET(self):
+        if self.path == "/exit-accept":
+            httpd.exit_code = 0
+            threading.Thread(target = httpd.shutdown, daemon = True).start()
+        elif self.path == "/exit-reject":
+            httpd.exit_code = 1
+            threading.Thread(target = httpd.shutdown, daemon = True).start()
+        else:
+            self.send_response(200)
+            self.send_header("Content-type","text/html")
+            self.end_headers()
+            message = html
+            self.wfile.write(bytes(message, "utf8"))
+            return
+
+pvar = "MMLSPARK_JUPYTER_PORT"
+port = int(environ[pvar]) if pvar in environ else 8888
+
+print("Running EULA server...")
+httpd = HTTPServer(("", port), eulaRequestHandler)
+httpd.serve_forever()
+
+print("Done, " + ("accept" if httpd.exit_code == 0 else "reject") + "ing")
+sys.exit(httpd.exit_code)
diff --git a/tools/docker/bin/launcher b/tools/docker/bin/launcher
new file mode 100755
index 0000000000..528dc3c511
--- /dev/null
+++ b/tools/docker/bin/launcher
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+shopt -s extglob
+
+. "$HOME/.mmlspark_profile"
+cd "$HOME/notebooks"
+echo "spark.sql.warehouse.dir $HOME/spark-warehouse" \
+     > "$HOME/lib/spark/conf/spark-defaults.conf"
+
+if [[ "${ACCEPT_EULA,,}" != @(y|yes) ]]; then
+  { echo "ERROR: You must accept the End User License Agreement to use this container."
+    echo "Run this container with \"eula\" to read the EULA."
+    echo "Set the environment variable ACCEPT_EULA to \"Yes\" (or \"Y\") to accept the"
+    echo "agreement, e.g., \"docker run -it -e ACCEPT_EULA=Y ...\"."
+  } 1>&2
+  exit 1
+  echo "Waiting for EULA agreement"; eula.py || exit 1
+fi
+
+PYSPARK_DRIVER_PYTHON="jupyter" \
+PYSPARK_DRIVER_PYTHON_OPTS="notebook --no-browser --port=${MMLSPARK_JUPYTER_PORT:=8888} --ip=*" \
+  pyspark --master "local[*]" --repositories "$MML_M2REPOS" --packages "$MML_PACKAGE"
diff --git a/tools/docker/build-docker b/tools/docker/build-docker
new file mode 100755
index 0000000000..1cdb330f66
--- /dev/null
+++ b/tools/docker/build-docker
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+. "$(dirname "${BASH_SOURCE[0]}")/../../runme" "$@"
+main() {
+
+show section "Building Docker Image"
+
+_rmcd "$BUILD_ARTIFACTS/docker-work"
+
+_tgzip() { # outdir workdir (always packs ".")
+  # avoid tracking times/owners to keep the bits stable (for docker caching)
+  local out="$1" dir="$2"; shift 2
+  if [[ "$dir" != "$out" ]]; then cp -al "$dir" "$out"; fi
+  tar cf - --mtime 1970-1-1T00:00 --owner=mmlspark --group=mmlspark "$out" \
+    | gzip -n9 > "$out.tgz"
+  rm -rf "$out"
+}
+
+local envtgz="$INSTALLER_CACHE_DIR/$(get_runtime_hash).tgz"
+if [[ -r "$envtgz" ]]; then
+  _ cp -al "$envtgz" "mmlspark.tgz"
+else
+  show - "Creating base environment cache"
+  docker run --interactive --rm \
+             -v "$BASEDIR:/mkenv/src:ro" \
+             -v "$INSTALLER_CACHE_DIR:/mkenv/cache:ro" \
+             -v "$(pwd):/home" \
+             ubuntu:16.04 "/mkenv/src/tools/docker/build-env" \
+    2>&1 | ( IFS=""; while read -r line; do echo "| $line"; done )
+  _ cp -al "mmlspark.tgz" "$envtgz"
+fi
+
+_ _tgzip "notebooks" "$BUILD_ARTIFACTS/notebooks/local"
+_ _tgzip "mml-m2"    "$BUILD_ARTIFACTS/packages/m2"
+_ _tgzip "bin"       "$TOOLSDIR/docker/bin"
+_ cp "$TOOLSDIR/docker/Dockerfile" .
+
+find . -type f | xargs cksum > ~/tmp/1
+
+_ docker system prune -f
+_ docker-buildx -t mmlspark .
+
+_ cd "$BASEDIR"
+_rm "$BUILD_ARTIFACTS/docker-work"
+
+}
+main "$@"
diff --git a/tools/docker/build-env b/tools/docker/build-env
new file mode 100755
index 0000000000..afa0607fe6
--- /dev/null
+++ b/tools/docker/build-env
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+# This file is a hack for creating the environment tgz with conda installed in
+# the right place (cannot install it in one place and use in another).  It would
+# be better to do this using a docker multi-stage build, but there is no way to
+# tag the intermediate step so it won't be deleted with a `docker system prune`.
+# By using a tgz we properly control the caching.
+
+apt-get update --fix-missing
+apt-get install -y curl unzip bzip2 libopenmpi1.10 libgomp1 libunwind8 libtiff5 binutils
+
+export USER="mmlspark"
+export HOME="/home/$USER"
+
+echo 'PS1='\''\u:\w\$ '\' >> "/etc/skel/.bashrc"
+useradd -c "Microsoft ML for Apache Spark" -U -d "$HOME" -m "$USER"
+cd "$HOME"
+
+/mkenv/src/runme BUILDMODE=runtime INSTALLER_CACHE_DIR=/mkenv/cache \
+                 MML_VERSION="???" MML_BUILD_INFO="???"
+
+chown -R "$USER:$USER" "$HOME"
+cd /home
+tar czf "$USER.tgz" "$USER"
+rm -rf "$USER"
+chown -R "$USER:$USER" "$USER.tgz"
diff --git a/tools/hdi/install-mmlspark.sh b/tools/hdi/install-mmlspark.sh
new file mode 100755
index 0000000000..5c28b284c0
--- /dev/null
+++ b/tools/hdi/install-mmlspark.sh
@@ -0,0 +1,165 @@
+#!/usr/bin/env bash
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+# -----------------------------------------------------------------------------
+# Configurations for installing mmlspark + dependencies on an HDI
+# cluster, from a specific storage blob (which is created by the build).
+
+# These are replaced by the build process.
+DOWNLOAD_URL="<=<=fill-in-url=>=>"
+MAVEN_PACKAGE="<=<=fill-in-maven-package=>=>"
+MAVEN_URL="<=<=fill-in-maven-url=>=>"
+PIP_PACKAGE="<=<=fill-in-pip-package=>=>"
+SDK_DIR="<=<=fill-in-sdk-dir=>=>"
+HDFS_NOTEBOOKS_FOLDER="/HdiNotebooks/Microsoft ML Spark Examples"
+
+CONDA_ENVS=( "root" "py35" )
+
+CNTK_VER="2.0.beta12.0"
+CNTK_BASE_URL="https://cntk.ai/PythonWheel/CPU-Only"
+CNTK_WHEELS=( # each is "<conda-env>::<wheel-url>"
+  "root::$CNTK_BASE_URL/cntk-$CNTK_VER-cp27-cp27mu-linux_x86_64.whl"
+  "py35::$CNTK_BASE_URL/cntk-$CNTK_VER-cp35-cp35m-linux_x86_64.whl")
+
+get_headnodes() {
+  hdfssite="$(< "/etc/hadoop/conf/hdfs-site.xml")"
+  host1="${hdfssite#*<name>dfs.namenode.http-address.mycluster.nn1*<value>}"
+  host2="${hdfssite#*<name>dfs.namenode.http-address.mycluster.nn2*<value>}"
+  host1="${host1%%:*</value>*}"; num1="${host1%%-*}"; num1="${num1#hn}"
+  host2="${host2%%:*</value>*}"; num2="${host2%%-*}"; num2="${num2#hn}"
+  if [[ "$host1,$host2" = "," ]]; then return; fi
+  if (($num1 < $num2)); then echo "$host1,$host2"; else echo "$host2,$host1"; fi
+}
+
+get_primary_headnode() {
+  headnodes="$(get_headnodes)"
+  echo "${headnodes%%,*}"
+}
+
+# -----------------------------------------------------------------------------
+# Run on all nodes
+
+# Install prerequisites
+apt-get install -y openmpi-bin libunwind8
+
+# Install CNTK in Python 2.7 & 3.5
+_anaconda_bin() { local bin="$1"; shift; . "/usr/bin/anaconda/bin/$bin" "$@"; }
+for cntk_wheel in "${CNTK_WHEELS[@]}"; do
+  condaenv="${cntk_wheel%%::*}" wheel="${cntk_wheel#*::}" pkg="$(pip freeze | grep "cntk")"
+  _anaconda_bin activate "$condaenv"
+  echo -n "[$condaenv] "
+  if [[ ! "$pkg" = "cntk"* ]]; then echo "Installing CNTK..."; pip install "$wheel"
+  elif [[ "$pkg" = *"$CNTK_VER" ]]; then echo "Latest CNTK version is already installed."
+  else echo "Updating CNTK..."; pip install --upgrade --no-deps "$wheel"
+  fi
+  _anaconda_bin deactivate
+done
+
+# Download build artifacts & scripts
+tmp="/tmp/mmlinstall-$$"
+curlflags="--silent --show-error"
+mkdir "$tmp"
+echo "Downloading materials..."
+curl $curlflags -o "$tmp/BuildArifacts.zip" "$DOWNLOAD_URL/BuildArtifacts.zip"
+curl $curlflags -o "$tmp/update_livy.py" "$DOWNLOAD_URL/update_livy.py"
+rm -rf "$SDK_DIR"; mkdir -p "$SDK_DIR"
+cd "$SDK_DIR"; unzip "$tmp/BuildArifacts.zip"; rm "$tmp/BuildArifacts.zip"
+
+# Change Livy configuration
+# Note: cntk has the same .so files in both version
+# Note: we don't need the sdk directory except for the so files (will soon go away)
+LD_STRING="/usr/bin/anaconda/lib/python2.7/site-packages/cntk/libs"
+LD_STRING+=":$SDK_DIR/sdk"
+echo "Updating Livy configurations..."
+python "$tmp/update_livy.py" \
+       "/home/spark/.sparkmagic/config.json" "$MAVEN_PACKAGE" "$LD_STRING"
+rm -rf "$tmp"
+
+/bin/su livy -c \
+  "spark-shell --packages \"$MAVEN_PACKAGE\" --repositories \"$MAVEN_URL\" < /dev/null"
+
+for env in "${CONDA_ENVS[@]}"; do
+  _anaconda_bin activate "$condaenv"
+  pip install "$PIP_PACKAGE"
+  _anaconda_bin deactivate
+done
+
+# Check whether script is running on headnode
+if [[ "$(get_primary_headnode)" != "$(hostname -f)" ]]; then
+  echo "$(hostname -f) is not primary headnode, exiting."
+  exit 0
+fi
+
+# -----------------------------------------------------------------------------
+# Run only on the main head node
+
+# Copy notebooks to storage
+hdfs dfs -rm -f -r -skipTrash "$HDFS_NOTEBOOKS_FOLDER"
+hdfs dfs -mkdir -p "$HDFS_NOTEBOOKS_FOLDER"
+
+# pure bash url encoder
+urlencode() {
+  local str="$1" ch
+  for ((i=0; i < ${#str}; i++)); do
+    ch="${str:i:1}"
+    case "$ch" in
+      ( [a-zA-Z0-9_.-] ) printf '%s' "$ch" ;;
+      ( * ) printf '%%%02x' "'$ch" ;;
+    esac
+  done
+  printf '\n'
+}
+
+for f in "$SDK_DIR/notebooks/hdinsight/"*.ipynb; do
+  hdfs dfs -copyFromLocal "$(urlencode "$f")" "$HDFS_NOTEBOOKS_FOLDER"
+done
+
+# Constants needed for changing Ambari configs
+AMBARI_HOST="headnodehost"
+AMBARI_PORT="8080"
+AMBARI_USER="$(python -c '
+import hdinsight_common.Constants as C
+print C.AMBARI_WATCHDOG_USERNAME')"
+AMBARI_PASSWD="$(python -c '
+import hdinsight_common.ClusterManifestParser as P, hdinsight_common.Constants as C, base64
+base64pwd = P.parse_local_manifest().ambari_users.usersmap[C.AMBARI_WATCHDOG_USERNAME].password
+print base64.b64decode(base64pwd)')"
+CLUSTERNAME="$(python -c '
+import hdinsight_common.ClusterManifestParser as P
+print P.parse_local_manifest().deployment.cluster_name')"
+
+# Stop and restart affected services
+stop_service_via_rest() { # service-name
+  local name="$1"; echo "Stopping $name"
+  local data='{"RequestInfo": {"context" :"Stopping service '"$name"' to install MMLSpark"},'
+  data+=' "Body": {"ServiceInfo": {"state": "INSTALLED"}}}'
+  curl $curlflags -u "$AMBARI_USER:$AMBARI_PASSWD" -i -H "X-Requested-By: ambari" -X PUT -d "$data" \
+       "http://$AMBARI_HOST:$AMBARI_PORT/api/v1/clusters/$CLUSTERNAME/services/$name"
+  echo ""
+}
+start_service_via_rest() { # service-name
+  local name="$1"; echo "Starting $name"
+  sleep 2
+  local data='{"RequestInfo": {"context" :"Starting service '"$name"' with a new MMLSpark version"},'
+  data+=' "Body": {"ServiceInfo": {"state": "STARTED"}}}'
+  local args=($curlflags
+              -u "$AMBARI_USER:$AMBARI_PASSWD" -i -H "X-Requested-By: ambari" -X PUT -d "$data"
+              "http://$AMBARI_HOST:$AMBARI_PORT/api/v1/clusters/$CLUSTERNAME/services/$name")
+  local r="$(curl "${args[@]}")"
+  if [[ "$r" = *"500 Server Error"* || "$r" = *"internal system exception occurred"* ]]; then
+    sleep 60
+    echo "Retry starting $name"
+    r="$(curl "${args[@]}")"
+  fi
+  echo "$r"
+  echo ""
+}
+
+# Restart affected services
+stop_service_via_rest LIVY
+stop_service_via_rest JUPYTER
+start_service_via_rest LIVY
+start_service_via_rest JUPYTER
+
+echo "Done."
diff --git a/tools/hdi/setup-test-authkey.sh b/tools/hdi/setup-test-authkey.sh
new file mode 100755
index 0000000000..8d08960e80
--- /dev/null
+++ b/tools/hdi/setup-test-authkey.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+SDK_DIR="<=<=fill-in-sdk-dir=>=>"
+NB_DIR="$SDK_DIR/notebooks" # This gets created as root, need to chown to spark
+
+# This is the public key used by the build to access the test cluster
+PUB_KEY="ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC0FUryXQloryZQGXVP9vOqBVsuUWihHs"
+PUB_KEY+="YPHvNf8PgR6ctUxPrvdZheAuJ+JLmauZeV2B01lSqCdyhnkwxTKiwLh2dDFx2yruAcXd2"
+PUB_KEY+="0MGjD3bc8kC60GxMgRMsRxL6Jgz9FtauLFLxiDuvsRxQcSCBGd+l+pPR/NuFZeSHlmRWC"
+PUB_KEY+="mb25fY29tqyitEqytRT9viBA1QpoERSPuzr3DEy3YIJ4BLVen0VYLKMU58L7oyEZxTElm"
+PUB_KEY+="7nQMeQKgRBWUZZgCB1pXR3JiTYni/bWP2t9wCWfgfNfSs1oUttt14Libm9NgRbjq2QzN8"
+PUB_KEY+="aQtVv1KyAUKOEdPmFqiGCPh1lRvm4KB7MF key-for-VSO"
+
+cd ~spark || { echo "ERROR: could not find ~spark, aborting" 1>&2; exit 1; }
+
+# Add public key to authorized key
+if [[ ! -f ".ssh/authorized_keys" ]] || ! grep -q " key-for-VSO\$" ".ssh/authorized_keys"; then
+  echo "Public key not found in authorized keys. Adding key..."
+  mkdir -p ".ssh"
+  echo "$PUB_KEY" >> ".ssh/authorized_keys"
+  chown -R "spark:spark" ".ssh"
+  chmod 700 ".ssh"
+else
+  echo "Public key already added to authorized keys. Skipping..."
+fi
+
+chown -R "spark:spark" "$NB_DIR"
+
+. /usr/bin/anaconda/bin/activate
+conda update setuptools
+pip install --upgrade nbconvert
+pip install xmlrunner
diff --git a/tools/hdi/update_livy.py b/tools/hdi/update_livy.py
new file mode 100755
index 0000000000..3f4a475cf8
--- /dev/null
+++ b/tools/hdi/update_livy.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+import sys
+import json
+
+def main():
+    if len(sys.argv) != 4:
+        raise Exception(("Not enough" if len(sys.argv)<4 else "Too many") + " arguments.")
+    [_, config_file, maven_pkg, ld_lib_path] = sys.argv
+    with open(config_file) as conf_file:
+        conf=json.load(conf_file)
+    conf["session_configs"]["conf"]    = {}
+    conf["session_configs"]["conf"]["spark.jars.packages"] = maven_pkg
+    with open(config_file, "w") as outfile:
+        json.dump(conf, outfile, indent=2, sort_keys=True)
+
+if __name__ == "__main__":
+    try:
+        main()
+    except Exception as exn:
+        for line in str(exn).split("\n"):
+            print "[ERROR] {0}".format(line)
+        sys.exit(1)
diff --git a/tools/mmlspark-packages.spec b/tools/mmlspark-packages.spec
new file mode 100644
index 0000000000..667cf64056
--- /dev/null
+++ b/tools/mmlspark-packages.spec
@@ -0,0 +1,67 @@
+cycler=0.10.0
+decorator=4.0.11
+expat=2.1.0
+fontconfig=2.12.1
+glib=2.50.2
+ipykernel=4.6.1
+ipython=6.0.0
+ipython_genutils=0.2.0
+jbig=2.1
+jsonschema=2.6.0
+jupyter_client=5.0.1
+jupyter_console=5.1.0
+jupyter_core=4.3.0
+libffi=3.2.1
+libgcc=5.2.0
+libgfortran=3.0.0
+libiconv=1.14
+libsodium=1.0.10
+matplotlib=2.0.1
+nbformat=4.3.0
+numpy=1.12.1
+olefile=0.44
+pandas=0.19.2
+path.py=10.3.1
+pcre=8.39
+pexpect=4.2.1
+pickleshare=0.7.4
+pillow=4.1.0
+prompt_toolkit=1.0.14
+ptyprocess=0.5.1
+pycosat=0.6.1
+pycrypto=2.6.1
+pyparsing=2.1.4
+python=3.5.2
+python-dateutil=2.6.0
+pytz=2017.2
+pyzmq=16.0.2
+readline=6.2
+requests=2.11.1
+scikit-learn=0.18.1
+scipy=0.19.0
+setuptools=27.2.0
+simplegeneric=0.8.1
+sip=4.18
+six=1.10.0
+tornado=4.5.1
+traitlets=4.3.2
+wcwidth=0.1.7
+wheel=0.29.0
+yaml=0.1.6
+zeromq=4.1.5
+zlib=1.2.8
+notebook=5.0.0
+jinja2=2.9.6
+markupsafe=0.23
+mkl=2017.0.1
+pygments=2.2.0
+jpeg=9b
+libpng=1.6.27
+nbconvert=5.1.1
+html5lib=0.999
+bleach=1.5.0
+entrypoints=0.2.2
+mistune=0.7.4
+pandocfilters=1.4.1
+testpath=0.3
+freetype=2.5.5
diff --git a/tools/notebook/postprocess.py b/tools/notebook/postprocess.py
new file mode 100755
index 0000000000..498985accf
--- /dev/null
+++ b/tools/notebook/postprocess.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python
+
+DEPLOYMENT_KEY = "mml-deploy"
+NOTEBOOK_POSTPROC = {}
+
+def _get_kernel_language(notebook):
+    name = notebook.metadata.language_info["name"].lower()
+    if "py" in name:
+        return "python"
+    elif "scala" in name:
+        return "scala"
+    else:
+        raise ValueError("Unknown language")
+
+def _setup_kernel_local(notebook):
+    if _get_kernel_language(notebook) == "python":
+        notebook.metadata["kernelspec"] = {
+            "display_name": "Python [default]",
+            "language": "python",
+            "name": "python3"}
+        notebook.metadata["language_info"] = {
+            "codemirror_mode": {"name": "ipython", "version": 3.0},
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.5.2"}
+    return notebook
+NOTEBOOK_POSTPROC["local"] = _setup_kernel_local
+
+def _setup_kernel_hdinsight(notebook):
+    from nbformat.notebooknode import NotebookNode
+    if _get_kernel_language(notebook) == "python":
+        notebook.metadata["kernelspec"] = {
+            "display_name": "PySpark3",
+            "language": "",
+            "name": "pyspark3kernel"}
+        notebook.metadata["language_info"] = {
+            "codemirror_mode": {"name": "python", "version": 3},
+            "mimetype": "text/x-python",
+            "name": "pyspark3",
+            "pygments_lexer": "python3"}
+    return notebook
+NOTEBOOK_POSTPROC["hdinsight"] = _setup_kernel_hdinsight
+
+def _notebooks_for_target(notebooks, target):
+    """Returns the subset of `notebooks` that must be deployed to a given
+       `target`.
+       :param notebooks: List of (file_name, NotebookNode) tuples.
+       :param target: Deployment target.
+       :rtype: List of (file_name, NotebookNode)"""
+
+    from nbformat.notebooknode import NotebookNode
+    from copy import deepcopy
+    return [(notebook[0], deepcopy(notebook[1])) for notebook in notebooks
+            if target in notebook[1].metadata.get(DEPLOYMENT_KEY, target)]
+
+def _cells_for_target(notebook, target):
+    """Returns a notebook containing only the cells that must be deployed
+       to `target`.
+       :param notebook: NotebookNode containing the cells and other metadata
+       :param target: Deployment target"""
+
+    notebook["cells"] = [cell for cell in notebook["cells"]
+                         if target in cell.metadata.get(DEPLOYMENT_KEY, target)]
+    return notebook
+
+def _postprocessed_notebooks_by_target(notebooks):
+    """Returns a collection of notebooks for each of the deployment
+       targets with cells filtered for that target if necessary."""
+
+    notebooks_by_target = {}
+    for target in NOTEBOOK_POSTPROC.keys():
+        candidate_nb = _notebooks_for_target(notebooks, target)
+        processed_nb = [(notebook[0], _cells_for_target(notebook[1], target))
+                        for notebook in candidate_nb]
+        postprocd_nb = [(notebook[0], NOTEBOOK_POSTPROC[target](notebook[1]))
+                        for notebook in processed_nb]
+        notebooks_by_target[target] = postprocd_nb
+
+    return notebooks_by_target
+
+def postprocess_notebooks(input_dir, output_base_dir):
+    """Scans all notebook files in `input_dir` and outputs
+       them for each deployment target under `output_base_dir`."""
+
+    import os
+    import glob
+    from nbformat import read, write, NO_CONVERT
+    notebooks = [(os.path.split(nbfile)[-1], read(nbfile, NO_CONVERT))
+                 for nbfile in glob.glob(os.path.join(input_dir, "*.ipynb"))]
+    notebooks_by_target = _postprocessed_notebooks_by_target(notebooks)
+
+    for target, notebooks in notebooks_by_target.items():
+        destination_dir = os.path.join(output_base_dir, target)
+        if not os.path.isdir(destination_dir):
+            os.makedirs(destination_dir)
+        for notebook in notebooks:
+            write(notebook[1], os.path.join(destination_dir, notebook[0]))
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description = "Generate notebooks for each of the deployment targets: %s"
+                      % (", ".join(NOTEBOOK_POSTPROC.keys())))
+    parser.add_argument("input_dir", help = "Input directory containing notebooks")
+    parser.add_argument("output_dir", help = "Output directory for notebooks")
+    args = parser.parse_args()
+    postprocess_notebooks(args.input_dir, args.output_dir)
diff --git a/tools/notebook/tester/NotebookTestSuite.py b/tools/notebook/tester/NotebookTestSuite.py
new file mode 100644
index 0000000000..c17b065c37
--- /dev/null
+++ b/tools/notebook/tester/NotebookTestSuite.py
@@ -0,0 +1,69 @@
+import unittest
+
+class NotebookTestSuite(unittest.TestCase):
+
+    # Tese are set if $PROC_SHARD has a "<mod>/<num>" value
+    proc_num, proc_mod = (0, 0)
+
+    def setUp(self):
+        from nbconvert.preprocessors import ExecutePreprocessor
+        self.preprocessor = ExecutePreprocessor(timeout=600, enabled=True, allow_errors=False)
+
+    @staticmethod
+    def _discover_notebooks():
+        import os, fnmatch
+        counter = -1
+        for dirpath, dirnames, filenames in os.walk("."):
+            # skip checkpoint directories
+            if "ipynb_checkpoints" in dirpath:
+                continue
+            dirnames.sort()
+            filenames.sort()
+            for notebook_file in fnmatch.filter(filenames, "*.ipynb"):
+                counter += 1
+                if (NotebookTestSuite.proc_num == 0
+                    or counter % NotebookTestSuite.proc_num == NotebookTestSuite.proc_mod):
+                    yield dirpath, notebook_file
+
+    def _in_pyspark(self):
+        """
+        _in_pyspark: Returns true if this test is run in a context that has access to PySpark
+        """
+        try:
+            from pyspark.sql import SparkSession
+            return True
+        except ImportError:
+            return False
+
+    def edit_notebook(self, nb):
+        return nb
+
+    @classmethod
+    def initialize_tests(cls):
+        import os, re
+        proc_shard = re.match("^ *(\d+) */ *(\d+) *$", os.getenv("PROC_SHARD",""))
+        if proc_shard:
+            NotebookTestSuite.proc_num = int(proc_shard.group(2))
+            NotebookTestSuite.proc_mod = int(proc_shard.group(1)) - 1
+            if not NotebookTestSuite.proc_mod < NotebookTestSuite.proc_num:
+                raise Exception("proc_shard: n should be <= m in n/m")
+        for dirpath, file_name in NotebookTestSuite._discover_notebooks():
+            test_name = "test_" + re.sub("\\W+", "_", file_name)
+            def make_test(nbfile):
+                return lambda instance: instance.verify_notebook(nbfile)
+            setattr(cls, test_name, make_test(os.path.join(dirpath, file_name)))
+
+    def verify_notebook(self, nbfile):
+        """
+        verify_notebook: Runs a notebook and ensures that all cells execute without errors.
+        """
+        from nbformat import read as read_nb, NO_CONVERT
+        try:
+            # First newline avoids the confusing "F"/"." output of unittest
+            print("\nTesting " + nbfile)
+            nb = read_nb(nbfile, NO_CONVERT)
+            if self._in_pyspark():
+                nb = self.edit_notebook(nb)
+            self.preprocessor.preprocess(nb, {})
+        except Exception as err:
+            self.fail(err)
diff --git a/tools/notebook/tester/TestNotebooksLocally.py b/tools/notebook/tester/TestNotebooksLocally.py
new file mode 100644
index 0000000000..e1d8e9c840
--- /dev/null
+++ b/tools/notebook/tester/TestNotebooksLocally.py
@@ -0,0 +1,36 @@
+import unittest
+from NotebookTestSuite import NotebookTestSuite
+
+class LocalNotebookTests(NotebookTestSuite):
+
+    def edit_notebook(self, nb):
+        """
+        Inject the code needed to setup and shutdown spark and sc magic variables.
+        """
+        from nbformat.notebooknode import NotebookNode
+        from textwrap import dedent
+        preamble_node = NotebookNode(cell_type="code", source=dedent("""
+            from pyspark.sql import SparkSession
+            spark = SparkSession.builder.appName("NotebookTestSuite").master("local[*]").getOrCreate()
+            globals()["spark"] = spark
+            globals()["sc"] = spark.sparkContext
+            """))
+        epilogue_node = NotebookNode(cell_type="code", source=dedent("""
+            try:
+                spark.stop()
+            except:
+                pass
+            """))
+        nb.cells.insert(0, preamble_node)
+        nb.cells.append(epilogue_node)
+        return nb
+
+if __name__ == "__main__":
+    import os, xmlrunner
+    LocalNotebookTests.initialize_tests()
+    outsfx = None
+    if LocalNotebookTests.proc_num > 0:
+        outsfx = str(LocalNotebookTests.proc_mod + 1)
+    result = unittest.main(testRunner=xmlrunner.XMLTestRunner(output=os.getenv("TEST_RESULTS","TestResults"),
+                                                              outsuffix=outsfx),
+                           failfast=False, buffer=False, catchbreak=False)
diff --git a/tools/notebook/tester/TestNotebooksOnHdi.py b/tools/notebook/tester/TestNotebooksOnHdi.py
new file mode 100644
index 0000000000..3911e9103d
--- /dev/null
+++ b/tools/notebook/tester/TestNotebooksOnHdi.py
@@ -0,0 +1,48 @@
+import unittest
+from NotebookTestSuite import NotebookTestSuite
+from nbconvert.preprocessors import ExecutePreprocessor
+from nbconvert.preprocessors.execute import CellExecutionError
+from textwrap import dedent
+
+class ExecuteSparkmagicPreprocessor(ExecutePreprocessor):
+
+    def preprocess_cell(self, cell, resources, cell_index):
+        """
+        Executes a single code cell. See base.py for details.
+
+        To execute all cells see :meth:`preprocess`.
+        """
+        if cell.cell_type != "code":
+            return cell, resources
+        outputs = self.run_cell(cell)
+        cell.outputs = outputs
+        if not self.allow_errors:
+            for out in outputs:
+                if out.output_type == "stream" and out.name == "stderr":
+                    pattern = u"""\
+                        An error occurred while executing the following cell:
+                        ------------------
+                        {cell.source}
+                        ------------------
+                        {out.text}
+                        """
+                    msg = dedent(pattern).format(out=out, cell=cell)
+                    raise CellExecutionError(msg)
+        return cell, resources
+
+
+class HdiNotebookTests(NotebookTestSuite):
+
+    def setUp(self):
+        self.preprocessor = ExecuteSparkmagicPreprocessor(timeout=600, enabled=True,
+                                                          allow_errors=False)
+
+if __name__ == "__main__":
+    import os, xmlrunner
+    HdiNotebookTests.initialize_tests()
+    outsfx = None
+    if HdiNotebookTests.proc_num > 0:
+        outsfx = str(HdiNotebookTests.proc_mod + 1)
+    result = unittest.main(testRunner=xmlrunner.XMLTestRunner(output=os.getenv("TEST_RESULTS","TestResults"),
+                                                              outsuffix=outsfx),
+                           failfast=False, buffer=False, catchbreak=False)
diff --git a/tools/notebook/tester/parallel_run.sh b/tools/notebook/tester/parallel_run.sh
new file mode 100755
index 0000000000..88f19ecf8e
--- /dev/null
+++ b/tools/notebook/tester/parallel_run.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+
+# Use this script to run the tests in N parallel processes, where
+# notebooks are split among them.  It would have been better to combine
+# python's unittest with multiprocessing (create a process pool, use
+# p.map), but that will be more work, especially for dealing with the
+# xml output.
+
+# Arguments: proc_num py_file [args...]
+proc_num="$1"; shift
+py_file="$1";  shift
+
+prefix_lines() { # pfx
+  local line
+  while read -r line; do printf "%s| %s\n" "$1" "${line%$'\r'}"; done
+}
+
+onerun() { # id [args...]
+  local id="$1"; shift
+  # dump the prefixed output on the correct fd; use script to fake a tty
+  # so the python process would show progress.
+  PROC_SHARD="$id" \
+      script -qefc "$(printf "%q " python "$py_file" "$@")" /dev/null \
+      1> >(prefix_lines "$id") 2> >(prefix_lines "$id" 1>&2)
+}
+
+procs=()
+for ((i=1; i <= proc_num; i++)); do onerun "$i/$proc_num" "$@" & procs+=($!); done
+
+status=0
+for p in "${procs[@]}"; do wait "$p" || status="$?"; done
+exit $status
diff --git a/tools/pip/MANIFEST.in b/tools/pip/MANIFEST.in
new file mode 100644
index 0000000000..8a2066b50b
--- /dev/null
+++ b/tools/pip/MANIFEST.in
@@ -0,0 +1,5 @@
+# Misc
+include LICENSE.txt
+
+# documentation, if any - there may be a minimal set of docs:
+# recursive-include docs *.html *.txt *.js
diff --git a/tools/pip/README.txt b/tools/pip/README.txt
new file mode 100644
index 0000000000..b73c4462e6
--- /dev/null
+++ b/tools/pip/README.txt
@@ -0,0 +1,8 @@
+Microsoft ML for Apache Spark
+=============================
+
+This package contains the PySpark library for MMLSpark.
+
+This library provides spark estimators, transformers, and utility functions
+for machine learning on Spark.  For more complete documentation, refer to
+the MMLSpark repo: https://github.com/Azure/mmlspark .
diff --git a/tools/pip/generate-pip.sh b/tools/pip/generate-pip.sh
new file mode 100755
index 0000000000..02aed7d7a4
--- /dev/null
+++ b/tools/pip/generate-pip.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+. "$(dirname "${BASH_SOURCE[0]}")/../../runme" "$@"
+main() {
+
+local srcdir="$TOOLSDIR/pip"
+local destdir="$BUILD_ARTIFACTS/packages/pip"
+local tempdir="$destdir/mmlspark"
+local wheelfile="$destdir/$PIP_PACKAGE"
+
+# Create the package structure in the temp packaging directory
+_rmcd "$tempdir"
+_ cp "$srcdir/"* .; _rm *.sh
+_ unzip -q "$BUILD_ARTIFACTS/sdk/mmlspark.zip"
+_ cp "$BASEDIR/LICENSE" "LICENSE.txt"
+
+# Create the package
+_ python setup.py bdist_wheel --universal -d "$destdir"
+if [[ -r "$wheelfile" ]]; then show - "Generated wheel: $wheelfile"
+else failwith "expected wheel file missing: $wheelfile"; fi
+
+# Cleanup
+_ cd "$destdir"
+_rm "$tempdir"
+
+}
+main "$@"
diff --git a/tools/pip/setup.py b/tools/pip/setup.py
new file mode 100644
index 0000000000..93dd8e1f88
--- /dev/null
+++ b/tools/pip/setup.py
@@ -0,0 +1,33 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+import setuptools, os
+
+setuptools.setup(
+    name = "mmlspark",
+    version = os.environ["MML_VERSION"],
+    description = "Microsoft ML for Spark",
+    long_description = "The Microsoft ML for Apache Spark package provides a python API to scala.",
+    license = "MIT",
+    packages = ["mmlspark"],
+
+    # Project's main homepage.
+    url = "https://github.com/Azure/mmlspark",
+    # Author details
+    author = "Microsoft",
+    author_email = os.environ["SUPPORT_EMAIL"],
+
+    classifiers = [
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Data Scientists",
+        "Topic :: Software Development :: Datascience Tools",
+        "License :: OSI Approved :: MIT License",
+        "Programming Language :: Python :: 2",
+        "Programming Language :: Python :: 3"
+    ],
+
+    zip_safe = True,
+
+    package_data = {"mmlspark": ["../LICENSE.txt", "../README.txt"]}
+)
diff --git a/tools/pytests/auto-tests b/tools/pytests/auto-tests
new file mode 100755
index 0000000000..def21357f5
--- /dev/null
+++ b/tools/pytests/auto-tests
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+. "$(dirname "${BASH_SOURCE[0]}")/../../runme" "$@"
+@ "shared.sh"
+main() {
+
+show section "Running Generated Python Tests"
+local testdir="$TEST_RESULTS/generated_pytests" t status=0
+cd "$testdir"
+for t in *"_tests.py"; do
+  printf "\n\n==================== %s ====================\n" "$t"
+  _pytest "$t" || status=$?
+done
+if ((status)); then failwith "failures in generated python tests"; fi
+
+}
+main "$@"
diff --git a/tools/pytests/notebook-tests b/tools/pytests/notebook-tests
new file mode 100755
index 0000000000..5149d3face
--- /dev/null
+++ b/tools/pytests/notebook-tests
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+. "$(dirname "${BASH_SOURCE[0]}")/../../runme" "$@"
+@ "shared.sh"
+
+show section "Running Local Notebook Tests"
+cd "$TEST_RESULTS/notebook_tests/local"
+_ cp -a "$BASEDIR/tools/notebook/tester/"* .
+_pytest "TestNotebooksLocally.py" || failwith "failures in local notebook tests"
diff --git a/tools/pytests/shared.sh b/tools/pytests/shared.sh
new file mode 100644
index 0000000000..94354caf12
--- /dev/null
+++ b/tools/pytests/shared.sh
@@ -0,0 +1,16 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+_pytest() {
+  local ret=0 tmperr="/tmp/pytest-stderr-$$" line
+  # capture stderr and show it on failure, because it looks like log4j is using
+  # stderr directly, which bypasses unittest's capture of stderr
+  TEST_RESULTS="$TEST_RESULTS" \
+  "$TOOLSDIR/bin/mml-exec" spark-submit "$@" 2> "$tmperr" || {
+    ret=$?
+    echo "Standard error for the above failure:"
+    cat "$tmperr" | while read -r line; do printf "  | %s\n" "$line"; done
+  }
+  rm -f "$tmperr"
+  return $ret
+}
diff --git a/tools/runme/README.txt b/tools/runme/README.txt
new file mode 100644
index 0000000000..7eea1b07b0
--- /dev/null
+++ b/tools/runme/README.txt
@@ -0,0 +1,4 @@
+This directory holds the implementation of the build/install script.
+These files are not intended to be used directly, use the toplevel
+"runme" script for that.  For a description of what it's doing, use
+"runme help".
diff --git a/tools/runme/build-readme.tmpl b/tools/runme/build-readme.tmpl
new file mode 100644
index 0000000000..155e983c34
--- /dev/null
+++ b/tools/runme/build-readme.tmpl
@@ -0,0 +1,12 @@
+# MMLSpark $MML_VERSION
+
+* Source: [$BUILD_REPOSITORY_NAME]($BUILD_REPOSITORY_URI),
+    ${BUILD_SOURCEBRANCH##refs/@(heads/|)} at revision ${BUILD_SOURCEVERSION:0:8}
+    (by $BUILD_SOURCEVERSIONAUTHOR).
+
+* Build: $BUILD_DEFINITIONNAME, $BUILD_BUILDNUMBER
+    (built by $AGENT_NAME on $AGENT_MACHINENAME, $(date +'%F %R'))
+
+* Info: `$MML_BUILD_INFO`
+
+Queued by: $BUILD_QUEUEDBY for [$BUILD_REQUESTEDFOR](mailto:$BUILD_REQUESTEDFOREMAIL)
diff --git a/tools/runme/build.sh b/tools/runme/build.sh
new file mode 100644
index 0000000000..d89b86ed8e
--- /dev/null
+++ b/tools/runme/build.sh
@@ -0,0 +1,249 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+################################################################################
+# Build
+
+# Since developers usually work in the IDE, most of the build mechanics is done
+# by SBT.
+
+_show_template_line() {
+  eval show - "$(qstr -not-dollar "${2%$'\n'}")"
+}
+
+_generate_description() {
+  if [[ "$BUILDMODE" != "server" || "$AGENT_ID" = "" ]]; then return; fi
+  show section "Generating Build.md"
+  show command "... > $(qstr "$BUILD_ARTIFACTS/Build.md")"
+  mapfile -c 1 -C _show_template_line \
+    < "$RUNMEDIR/build-readme.tmpl" > "$BUILD_ARTIFACTS/Build.md"
+  if [[ "$PUBLISH" = "all" ]]; then
+    printf '\nThis is a publish build.\n' >> "$BUILD_ARTIFACTS/Build.md"
+    echo "##vso[build.addbuildtag]Publish"
+  fi
+  # upload the generated description lazily on exit, so we can add info lines below
+  echo_exit "##vso[task.uploadsummary]$BUILD_ARTIFACTS/Build.md"
+}
+
+_postprocess_sbt_log() {
+  # Adapts the SBT output to work nicely with the VSTS build, most of the work
+  # is for the SPARK output logs
+  local line rx tag text oIFS="$IFS"
+  IFS="" # preserve whitespaces
+  # Prefix finding regexp
+  rx=$'^(\e[[0-9]+m)?\[?(\e[[0-9]+m)??'
+  rx+=$'(warning|WARNING|warn|WARN|info|INFO|error|ERROR)'
+  rx+=$'(\e[[0-9]+m)?\]?(\e[[0-9]+m)? *(.*)'
+  while read -r line || [[ -n "$line" ]]; do
+    # Drop time stamps from SPARK output lines
+    line="${line#[0-9][0-9]/[0-9][0-9]/[0-9][0-9] [0-9][0-9]:[0-9][0-9]:[0-9][0-9] }"
+    # Highlight a prefix of "[warning]" with optional brackets, and the same
+    # for "warn"s, "error"s and "info"s (for info, just drop the prefix); do
+    # that for uppercase also, but *not* mixed since spark shows a line that
+    # starts with "Info provided"
+    if [[ "${line}" =~ $rx ]]; then
+      tag="${BASH_REMATCH[3],,}"
+      if [[ "$tag" = "warn" ]]; then tag="warning"
+      elif [[ "$tag" = "info" ]]; then tag="-"
+      fi
+      # preserve the line (with escape sequences) when in interactive mode
+      if [[ "${BUILDMODE}${BASH_REMATCH[1]}" != "server" ]]; then text="$line"
+      else text="${BASH_REMATCH[6]}"
+      fi
+      show "$tag" "$text"
+    else
+      echo "$line"
+    fi
+  done
+  IFS="$oIFS"
+}
+
+_prepare_build_artifacts() {
+  show section "Preparing Build"
+  _rm "$BUILD_ARTIFACTS" "$TEST_RESULTS"
+  _ mkdir -p "$BUILD_ARTIFACTS/sdk" "$TEST_RESULTS"
+  _ cp -a "$BASEDIR/LICENSE" "$BUILD_ARTIFACTS"
+  _ cp -a "$BASEDIR/LICENSE" "$BUILD_ARTIFACTS/sdk"
+  echo "$MML_VERSION" > "$BUILD_ARTIFACTS/version"
+  local paths
+  # copy only the test notebooks from notebooks/tests to the local test
+  # directory -- running all notebooks is covered better by the E2E tests
+  for paths in "samples:$BUILD_ARTIFACTS/notebooks" "tests:$TEST_RESULTS/notebook_tests"; do
+    _ "$BASEDIR/tools/notebook/postprocess.py" "$BASEDIR/notebooks/${paths%%:*}" "${paths#*:}"
+  done
+}
+
+_sbt_run() { # sbt-args...
+  local flags=""; if [[ "$BUILDMODE" = "server" ]]; then flags="-no-colors"; fi
+  (set -o pipefail; _ sbt $flags "$@" < /dev/null 2>&1 | _postprocess_sbt_log) \
+    || exit $?
+}
+
+_sbt_build() {
+  show section "Running SBT Build"
+  local owd="$PWD" restore_opt="$(shopt -p nullglob)"; shopt -s nullglob
+  cd "$SRCDIR"
+  local rmjars=( **/"target/scala-"*/!(*"-$MML_VERSION")".jar" )
+  $restore_opt
+  if [[ "${#rmjars[@]}" != "0" ]]; then
+    show command "rm **/target/...stale-jars"
+    __ rm "${rmjars[@]}"
+  fi
+  local TESTS="$TESTS"
+  if ! should test scala; then TESTS="none"
+  else # Hide the "+scala" tag
+    TESTS=",$TESTS,"; TESTS="${TESTS//,+scala,/,}"; TESTS="${TESTS#,}"; TESTS="${TESTS%,}"
+    if [[ "$TESTS" = "" ]]; then TESTS="all"; fi
+  fi
+  _sbt_run "full-build"
+  # leave only the -assembley jars under the proper name (and the pom files)
+  local f; for f in "$BUILD_ARTIFACTS/packages/m2/"**; do case "$f" in
+    ( *-@(javadoc|sources).jar@(|.md5|.sha1) ) _rm "$f" ;;
+    ( *-assembly.jar@(|.md5|.sha1) ) _ mv "$f" "${f//-assembly.jar/.jar}" ;;
+  esac; done
+  cd "$owd"
+}
+
+_upload_to_storage() { # name, pkgdir, container
+  show section "Publishing $1 Package"
+  _ az storage blob upload-batch --account-name "$MAIN_CONTAINER" \
+       --source "$BUILD_ARTIFACTS/packages/$2" --destination "$3"
+}
+
+_e2e_script_action() { # script-name file-name config-name
+  local cnf="$1" script_name="$2" file="$3"; shift 3
+  local cluster="${cnf}_CLUSTER_NAME" group="${cnf}_RESOURCE_GROUP"
+  local url="$STORAGE_URL/$MML_VERSION/$file"
+  collect_log=1 \
+    _ azure hdinsight script-action create "${!cluster}" -g "${!group}" \
+            -n "$script_name" -u "$url" -t "headnode;workernode"
+  echo "$collected_log"
+  if [[ ! "$collected_log" =~ "Operation state: "+"Succeeded" ]]; then
+    failwith "script action failed"
+  fi
+}
+e2ekey=""
+_e2e_ssh() {
+  local cmd keyfile rm_pid ret
+  cmd=("ssh"); if [[ "$1" = "scp" ]]; then cmd=("$1"); shift; fi
+  if [[ "$_e2e_key" = "" ]]; then
+    e2ekey="$(__ az keyvault secret show --vault-name mmlspark-keys --name testcluster-ssh-key)"
+    e2ekey="${e2ekey##*\"value\": \"}"; e2ekey="${e2ekey%%\"*}"; e2ekey="${e2ekey//\\n/$'\n'}"
+  fi
+  keyfile="/dev/shm/k$$"; touch "$keyfile"; chmod 600 "$keyfile"; echo "$e2ekey" > "$keyfile"
+  cmd+=(-o "StrictHostKeyChecking=no" -i "$keyfile")
+  if [[ "${cmd[0]}" = "ssh" ]]; then
+    { sleep 30; rm -f "$keyfile"; } &
+    rm_pid="$!"
+    _ -a "${cmd[@]}" "$@"; ret="$?"
+    kill -9 "$rm_pid" > /dev/null 2>&1; rm -f "$keyfile"
+  elif [[ "${cmd[0]}" = "scp" ]]; then
+    _ -a "${cmd[@]}" "$@"; ret="$?"
+    rm -f "$keyfile"
+  fi
+  return $ret
+}
+_e2e_tests() {
+  show section "Running E2E Tests"
+  _e2e_script_action "E2E" "Install MML to E2E Cluster" "install-mmlspark.sh"
+  _e2e_script_action "E2E" "Setup authorized-keys for E2E" "setup-test-authkey.sh"
+  local shost="$E2E_CLUSTER_SSH" sdir="$CLUSTER_SDK_DIR/notebooks/hdinsight"
+  _e2e_ssh scp -p "$TEST_RESULTS/notebook_tests/hdinsight/"* "$shost:$sdir"
+  _e2e_ssh scp -p "$BASEDIR/tools/notebook/tester/"* "$shost:$sdir"
+  _e2e_ssh -t -t "$shost" \
+           ". /usr/bin/anaconda/bin/activate; \
+            cd \"$sdir\"; rm -rf \"../local\"; \
+            ./parallel_run.sh 2 \"TestNotebooksOnHdi.py\""
+  local ret="$?"
+  _e2e_ssh scp "$shost:$sdir/TestResults/*" "$TEST_RESULTS"
+  if ((ret != 0)); then failwith "E2E test failures"; fi
+}
+
+_publish_to_demo_cluster() {
+  show section "Installing Demo Cluster"
+  _e2e_script_action "DEMO" "Install MML to Demo Cluster" "install-mmlspark.sh"
+}
+
+_publish_to_dockerhub() {
+  @ "../docker/build-docker"
+  local itag="mmlspark:latest" otag otags
+  otag="microsoft/mmlspark:$MML_VERSION"; otag="${otag//+/_}"; otags=("$otag")
+  if [[ "$MML_VERSION" = *([0-9.]) ]]; then otags+=( "microsoft/mmlspark:latest" ); fi
+  show section "Pushing to Dockerhub as ${otags[*]}"
+  show - "Image info:"
+  local info="$(docker images "$itag")"
+  if [[ "$info" != *$'\n'* ]]; then failwith "tag not found: $itag"; fi
+  info="  | ${info//$'\n'/$'\n  | '}"
+  echo "$info"
+  local auth user pswd
+  __ docker logout > /dev/null
+  auth="$(__ az keyvault secret show --vault-name mmlspark-keys --name dockerhub-auth)"
+  auth="${auth##*\"value\": \"}"; auth="${auth%%\"*}"; auth="$(base64 -d <<<"$auth")"
+  user="${auth%%:*}" pswd="${auth#*:}"
+  ___ docker login -u "$user" -p "$pswd" > /dev/null
+  unset user pass auth
+  for otag in "${otags[@]}"; do
+    show - "Pushing \"$otag\""
+    _ docker tag "$itag" "$otag"
+    _ docker push "$otag"
+    _ docker rmi "$otag"
+  done
+  __ docker logout > /dev/null
+}
+
+_upload_artifacts_to_VSTS() {
+  if [[ "$BUILDMODE" != "server" ]]; then return; fi
+  show section "Uploading Build Artifacts to VSTS"
+  local f d
+  for f in "$BUILD_ARTIFACTS/"**/*; do
+    if [[ -d "$f" ]]; then continue; fi
+    f="${f#$BUILD_ARTIFACTS}"; d="${f%/*}"
+    echo "##vso[artifact.upload artifactname=Build$d]$BUILD_ARTIFACTS/$f"
+  done
+}
+
+_upload_artifacts_to_storage() {
+  show section "Uploading Build Artifacts to Storage"
+  _ az account show > /dev/null # this fails if not logged-in
+  local tmp="/tmp/mmlbuild-$$" # temporary place for uploads
+  mkdir -p "$tmp"
+  ( cd "$BUILD_ARTIFACTS"
+    _ zip -qr9 "$tmp/$(basename "$BUILD_ARTIFACTS.zip")" * )
+  local f txt
+  for f in "$TOOLSDIR/hdi/"*; do
+    txt="$(< "$f")"
+    txt="${txt//<=<=fill-in-maven-package=>=>/com.microsoft.ml.spark:mmlspark_$SCALA_VERSION:$MML_VERSION}"
+    txt="${txt//<=<=fill-in-maven-url=>=>/$MAVEN_URL}"
+    txt="${txt//<=<=fill-in-pip-package=>=>/$PIP_URL/$PIP_PACKAGE}"
+    txt="${txt//<=<=fill-in-sdk-dir=>=>/$CLUSTER_SDK_DIR}"
+    txt="${txt//<=<=fill-in-url=>=>/$STORAGE_URL/$MML_VERSION}"
+    echo "$txt" > "$tmp/$(basename "$f")"
+  done
+  _ az storage blob upload-batch --account-name "$MAIN_CONTAINER" \
+       --source "$tmp" --destination "$STORAGE_CONTAINER/$MML_VERSION"
+  _rm "$tmp"
+  printf '\nCopy the link to [%s](%s) to setup this build on a cluster.' \
+         "this HDInsight Script Action" "$STORAGE_URL/$MML_VERSION/install-mmlspark.sh" \
+         >> "$BUILD_ARTIFACTS/Build.md"
+}
+
+_full_build() {
+  show section "Building ($MML_VERSION)"
+  _ cd "$BASEDIR"
+  _prepare_build_artifacts
+  _generate_description
+  _sbt_build
+  _ ln -sf "$(realpath --relative-to="$HOME/bin" "$TOOLSDIR/bin/mml-exec")" \
+           "$HOME/bin"
+  should publish maven   && _upload_to_storage "Maven" "m2" "$MAVEN_CONTAINER"
+  should test python     && @ "../pytests/auto-tests"
+  should test python     && @ "../pytests/notebook-tests"
+  should publish pip     && @ "../pip/generate-pip.sh"
+  should publish pip     && _upload_to_storage "PIP" "pip" "$PIP_CONTAINER"
+  should publish storage && _upload_artifacts_to_storage
+  should test e2e        && _e2e_tests
+  should publish demo    && _publish_to_demo_cluster
+  should publish docker  && _publish_to_dockerhub
+  _upload_artifacts_to_VSTS
+  return 0
+}
diff --git a/tools/runme/install.sh b/tools/runme/install.sh
new file mode 100644
index 0000000000..007cab3b1f
--- /dev/null
+++ b/tools/runme/install.sh
@@ -0,0 +1,206 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+################################################################################
+# Environment Installation
+
+inst_work_done=""
+_note_work() { # what, as something that can be shown in "_ done"
+  # prefer bigger work as the label to keep
+  for what in "Install" "Update" "Environment update"; do
+    if [[ "$what" = "$1" ]]; then inst_work_done="$1"; fi
+    if [[ "$what" = "$inst_work_done" ]]; then return; fi
+  done
+}
+
+_verify_version() { # [-q] what; prints error if mismatched version
+  # "-q" => quick mode: just check that the version file matches
+  local quick="N"; if [[ "x$1" = "x-q" ]]; then quick="Y"; shift; fi
+  libname="$1"; shift
+  local lib vers ver; set_install_info_vars "$libname" lib vers ver
+  local dir="$HOME/lib/$lib"
+  if [[ ! -d "$dir" ]]; then
+    echo "$libname is not installed (missing dir: $dir)"; return
+  fi
+  local vcmd="" pat1 pat2 actual line ver_file="$dir/$LIB_VERSION_FILE"
+  if [[ "$vers" != "" ]]; then vcmd="${vers%%|*}"; vers="${vers#*|}"; fi
+  if [[ "$quick" = "Y" && -r "$ver_file" ]]; then
+    read -r actual < "$ver_file"
+    if [[ "$actual" = "$ver" ]]; then return; fi # look for $ver here!
+  fi
+  if [[ "$vcmd" = "" ]]; then echo "no version information"; return; fi
+  actual="$(cd "$dir"; ___ $vcmd 2>&1)"
+  if [[ $'\n'"$actual"$'\n' != *$'\n'$vers$'\n'* ]]; then # $vers can have globs
+    printf 'unexpected %s version,\n  wanted:\n  | %s\n  got:\n  | %s' \
+           "$libname" "$vers" "${actual//$'\n'/$'\n  | '}"
+  fi
+}
+
+_do_envinits() {
+  cd "$HOME"
+  local cmd script="" f="$PROFILE_FILE" add_init="N" orig_profile="$MMLSPARK_PROFILE"
+  # create the init script
+  for cmd in "${envinit_commands[@]}"; do
+    script+="$cmd"$'\n'
+    local var="" val
+    if [[ "$cmd" = "export "*"="* ]]; then
+      var="${cmd#export }"; var="${var%%=*}"; val="$(qstr "${!var}")"
+    fi
+    # print only commands and setenvs that change values
+    if [[ "$var" = "" || "$cmd" != "export $var=$val" ]]; then show command "$cmd"; fi
+  done
+  eval "$script"
+  if   [[ ! -e "$f" ]]; then add_init="Y"; show section "Creating $f"
+  elif [[ "${script%$'\n'}" != "$(<"$f")" ]]; then show section "Updating $f"
+  else return; fi
+  _note_work "Environment update"
+  show command "...init code... > \"$f\""
+  echo -n "$script" > "$f"
+  if [[ "$add_init" = "N" ]]; then return; fi
+  if [[ "$orig_profile" = "yes" ]]; then
+    show warning "There was no $f file, but \$MMLSPARK_PROFILE is set,"
+    show warning "so the environment was modified in an unexpected way and"
+    show warning "therefore no shell init files are modified."
+    return
+  fi
+  show section "Adding environment initialization"
+  local file haveit="N" text fh="$(qstr "$f")"
+  local cmd="[[ \"\$MMLSPARK_PROFILE\" != \"\" ]] || . $fh"
+  local qcmd="$(qstr "$cmd")"
+  for file in "${ENV_INIT_FILES[@]}"; do
+    if [[ ! -r "$file" ]]; then continue; fi
+    haveit="Y" # either it's there or we're adding it
+    text="$(< "$file")"
+    if [[ "$text" = *"$cmd"* ]]; then continue; fi
+    show command "...added init line... > \"$file\""
+    # add it at the top since some init files have `return`s in the middle
+    echo "$cmd"$'\n\n'"$text" > "$file"
+  done
+  if [[ "$haveit" = "N" ]]; then
+    show command "echo $qcmd > \"${ENV_INIT_FILES[0]}\""
+    echo "$cmd" > "${ENV_INIT_FILES[0]}"
+  fi
+  show - ""
+  show warning "I made your shell initialization load $f, but this"
+  show warning "shell is still not initialized.  Enter \"source $f\""
+  show warning "to do so, or start a new terminal."
+}
+
+_unpack_tgz() {
+  _ tar xzf "$1" --strip-components=1
+}
+
+_unpack_zip() {
+  local restore_opt="$(shopt -p dotglob)"; shopt -s dotglob
+  _ unzip -q "$1"
+  local paths=( * )
+  if [[ "${#paths[@]}" != "1" || ! -d "${paths[0]}" ]]; then
+    failwith "empty archive or archive with multiple toplevel directories, $1"
+  fi
+  show command "mv ${paths[0]}/* .; rmdir ${paths[0]}"
+  local tmp="...install-tmp-$$"
+  mv "${paths[0]}" "$tmp"
+  mv "$tmp"/* .
+  rmdir "$tmp"
+  $restore_opt
+}
+
+_unpack_sh() {
+  if [[ "x$instcmd" != "x" ]]; then eval "_ $instcmd"
+  else failwith "sh package without instcmd: $1"; fi
+}
+
+_retrieve_file() { # url file sha256
+  # Retrieve the $url into $file with a cache left in $INSTALLER_CACHE_DIR; the
+  # file will actually be a symlink to the cache; if $INSTALLER_CACHE_DIR is
+  # empty no cache is used; verify sha256 checksum; only verified files are
+  # cached; files in the cache are assumed to be valid.
+  local url="$1" target="$2" sha256="$3"; shift 3
+  local cache="$INSTALLER_CACHE_DIR/$(basename "$target")"
+  if [[ -n "$INSTALLER_CACHE_DIR" && -r "$cache" && -r "$cache.sha256"
+        && "$(< "$cache.sha256")" = "$sha256" ]]; then
+    _ ln -sf "$cache" "$target"; return
+  fi
+  _ curl --output "$target" $CURL_FLAGS "$url"
+  local sha256sum="$(__ sha256sum "$target")"; sha256sum="${sha256sum%% *}"
+  if [[ "x$sha256sum" = "x" ]]; then failwith "could not get sha256 checksum"; fi
+  if [[ "$sha256sum" != "$sha256" ]]; then
+    failwith "sha256 checksum failed for $target (retrieved from $url)"
+  fi
+  if [[ -z "$INSTALLER_CACHE_DIR" ]]; then return; fi
+  _md "$INSTALLER_CACHE_DIR"
+  _ mv "$target" "$cache"; echo "$sha256" > "$cache.sha256"
+  _ ln -s "$cache" "$target"
+}
+
+_install() { # libname
+  libname="$1"; shift
+  local lib envvar url sha256 instcmd exes vers ver bindir prereq where
+  set_install_info_vars "$libname" \
+        lib envvar url sha256 instcmd exes vers ver bindir prereq where
+  if [[ ( "$BUILDMODE" = "server"  && " $where " != *" build "*   )
+     || ( "$BUILDMODE" = "runtime" && " $where " != *" runtime "* )
+     || (                             " $where " != *" devel "*   ) ]]; then return
+  fi
+  local dir="$HOME/lib/$lib"
+  setenv "${envvar}_VERSION" "$ver"
+  setenv "${envvar}_HOME"    "$dir"
+  if [[ "x$prereq" != "x" ]] && ! eval "${prereq%|*}" > /dev/null 2>&1; then
+    failwith "$libname: prerequisite failure: ${prereq##*|}"
+  fi
+  if [[ "$(_verify_version -q "$libname")" = "" ]]; then
+    cd "$dir"; call_ifdef "$libname.init" # can use $ver
+    return
+  fi
+  local update="N" Op; if [[ -r "$dir/$LIB_VERSION_FILE" ]]; then update="Y"; fi
+  if [[ "$update" = "Y" ]]; then Op="Updating"; _note_work "Update"
+  else Op="Installing"; _note_work "Install"; fi
+  # avoid output up to here, so there's nothing unless we actually do something
+  show section "$Op $libname v$ver in $dir"
+  show command setenv "${envvar}_VERSION" "$ver"
+  show command setenv "${envvar}_HOME"    "$dir"
+  if [[ "$update" = "Y" && "$(_verify_version "$libname")" = "" ]]; then
+    show warning "Looks like $libname was already updated, noting new version"
+    _ cd "$dir"
+  else
+    if [[ "$update" = "Y" ]]; then show warning "Removing $dir!"; _rm "$dir"; fi
+    if [[ -d "$dir" ]]; then failwith "directory exists, please remove it: $dir"; fi
+    local sfx="$(get_suffix "$url")"; if [[ "$sfx" = "tar.gz" ]]; then sfx="tgz"; fi
+    local file="/tmp/$lib.$sfx"
+    _retrieve_file "$url" "$file" "$sha256"
+    _mcd "$dir"
+    if [[ "$(type -t _unpack_$sfx)" = "function" ]]; then _unpack_$sfx "$file"
+    else failwith "unknown package file suffix: $sfx"; fi
+    _rm "$file"
+  fi
+  map call_ifdef "$libname.setup" "$libname.init" # can use $ver
+  if [[ "$setup_function" != "" ]]; then _ "$setup_function"; fi
+  show command "...text... > $(qstr "$LIB_VERSION_FILE")"
+  { echo "$ver"
+    echo ""
+    echo "This directory has an installation of $libname v$ver"
+    echo "It has been created by the MMLSpark build script: as long as this file"
+    echo "exists, the build script is allowed to remove it for version updates"
+    echo "when needed.  Please do not modify it."
+  } > "$dir/$LIB_VERSION_FILE"
+  _ cd "$HOME/bin"
+  local exe
+  for exe in $exes; do _ ln -sf "../lib/$lib/$bindir/$exe" "$exe"; done
+  if [[ "$vers" != "" ]]; then
+    show debug "verifying $libname installation version"
+    local err="$(_verify_version "$libname")"
+    if [[ "$err" != "" ]]; then _rm -rf "$dir"; failwith "$err"; fi
+  fi
+}
+
+# Main entry point
+_install_environment() {
+  # Common directories
+  _md "$HOME/bin" "$HOME/lib"
+  # Installations
+  map _install "${install_packages[@]}"
+  _ cd "$BASEDIR"
+  _rm "$CONF_TRACK_FILE"
+  # Set vars and setup environment initialization
+  _do_envinits
+}
diff --git a/tools/runme/runme.sh b/tools/runme/runme.sh
new file mode 100755
index 0000000000..f66917ce8a
--- /dev/null
+++ b/tools/runme/runme.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+# Load once
+if [[ "${RUNME_LOADED:-}" = "$$" ]]; then return; else RUNME_LOADED="$$"; fi
+
+# extra bash globs, quote expansion of quoted parameters
+shopt -s globstar extglob extquote
+
+# Where are we?
+RUNMEDIR="$(cd "$(dirname "${BASH_SOURCE[0]}")"; pwd)"
+TOOLSDIR="$(dirname "$RUNMEDIR")"
+BASEDIR="$(dirname "$TOOLSDIR")"
+
+if [[ "${OS:-}" = "Windows_NT" ]]; then
+  echo "This script cannot run on Windows (yet)." 1>&2; exit 1
+fi
+
+# PATH for these scripts: conservative (will include "$HOME/bin" later)
+PATH="/usr/bin:/bin"
+
+# shared for runme and all scriplets
+. "$RUNMEDIR/utils.sh"
+[[ -r "$TOOLSDIR/local-config.sh" ]] && @ "$TOOLSDIR/local-config.sh"
+@ "../config.sh"; _post_config
+
+# main runme functionality
+_runme() {
+  @ "install.sh"
+  @ "build.sh"
+  case "$BUILDMODE" in
+    ( "build" | "server" )
+      _install_environment
+      _full_build
+      ;;
+    ( "setup" | "runtime" )
+      _install_environment
+      ;;
+    ( "" )
+      _install_environment
+      if [[ "$inst_work_done" = "" ]]; then _full_build; exit; fi
+      show section "$inst_work_done done"
+      show warning "You can use the environment now," \
+                   "or run this script again to build."
+      ;;
+    ( * )
+      failwith "unknown build mode: $BUILDMODE"
+      ;;
+  esac
+}
diff --git a/tools/runme/show-version b/tools/runme/show-version
new file mode 100755
index 0000000000..f014081741
--- /dev/null
+++ b/tools/runme/show-version
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+. "$(dirname "${BASH_SOURCE[0]}")/../../runme" "$@"
+
+echo "$MML_VERSION"
diff --git a/tools/runme/utils.sh b/tools/runme/utils.sh
new file mode 100644
index 0000000000..a102502526
--- /dev/null
+++ b/tools/runme/utils.sh
@@ -0,0 +1,450 @@
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+################################################################################
+# Utilities
+
+# ---< defvar [opts] var val >--------------------------------------------------
+# Use this to define customize-able variables (no effect if it exists).
+# Additional arguments are concatenated (wihtout spaces) to make long value
+# settings look nice.  Use "-x" to export the variable, "-p" to resolve the
+# value to an absolute path from where we are, "-f" to set the value even if
+# it's already set.  You can also use "-d" to define values with delayed
+# references to other variables using "...<{var}>..." -- these will be replaced
+# at the end of processing the config file.
+_delayed_vars=()
+defvar() {
+  local opts=""; while [[ "x$1" == "x-"* ]]; do opts+="${1:1}"; shift; done
+  local var="$1" val v; shift
+  if [[ "$opts" == *"f"* || -z "${!var+x}" ]]; then
+    val=""; for v; do val+="$v"; done; printf -v "$var" "%s" "$val"; fi
+  if [[ "$opts" == *"p"* && "x${!var}" != "/"* ]]; then
+    printf -v "$var" "%s" "$(realpath -m "${!var}")"; fi
+  if [[ "$opts" == *"x"* ]]; then export "$var"; fi
+  if [[ "$opts" == *"d"* ]]; then _delayed_vars+=( "$var" ); fi
+}
+_replace_delayed_vars() {
+  local var val pfx sfx change=1
+  for var in "${_delayed_vars[@]}"; do
+    val="${!var}"
+    while [[ "$val" = *"<{"*"}>"* ]]; do
+      pfx="${val%%"<{"*}"; val="${val#*"<{"}"
+      sfx="${val#*"}>"}"; val="${val%%"}>"*}"
+      val="$pfx${!val}$sfx"
+      printf -v "$var" "%s" "$val"
+    done
+  done
+}
+
+# Parse `X=Y` arguments, stop at a "--"
+while [[ "$#" -gt 0 ]]; do case "$1" in
+  ( *"="* ) defvar -f "${1%%=*}" "${1#*=}" ;;
+  ( "--"  ) shift; break ;;
+  ( "-h" | "--help" | "help" )
+    text="$(<"$BASEDIR/runme")"
+    text="${text#*$'\n'+(#)$'\n# '}"; text="${text%$'\n'+(#)$'\n'*}"
+    text="${text//$'\n'#?( )/$'\n'}"
+    echo "$text"
+    exit
+    ;;
+  ( * ) echo "WARNING: ignoring unrecognized argument \"$1\"" 1>&2; sleep 1 ;;
+esac; shift; done
+
+# ---< @ bash-file arg... >-----------------------------------------------------
+# Similar to `script` for loading a bash library file, except that the path is
+# relative to the file that used `@`.
+@ () {
+  local lib="$1" srcdir="$(dirname ${BASH_SOURCE[1]})"; shift
+  lib="$(cd "$srcdir"; realpath "$lib")"
+  if [[ ! -r "$lib" ]]; then failwith "lib: file not found, $lib"; fi
+  . "$lib" "$@"
+}
+
+# VSTS:
+# Details on available environment variables:
+#   https://www.visualstudio.com/en-us/docs/build/define/variables
+# to color output, start an output line with "##[<tag>]", these are
+# known tags: section, command, error, warning, debug; also, there are
+# various meta "##vso[...]" instructions, see:
+#   https://github.com/Microsoft/vsts-tasks/blob/master/docs/authoring/commands.md
+
+# ---< show tag message... >----------------------------------------------------
+# Display a message type classified by the given tag, in a way that is proper
+# for the build context: on the build server use VSO magic outputs.  Accepted
+# tags are: "section", "warning", "command", "debug", "error", or "-" for
+# generic output.  $hide_in_log can be set to a string holding sensitive
+# information that should be hidden in the output.  The display uses "$HOME"
+# instead of the actual value whenever it appears.
+first_show="Y"
+hide_in_log=""
+show() {
+  local tag="$1"; shift
+  if [[ "$first_show" = "Y" ]]; then first_show="N";
+  elif [[ "x$tag" = "xsection" ]]; then echo ""
+  fi
+  if [[ "x$tag" = "x-" ]]; then tag=""
+  elif [[ "$BUILDMODE" = "server" ]]; then tag="##[$tag]"
+  else case "$tag" in
+    ( "section" ) tag="===>>> " ;;
+    ( "warning" ) tag="*** "    ;;
+    ( "command" ) tag="$ "      ;;
+    ( "debug"   ) tag=">> "     ;;
+    ( "error"   ) tag="!!! "    ;;
+    ( * ) failwith "this script is broken, don't know about display tag: $tag" ;;
+  esac; fi
+  local line="$tag${*//"$HOME"/\$HOME}"
+  if [[ "$hide_in_log" != "" ]]; then
+    line="$tag${*//"$hide_in_log"/[...]}"
+  fi
+  echo "$line"
+}
+
+# ---< failwith message... >----------------------------------------------------
+# Abort the run with the given error message.
+failwith() { show error "Error: $*" 1>&2; exit 1; }
+
+_killed_handler() { echo ""; failwith "Aborting..."; }
+builtin trap _killed_handler 2 3 9 15
+
+# ---< map cmd arg... >---------------------------------------------------------
+# Apply $cmd on each of the arguments.
+map() { local cmd="$1" arg; shift; for arg; do $cmd "$arg"; done; }
+
+# ---< echo_exit message... >---------------------------------------------------
+# Echo a message on exit.
+_exit_strings=()
+_show_exit_strings() { map echo "${_exit_strings[@]}"; }
+trap _show_exit_strings 0
+echo_exit() { _exit_strings+=("$*"); }
+
+# protection from mistakingly overwriting traps in scriplets
+trap() { failwith "cannot overwrite traps (in \"trap $*\")"; }
+
+# ---< qstr [-not-dollar] str... >----------------------------------------------
+# Quotes the input as a shell-parsable string, also using $HOME instead of its
+# value (better than printf with "%q" which tends to uglingly backslash spaces).
+# If "-not-dollar" then avoid quoting dollar signs.
+qstr() {
+  local replace='\ " ` $'
+  if [[ "x$1" = "x-not-dollar" ]]; then replace='\ " `'; shift; fi
+  local str="$*" ch
+  for ch in $replace; do str="${str//"$ch"/\\$ch}"; done
+  echo "\"${str//$HOME/\$HOME}\""
+}
+
+# ---< maybe_qstr str... >------------------------------------------------------
+# Quotes the input as a shell-parsable string (using qstr) only if needed.
+maybe_qstr() {
+  local str="$*"
+  if [[ "$(printf "%q" "$str")" = "$str" ]]; then echo "$str"; else qstr "$str"; fi
+}
+
+# ---< _ [flags] cmd arg... >---------------------------------------------------
+# Run the given $cmd very carefuly.  Exit on error, unless flags have "-a".
+# Normally, the command is shown (using "show command") unless flags have "-q".
+# If $collect_log is set to 1 then instead of showing the command's stdout it is
+# captured in $collected_logs (which can also be used to suppress showing the
+# output), or set it to 2 to capture both stdout and stderr (this is better than
+# redirecting to /dev/null since that will swallow failure messages as well).
+collect_log=0 collected_log=""
+declare -A known_exes
+_() {
+  local verbose=1 abortonfail=1
+  while [[ "x$1" = "x-"* ]]; do
+    case "${1#-}" in
+      ( "q" ) verbose=0 ;;
+      ( "a" ) abortonfail=0 ;;
+      ( * ) failwith "internal error, unknown flag for '_': $1"
+    esac
+    shift
+  done
+  local sets=()
+  while [[ "$1" =~ ^[A-Za-z_][A-Za-z_0-9]*= ]]; do sets+=( "$1" ); shift; done
+  local cmd="$1"; shift
+  local exe="${known_exes[$cmd]}"
+  if [[ "$exe" = "" ]]; then
+    exe="$(type -p "$cmd")"
+    if [[ "$exe" = "" && "$(type -t "$cmd")" != "" ]]; then exe="$cmd"; fi
+    if [[ "$exe" = "" ]]; then failwith "could not find executable: $cmd"; fi
+    known_exes[$cmd]="$exe"
+  fi
+  if ((verbose)); then
+    local to_show="" x
+    for x in "${sets[@]}"; do to_show+=" ${x%%=*}=$(maybe_qstr "${x#*=}")"; done
+    for x in "$cmd" "$@"; do to_show+=" $(maybe_qstr "$x")"; done
+    show command "${to_show:1}"
+  fi
+  args=( "$@" )
+  __run_it__() {
+    case $collect_log in
+      ( 2 ) collected_log="$("$exe" "${args[@]}" 2>&1)" ;;
+      ( 1 ) collected_log="$("$exe" "${args[@]}")"      ;;
+      ( * )                  "$exe" "${args[@]}"        ;;
+    esac
+  }
+  if [[ "${#sets[@]}" = 0 ]]; then __run_it__
+  else # can't put "x=y"s in a variable, so use eval
+    local pfx=""
+    for x in "${sets[@]}"; do pfx+="${x%%=*}=$(printf "%q" "${x#*=}") "; done
+    eval "${pfx}__run_it__"
+  fi
+  local ret=$?
+  if [[ $ret != 0 && $abortonfail -ge 1 ]]; then failwith "failure when running $cmd $*"
+  else return $ret; fi
+}
+
+# ---< __ cmd arg... >----------------------------------------------------------
+# Convenient shorthand for "_ -q cmd arg..."
+__()  { _ -q "$@"; } # same, but no command display
+
+# ---< ___ cmd arg... >---------------------------------------------------------
+# Convenient shorthand for "_ -q -a cmd arg..."
+___() { _ -q -a "$@"; } # same, but command display or aborting on failure
+
+# ---< _rm path... >------------------------------------------------------------
+# Removes a file or directory if it exists.
+_rm_() {
+  if [[ -d "$1" ]]; then _ rm -rf "$1"; elif [[ -e "$1" ]]; then _ rm -f "$1"; fi;
+}
+_rm() { map _rm_ "$@"; }
+
+# ---< _md dir >----------------------------------------------------------------
+# Create a directory (with -p) if it doesn't exist.
+_md_() { if [[ ! -d "$1" ]]; then _ mkdir -p "$1"; fi; }
+_md() { map _md_ "$@"; }
+
+# ---< _mcd dir >---------------------------------------------------------------
+# Create a directory (with -p) and cd into it.
+_mcd() { _md "$1"; _ cd "$1"; }
+
+# ---< _rmcd dir >--------------------------------------------------------------
+# Same as _mcd, removing the directory if it exists.
+_rmcd() { _rm "$1"; _mcd "$1"; }
+
+# ---< get_suffix path >--------------------------------------------------------
+# Prints the suffix of a given path.  Properly deal with filenames that begin
+# with a "." and with multiple suffixes (like ".tar.gz"); suffixes are
+# alphanumeric with at least one alphabetic character.
+get_suffix() {
+  rx="[^/](([.][a-zA-Z0-9_]*[a-zA-Z][a-zA-Z0-9_]*)+)$"
+  if [[ "$1" =~ $rx ]]; then echo "${BASH_REMATCH[1]:1}"; fi
+}
+
+# ---< call_ifdef [_] fun arg... >----------------------------------------------
+# If the named function exists, calls it with the given arguments.  Calls only
+# functions, not external executables or builtins.
+call_ifdef() {
+  local pfx=""; if [[ "$1" = "_" ]]; then pfx="_"; shift; fi
+  local fun="$1"; shift
+  if [[ "$(type -t "$fun")" = "function" ]]; then $pfx "$fun" "$@"; fi
+}
+
+# ---< deftag tag [supertag] >--------------------------------------------------
+# Define a tag, possibly as a subtag of supertag (which defaults to `all`).
+# Note that tags are shared for both `$TESTS` and `$PUBLISH`.
+declare -A _tag_parent
+deftag() {
+  if [[ -n "$2" && "$2" != "all" && -z "${_tag_parent[$2]}" ]]; then
+    failwith "deftag: unknown parent tag, $2"
+  fi
+  _tag_parent[$1]="${2:-all}"
+}
+
+# ---< should what tag... >-----------------------------------------------------
+# Returns a zero (success) status if `tag` should be "what"-ed (tested or
+# published) according to $TESTS or $PUBLISH, or one (failure) status otherwise.
+# If more than one tag is given, follow scalatest semantics: succeed if at least
+# one of the tags is included, and none are excluded.  Convenient to use as:
+# `should test foo && run_foo_test`.
+_get_tag_value() {
+  local ret="${info[$1]}"
+  if [[ "$ret" = "" ]]; then
+    if [[ -z "${_tag_parent[$1]}" ]]; then ret="."
+    else ret="$(_get_tag_value "${_tag_parent[$1]}")"; fi
+    info[$1]=$ret
+  fi
+  echo $ret
+}
+_get_valid_tag_value() {
+  if [[ "$1" != "all" && "$1" != "none" && -z "${_tag_parent[$1]}" ]]; then
+    failwith "should: unknown tag, $1  {{${_tag_parent[$1]}}}"
+  fi
+  _get_tag_value "$1"
+}
+_has_tag() {
+  local -n info="$1"; shift
+  if (($# == 0)); then failwith "should: missing tag(s)"; fi
+  # mimic the scalatest logic: needs one tag included and none excluded
+  local r="" t; for t; do r+="$(_get_valid_tag_value "$t")"; done
+  [[ "$r" = *1* && "$r" != *0* ]]
+}
+should() {
+  what="$1"; shift
+  case "$what" in
+    ( "test" | "publish" ) _has_tag "_${what}_info" "$@" ;;
+    ( * ) failwith "should: unknown tag info, $what" ;;
+  esac
+}
+
+# ---< get_install_info libname key >-------------------------------------------
+# Print the value for $key in the setup section of $libname.  Properly deals
+# with various default values.
+get_install_info() {
+  local ret="${_install_info[$1.$2]}"
+  if [[ "$ret" = "" ]]; then
+    case "$2" in
+      ( "lib"    ) ret="${1,,}" ;;
+      ( "envvar" ) ret="${1^^}" ;;
+      ( "bindir" ) ret="bin"    ;;
+    esac
+    if [[ "$ret" != "" ]]; then
+      _install_info[$1.$2]="$ret"; _replace_ver_in_info $1.$2
+    fi
+  fi
+  echo "$ret"
+}
+
+# ---< set_install_info_vars libname key... >-----------------------------------
+# Get the value for each $key in the setup section of $libname, and set the
+# variable whose name is $key to this value.
+set_install_info_vars() {
+  local libname="$1" var val; shift
+  for var; do
+    printf -v "$var" "%s" "$(get_install_info "$libname" "$var")"
+  done
+}
+
+# ---< env_eval str... >--------------------------------------------------------
+# Evaluate an expression and make sure that it's also included in the
+# user's environment.  The commands are held in $envinit_commands which
+# can be added to if you want to include something in the environment
+# but not evaluate it right now.  (This is written out by code in "install.sh".)
+envinit_commands=('export MMLSPARK_PROFILE="yes"')
+envinit_eval() { envinit_commands+=("$*"); eval "$*"; }
+
+# ---< setenv var val >---------------------------------------------------------
+# Set an environment variable; include the setting in the user environment too.
+setenv() { envinit_eval "export $1=$(qstr "$2")"; }
+
+# ---< get_runtime_hash >-------------------------------------------------------
+# Prints out a hash of the currently configured runtime environment.  The hash
+# depends on the relevant bits of configuration, including a .setup and .init
+# function definitions, if any.
+get_runtime_hash() {
+  local hash="$(
+    for libname in "${install_packages[@]}"; do
+      set_install_info_vars "$libname" lib sha256 instcmd exes where
+      if [[ " $where " != *" runtime "* ]]; then continue; fi
+      printf "%s\n" "$libname" "$lib" "$sha256" "$instcmd" "$exes" \
+             "$(declare -f "$libname.setup" "$libname.init")"
+    done | sha256sum)"
+  echo "${hash%% *}"
+}
+
+# ------------------------------------------------------------------------------
+# Internal functions follow
+
+# Parse tag specs, used for $TESTS
+_parse_tags() {
+  local -n tags="$1" info="$2"
+  tags="${tags,,}"; tags="${tags// /,}"
+  while [[ "$tags" != "${tags//,,/,}" ]]; do tags="${tags//,,/,}"; done
+  tags="${tags#,}"; tags="${tags%,}"; tags=",$tags"
+  while [[ "$tags" =~ (.*)","([^+-].*) ]]; do # just "tag" is the same as "+tag"
+    tags="${BASH_REMATCH[1]},+${BASH_REMATCH[2]}"; done
+  tags="${tags#,}"
+  if [[ "$tags," =~ [+-], ]]; then
+    failwith "empty tag in \$$1"
+  elif [[ "$tags" =~ [+-]([a-zA-Z0-9_]*[^a-zA-Z0-9_,][^,]*) ]]; then
+    failwith "bad \$$1 tag name: ${BASH_REMATCH[1]}"
+  fi
+  local t pos=0 ts="$tags"
+  ts="${ts//,/ }" ts="${ts//+/1}"; ts="${ts//-/0}"
+  for t in $ts; do [[ ${t:0:1} = 1 ]] && pos=1; info[${t:1}]=${t:0:1}; done
+  if ((!pos)); then info[all]=${info[all]:-1}; fi # no positives => all
+  if [[ "$tags" == "+"@("all"|"none") ]]; then tags="${tags:1}"; fi
+}
+declare -A _test_info _publish_info
+_parse_TESTS()   { _parse_tags TESTS   _test_info;    }
+_parse_PUBLISH() { _parse_tags PUBLISH _publish_info; }
+
+# Defines $MML_VERSION and $MML_BUILD_INFO
+_set_build_info() {
+  local info version
+  # make it possible to avoid running git
+  if [[ ! -z "$MML_BUILD_INFO" && ! -z "$MML_VERSION" ]]; then
+    info="$MML_BUILD_INFO"; version="$MML_VERSION"
+  else
+    local owd="$PWD"; cd "$BASEDIR"
+    # sanity checks for version tags
+    local t rx="(0|[1-9][0-9]*)"; rx="^v$rx[.]$rx([.]$rx)?$"
+    for t in $(git tag -l); do
+      if [[ ! "$t" =~ $rx ]]; then failwith "found a bad tag name \"$t\""; fi
+    done
+    if [[ -r "$BUILD_ARTIFACTS/version" ]]; then
+      # if there is a built version, use it, so that we don't get a new
+      # version after commits are made
+      version="$(< "$BUILD_ARTIFACTS/version")"
+    else
+      version="$(git describe --dirty=".dirty" --match "v*")"
+      # convert it to something that works for pip wheels
+      version="${version#v}"; version="${version/-g/+g}"; version="${version/-/.dev}"
+    fi
+    if [[ "$BUILDMODE" != "server" || "$AGENT_ID" = "" ]]; then
+      if [[ ! -r "$BUILD_ARTIFACTS/version" ]]; then
+        version="${version/+/+local.}"
+      fi
+      info="Local build: ${USERNAME:-$USER} ${BASEDIR:-$PWD}"
+      local line
+      info+="$(
+        git branch --no-color -vv --contains HEAD --merged | \
+          while read line; do
+            if [[ "x$line" = "x*"* ]]; then
+              line="${line#"* "}";
+              if [[ "$line" = *\[*\]* ]]; then line="${line%%\]*}]"; fi
+              if [[ "$line" = *\(*\)* ]]; then line="${line%%)*})"; fi
+              echo "//$line"; fi; done)"
+      if ! git diff-index --quiet HEAD; then info+=" (dirty)"; fi
+    else
+      local branch="${BUILD_SOURCEBRANCH#refs/heads/}"
+      # drop the commit sha1 for builds that are on the main line
+      if [[ "$BUILDPR:$branch" = ":master" && ! -r "$BUILD_ARTIFACTS/version" ]]; then
+        version="${version%+g[0-9a-f][0-9a-f]*}"
+      fi
+      info="$BUILD_REPOSITORY_NAME/$branch@${BUILD_SOURCEVERSION:0:8}"
+      info+="; $BUILD_DEFINITIONNAME#$BUILD_BUILDNUMBER"
+    fi
+    info="$version: $info"
+    cd "$owd"
+  fi
+  defvar -x MML_VERSION    "$version"
+  defvar -x MML_BUILD_INFO "$info"
+}
+
+# Parse $INSTALLATIONS info
+declare -A _install_info
+install_packages=()
+_parse_install_info() {
+  local key="" libname="" x keys=1
+  for x in "${INSTALLATIONS[@]}"; do
+    if [[ "$key" != "" ]];     then _install_info[${libname}.${key%:}]="$x" key=""
+    elif [[ "$x" = *: ]];      then key="$x" keys=1
+    elif [[ "$x" != [A-Z]* ]]; then failwith "bad package name: $x"
+    elif ((!keys)); then failwith "install entry with no keys: $libname"
+    else libname="$x"; key=""; keys=0; install_packages+=("$x")
+    fi
+  done
+  # replace "<{ver}>"s
+  for x in "${!_install_info[@]}"; do _replace_ver_in_info "$x"; done
+}
+_replace_ver_in_info() { # lib.field
+  _install_info[$1]="${_install_info[$1]//"<{ver}>"/"${_install_info[${1%%.*}.ver]}"}"
+}
+
+_post_config() {
+  _set_build_info
+  _parse_install_info
+  _parse_TESTS
+  _parse_PUBLISH
+  _replace_delayed_vars
+}
diff --git a/tools/tests/tags.sh b/tools/tests/tags.sh
new file mode 100755
index 0000000000..d036958364
--- /dev/null
+++ b/tools/tests/tags.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+# Copyright (C) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE in project root for information.
+
+. "$(dirname "${BASH_SOURCE[0]}")/../../runme" "$@"
+
+map deftag a b c
+num=0 fail=0
+
+try() {
+  local test="$*"
+  local flags="${test%% => *}" expect="${test##* => }"
+  flags=$(echo $flags)
+  local res=$(TESTS="$flags"; unset _test_info; declare -A _test_info
+              _parse_TESTS
+              should test a && echo A
+              should test b && echo B
+              should test c && echo C
+              should test a b && echo AB)
+  res=$(echo { $res })
+  ((num++))
+  if [[ "$expect" != "$res" ]]; then
+    ((fail++))
+    echo "FAIL: TEST=\"$flags\": expected $expect, got $res"
+  fi
+}
+
+report() {
+  if ((fail == 0)); then echo "All tests passed"; exit 0
+  else echo "$fail/$num tests failed"; exit 1; fi
+}
+
+# The following is an exhaustive list of all a/b/c options, verified with
+# scalatest.  To try it:
+#     import org.scalatest.{FunSuite, Tag}
+#     object A extends Tag("a"); object B extends Tag("b"); object C extends Tag("c")
+#     class ExampleSpec extends FunSuite {
+#       test("A", A) {}; test("B", B) {}; test("C", C) {}; test("AB", A, B) {}
+#     }
+# and then in sbt use -n for + and -l for -, eg: test-only * -- -n a -n b -l c
+
+try "         => { A B C AB }"
+try "+a +b +c => { A B C AB }"
+try "+a +b    => { A B AB }"
+try "+a +b -c => { A B AB }"
+try "      -c => { A B AB }"
+try "+a    +c => { A C AB }"
+try "   +b +c => { B C AB }"
+try "+a       => { A AB }"
+try "+a    -c => { A AB }"
+try "   +b    => { B AB }"
+try "   +b -c => { B AB }"
+try "-a       => { B C }"
+try "-a +b +c => { B C }"
+try "   -b    => { A C }"
+try "+a -b +c => { A C }"
+try "+a -b    => { A }"
+try "   -b -c => { A }"
+try "+a -b -c => { A }"
+try "-a +b    => { B }"
+try "-a    -c => { B }"
+try "-a +b -c => { B }"
+try "-a -b    => { C }"
+try "      +c => { C }"
+try "-a    +c => { C }"
+try "   -b +c => { C }"
+try "-a -b +c => { C }"
+try "-a -b -c => { }"
+
+report